In [31]:
import sys
import pandas as pd
import os
from metient.util import data_extraction_util as dutil
from metient.util import vertex_labeling_util as vutil

repo_dir = os.path.join(os.getcwd(), "../")
data_dir = os.path.join(repo_dir, 'data', 'mcpherson_ovarian_2016')

## (1) Read in data from McPherson et.al. supplemental tables

In [2]:
# First, read in targeted deep sequencing read data
# Taken directly from supplementary table 10 of McPherson et.al.
reads_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_10.csv'))
patient_ids = reads_df['patient_id'].unique()
print("patient IDs:", patient_ids)
reads_df['gene_name'] = reads_df['gene_name'].astype(str)
def label_snv(row):
    label = []
    if row['gene_name'] != "nan":
        label.append(row['gene_name'])
    label += [str(row['chrom']), str(int(row['coord']))]
    return (":").join(label)
reads_df['character_label'] = reads_df.apply(lambda row: label_snv(row), axis=1)
reads_df

patient IDs: [ 1  2  3  4  7  9 10]


Unnamed: 0,sample_id,patient_id,malignant,primer_id,chrom,coord,ref,alt,ref_counts,alt_counts,depth,alt_freq,background_average_alt_freq,ref_p_value,alt_p_value,status,gene_name,snpeff_impact,character_label
0,normal_blood,1,no,1_b_amplicrazy,1,17090971.0,G,A,641,0,641,0.000000,0.002034,0.0,1.000000e+00,normal_sample,,,1:17090971
1,omentum_site_1,1,yes,1_b_amplicrazy,1,17090971.0,G,A,449,0,449,0.000000,0.001597,0.0,1.000000e+00,wildtype,,,1:17090971
2,right_ovary_site_1,1,yes,1_b_amplicrazy,1,17090971.0,G,A,395,64,459,0.139434,0.002375,0.0,7.800000e-90,somatic,,,1:17090971
3,right_ovary_site_2,1,yes,1_b_amplicrazy,1,17090971.0,G,A,415,1,416,0.002404,0.002754,0.0,6.825260e-01,wildtype,,,1:17090971
4,right_ovary_site_3,1,yes,1_b_amplicrazy,1,17090971.0,G,A,609,4,613,0.006525,0.002145,0.0,4.440600e-02,wildtype,,,1:17090971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15293,normal_blood,10,no,10_b_amplicrazy,X,110000000.0,A,G,3669,10,3679,0.002718,0.002239,0.0,3.130860e-01,normal_sample,RGAG1,LOW,RGAG1:X:110000000
15294,right_ovary_site_1,10,yes,10_b_amplicrazy,X,110000000.0,A,G,2037,295,2332,0.126501,0.002306,0.0,0.000000e+00,somatic,RGAG1,LOW,RGAG1:X:110000000
15295,right_ovary_site_2,10,yes,10_b_amplicrazy,X,110000000.0,A,G,1369,4,1373,0.002913,0.002442,0.0,4.314750e-01,wildtype,RGAG1,LOW,RGAG1:X:110000000
15296,right_ovary_site_3,10,yes,10_b_amplicrazy,X,110000000.0,A,G,2675,4,2679,0.001493,0.002189,0.0,8.364490e-01,wildtype,RGAG1,LOW,RGAG1:X:110000000


In [3]:
pyclone_to_clone_id_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_8.csv'))
pyclone_to_clone_id_df

Unnamed: 0,patient_id,clone_id,pyclone_cluster_id,present
0,1,A,2,1
1,1,A,3,0
2,1,A,4,0
3,1,A,5,1
4,1,A,6,1
...,...,...,...,...
372,10,F,3,1
373,10,F,7,1
374,10,F,11,1
375,10,F,12,0


In [25]:
sample_df = pd.read_csv(os.path.join(data_dir, 'supplemental_table_1.csv'))
sample_df = sample_df[sample_df['sample_id']!='normal_blood']
sample_df['anatomical_site_label'] = sample_df.apply(lambda row: ''.join([i for i in row['anatomy'] if not i.isdigit()]).replace("Site", "").strip(), axis=1)
sample_df


Unnamed: 0,patient_id,sample_id,paper_id,malignant,discovery_sample,tissue_source,anatomy,anatomical_site_label
1,1,appendix_site_c1,ApC1,yes,no,ffpe,Appendix,Appendix
2,1,left_fallopian_tube_site_b4,LFTB4,yes,no,ffpe,Left Fallopian Tube,Left Fallopian Tube
3,1,left_ovary_site_b2,LOvB2,yes,no,ffpe,Left Ovary,Left Ovary
4,1,right_fallopian_tube_site_a16,RFTA16,yes,no,ffpe,Right Fallopian Tube,Right Fallopian Tube
5,1,right_ovary_site_a4,ROvA4,yes,no,ffpe,Right Ovary,Right Ovary
...,...,...,...,...,...,...,...,...
70,10,right_ovary_site_a9,ROvA9,yes,no,ffpe,Right Ovary,Right Ovary
71,10,right_ovary_site_1,ROv1,yes,yes,fresh_frozen,Right Ovary Site 1,Right Ovary
72,10,right_ovary_site_2,ROv2,yes,yes,fresh_frozen,Right Ovary Site 2,Right Ovary
73,10,right_ovary_site_3,ROv3,yes,yes,fresh_frozen,Right Ovary Site 3,Right Ovary


In [79]:
sample_df['anatomical_site_label'].unique()

array(['Appendix', 'Left Fallopian Tube', 'Left Ovary',
       'Right Fallopian Tube', 'Right Ovary', 'Small Bowel', 'Omentum',
       'Cul de Sac', 'Sigmoid Colon Deposit',
       'Left Fallopian Tube Fimbriae', 'Adnexa', 'Left Ovary Surface',
       'Left Pelvic Sidewall', 'Right Pelvic Sidewall',
       'Brain Metastasis', 'Bowel Implant', 'Right Uterosacral',
       'Righ Pelvic Mass'], dtype=object)

In [34]:
prevalences_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_9.csv'))
prevalences_df

Unnamed: 0,patient_id,paper_id,clone_id,prevalence
0,1,ApC1,A,0.055
1,1,ApC1,B,0.936
2,1,ApC1,C,0.003
3,1,ApC1,D,0.002
4,1,ApC1,E,0.001
...,...,...,...,...
404,10,ROvA9,B,0.241
405,10,ROvA9,C,0.246
406,10,ROvA9,D,0.009
407,10,ROvA9,E,0.229


From McPherson et.al.:
Presence of each clone in each site was calculated as follows. First, samples were grouped into more broad
anatomic locations, for instance ROv1, 2 etc. were grouped as the ROv site. A clone was said to be present in a
sample if it was assigned a clonal prevalence greater than 0.01. Parent clones are often predicted to coexist with child
clones at small proportions, though based on single cell data, such a scenario is likely an artifact. Parent and child
clones are usually distinguished by an absence of mutations in the parent. If the mutations that identify the child
clone are estimated to have slightly lower prevalence than more ancestral mutations, that slight deviation will results
in prediction of a minor population of the parent clone in the sample. On the other hand, a sample composed of 95%
parent and 5% child is more likely to represent a true mixture. To account for these issues, we use an additional rule
for determining whether a parent clone co-exists with its child in a sample. A predicted cellular prevalence of non-leaf
clone X must be at least 50% of the combined cellular prevalence of X and its descendants to be considered present in
the sample. A clone is said to be present in a site if the clone, by the above rules, is present in any of the samples of
that site.

In [82]:
cluster_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']

def get_combined_prevalence(pid, clone_ids, sample):
    total_prevalence = 0.0
    for clone_id in clone_ids:
        subset_df = prevalences_df[(prevalences_df['patient_id']==pid)&(prevalences_df['clone_id']==clone_id)&(prevalences_df['paper_id']==sample)]
        if len(subset_df) > 0:
            prevalence = subset_df['prevalence'].item()
            total_prevalence += float(prevalence)
    return total_prevalence

def get_presence_in_site(tree, path_matrix, pid, clone_id, site):
    pid_clone_df = prevalences_df[(prevalences_df['patient_id']==pid)&(prevalences_df['clone_id']==clone_id)]
    site_samples = sample_df[sample_df['anatomical_site_label']==site]['paper_id'].unique()
    
    descendants = [cluster_labels[i[0].item()] for i in path_matrix[cluster_labels.index(clone_id)].nonzero()]
    present_in_site = False
    for sample in site_samples:
        clone_prevalence = get_combined_prevalence(pid, [clone_id], sample)
        descendants_prevalence = get_combined_prevalence(pid, [clone_id]+descendants, sample)
#         print(clone_id, descendants, sample, clone_prevalence, descendants_prevalence)
        if clone_prevalence > 0.01 and clone_prevalence > 0.5*descendants_prevalence:
            present_in_site = True
    return present_in_site

def get_parent(tree, node_index):
    # Look for a '1' in the column corresponding to the node_index
    parent_index = (tree[:, node_index] == 1).nonzero(as_tuple=True)[0]
    return parent_index

def get_num_mutations(tree, clone_id)

cols = ['anatomical_site_label', 'cluster_index', 'cluster_label', 'present', 'site_category', 'num_mutations']
for pid in patient_ids:
    tree = dutil.get_adjacency_matrix_from_txt_edge_list(os.path.join(data_dir, f"patient{pid}_tree.txt"))
    path_matrix = vutil.get_path_matrix(tree, remove_self_loops=True)
    clones = cellular_prevalences_df[cellular_prevalences_df['patient_id']==pid]['clone_id'].unique()
    unique_sites = sample_df[sample_df['patient_id']==pid]['anatomical_site_label'].unique()
    print(clones, unique_sites)
    data = []
    for clone in clones:
        for site in unique_sites:
            present_in_site = get_presence_in_site(tree, path_matrix, pid, clone, site)
            if present_in_site:
                print(clone, site)
            site_category = 'primary' if 'Ovary' in site or 'Uterosacral' in site else 'metastasis'
            data.append([site, cluster_labels.index(clone), clone, 1 if present_in_site else 0, site_category, 1])
    # Build a dataframe for each patient w/ the information in cols
    df = pd.DataFrame(data, columns=cols)
    print(df)
    

['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I'] ['Appendix' 'Left Fallopian Tube' 'Left Ovary' 'Right Fallopian Tube'
 'Right Ovary' 'Small Bowel' 'Omentum']
A Left Fallopian Tube
A Right Fallopian Tube
B Appendix
B Left Fallopian Tube
B Left Ovary
B Small Bowel
B Omentum
C Small Bowel
D Left Ovary
E Right Ovary
G Right Ovary
H Left Ovary
H Right Ovary
I Right Ovary
   anatomical_site_label  cluster_index cluster_label  present site_category  \
0               Appendix              0             A        0    metastasis   
1    Left Fallopian Tube              0             A        1    metastasis   
2             Left Ovary              0             A        0       primary   
3   Right Fallopian Tube              0             A        1    metastasis   
4            Right Ovary              0             A        0       primary   
..                   ...            ...           ...      ...           ...   
58            Left Ovary              8             I        0       primary   
5

B Left Ovary Surface
B Left Pelvic Sidewall
B Right Ovary
B Right Pelvic Sidewall
D Right Ovary
E Right Ovary
F Right Ovary
G Right Ovary
H Right Ovary
    anatomical_site_label  cluster_index cluster_label  present site_category  \
0      Left Ovary Surface              0             A        0       primary   
1    Left Pelvic Sidewall              0             A        0    metastasis   
2             Right Ovary              0             A        0       primary   
3   Right Pelvic Sidewall              0             A        0    metastasis   
4      Left Ovary Surface              1             B        1       primary   
5    Left Pelvic Sidewall              1             B        1    metastasis   
6             Right Ovary              1             B        1       primary   
7   Right Pelvic Sidewall              1             B        1    metastasis   
8      Left Ovary Surface              2             C        0       primary   
9    Left Pelvic Sidewall             

In [76]:
prevalences_df[(prevalences_df['patient_id']==1)&(prevalences_df['clone_id']=='A')]

Unnamed: 0,patient_id,paper_id,clone_id,prevalence
0,1,ApC1,A,0.055
9,1,LFTB4,A,0.543
18,1,LOvB2,A,0.3
27,1,Om1,A,0.024
36,1,RFTA16,A,0.986
45,1,ROv1,A,0.004
54,1,ROv2,A,0.002
63,1,ROv3,A,0.002
72,1,ROv4,A,0.002
81,1,ROvA4,A,0.002


In [3]:
pyclone_cluster_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_16.csv'))
pyclone_cluster_df[pyclone_cluster_df['patient_id']==10]

Unnamed: 0,patient_id,sample_id,chrom,coord,ref,alt,primer_set,cluster_id,mean,std,ci_length,ml_loss,ml_origin,ml_presence,deletion
5516,10,right_ovary_site_1,10,101648657,C,A,amplicrazy,7,0.996462,0.003303,0.009970,0,0,1,0
5517,10,right_ovary_site_1,10,123969955,C,T,amplicrazy,7,0.989874,0.049450,0.018570,0,0,1,0
5518,10,right_ovary_site_1,10,127350427,T,C,amplicrazy,7,0.965689,0.095702,0.263609,0,0,1,0
5519,10,right_ovary_site_1,10,31815848,G,A,amplicrazy,7,0.995950,0.014094,0.010017,0,0,1,0
5520,10,right_ovary_site_1,11,247341,G,A,amplicrazy,7,0.996430,0.004301,0.009970,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,10,right_ovary_site_4,6,53142417,T,C,10_b_1,13,0.408940,0.114234,0.385519,0,1,1,0
6496,10,right_ovary_site_1,3,142681930,C,T,amplicrazy,8,0.814847,0.231407,0.654302,0,0,1,0
6497,10,right_ovary_site_3,3,142681930,C,T,amplicrazy,8,0.685740,0.355491,0.986055,0,0,1,0
6498,10,right_ovary_site_2,3,142681930,C,T,amplicrazy,8,0.650817,0.371326,0.967168,0,0,1,0


In [5]:
pyclone_cluster_df
def get_clone_id(row):
    pid = row['patient_id']
    pyclone_cluster_id = row['cluster_id']
    pt_cluster_df =  pyclone_to_clone_id_df[(pyclone_to_clone_id_df['pyclone_cluster_id']==pyclone_cluster_id)&(pyclone_to_clone_id_df['patient_id']==pid)]
    if len(pt_cluster_df) != 0:
        return pt_cluster_df['clone_id'].unique()
    return None
pyclone_cluster_df['clone_id'] = pyclone_cluster_df.apply(lambda row:get_clone_id(row), axis=1)


In [8]:
pyclone_to_clone_id_df[(pyclone_to_clone_id_df['pyclone_cluster_id']==7)&(pyclone_to_clone_id_df['patient_id']==10)]


Unnamed: 0,patient_id,clone_id,pyclone_cluster_id,present
343,10,A,7,1
349,10,B,7,1
355,10,C,7,1
361,10,D,7,1
367,10,E,7,1
373,10,F,7,1


In [10]:
reads_df[reads_df['patient_id']==10]['sample_id'].unique()

array(['left_fallopian_tube_site_b2', 'normal_blood', 'omentum_site_c1',
       'right_ovary_site_1', 'right_ovary_site_2', 'right_ovary_site_3',
       'right_ovary_site_4', 'right_ovary_site_a4', 'right_ovary_site_a9'],
      dtype=object)

## (2) Generate inputs for PyClone clustering

### (2a) Create input for PyClone clustering

In [4]:

# get CNA info from supplement of McPherson et. al. this is only available for the discovery samples
cna_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_7.csv'))
print(cna_df)

def subset_cna(row):
    chrom = row['chrom']
    coord = row['coord']
    patient_id = row['patient_id']
    sample_id = row['sample_id']
    df = cna_df[(cna_df['patient_id'] == patient_id) & (cna_df['sample_id'] == sample_id) & (cna_df['chrom'] == chrom) & (cna_df['start'] <= coord) & (cna_df['end'] >= coord)]
    return df

def major_cn(row):
    coord_df = subset_cna(row)
    if len(coord_df) != 1:
        #print("Not found in CNA:\n", row['chrom'], row['coord'])
        return 0
    return int(coord_df['major'])

def minor_cn(row):
    coord_df = subset_cna(row)
    if len(coord_df) != 1:
        return 0
    return int(coord_df['minor'])


# Add CNAs to reads_df
cna_reads_df = reads_df.copy(deep=True)
cna_reads_df['major_cn'] =  cna_reads_df.apply(lambda row: major_cn(row), axis=1)
cna_reads_df['minor_cn'] =  cna_reads_df.apply(lambda row: minor_cn(row), axis=1)
cna_reads_df

        patient_id           sample_id chrom        start          end  major  \
0                1  small_bowel_site_1     1          1.0      71283.0     16   
1                1  small_bowel_site_1     1      73560.0      94700.0      7   
2                1  small_bowel_site_1     1     101640.0     110479.0     13   
3                1  small_bowel_site_1     1     110479.0     141357.0     22   
4                1  small_bowel_site_1     1     144162.0     247926.0     24   
...            ...                 ...   ...          ...          ...    ...   
135154          10  right_ovary_site_4     X  153000000.0  153000000.0      0   
135155          10  right_ovary_site_4     X  153000000.0  154000000.0      2   
135156          10  right_ovary_site_4     X  154000000.0  155000000.0      2   
135157          10  right_ovary_site_4     X  155000000.0  155000000.0      2   
135158          10  right_ovary_site_4     X  155000000.0  155000000.0      2   

        minor  major_sub  m


KeyboardInterrupt



In [5]:
# do some pre-processing to remove any mutations where CNA info is not available or
# the major allele CN is 0 (if the malignant cells have no copies of the region 
# overlapping the mutation, the mutation cannot exist.) or minor allele CN > major allele CN
cna_reads_df = cna_reads_df[cna_reads_df['major_cn'] != 0]
cna_reads_df = cna_reads_df[(cna_reads_df['minor_cn'] <= cna_reads_df['major_cn'])]
cna_reads_df['normal_cn'] = 2 # copy number of the locus in normal cells is 2 (no male chromosomes here)

## (4a) Take PyClone generated clusters and create csvs with ref and var counts pooled by cluster 

In [15]:
# Need a tsv for each patient with ['#sample_index', 'sample_label', 'anatomical_site_index','anatomical_site_label', 'character_index', 'character_label', 'ref', 'var']
import re
import numpy as np
from metient.util import data_extraction_util as dutil
pyclone_dir = os.path.join(data_dir, "pyclone_analysis")


cols = [ 'anatomical_site_index','anatomical_site_label',  'cluster_index', 'character_index', 'character_label', 'ref', 'var', 'var_read_prob', 'site_category']
#agg_rules = {'sample_label': lambda x: ';'.join(set(x))}

for patient_id in patient_ids:
    patient_pyclone_dir = os.path.join(pyclone_dir, f"patient_{patient_id}", "tables", f"loci.tsv")
    mut_name_to_clstr_id, clstr_id_to_name, mutation_names = dutil.get_mut_to_cluster_map_from_pyclone_output(patient_pyclone_dir, min_mut_thres=MIN_MUT_THRES)
  
    patient_subset = cna_reads_df[cna_reads_df['patient_id'] == patient_id]
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    patient_subset = patient_subset[(patient_subset['sample_id'].isin(discovery_samples)) & (patient_subset['sample_id'] != 'normal_blood')]   
    sample_names = list(patient_subset['sample_id'].unique())
    print("patient", patient_id, len(clstr_id_to_name), "clusters", len(mutation_names), "mutations", len(sample_names), "samples")
    print(sample_names)
    data = []
    
    anat_sites = list(set([re.sub(r'_[0-9]+$', '', sample) for sample in sample_names]))
    print(anat_sites)
    for midx, mut in enumerate(mutation_names):
        mut_patient_subset = patient_subset[patient_subset['character_label'] == mut]
        for sidx, sample in enumerate(sample_names):
            mut_patient_sample = mut_patient_subset[mut_patient_subset['sample_id'] == sample]
            var = mut_patient_sample['alt_counts'].values[0]
            ref = mut_patient_sample['ref_counts'].values[0]
            tumour_prop_subset = discovery_samples_df[(discovery_samples_df['patient_id']==patient_id) & (discovery_samples_df['sample_id']==sample)]
            p = tumour_prop_subset['tumour_cell_proportion'].item() 
            major_cn = mut_patient_sample['major_cn'].values[0]
            minor_cn = mut_patient_sample['minor_cn'].values[0]
            var_read_prob = dutil.calc_var_read_prob(major_cn, minor_cn, p)
            
            site_label = re.sub(r'_[0-9]+$', '', sample)
            #print(sample, site_label, anat_sites.index(site_label))
            category = 'primary' if 'ovary_site' in site_label else 'metastasis'
            data.append([anat_sites.index(site_label), site_label, mut_name_to_clstr_id[mut], midx, mut.split(":")[0], ref, var, var_read_prob, category])
            
    patient_df = pd.DataFrame(data, columns=cols)
    patient_df.to_csv(os.path.join(data_dir,"pyclone_clustered_tsvs", f"{patient_id}_SNVs.tsv"), sep="\t", index=False)

#     dutil.write_pooled_tsv_from_clusters(patient_df, mut_name_to_clstr_id, clstr_id_to_name, agg_rules, 
#                                          os.path.join(data_dir,"pyclone_clustered_tsvs"), f"patient{patient_id}", ";", ":")


patient 1 13 clusters 177 mutations 6 samples
['omentum_site_1', 'right_ovary_site_1', 'right_ovary_site_2', 'right_ovary_site_3', 'right_ovary_site_4', 'small_bowel_site_1']
['omentum_site', 'small_bowel_site', 'right_ovary_site']
patient 2 6 clusters 120 mutations 4 samples
['omentum_site_1', 'omentum_site_2', 'right_ovary_site_1', 'right_ovary_site_2']
['omentum_site', 'right_ovary_site']
patient 3 9 clusters 168 mutations 4 samples
['adnexa_site_1', 'omentum_site_1', 'right_ovary_site_1', 'right_ovary_site_2']
['omentum_site', 'adnexa_site', 'right_ovary_site']
patient 4 10 clusters 161 mutations 5 samples
['left_pelvic_sidewall_site_1', 'right_ovary_site_1', 'right_ovary_site_2', 'right_ovary_site_3', 'right_ovary_site_4']
['right_ovary_site', 'left_pelvic_sidewall_site']
patient 7 8 clusters 221 mutations 3 samples
['brain_metastasis', 'left_ovary_site_1', 'righ_pelvic_mass']
['righ_pelvic_mass', 'left_ovary_site', 'brain_metastasis']
patient 9 5 clusters 132 mutations 5 samples
