In [None]:
import sys
import pandas as pd
import os
from metient.util import data_extraction_util as dutil
from metient.util import vertex_labeling_util as vutil

repo_dir = os.path.join(os.getcwd(), "../")
data_dir = os.path.join(repo_dir, 'data', 'mcpherson_ovarian_2016')

### (1) Read in data from McPherson et.al. supplemental tables

In [9]:
sample_df = pd.read_csv(os.path.join(data_dir, 'supplemental_table_1.csv'))
sample_df = sample_df[sample_df['sample_id']!='normal_blood']
sample_df['anatomical_site_label'] = sample_df.apply(lambda row: ''.join([i for i in row['anatomy'] if not i.isdigit()]).replace("Site", "").strip(), axis=1)
patient_ids = sample_df['patient_id'].unique()
print(patient_ids)
sample_df


[ 1  2  3  4  7  9 10]


Unnamed: 0,patient_id,sample_id,paper_id,malignant,discovery_sample,tissue_source,anatomy,anatomical_site_label
1,1,appendix_site_c1,ApC1,yes,no,ffpe,Appendix,Appendix
2,1,left_fallopian_tube_site_b4,LFTB4,yes,no,ffpe,Left Fallopian Tube,Left Fallopian Tube
3,1,left_ovary_site_b2,LOvB2,yes,no,ffpe,Left Ovary,Left Ovary
4,1,right_fallopian_tube_site_a16,RFTA16,yes,no,ffpe,Right Fallopian Tube,Right Fallopian Tube
5,1,right_ovary_site_a4,ROvA4,yes,no,ffpe,Right Ovary,Right Ovary
...,...,...,...,...,...,...,...,...
70,10,right_ovary_site_a9,ROvA9,yes,no,ffpe,Right Ovary,Right Ovary
71,10,right_ovary_site_1,ROv1,yes,yes,fresh_frozen,Right Ovary Site 1,Right Ovary
72,10,right_ovary_site_2,ROv2,yes,yes,fresh_frozen,Right Ovary Site 2,Right Ovary
73,10,right_ovary_site_3,ROv3,yes,yes,fresh_frozen,Right Ovary Site 3,Right Ovary


In [4]:
pyclone_to_clone_id_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_8.csv'))
pyclone_to_clone_id_df

Unnamed: 0,patient_id,clone_id,pyclone_cluster_id,present
0,1,A,2,1
1,1,A,3,0
2,1,A,4,0
3,1,A,5,1
4,1,A,6,1
...,...,...,...,...
372,10,F,3,1
373,10,F,7,1
374,10,F,11,1
375,10,F,12,0


In [6]:
pyclone_cluster_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_16.csv'))
pyclone_cluster_df['mut_label'] = pyclone_cluster_df.apply(lambda row: f"{row['chrom']},{row['coord']},{row['ref']},{row['alt']}", axis=1)
pyclone_cluster_df
                                                           

Unnamed: 0,patient_id,sample_id,chrom,coord,ref,alt,primer_set,cluster_id,mean,std,ci_length,ml_loss,ml_origin,ml_presence,deletion,mut_label
0,1,small_bowel_site_1,10,116076901,C,T,1_b_1,23,0.011,0.007,0.021,0,0,1,0,"10,116076901,C,T"
1,1,small_bowel_site_1,10,121191039,G,A,amplicrazy,23,0.011,0.007,0.021,0,0,0,0,"10,121191039,G,A"
2,1,small_bowel_site_1,15,101599691,T,C,1_b_1,23,0.011,0.007,0.021,0,1,1,0,"15,101599691,T,C"
3,1,small_bowel_site_1,2,3743333,T,C,amplicrazy,23,0.011,0.007,0.021,0,0,0,0,"2,3743333,T,C"
4,1,small_bowel_site_1,4,3211580,C,G,amplicrazy,23,0.011,0.007,0.021,0,0,0,0,"4,3211580,C,G"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,10,right_ovary_site_4,6,53142417,T,C,10_b_1,13,0.409,0.114,0.386,0,1,1,0,"6,53142417,T,C"
6496,10,right_ovary_site_1,3,142681930,C,T,amplicrazy,8,0.815,0.231,0.654,0,0,1,0,"3,142681930,C,T"
6497,10,right_ovary_site_3,3,142681930,C,T,amplicrazy,8,0.686,0.355,0.986,0,0,1,0,"3,142681930,C,T"
6498,10,right_ovary_site_2,3,142681930,C,T,amplicrazy,8,0.651,0.371,0.967,0,0,1,0,"3,142681930,C,T"


In [11]:
prevalences_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_9.csv'))
prevalences_df

Unnamed: 0,patient_id,paper_id,clone_id,prevalence
0,1,ApC1,A,0.055
1,1,ApC1,B,0.936
2,1,ApC1,C,0.003
3,1,ApC1,D,0.002
4,1,ApC1,E,0.001
...,...,...,...,...
404,10,ROvA9,B,0.241
405,10,ROvA9,C,0.246
406,10,ROvA9,D,0.009
407,10,ROvA9,E,0.229


From McPherson et.al.:
Presence of each clone in each site was calculated as follows. First, samples were grouped into more broad
anatomic locations, for instance ROv1, 2 etc. were grouped as the ROv site. A clone was said to be present in a
sample if it was assigned a clonal prevalence greater than 0.01. Parent clones are often predicted to coexist with child
clones at small proportions, though based on single cell data, such a scenario is likely an artifact. Parent and child
clones are usually distinguished by an absence of mutations in the parent. If the mutations that identify the child
clone are estimated to have slightly lower prevalence than more ancestral mutations, that slight deviation will results
in prediction of a minor population of the parent clone in the sample. On the other hand, a sample composed of 95%
parent and 5% child is more likely to represent a true mixture. To account for these issues, we use an additional rule
for determining whether a parent clone co-exists with its child in a sample. A predicted cellular prevalence of non-leaf
clone X must be at least 50% of the combined cellular prevalence of X and its descendants to be considered present in
the sample. A clone is said to be present in a site if the clone, by the above rules, is present in any of the samples of
that site.

In [84]:
cluster_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']

def get_combined_prevalence(pid, clone_ids, sample):
    '''
    sum of cellular prevalences of all clone_ids in sample
    '''
    total_prevalence = 0.0
    for clone_id in clone_ids:
        subset_df = prevalences_df[(prevalences_df['patient_id']==pid)&(prevalences_df['clone_id']==clone_id)&(prevalences_df['paper_id']==sample)]
        if len(subset_df) > 0:
            prevalence = subset_df['prevalence'].item()
            total_prevalence += float(prevalence)
    return total_prevalence

def is_present_in_site(tree, path_matrix, pid, clone_id, site):
    '''
    using McPherson definition from above
    '''
    pid_clone_df = prevalences_df[(prevalences_df['patient_id']==pid)&(prevalences_df['clone_id']==clone_id)]
    site_samples = sample_df[sample_df['anatomical_site_label']==site]['paper_id'].unique()
    
    descendants = [cluster_labels[i[0].item()] for i in path_matrix[cluster_labels.index(clone_id)].nonzero()]
    present_in_site = False
    for sample in site_samples:
        clone_prevalence = get_combined_prevalence(pid, [clone_id], sample)
        descendants_prevalence = get_combined_prevalence(pid, [clone_id]+descendants, sample)
        print(clone_id, descendants, sample, clone_prevalence, descendants_prevalence)
        if clone_prevalence > 0.01 and clone_prevalence > 0.5*descendants_prevalence:
            present_in_site = True
    return present_in_site

def get_parent(tree, node_index):
    # Look for a '1' in the column corresponding to the node_index
    parent_index = (tree[:, node_index] == 1).nonzero(as_tuple=True)[0]
    return cluster_labels[parent_index[0].item()]

def get_pyclone_clusters(pid, clone_id):
    return set(pyclone_to_clone_id_df[(pyclone_to_clone_id_df['clone_id']==clone_id)&(pyclone_to_clone_id_df['patient_id']==pid)&(pyclone_to_clone_id_df['present']==1)]['pyclone_cluster_id'])

def num_muts_in_clusters(pid, pyclone_clusters):
    muts = set(pyclone_cluster_df[(pyclone_cluster_df['cluster_id'].isin(pyclone_clusters))&(pyclone_cluster_df['patient_id']==pid)]['mut_label'])
    return len(muts)
                                  
def get_num_mutations(tree, pid, clone_id):
    pyclone_clusters = get_pyclone_clusters(pid, clone_id)
    if clone_id == 'A': # root node
        return num_muts_in_clusters(pid, pyclone_clusters)
    # get the clusters which are either gained or lost between child and parent
    parent = get_parent(tree, cluster_labels.index(clone_id))
    parent_pyclone_clusters = get_pyclone_clusters(pid, parent)
    gained_or_lost_clusters = parent_pyclone_clusters ^ pyclone_clusters
    return num_muts_in_clusters(pid, gained_or_lost_clusters)

cols = ['anatomical_site_index', 'anatomical_site_label', 'cluster_index', 'cluster_label', 'present', 'site_category', 'num_mutations']
for pid in patient_ids[:1]:
    tree = dutil.get_adjacency_matrix_from_txt_edge_list(os.path.join(data_dir, f"patient{pid}_tree.txt"))
    path_matrix = vutil.get_path_matrix(tree, remove_self_loops=True)
    clones = prevalences_df[prevalences_df['patient_id']==pid]['clone_id'].unique()
    unique_sites = sample_df[sample_df['patient_id']==pid]['anatomical_site_label'].unique()
    # First find which sites clones are actually detected in 
    observed_sites = set()
    for clone in clones:
        for site in unique_sites:
            present_in_site = is_present_in_site(tree, path_matrix, pid, clone, site)
            if present_in_site:
                observed_sites.add(site)
    
    observed_sites = sorted(list(observed_sites), key=lambda x: ('Ovary' not in x and 'Uterosacral' not in x, x))
    data = []         
    for clone in clones:
        for site in observed_sites:
            present_in_site = is_present_in_site(tree, path_matrix, pid, clone, site)
            num_muts = get_num_mutations(tree, pid, clone)
            site_category = 'primary' if 'Ovary' in site or 'Uterosacral' in site else 'metastasis'
            data.append([observed_sites.index(site), site, cluster_labels.index(clone), clone, 1 if present_in_site else 0, site_category, num_muts])
    print(len(observed_sites), observed_sites, )
    # Build a dataframe for each patient w/ the information in cols
    df = pd.DataFrame(data, columns=cols)
    df.to_csv(os.path.join(data_dir, f"patient{pid}_SNVs.tsv"), sep="\t", index=False)
    #print(df.head())
    

A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] ApC1 0.054538437 1.0000000000000002
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LFTB4 0.542667605 1.0000000000000002
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LFTB2 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvB2 0.300349609 1.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvD3 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvC5 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvA10 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvA4 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOv1 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOv2 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] RFTA16 0.986180615 1.000000001
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] RFTC10 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] RFTA2 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] ROvA4 0.001921998 1.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] ROv1 0.003611262 1.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] ROv2 0.00150505 0

I [] ROv3 0.966088951 0.966088951
I [] ROv4 0.002901882 0.002901882
I [] ROvC2 0.0 0.0
I [] ROvC4 0.0 0.0
I [] ROvA7 0.0 0.0
I [] ROvA5 0.0 0.0
I [] ROvC5 0.0 0.0
I [] ROvC6 0.0 0.0
I [] ROvA9 0.0 0.0
I [] SBwlE4 0.000483684 0.000483684
I [] SBwl 0.000520877 0.000520877
I [] Om1 0.000810635 0.000810635
I [] OmA2 0.0 0.0
I [] OmB1 0.0 0.0
I [] Om2 0.0 0.0
I [] OmF2 0.0 0.0
I [] OmC1 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvB2 0.300349609 1.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvD3 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvC5 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvA10 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOvA4 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOv1 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] LOv2 0.0 0.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] ROvA4 0.001921998 1.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] ROv1 0.003611262 1.0
A ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'] ROv2 0.00150505 0.99999999999

I [] LOvB2 0.000792996 0.000792996
I [] LOvD3 0.0 0.0
I [] LOvC5 0.0 0.0
I [] LOvA10 0.0 0.0
I [] LOvA4 0.0 0.0
I [] LOv1 0.0 0.0
I [] LOv2 0.0 0.0
I [] ROvA4 0.97751603 0.97751603
I [] ROv1 0.003278466 0.003278466
I [] ROv2 0.002368866 0.002368866
I [] ROv3 0.966088951 0.966088951
I [] ROv4 0.002901882 0.002901882
I [] ROvC2 0.0 0.0
I [] ROvC4 0.0 0.0
I [] ROvA7 0.0 0.0
I [] ROvA5 0.0 0.0
I [] ROvC5 0.0 0.0
I [] ROvC6 0.0 0.0
I [] ROvA9 0.0 0.0
I [] ApC1 0.000640804 0.000640804
I [] LFTB4 0.000636686 0.000636686
I [] LFTB2 0.0 0.0
I [] Om1 0.000810635 0.000810635
I [] OmA2 0.0 0.0
I [] OmB1 0.0 0.0
I [] Om2 0.0 0.0
I [] OmF2 0.0 0.0
I [] OmC1 0.0 0.0
I [] RFTA16 0.000849746 0.000849746
I [] RFTC10 0.0 0.0
I [] RFTA2 0.0 0.0
I [] SBwlE4 0.000483684 0.000483684
I [] SBwl 0.000520877 0.000520877
7 ['Left Ovary', 'Right Ovary', 'Appendix', 'Left Fallopian Tube', 'Omentum', 'Right Fallopian Tube', 'Small Bowel']
