### NOTE: this was done for validation, we use the McPherson published trees/clonal presences to 

In [1]:
import sys
import pandas as pd
import os

repo_dir = os.path.join(os.getcwd(), "../../")
data_dir = os.path.join(repo_dir, 'data', 'mcpherson_ovarian_2016')

## (1) Read in data from McPherson et.al. supplemental tables

In [2]:
# First, read in WGS reads data from McPherson and reformat into 
# file structure needed for MACHINA's and PyClone's clustering
# TODO: how to handle "status" column? -> shows somatic, wildtype, germline?

# Taken directly from supplementary table 10 of McPherson et.al.
reads_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_10.csv'))
patient_ids = reads_df['patient_id'].unique()
print("patient IDs:", patient_ids)
reads_df['gene_name'] = reads_df['gene_name'].astype(str)
def label_snv(row):
    label = []
    if row['gene_name'] != "nan":
        label.append(row['gene_name'])
    label += [str(row['chrom']), str(int(row['coord']))]
    return (":").join(label)
reads_df['character_label'] = reads_df.apply(lambda row: label_snv(row), axis=1)
# Remove germline variants
reads_df = reads_df[reads_df['status'] != 'germline']
reads_df

patient IDs: [ 1  2  3  4  7  9 10]


Unnamed: 0,sample_id,patient_id,malignant,primer_id,chrom,coord,ref,alt,ref_counts,alt_counts,depth,alt_freq,background_average_alt_freq,ref_p_value,alt_p_value,status,gene_name,snpeff_impact,character_label
0,normal_blood,1,no,1_b_amplicrazy,1,17090971.0,G,A,641,0,641,0.000000,0.002034,0.0,1.000000e+00,normal_sample,,,1:17090971
1,omentum_site_1,1,yes,1_b_amplicrazy,1,17090971.0,G,A,449,0,449,0.000000,0.001597,0.0,1.000000e+00,wildtype,,,1:17090971
2,right_ovary_site_1,1,yes,1_b_amplicrazy,1,17090971.0,G,A,395,64,459,0.139434,0.002375,0.0,7.800000e-90,somatic,,,1:17090971
3,right_ovary_site_2,1,yes,1_b_amplicrazy,1,17090971.0,G,A,415,1,416,0.002404,0.002754,0.0,6.825260e-01,wildtype,,,1:17090971
4,right_ovary_site_3,1,yes,1_b_amplicrazy,1,17090971.0,G,A,609,4,613,0.006525,0.002145,0.0,4.440600e-02,wildtype,,,1:17090971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15293,normal_blood,10,no,10_b_amplicrazy,X,110000000.0,A,G,3669,10,3679,0.002718,0.002239,0.0,3.130860e-01,normal_sample,RGAG1,LOW,RGAG1:X:110000000
15294,right_ovary_site_1,10,yes,10_b_amplicrazy,X,110000000.0,A,G,2037,295,2332,0.126501,0.002306,0.0,0.000000e+00,somatic,RGAG1,LOW,RGAG1:X:110000000
15295,right_ovary_site_2,10,yes,10_b_amplicrazy,X,110000000.0,A,G,1369,4,1373,0.002913,0.002442,0.0,4.314750e-01,wildtype,RGAG1,LOW,RGAG1:X:110000000
15296,right_ovary_site_3,10,yes,10_b_amplicrazy,X,110000000.0,A,G,2675,4,2679,0.001493,0.002189,0.0,8.364490e-01,wildtype,RGAG1,LOW,RGAG1:X:110000000


In [10]:
reads_df[reads_df['patient_id']==10]['sample_id'].unique()

array(['left_fallopian_tube_site_b2', 'normal_blood', 'omentum_site_c1',
       'right_ovary_site_1', 'right_ovary_site_2', 'right_ovary_site_3',
       'right_ovary_site_4', 'right_ovary_site_a4', 'right_ovary_site_a9'],
      dtype=object)

In [3]:
# Load the table that contains which "high-quality DNA extractions (discovery samples)"
# were used in the paper's analysis
discovery_samples_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_2.csv'))
discovery_samples_df

Unnamed: 0,patient_id,sample_id,paper_id,total_reads,aligned_reads,coverage,mutation_seq_snvs,strelka_snvs,all_snvs,high_quality_snvs,validated_snvs,ploidy,tumour_cell_proportion,subclone_frequency
0,1,normal_blood,Nml,1290000000.0,1050000000.0,34.47713,,,,,,,,
1,1,omentum_site_1,Om1,988000000.0,885000000.0,28.78891,5715.0,6665.0,7281.0,3550.0,120.0,3.606594,0.463669,0.318225
2,1,right_ovary_site_1,ROv1,1260000000.0,1080000000.0,34.99681,6276.0,7902.0,8514.0,3986.0,132.0,1.880553,0.826552,0.251911
3,1,right_ovary_site_2,ROv2,1390000000.0,1190000000.0,38.78504,6004.0,7256.0,7912.0,3757.0,111.0,1.847366,0.695751,0.268036
4,1,right_ovary_site_3,ROv3,1210000000.0,1060000000.0,33.31238,5803.0,6756.0,7410.0,3612.0,118.0,1.854135,0.678331,0.265984
5,1,right_ovary_site_4,ROv4,1480000000.0,1300000000.0,36.7946,6501.0,7920.0,8664.0,4017.0,112.0,1.85722,0.702755,0.240692
6,1,small_bowel_site_1,SBwl,1240000000.0,1100000000.0,35.65017,7100.0,8323.0,9137.0,4357.0,127.0,3.611431,0.448979,0.270676
7,2,normal_blood,Nml,1400000000.0,1190000000.0,37.44915,,,,,,,,
8,2,omentum_site_1,Om1,1060000000.0,937000000.0,30.63196,1860.0,2705.0,2849.0,1228.0,106.0,3.016544,0.196412,0.347316
9,2,omentum_site_2,Om2,1190000000.0,1070000000.0,34.93282,3046.0,4036.0,4242.0,1980.0,107.0,3.014453,0.23995,0.348032


## (2) Generate inputs for PyClone clustering

### (2a) Create input for PyClone clustering

In [4]:

# get CNA info from supplement of McPherson et. al.
cna_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_7.csv'))
print(cna_df)

def subset_cna(row):
    chrom = row['chrom']
    coord = row['coord']
    patient_id = row['patient_id']
    sample_id = row['sample_id']
    df = cna_df[(cna_df['patient_id'] == patient_id) & (cna_df['sample_id'] == sample_id) & (cna_df['chrom'] == chrom) & (cna_df['start'] <= coord) & (cna_df['end'] >= coord)]
    return df

def major_cn(row):
    coord_df = subset_cna(row)
    if len(coord_df) != 1:
        #print("Not found in CNA:\n", row['chrom'], row['coord'])
        return 0
    return int(coord_df['major'])

def minor_cn(row):
    coord_df = subset_cna(row)
    if len(coord_df) != 1:
        return 0
    return int(coord_df['minor'])


# Add CNAs to reads_df
cna_reads_df = reads_df.copy(deep=True)
cna_reads_df['major_cn'] =  cna_reads_df.apply(lambda row: major_cn(row), axis=1)
cna_reads_df['minor_cn'] =  cna_reads_df.apply(lambda row: minor_cn(row), axis=1)
cna_reads_df

        patient_id           sample_id chrom        start          end  major  \
0                1  small_bowel_site_1     1          1.0      71283.0     16   
1                1  small_bowel_site_1     1      73560.0      94700.0      7   
2                1  small_bowel_site_1     1     101640.0     110479.0     13   
3                1  small_bowel_site_1     1     110479.0     141357.0     22   
4                1  small_bowel_site_1     1     144162.0     247926.0     24   
...            ...                 ...   ...          ...          ...    ...   
135154          10  right_ovary_site_4     X  153000000.0  153000000.0      0   
135155          10  right_ovary_site_4     X  153000000.0  154000000.0      2   
135156          10  right_ovary_site_4     X  154000000.0  155000000.0      2   
135157          10  right_ovary_site_4     X  155000000.0  155000000.0      2   
135158          10  right_ovary_site_4     X  155000000.0  155000000.0      2   

        minor  major_sub  m

Unnamed: 0,sample_id,patient_id,malignant,primer_id,chrom,coord,ref,alt,ref_counts,alt_counts,...,alt_freq,background_average_alt_freq,ref_p_value,alt_p_value,status,gene_name,snpeff_impact,character_label,major_cn,minor_cn
0,normal_blood,1,no,1_b_amplicrazy,1,17090971.0,G,A,641,0,...,0.000000,0.002034,0.0,1.000000e+00,normal_sample,,,1:17090971,0,0
1,omentum_site_1,1,yes,1_b_amplicrazy,1,17090971.0,G,A,449,0,...,0.000000,0.001597,0.0,1.000000e+00,wildtype,,,1:17090971,13,5
2,right_ovary_site_1,1,yes,1_b_amplicrazy,1,17090971.0,G,A,395,64,...,0.139434,0.002375,0.0,7.800000e-90,somatic,,,1:17090971,4,2
3,right_ovary_site_2,1,yes,1_b_amplicrazy,1,17090971.0,G,A,415,1,...,0.002404,0.002754,0.0,6.825260e-01,wildtype,,,1:17090971,6,2
4,right_ovary_site_3,1,yes,1_b_amplicrazy,1,17090971.0,G,A,609,4,...,0.006525,0.002145,0.0,4.440600e-02,wildtype,,,1:17090971,6,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15293,normal_blood,10,no,10_b_amplicrazy,X,110000000.0,A,G,3669,10,...,0.002718,0.002239,0.0,3.130860e-01,normal_sample,RGAG1,LOW,RGAG1:X:110000000,0,0
15294,right_ovary_site_1,10,yes,10_b_amplicrazy,X,110000000.0,A,G,2037,295,...,0.126501,0.002306,0.0,0.000000e+00,somatic,RGAG1,LOW,RGAG1:X:110000000,1,0
15295,right_ovary_site_2,10,yes,10_b_amplicrazy,X,110000000.0,A,G,1369,4,...,0.002913,0.002442,0.0,4.314750e-01,wildtype,RGAG1,LOW,RGAG1:X:110000000,1,0
15296,right_ovary_site_3,10,yes,10_b_amplicrazy,X,110000000.0,A,G,2675,4,...,0.001493,0.002189,0.0,8.364490e-01,wildtype,RGAG1,LOW,RGAG1:X:110000000,1,0


In [5]:
# do some pre-processing to remove any mutations where CNA info is not available or
# the major allele CN is 0 (if the malignant cells have no copies of the region 
# overlapping the mutation, the mutation cannot exist.) or minor allele CN > major allele CN
cna_reads_df = cna_reads_df[cna_reads_df['major_cn'] != 0]
cna_reads_df = cna_reads_df[(cna_reads_df['minor_cn'] <= cna_reads_df['major_cn'])]
cna_reads_df['normal_cn'] = 2 # copy number of the locus in normal cells is 2 (no male chromosomes here)

In [6]:
pyclone_dir = os.path.join(data_dir, "pyclone_analysis")

cols = ['sample_id', 'ref_counts', 'alt_counts', 'character_label', "chrom", "coord", "major_cn", "minor_cn", "normal_cn"]
# put df in format used in PyClone, with columns:
#'mutation_id', 'ref_counts','var_counts', 'normal_cn', 'minor_cn', 'major_cn'
for patient_id in patient_ids:
    patient_dir = os.path.join(pyclone_dir, f"patient_{patient_id}")
    if not os.path.exists(patient_dir): 
        os.makedirs(patient_dir)
    patient_subset = cna_reads_df[cna_reads_df['patient_id'] == patient_id][cols]
    # only keep the high quality discovery samples used in the paper
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    patient_subset = patient_subset[patient_subset['sample_id'].isin(discovery_samples)]
    # remove normal samples
    patient_subset = patient_subset[patient_subset['sample_id'] != 'normal_blood'] 
    sample_ids = list(patient_subset['sample_id'].unique())
    patient_subset = patient_subset.rename(columns={"ref_counts": "ref_counts", "alt_counts": "var_counts", "character_label": "mutation_id"})
    
    for sample_id in sample_ids:
        sample_subset = patient_subset[patient_subset['sample_id'] == sample_id]
        sample_subset = sample_subset[['mutation_id', 'ref_counts', 'var_counts', 'normal_cn', 'minor_cn', 'major_cn']]
        sample_subset.to_csv(os.path.join(patient_dir,  f"patient{patient_id}_{sample_id}.tsv"), index=False, sep="\t")    


In [7]:
# Setup PyClone commands to run for each patient

for patient_id in patient_ids:
    cmd = [f"bsub -n 8 -W 20:00 -R rusage[mem=8] -o output_{patient_id}.log -e error_{patient_id}.log", "PyClone run_analysis_pipeline", "--in_files"]
    patient_dir = os.path.join(pyclone_dir, f"patient_{patient_id}")
    discovery_samples = list(discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id'])
    discovery_samples.remove("normal_blood")
    # Add tsvs for each sample
    for sample_id in discovery_samples:
        cmd.append(os.path.join(patient_dir,  f"patient{patient_id}_{sample_id}.tsv"))
    cmd += ["--working_dir", patient_dir, "--tumour_contents"]
    # Add tumour cell proportions for each sample
    for sample_id in discovery_samples:
        tumour_prop_subset = discovery_samples_df[(discovery_samples_df['patient_id']==patient_id) & (discovery_samples_df['sample_id']==sample_id)]
        tumour_prop = tumour_prop_subset['tumour_cell_proportion']                                                                                            
        cmd.append(str(tumour_prop.values[0]))
    # Add sample names
    cmd.append("--samples")
    for sample_id in discovery_samples:
        cmd.append(sample_id)
    cmd += ["--num_iters", "100000", "--burnin", "50000"]
    print(" ".join(cmd))
    

bsub -n 8 -W 20:00 -R rusage[mem=8] -o output_1.log -e error_1.log PyClone run_analysis_pipeline --in_files /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/../data/mcpherson_ovarian_2016/pyclone_analysis/patient_1/patient1_omentum_site_1.tsv /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/../data/mcpherson_ovarian_2016/pyclone_analysis/patient_1/patient1_right_ovary_site_1.tsv /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/../data/mcpherson_ovarian_2016/pyclone_analysis/patient_1/patient1_right_ovary_site_2.tsv /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/../data/mcpherson_ovarian_2016/pyclone_analysis/patient_1/patient1_right_ovary_site_3.tsv /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/../data/mcpherson_ovarian_2016/pyclone_analysis/patient_1/patient1_right_ovary_site_4.tsv /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/../data/mcpherson_ovarian_2016/p

## (3) Generate inputs for tree inference algorithms (PairTree/Orchard)

## (3a) Create input for PairTree/Orchard (using PyClone clustering)

In [8]:
import json
from metient.util import data_extraction_util as dutil

MIN_MUT_THRES = 5
pairtree_dir = os.path.join(data_dir, "orchard_trees")
header = ["id", "name", "var_reads", "total_reads", "var_read_prob"]

# id, name, var_reads, total_reads, var_read_prob
for patient_id in patient_ids:
    # Use the CNA reads df since this has filtered out the variants without CN information
    # and was used as input to PyClone, which our clusters are made from
    patient_subset = cna_reads_df[cna_reads_df['patient_id'] == patient_id]
    # only keep the high quality discovery samples used in the paper
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    patient_subset = patient_subset[patient_subset['sample_id'].isin(discovery_samples)]
    # remove normal samples
    patient_subset = patient_subset[patient_subset['sample_id'] != 'normal_blood'] 
    
    patient_pyclone_dir = os.path.join(pyclone_dir, f"patient_{patient_id}", "tables", f"loci.tsv")
    mut_name_to_clstr_id, clstr_id_to_name, mutation_names = dutil.get_mut_to_cluster_map_from_pyclone_output(patient_pyclone_dir, min_mut_thres=MIN_MUT_THRES)
    cluster_id_to_mut_names = {}
    for mut in mut_name_to_clstr_id:
        cid = mut_name_to_clstr_id[mut]
        if cid not in cluster_id_to_mut_names:
            cluster_id_to_mut_names[cid]= []
        cluster_id_to_mut_names[cid].append(mut)
   
    sample_names = list(patient_subset['sample_id'].unique())
    print(len(cluster_id_to_mut_names), "clusters", len(mutation_names), "mutations")
    #print(mutation_names)
    mut_name_to_mut_id = {}
    with open(os.path.join(pairtree_dir, f"patient{patient_id}.ssm"), "w") as f:
        
        f.write("\t".join(header))
        f.write("\n")
        for i, mut in enumerate(mutation_names):
            mut_name_to_mut_id[mut] = f"m{i}"
            row = [f"m{i}", mut]
            mut_patient_subset = patient_subset[patient_subset['character_label'] == mut]
            #print(mut_patient_subset)
            var_reads = []
            total_reads = []
            var_read_probs = []
            for sample in sample_names:
                #print(patient_id, mut, sample)
                mut_patient_sample = mut_patient_subset[mut_patient_subset['sample_id'] == sample]
                #print(mut_patient_sample)
                var = mut_patient_sample['alt_counts'].values[0]
                ref = mut_patient_sample['ref_counts'].values[0]
                var_reads.append(str(var))
                total_reads.append(str(var+ref))
                tumour_prop_subset = discovery_samples_df[(discovery_samples_df['patient_id']==patient_id) & (discovery_samples_df['sample_id']==sample)]
                p = tumour_prop_subset['tumour_cell_proportion'].item() 
                major_cn = mut_patient_sample['major_cn'].values[0]
                minor_cn = mut_patient_sample['minor_cn'].values[0]
                var_read_prob = dutil.calc_var_read_prob(major_cn, minor_cn, p)
                var_read_probs.append(str(var_read_prob))

            row += [",".join(var_reads), ",".join(total_reads), ",".join(var_read_probs)]
            f.write("\t".join(row))
            f.write("\n")
    json_data = {"samples": sample_names, "clusters": [], "garbage": []}
    for x in range(0,len(cluster_id_to_mut_names)):
        json_data["clusters"].append([mut_name_to_mut_id[t] for t in cluster_id_to_mut_names[x]])
    
    with open(os.path.join(pairtree_dir, f"patient{patient_id}.params.json"), 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False)
    


  return torch._C._cuda_getDeviceCount() > 0


CUDA GPU: False
13 clusters 177 mutations
6 clusters 120 mutations
9 clusters 168 mutations
10 clusters 161 mutations
8 clusters 221 mutations
5 clusters 132 mutations
3 clusters 189 mutations


## (4) Prepare data for migration history inference

## (4a) Take PyClone generated clusters and create csvs with ref and var counts pooled by cluster 

In [15]:
# Need a tsv for each patient with ['#sample_index', 'sample_label', 'anatomical_site_index','anatomical_site_label', 'character_index', 'character_label', 'ref', 'var']
import re
import numpy as np
from metient.util import data_extraction_util as dutil
pyclone_dir = os.path.join(data_dir, "pyclone_analysis")


cols = [ 'anatomical_site_index','anatomical_site_label',  'cluster_index', 'character_index', 'character_label', 'ref', 'var', 'var_read_prob', 'site_category']
#agg_rules = {'sample_label': lambda x: ';'.join(set(x))}

for patient_id in patient_ids:
    patient_pyclone_dir = os.path.join(pyclone_dir, f"patient_{patient_id}", "tables", f"loci.tsv")
    mut_name_to_clstr_id, clstr_id_to_name, mutation_names = dutil.get_mut_to_cluster_map_from_pyclone_output(patient_pyclone_dir, min_mut_thres=MIN_MUT_THRES)
  
    patient_subset = cna_reads_df[cna_reads_df['patient_id'] == patient_id]
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    patient_subset = patient_subset[(patient_subset['sample_id'].isin(discovery_samples)) & (patient_subset['sample_id'] != 'normal_blood')]   
    sample_names = list(patient_subset['sample_id'].unique())
    print("patient", patient_id, len(clstr_id_to_name), "clusters", len(mutation_names), "mutations", len(sample_names), "samples")
    print(sample_names)
    data = []
    
    anat_sites = list(set([re.sub(r'_[0-9]+$', '', sample) for sample in sample_names]))
    print(anat_sites)
    for midx, mut in enumerate(mutation_names):
        mut_patient_subset = patient_subset[patient_subset['character_label'] == mut]
        for sidx, sample in enumerate(sample_names):
            mut_patient_sample = mut_patient_subset[mut_patient_subset['sample_id'] == sample]
            var = mut_patient_sample['alt_counts'].values[0]
            ref = mut_patient_sample['ref_counts'].values[0]
            tumour_prop_subset = discovery_samples_df[(discovery_samples_df['patient_id']==patient_id) & (discovery_samples_df['sample_id']==sample)]
            p = tumour_prop_subset['tumour_cell_proportion'].item() 
            major_cn = mut_patient_sample['major_cn'].values[0]
            minor_cn = mut_patient_sample['minor_cn'].values[0]
            var_read_prob = dutil.calc_var_read_prob(major_cn, minor_cn, p)
            
            site_label = re.sub(r'_[0-9]+$', '', sample)
            #print(sample, site_label, anat_sites.index(site_label))
            category = 'primary' if 'ovary_site' in site_label else 'metastasis'
            data.append([anat_sites.index(site_label), site_label, mut_name_to_clstr_id[mut], midx, mut.split(":")[0], ref, var, var_read_prob, category])
            
    patient_df = pd.DataFrame(data, columns=cols)
    patient_df.to_csv(os.path.join(data_dir,"pyclone_clustered_tsvs", f"{patient_id}_SNVs.tsv"), sep="\t", index=False)

#     dutil.write_pooled_tsv_from_clusters(patient_df, mut_name_to_clstr_id, clstr_id_to_name, agg_rules, 
#                                          os.path.join(data_dir,"pyclone_clustered_tsvs"), f"patient{patient_id}", ";", ":")


patient 1 13 clusters 177 mutations 6 samples
['omentum_site_1', 'right_ovary_site_1', 'right_ovary_site_2', 'right_ovary_site_3', 'right_ovary_site_4', 'small_bowel_site_1']
['omentum_site', 'small_bowel_site', 'right_ovary_site']
patient 2 6 clusters 120 mutations 4 samples
['omentum_site_1', 'omentum_site_2', 'right_ovary_site_1', 'right_ovary_site_2']
['omentum_site', 'right_ovary_site']
patient 3 9 clusters 168 mutations 4 samples
['adnexa_site_1', 'omentum_site_1', 'right_ovary_site_1', 'right_ovary_site_2']
['omentum_site', 'adnexa_site', 'right_ovary_site']
patient 4 10 clusters 161 mutations 5 samples
['left_pelvic_sidewall_site_1', 'right_ovary_site_1', 'right_ovary_site_2', 'right_ovary_site_3', 'right_ovary_site_4']
['right_ovary_site', 'left_pelvic_sidewall_site']
patient 7 8 clusters 221 mutations 3 samples
['brain_metastasis', 'left_ovary_site_1', 'righ_pelvic_mass']
['righ_pelvic_mass', 'left_ovary_site', 'brain_metastasis']
patient 9 5 clusters 132 mutations 5 samples
