In [1]:
import sys
import pandas as pd
import os

repo_dir = "/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/"

data_dir = os.path.join(repo_dir, 'src/data/mcpherson_ovarian_2016')

In [2]:
# (1) First, read in WGS reads data from McPherson and reformat into 
# file structure needed for MACHINA's and PyClone's clustering
# TODO: how to handle "status" column? -> shows somatic, wildtype, germline?

# Taken directly from supplementary table 10 of McPherson et.al.
reads_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_10.csv'))
patient_ids = reads_df['patient_id'].unique()
print("patient IDs:", patient_ids)
reads_df['gene_name'] = reads_df['gene_name'].astype(str)
def label_snv(row):
    label = []
    if row['gene_name'] != "nan":
        label.append(row['gene_name'])
    label += [str(row['chrom']), str(row['coord'])]
    return (":").join(label)
reads_df['character_label'] = reads_df.apply(lambda row: label_snv(row), axis=1)
reads_df

patient IDs: [ 1  2  3  4  7  9 10]


Unnamed: 0,sample_id,patient_id,malignant,primer_id,chrom,coord,ref,alt,ref_counts,alt_counts,depth,alt_freq,background_average_alt_freq,ref_p_value,alt_p_value,status,gene_name,snpeff_impact,character_label
0,normal_blood,1,no,1_b_amplicrazy,1,17090971,G,A,641,0,641,0.000000,0.002034,0.0,1.000000e+00,normal_sample,,,1:17090971
1,omentum_site_1,1,yes,1_b_amplicrazy,1,17090971,G,A,449,0,449,0.000000,0.001597,0.0,1.000000e+00,wildtype,,,1:17090971
2,right_ovary_site_1,1,yes,1_b_amplicrazy,1,17090971,G,A,395,64,459,0.139434,0.002375,0.0,7.795800e-90,somatic,,,1:17090971
3,right_ovary_site_2,1,yes,1_b_amplicrazy,1,17090971,G,A,415,1,416,0.002404,0.002754,0.0,6.825264e-01,wildtype,,,1:17090971
4,right_ovary_site_3,1,yes,1_b_amplicrazy,1,17090971,G,A,609,4,613,0.006525,0.002145,0.0,4.440635e-02,wildtype,,,1:17090971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15293,normal_blood,10,no,10_b_amplicrazy,X,109694487,A,G,3669,10,3679,0.002718,0.002239,0.0,3.130859e-01,normal_sample,RGAG1,LOW,RGAG1:X:109694487
15294,right_ovary_site_1,10,yes,10_b_amplicrazy,X,109694487,A,G,2037,295,2332,0.126501,0.002306,0.0,0.000000e+00,somatic,RGAG1,LOW,RGAG1:X:109694487
15295,right_ovary_site_2,10,yes,10_b_amplicrazy,X,109694487,A,G,1369,4,1373,0.002913,0.002442,0.0,4.314746e-01,wildtype,RGAG1,LOW,RGAG1:X:109694487
15296,right_ovary_site_3,10,yes,10_b_amplicrazy,X,109694487,A,G,2675,4,2679,0.001493,0.002189,0.0,8.364492e-01,wildtype,RGAG1,LOW,RGAG1:X:109694487


In [3]:
# Load the table that contains which "high-quality DNA extractions (discovery samples)"
# were used in the paper's analysis
discovery_samples_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_2.csv'))
discovery_samples_df

Unnamed: 0,patient_id,sample_id,paper_id,total_reads,aligned_reads,coverage,mutation_seq_snvs,strelka_snvs,all_snvs,high_quality_snvs,validated_snvs,ploidy,tumour_cell_proportion,subclone_frequency
0,1,normal_blood,Nml,1288936606,1045839192,34.477126,,,,,,,,
1,1,omentum_site_1,Om1,987945944,884565254,28.788908,5715.0,6665.0,7281.0,3550.0,120.0,3.606594,0.463669,0.318225
2,1,right_ovary_site_1,ROv1,1261617726,1076429948,34.996814,6276.0,7902.0,8514.0,3986.0,132.0,1.880553,0.826552,0.251911
3,1,right_ovary_site_2,ROv2,1389016408,1194767689,38.785042,6004.0,7256.0,7912.0,3757.0,111.0,1.847366,0.695751,0.268036
4,1,right_ovary_site_3,ROv3,1205431286,1056428422,33.312377,5803.0,6756.0,7410.0,3612.0,118.0,1.854135,0.678331,0.265984
5,1,right_ovary_site_4,ROv4,1480564778,1300349864,36.794602,6501.0,7920.0,8664.0,4017.0,112.0,1.85722,0.702755,0.240692
6,1,small_bowel_site_1,SBwl,1242912402,1096324079,35.650173,7100.0,8323.0,9137.0,4357.0,127.0,3.611431,0.448979,0.270676
7,2,normal_blood,Nml,1395804938,1194671812,37.449153,,,,,,,,
8,2,omentum_site_1,Om1,1058316688,936624472,30.631961,1860.0,2705.0,2849.0,1228.0,106.0,3.016544,0.196412,0.347316
9,2,omentum_site_2,Om2,1191965188,1070042684,34.932824,3046.0,4036.0,4242.0,1980.0,107.0,3.014453,0.23995,0.348032


In [4]:
discovery_samples_df[discovery_samples_df['patient_id']==1]['sample_id']

0          normal_blood
1        omentum_site_1
2    right_ovary_site_1
3    right_ovary_site_2
4    right_ovary_site_3
5    right_ovary_site_4
6    small_bowel_site_1
Name: sample_id, dtype: object

In [5]:
# (2) Prepare data for clustering algorithms (MACHINA, PyClone)
# (2a) prep data for MACHINA clustering
cols = ['sample_id', 'ref_counts', 'alt_counts', 'character_label']

# put df in format used in MACHINA, with columns:
#'#sample_index', 'sample_label','anatomical_site_index', 'anatomical_site_label', 'character_index', 'character_label', 'ref', 'var'
for patient_id in patient_ids:
    subset = reads_df[reads_df['patient_id'] == patient_id][cols]
    # only keep the high quality discovery samples used in the paper
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    subset = subset[subset['sample_id'].isin(discovery_samples)]
    # remove normal samples
    subset = subset[subset['sample_id'] != 'normal_blood'] 
    subset = subset.rename(columns={"ref_counts": "ref", "alt_counts": "var", "sample_id": "sample_label"})
    char_labels = list(subset['character_label'].unique())
    sites = list(subset['sample_label'].unique())
    subset['character_index'] = subset.apply(lambda row: char_labels.index(row['character_label']), axis=1)
    subset['#sample_index'] = subset.apply(lambda row: sites.index(row['sample_label']), axis=1)
    subset['anatomical_site_index'] = subset['#sample_index']
    subset['anatomical_site_label'] = subset['sample_label']
    subset = subset[['#sample_index', 'sample_label','anatomical_site_index', 'anatomical_site_label', 'character_index', 'character_label', 'ref', 'var']]
    subset.to_csv(os.path.join(data_dir,  f"reads_patient{patient_id}.csv"), index=False)


In [6]:
# (2b) prep data for PyClone clustering
# get CNA info from supplement of McPherson et. al.
cna_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_7.csv'))
print(cna_df)

def subset_cna(row):
    chrom = row['chrom']
    coord = row['coord']
    patient_id = row['patient_id']
    sample_id = row['sample_id']
    df = cna_df[(cna_df['patient_id'] == patient_id) & (cna_df['sample_id'] == sample_id) & (cna_df['chrom'] == chrom) & (cna_df['start'] <= coord) & (cna_df['end'] >= coord)]
    return df

def major_cn(row):
    coord_df = subset_cna(row)
    if len(coord_df) != 1:
        #print("Not found in CNA:\n", row['chrom'], row['coord'])
        return 0
    return int(coord_df['major'])

def minor_cn(row):
    coord_df = subset_cna(row)
    if len(coord_df) != 1:
        return 0
    return int(coord_df['minor'])


# Add CNAs to reads_df
cna_reads_df = reads_df.copy(deep=True)
cna_reads_df['major_cn'] =  cna_reads_df.apply(lambda row: major_cn(row), axis=1)
cna_reads_df['minor_cn'] =  cna_reads_df.apply(lambda row: minor_cn(row), axis=1)
cna_reads_df

        patient_id           sample_id chrom      start        end  major  \
0                1  small_bowel_site_1     1          1      71283     16   
1                1  small_bowel_site_1     1      73560      94700      7   
2                1  small_bowel_site_1     1     101640     110479     13   
3                1  small_bowel_site_1     1     110479     141357     22   
4                1  small_bowel_site_1     1     144162     247926     24   
...            ...                 ...   ...        ...        ...    ...   
135154          10  right_ovary_site_4     X  152951113  152955404      0   
135155          10  right_ovary_site_4     X  152955404  154151513      2   
135156          10  right_ovary_site_4     X  154151513  154887231      2   
135157          10  right_ovary_site_4     X  154887231  154915957      2   
135158          10  right_ovary_site_4     X  154915957  155255695      2   

        minor  major_sub  minor_sub  subclonal  
0           7          0  

Unnamed: 0,sample_id,patient_id,malignant,primer_id,chrom,coord,ref,alt,ref_counts,alt_counts,...,alt_freq,background_average_alt_freq,ref_p_value,alt_p_value,status,gene_name,snpeff_impact,character_label,major_cn,minor_cn
0,normal_blood,1,no,1_b_amplicrazy,1,17090971,G,A,641,0,...,0.000000,0.002034,0.0,1.000000e+00,normal_sample,,,1:17090971,0,0
1,omentum_site_1,1,yes,1_b_amplicrazy,1,17090971,G,A,449,0,...,0.000000,0.001597,0.0,1.000000e+00,wildtype,,,1:17090971,13,5
2,right_ovary_site_1,1,yes,1_b_amplicrazy,1,17090971,G,A,395,64,...,0.139434,0.002375,0.0,7.795800e-90,somatic,,,1:17090971,4,2
3,right_ovary_site_2,1,yes,1_b_amplicrazy,1,17090971,G,A,415,1,...,0.002404,0.002754,0.0,6.825264e-01,wildtype,,,1:17090971,6,2
4,right_ovary_site_3,1,yes,1_b_amplicrazy,1,17090971,G,A,609,4,...,0.006525,0.002145,0.0,4.440635e-02,wildtype,,,1:17090971,6,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15293,normal_blood,10,no,10_b_amplicrazy,X,109694487,A,G,3669,10,...,0.002718,0.002239,0.0,3.130859e-01,normal_sample,RGAG1,LOW,RGAG1:X:109694487,0,0
15294,right_ovary_site_1,10,yes,10_b_amplicrazy,X,109694487,A,G,2037,295,...,0.126501,0.002306,0.0,0.000000e+00,somatic,RGAG1,LOW,RGAG1:X:109694487,1,0
15295,right_ovary_site_2,10,yes,10_b_amplicrazy,X,109694487,A,G,1369,4,...,0.002913,0.002442,0.0,4.314746e-01,wildtype,RGAG1,LOW,RGAG1:X:109694487,1,0
15296,right_ovary_site_3,10,yes,10_b_amplicrazy,X,109694487,A,G,2675,4,...,0.001493,0.002189,0.0,8.364492e-01,wildtype,RGAG1,LOW,RGAG1:X:109694487,1,0


In [7]:
# do some pre-processing to remove any mutations where CNA info is not available or
# the major allele CN is 0 (if the malignant cells have no copies of the region 
# overlapping the mutation, the mutation cannot exist.) or minor allele CN > major allele CN
cna_reads_df = cna_reads_df[cna_reads_df['major_cn'] != 0]
cna_reads_df = cna_reads_df[(cna_reads_df['minor_cn'] <= cna_reads_df['major_cn'])]
cna_reads_df['normal_cn'] = 2 # copy number of the locus in normal cells is 2 (no male chromosomes here)

pyclone_dir = os.path.join(data_dir, "pyclone_preprocessing")

cols = ['sample_id', 'ref_counts', 'alt_counts', 'character_label', "chrom", "coord", "major_cn", "minor_cn", "normal_cn"]
# put df in format used in PyClone, with columns:
#'mutation_id', 'ref_counts','var_counts', 'normal_cn', 'minor_cn', 'major_cn'
for patient_id in patient_ids:
    patient_dir = os.path.join(pyclone_dir, f"reads_patient_{patient_id}")
    if not os.path.exists(patient_dir): 
        os.makedirs(patient_dir)
    patient_subset = cna_reads_df[cna_reads_df['patient_id'] == patient_id][cols]
    # only keep the high quality discovery samples used in the paper
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    patient_subset = patient_subset[patient_subset['sample_id'].isin(discovery_samples)]
    # remove normal samples
    patient_subset = patient_subset[patient_subset['sample_id'] != 'normal_blood'] 
    sample_ids = list(patient_subset['sample_id'].unique())
    patient_subset = patient_subset.rename(columns={"ref_counts": "ref_counts", "alt_counts": "var_counts", "character_label": "mutation_id"})
    
    for sample_id in sample_ids:
        sample_subset = patient_subset[patient_subset['sample_id'] == sample_id]
        sample_subset = sample_subset[['mutation_id', 'ref_counts', 'var_counts', 'normal_cn', 'minor_cn', 'major_cn']]
        sample_subset.to_csv(os.path.join(patient_dir,  f"reads_patient{patient_id}_{sample_id}.csv"))    


In [8]:
# Setup PyClone commands to run for each patient

for patient_id in patient_ids:
    cmd = ["PyClone", "run_analysis_pipeline", "--in_files"]
    patient_dir = os.path.join(pyclone_dir, f"reads_patient_{patient_id}")
    discovery_samples = list(discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id'])
    discovery_samples.remove("normal_blood")
    # Add tsvs for each sample
    for sample_id in discovery_samples:
        cmd.append(os.path.join(patient_dir,  f"reads_patient{patient_id}_{sample_id}.tsv"))
    cmd += ["--working_dir", patient_dir, "--tumour_contents"]
    # Add tumour cell proportions for each sample
    for sample_id in discovery_samples:
        tumour_prop_subset = discovery_samples_df[(discovery_samples_df['patient_id']==patient_id) & (discovery_samples_df['sample_id']==sample_id)]
        tumour_prop = tumour_prop_subset['tumour_cell_proportion']                                                                                            
        cmd.append(str(tumour_prop.values[0]))
    # Add sample names
    cmd.append("--samples")
    for sample_id in discovery_samples:
        cmd.append(sample_id)
    print(" ".join(cmd))
    

PyClone run_analysis_pipeline --in_files /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_omentum_site_1.tsv /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_right_ovary_site_1.tsv /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_right_ovary_site_2.tsv /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_right_ovary_site_3.tsv /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_right_ovary_site_4.tsv /Us

In [47]:
# (3) Preparing for tree inference algorithms (SPRUCE, PairTree)
# (3a) Write results of PyClone clustering out to file to use as input for SPRUCE (tree inference)
# For each patient, write a file where each line contains the mutations in each cluster

for patient_id in patient_ids:
    patient_dir = os.path.join(pyclone_dir, f"reads_patient_{patient_id}", "tables")
    cluster_df = pd.read_csv(os.path.join(patient_dir, 'loci.tsv'), sep = "\t")
    clusters = cluster_df['cluster_id'].unique()
    clusters.sort()
    assert((clusters[0] == 0) and (clusters[-1] == len(clusters)-1))
    with open(os.path.join(pyclone_dir, f"patient{patient_id}_pyclone_clusters.txt"), 'w') as f:
        for cluster in clusters:
            subset = cluster_df[cluster_df['cluster_id']==cluster]
            mutations = subset['mutation_id'].unique()
            f.write(";".join(mutations))
            f.write("\n")
            

In [48]:
# Create a tsv for each patient with ref reads, var reads, and f upper bound and f lower bound
# pooled by all SNVs in a cluster. This is used for SPRUCE

repo_dir = "/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/"
os.chdir(repo_dir)

from src.util import create_conf_intervals_from_reads as conf

cluster_split = lambda cluster_name: [x for x in cluster_name.split(";")]

for patient_id in patient_ids:
    patient_dir = os.path.join(pyclone_dir, f"reads_patient_{patient_id}")
    conf.write(os.path.join(data_dir, f"reads_patient{patient_id}.csv"),
               os.path.join(pyclone_dir, f"patient{patient_id}_pyclone_clusters.txt"),
               data_dir,
               cluster_split_function=cluster_split)


num variants: 299
anatomical site labels: ['omentum_site_1' 'right_ovary_site_1' 'right_ovary_site_2'
 'right_ovary_site_3' 'right_ovary_site_4' 'small_bowel_site_1']
num variants: 224
anatomical site labels: ['omentum_site_1' 'omentum_site_2' 'right_ovary_site_1'
 'right_ovary_site_2']
num variants: 218
anatomical site labels: ['adnexa_site_1' 'omentum_site_1' 'right_ovary_site_1'
 'right_ovary_site_2']
num variants: 233
anatomical site labels: ['left_pelvic_sidewall_site_1' 'right_ovary_site_1' 'right_ovary_site_2'
 'right_ovary_site_3' 'right_ovary_site_4']
num variants: 327
anatomical site labels: ['brain_metastasis' 'left_ovary_site_1' 'righ_pelvic_mass']
num variants: 229
anatomical site labels: ['left_ovary_site_1' 'left_ovary_site_2' 'omentum_site_1' 'omentum_site_2'
 'right_ovary_site_1']
num variants: 273
anatomical site labels: ['right_ovary_site_1' 'right_ovary_site_2' 'right_ovary_site_3'
 'right_ovary_site_4']


In [76]:
# (3b) Prepare input for PairTree
import json
pairtree_dir = os.path.join(data_dir, "pairtree")
header = ["id", "name", "var_reads", "total_reads", "var_read_prob"]

def load_pyclone_clusters(pyclone_dir, patient_id):
    '''
    Returns (1) map of PyClone cluster ID to mutation names, 
    and (2) list of all mutation names used in PyClone clustering
    '''
    # Load map from mutation name to PyClone cluster
    cluster_id_to_mut_names = {}
    mutations = []
    with open(os.path.join(pyclone_dir, f"patient{patient_id}_pyclone_clusters.txt"), "r") as f:
        for i, line in enumerate(f):
            muts = line.strip().split(";")
            for mut in muts:
                if i not in cluster_id_to_mut_names:
                    cluster_id_to_mut_names[i] = []
                cluster_id_to_mut_names[i].append(mut)
                mutations.append(mut)
    return cluster_id_to_mut_names, mutations

    
# id, name, var_reads, total_reads, var_read_prob
for patient_id in patient_ids:
    # Use the CNA reads df since this has filtered out the variants without CN information
    # and was used as input to PyClone, which our clusters are made from
    patient_subset = reads_df[reads_df['patient_id'] == patient_id]
    # only keep the high quality discovery samples used in the paper
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    patient_subset = patient_subset[patient_subset['sample_id'].isin(discovery_samples)]
    # remove normal samples
    patient_subset = patient_subset[patient_subset['sample_id'] != 'normal_blood'] 
    
    cluster_id_to_mut_names, mutation_names = load_pyclone_clusters(pyclone_dir, patient_id)
    sample_names = list(patient_subset['sample_id'].unique())
    print(len(mutation_names))
    
    mut_name_to_mut_id = {}
    with open(os.path.join(pairtree_dir, f"patient{patient_id}.ssm"), "w") as f:
        
        f.write("\t".join(header))
        f.write("\n")
        for i, mut in enumerate(mutation_names):
            mut_name_to_mut_id[mut] = f"m{i}"
            row = [f"m{i}", mut]
            mut_patient_subset = patient_subset[patient_subset['character_label'] == mut]
            var_reads = []
            total_reads = []
            var_read_probs = []
            for sample in sample_names:
                mut_patient_sample = mut_patient_subset[mut_patient_subset['sample_id'] == sample]
                var = mut_patient_sample['alt_counts'].values[0]
                ref = mut_patient_sample['ref_counts'].values[0]
                var_reads.append(str(var))
                total_reads.append(str(var+ref))
                # TODO: Add CNA, for now assume no CNA diploid cells
                var_read_probs.append(str(0.5))

            row += [",".join(var_reads), ",".join(total_reads), ",".join(var_read_probs)]
            f.write("\t".join(row))
            f.write("\n")
    json_data = {"samples": sample_names, "clusters": [], "garbage": []}
    for cid in cluster_id_to_mut_names:
        json_data["clusters"].append([mut_name_to_mut_id[x] for x in cluster_id_to_mut_names[cid]])
    
    with open(os.path.join(pairtree_dir, f"patient{patient_id}.params.json"), 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False)
    


299
224
218
233
327
229
273
