In [1]:
import sys
import pandas as pd
import os

repo_dir = "/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/"

data_dir = os.path.join(repo_dir, 'src/data/mcpherson_ovarian_2016')

In [2]:
# (1) First, read in WGS reads data from McPherson and reformat into 
# file structure needed for MACHINA's and PyClone's clustering
# TODO: how to handle "status" column? -> shows somatic, wildtype, germline?

# Taken directly from supplementary table 10 of McPherson et.al.
reads_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_10.csv'))
patient_ids = reads_df['patient_id'].unique()
print("patient IDs:", patient_ids)
reads_df['gene_name'] = reads_df['gene_name'].astype(str)
def label_snv(row):
    label = []
    if row['gene_name'] != "nan":
        label.append(row['gene_name'])
    label += [str(row['chrom']), str(row['coord'])]
    return (":").join(label)
reads_df['character_label'] = reads_df.apply(lambda row: label_snv(row), axis=1)
reads_df

patient IDs: [ 1  2  3  4  7  9 10]


Unnamed: 0,sample_id,patient_id,malignant,primer_id,chrom,coord,ref,alt,ref_counts,alt_counts,depth,alt_freq,background_average_alt_freq,ref_p_value,alt_p_value,status,gene_name,snpeff_impact,character_label
0,normal_blood,1,no,1_b_amplicrazy,1,17090971,G,A,641,0,641,0.000000,0.002034,0.0,1.000000e+00,normal_sample,,,1:17090971
1,omentum_site_1,1,yes,1_b_amplicrazy,1,17090971,G,A,449,0,449,0.000000,0.001597,0.0,1.000000e+00,wildtype,,,1:17090971
2,right_ovary_site_1,1,yes,1_b_amplicrazy,1,17090971,G,A,395,64,459,0.139434,0.002375,0.0,7.795800e-90,somatic,,,1:17090971
3,right_ovary_site_2,1,yes,1_b_amplicrazy,1,17090971,G,A,415,1,416,0.002404,0.002754,0.0,6.825264e-01,wildtype,,,1:17090971
4,right_ovary_site_3,1,yes,1_b_amplicrazy,1,17090971,G,A,609,4,613,0.006525,0.002145,0.0,4.440635e-02,wildtype,,,1:17090971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15293,normal_blood,10,no,10_b_amplicrazy,X,109694487,A,G,3669,10,3679,0.002718,0.002239,0.0,3.130859e-01,normal_sample,RGAG1,LOW,RGAG1:X:109694487
15294,right_ovary_site_1,10,yes,10_b_amplicrazy,X,109694487,A,G,2037,295,2332,0.126501,0.002306,0.0,0.000000e+00,somatic,RGAG1,LOW,RGAG1:X:109694487
15295,right_ovary_site_2,10,yes,10_b_amplicrazy,X,109694487,A,G,1369,4,1373,0.002913,0.002442,0.0,4.314746e-01,wildtype,RGAG1,LOW,RGAG1:X:109694487
15296,right_ovary_site_3,10,yes,10_b_amplicrazy,X,109694487,A,G,2675,4,2679,0.001493,0.002189,0.0,8.364492e-01,wildtype,RGAG1,LOW,RGAG1:X:109694487


In [3]:
# Load the table that contains which "high-quality DNA extractions (discovery samples)"
# were used in the paper's analysis
discovery_samples_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_2.csv'))
discovery_samples_df

Unnamed: 0,patient_id,sample_id,paper_id,total_reads,aligned_reads,coverage,mutation_seq_snvs,strelka_snvs,all_snvs,high_quality_snvs,validated_snvs,ploidy,tumour_cell_proportion,subclone_frequency
0,1,normal_blood,Nml,1288936606,1045839192,34.477126,,,,,,,,
1,1,omentum_site_1,Om1,987945944,884565254,28.788908,5715.0,6665.0,7281.0,3550.0,120.0,3.606594,0.463669,0.318225
2,1,right_ovary_site_1,ROv1,1261617726,1076429948,34.996814,6276.0,7902.0,8514.0,3986.0,132.0,1.880553,0.826552,0.251911
3,1,right_ovary_site_2,ROv2,1389016408,1194767689,38.785042,6004.0,7256.0,7912.0,3757.0,111.0,1.847366,0.695751,0.268036
4,1,right_ovary_site_3,ROv3,1205431286,1056428422,33.312377,5803.0,6756.0,7410.0,3612.0,118.0,1.854135,0.678331,0.265984
5,1,right_ovary_site_4,ROv4,1480564778,1300349864,36.794602,6501.0,7920.0,8664.0,4017.0,112.0,1.85722,0.702755,0.240692
6,1,small_bowel_site_1,SBwl,1242912402,1096324079,35.650173,7100.0,8323.0,9137.0,4357.0,127.0,3.611431,0.448979,0.270676
7,2,normal_blood,Nml,1395804938,1194671812,37.449153,,,,,,,,
8,2,omentum_site_1,Om1,1058316688,936624472,30.631961,1860.0,2705.0,2849.0,1228.0,106.0,3.016544,0.196412,0.347316
9,2,omentum_site_2,Om2,1191965188,1070042684,34.932824,3046.0,4036.0,4242.0,1980.0,107.0,3.014453,0.23995,0.348032


In [4]:
discovery_samples_df[discovery_samples_df['patient_id']==1]['sample_id']

0          normal_blood
1        omentum_site_1
2    right_ovary_site_1
3    right_ovary_site_2
4    right_ovary_site_3
5    right_ovary_site_4
6    small_bowel_site_1
Name: sample_id, dtype: object

In [5]:
# (2) Prepare data for clustering algorithms (MACHINA, PyClone)
# (2a) prep data for MACHINA clustering
cols = ['sample_id', 'ref_counts', 'alt_counts', 'character_label']

# put df in format used in MACHINA, with columns:
#'#sample_index', 'sample_label','anatomical_site_index', 'anatomical_site_label', 'character_index', 'character_label', 'ref', 'var'
for patient_id in patient_ids:
    subset = reads_df[reads_df['patient_id'] == patient_id][cols]
    # only keep the high quality discovery samples used in the paper
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    subset = subset[subset['sample_id'].isin(discovery_samples)]
    # remove normal samples
    subset = subset[subset['sample_id'] != 'normal_blood'] 
    subset = subset.rename(columns={"ref_counts": "ref", "alt_counts": "var", "sample_id": "sample_label"})
    char_labels = list(subset['character_label'].unique())
    sites = list(subset['sample_label'].unique())
    subset['character_index'] = subset.apply(lambda row: char_labels.index(row['character_label']), axis=1)
    subset['#sample_index'] = subset.apply(lambda row: sites.index(row['sample_label']), axis=1)
    subset['anatomical_site_index'] = subset['#sample_index']
    subset['anatomical_site_label'] = subset['sample_label']
    subset = subset[['#sample_index', 'sample_label','anatomical_site_index', 'anatomical_site_label', 'character_index', 'character_label', 'ref', 'var']]
    subset.to_csv(os.path.join(data_dir,  f"reads_patient{patient_id}.csv"), index=False)


In [6]:
# (2b) prep data for PyClone clustering
# get CNA info from supplement of McPherson et. al.
cna_df = pd.read_csv(os.path.join(data_dir, 'supplement_table_7.csv'))
print(cna_df)

def subset_cna(row):
    chrom = row['chrom']
    coord = row['coord']
    patient_id = row['patient_id']
    sample_id = row['sample_id']
    df = cna_df[(cna_df['patient_id'] == patient_id) & (cna_df['sample_id'] == sample_id) & (cna_df['chrom'] == chrom) & (cna_df['start'] <= coord) & (cna_df['end'] >= coord)]
    return df

def major_cn(row):
    coord_df = subset_cna(row)
    if len(coord_df) != 1:
        #print("Not found in CNA:\n", row['chrom'], row['coord'])
        return 0
    return int(coord_df['major'])

def minor_cn(row):
    coord_df = subset_cna(row)
    if len(coord_df) != 1:
        return 0
    return int(coord_df['minor'])


# Add CNAs to reads_df
cna_reads_df = reads_df.copy(deep=True)
cna_reads_df['major_cn'] =  cna_reads_df.apply(lambda row: major_cn(row), axis=1)
cna_reads_df['minor_cn'] =  cna_reads_df.apply(lambda row: minor_cn(row), axis=1)
cna_reads_df

        patient_id           sample_id chrom      start        end  major  \
0                1  small_bowel_site_1     1          1      71283     16   
1                1  small_bowel_site_1     1      73560      94700      7   
2                1  small_bowel_site_1     1     101640     110479     13   
3                1  small_bowel_site_1     1     110479     141357     22   
4                1  small_bowel_site_1     1     144162     247926     24   
...            ...                 ...   ...        ...        ...    ...   
135154          10  right_ovary_site_4     X  152951113  152955404      0   
135155          10  right_ovary_site_4     X  152955404  154151513      2   
135156          10  right_ovary_site_4     X  154151513  154887231      2   
135157          10  right_ovary_site_4     X  154887231  154915957      2   
135158          10  right_ovary_site_4     X  154915957  155255695      2   

        minor  major_sub  minor_sub  subclonal  
0           7          0  

Unnamed: 0,sample_id,patient_id,malignant,primer_id,chrom,coord,ref,alt,ref_counts,alt_counts,...,alt_freq,background_average_alt_freq,ref_p_value,alt_p_value,status,gene_name,snpeff_impact,character_label,major_cn,minor_cn
0,normal_blood,1,no,1_b_amplicrazy,1,17090971,G,A,641,0,...,0.000000,0.002034,0.0,1.000000e+00,normal_sample,,,1:17090971,0,0
1,omentum_site_1,1,yes,1_b_amplicrazy,1,17090971,G,A,449,0,...,0.000000,0.001597,0.0,1.000000e+00,wildtype,,,1:17090971,13,5
2,right_ovary_site_1,1,yes,1_b_amplicrazy,1,17090971,G,A,395,64,...,0.139434,0.002375,0.0,7.795800e-90,somatic,,,1:17090971,4,2
3,right_ovary_site_2,1,yes,1_b_amplicrazy,1,17090971,G,A,415,1,...,0.002404,0.002754,0.0,6.825264e-01,wildtype,,,1:17090971,6,2
4,right_ovary_site_3,1,yes,1_b_amplicrazy,1,17090971,G,A,609,4,...,0.006525,0.002145,0.0,4.440635e-02,wildtype,,,1:17090971,6,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15293,normal_blood,10,no,10_b_amplicrazy,X,109694487,A,G,3669,10,...,0.002718,0.002239,0.0,3.130859e-01,normal_sample,RGAG1,LOW,RGAG1:X:109694487,0,0
15294,right_ovary_site_1,10,yes,10_b_amplicrazy,X,109694487,A,G,2037,295,...,0.126501,0.002306,0.0,0.000000e+00,somatic,RGAG1,LOW,RGAG1:X:109694487,1,0
15295,right_ovary_site_2,10,yes,10_b_amplicrazy,X,109694487,A,G,1369,4,...,0.002913,0.002442,0.0,4.314746e-01,wildtype,RGAG1,LOW,RGAG1:X:109694487,1,0
15296,right_ovary_site_3,10,yes,10_b_amplicrazy,X,109694487,A,G,2675,4,...,0.001493,0.002189,0.0,8.364492e-01,wildtype,RGAG1,LOW,RGAG1:X:109694487,1,0


In [7]:
# do some pre-processing to remove any mutations where CNA info is not available or
# the major allele CN is 0 (if the malignant cells have no copies of the region 
# overlapping the mutation, the mutation cannot exist.) or minor allele CN > major allele CN
cna_reads_df = cna_reads_df[cna_reads_df['major_cn'] != 0]
cna_reads_df = cna_reads_df[(cna_reads_df['minor_cn'] <= cna_reads_df['major_cn'])]
cna_reads_df['normal_cn'] = 2 # copy number of the locus in normal cells is 2 (no male chromosomes here)

pyclone_dir = os.path.join(data_dir, "pyclone_preprocessing")

cols = ['sample_id', 'ref_counts', 'alt_counts', 'character_label', "chrom", "coord", "major_cn", "minor_cn", "normal_cn"]
# put df in format used in PyClone, with columns:
#'mutation_id', 'ref_counts','var_counts', 'normal_cn', 'minor_cn', 'major_cn'
for patient_id in patient_ids:
    patient_dir = os.path.join(pyclone_dir, f"reads_patient_{patient_id}")
    if not os.path.exists(patient_dir): 
        os.makedirs(patient_dir)
    patient_subset = cna_reads_df[cna_reads_df['patient_id'] == patient_id][cols]
    # only keep the high quality discovery samples used in the paper
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    patient_subset = patient_subset[patient_subset['sample_id'].isin(discovery_samples)]
    # remove normal samples
    patient_subset = patient_subset[patient_subset['sample_id'] != 'normal_blood'] 
    sample_ids = list(patient_subset['sample_id'].unique())
    patient_subset = patient_subset.rename(columns={"ref_counts": "ref_counts", "alt_counts": "var_counts", "character_label": "mutation_id"})
    
    for sample_id in sample_ids:
        sample_subset = patient_subset[patient_subset['sample_id'] == sample_id]
        sample_subset = sample_subset[['mutation_id', 'ref_counts', 'var_counts', 'normal_cn', 'minor_cn', 'major_cn']]
        sample_subset.to_csv(os.path.join(patient_dir,  f"reads_patient{patient_id}_{sample_id}.csv"))    


In [8]:
# Setup PyClone commands to run for each patient

for patient_id in patient_ids:
    cmd = ["PyClone", "run_analysis_pipeline", "--in_files"]
    patient_dir = os.path.join(pyclone_dir, f"reads_patient_{patient_id}")
    discovery_samples = list(discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id'])
    discovery_samples.remove("normal_blood")
    # Add tsvs for each sample
    for sample_id in discovery_samples:
        cmd.append(os.path.join(patient_dir,  f"reads_patient{patient_id}_{sample_id}.tsv"))
    cmd += ["--working_dir", patient_dir, "--tumour_contents"]
    # Add tumour cell proportions for each sample
    for sample_id in discovery_samples:
        tumour_prop_subset = discovery_samples_df[(discovery_samples_df['patient_id']==patient_id) & (discovery_samples_df['sample_id']==sample_id)]
        tumour_prop = tumour_prop_subset['tumour_cell_proportion']                                                                                            
        cmd.append(str(tumour_prop.values[0]))
    # Add sample names
    cmd.append("--samples")
    for sample_id in discovery_samples:
        cmd.append(sample_id)
    print(" ".join(cmd))
    

PyClone run_analysis_pipeline --in_files /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_omentum_site_1.tsv /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_right_ovary_site_1.tsv /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_right_ovary_site_2.tsv /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_right_ovary_site_3.tsv /Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/mcpherson_ovarian_2016/pyclone_preprocessing/reads_patient_1/reads_patient1_right_ovary_site_4.tsv /Us

In [47]:
# (3) Preparing for tree inference algorithms (SPRUCE, PairTree)
# (3a) Write results of PyClone clustering out to file to use as input for SPRUCE (tree inference)
# For each patient, write a file where each line contains the mutations in each cluster

for patient_id in patient_ids:
    patient_dir = os.path.join(pyclone_dir, f"reads_patient_{patient_id}", "tables")
    cluster_df = pd.read_csv(os.path.join(patient_dir, 'loci.tsv'), sep = "\t")
    clusters = cluster_df['cluster_id'].unique()
    clusters.sort()
    assert((clusters[0] == 0) and (clusters[-1] == len(clusters)-1))
    with open(os.path.join(pyclone_dir, f"patient{patient_id}_pyclone_clusters.txt"), 'w') as f:
        for cluster in clusters:
            subset = cluster_df[cluster_df['cluster_id']==cluster]
            mutations = subset['mutation_id'].unique()
            f.write(";".join(mutations))
            f.write("\n")
            

In [48]:
# Create a tsv for each patient with ref reads, var reads, and f upper bound and f lower bound
# pooled by all SNVs in a cluster. This is used for SPRUCE

repo_dir = "/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/"
os.chdir(repo_dir)

from src.util import create_conf_intervals_from_reads as conf

cluster_split = lambda cluster_name: [x for x in cluster_name.split(";")]

for patient_id in patient_ids:
    patient_dir = os.path.join(pyclone_dir, f"reads_patient_{patient_id}")
    conf.write(os.path.join(data_dir, f"reads_patient{patient_id}.csv"),
               os.path.join(pyclone_dir, f"patient{patient_id}_pyclone_clusters.txt"),
               data_dir,
               cluster_split_function=cluster_split)


num variants: 299
anatomical site labels: ['omentum_site_1' 'right_ovary_site_1' 'right_ovary_site_2'
 'right_ovary_site_3' 'right_ovary_site_4' 'small_bowel_site_1']
num variants: 224
anatomical site labels: ['omentum_site_1' 'omentum_site_2' 'right_ovary_site_1'
 'right_ovary_site_2']
num variants: 218
anatomical site labels: ['adnexa_site_1' 'omentum_site_1' 'right_ovary_site_1'
 'right_ovary_site_2']
num variants: 233
anatomical site labels: ['left_pelvic_sidewall_site_1' 'right_ovary_site_1' 'right_ovary_site_2'
 'right_ovary_site_3' 'right_ovary_site_4']
num variants: 327
anatomical site labels: ['brain_metastasis' 'left_ovary_site_1' 'righ_pelvic_mass']
num variants: 229
anatomical site labels: ['left_ovary_site_1' 'left_ovary_site_2' 'omentum_site_1' 'omentum_site_2'
 'right_ovary_site_1']
num variants: 273
anatomical site labels: ['right_ovary_site_1' 'right_ovary_site_2' 'right_ovary_site_3'
 'right_ovary_site_4']


In [55]:
# (3b) Prepare input for PairTree
import json
pairtree_dir = os.path.join(data_dir, "pairtree")
header = ["id", "name", "var_reads", "total_reads", "var_read_prob"]

# id, name, var_reads, total_reads, var_read_prob
for patient_id in patient_ids:
    patient_subset = reads_df[reads_df['patient_id'] == patient_id]
    # only keep the high quality discovery samples used in the paper
    discovery_samples = discovery_samples_df[discovery_samples_df['patient_id']==patient_id]['sample_id']
    patient_subset = patient_subset[patient_subset['sample_id'].isin(discovery_samples)]
    # remove normal samples
    patient_subset = patient_subset[patient_subset['sample_id'] != 'normal_blood'] 
    mutation_names = list(patient_subset['character_label'].unique())
    sample_names = list(patient_subset['sample_id'].unique())
    
    # Load map from mutation name to PyClone cluster
    cluster_id_to_mut_names = {}
    with open(os.path.join(pyclone_dir, f"patient{patient_id}_pyclone_clusters.txt"), "r") as f:
        for i, line in enumerate(f):
            muts = line.strip().split(";")
            for mut in muts:
                if i not in cluster_id_to_mut_names:
                    cluster_id_to_mut_names[i] = []
                cluster_id_to_mut_names[i].append(mut)
    
    mut_name_to_mut_id = {}
    with open(os.path.join(pairtree_dir, f"patient_{patient_id}.ssm"), "w") as f:
        
        f.write("\t".join(header))
        f.write("\n")
        for i, mut in enumerate(mutation_names):
            mut_name_to_mut_id[mut] = f"m{i}"
            
            row = []
            row += [f"m{i}", mut]
            mut_patient_subset = patient_subset[patient_subset['character_label'] == mut]
            
            var_reads = []
            total_reads = []
            var_read_probs = []
            for sample in sample_names:
                mut_patient_sample = mut_patient_subset[mut_patient_subset['sample_id'] == sample]
                var = mut_patient_sample['alt_counts'].values[0]
                ref = mut_patient_sample['ref_counts'].values[0]
                var_reads.append(str(var))
                total_reads.append(str(var+ref))
                # TODO: Add CNA, for now assume no CNA diploid cells
                var_read_probs.append(str(0.5))

            row += [",".join(var_reads), ",".join(total_reads), ",".join(var_read_probs)]
            f.write("\t".join(row))
            f.write("\n")
    print(cluster_id_to_mut_names)
    json_data = {"samples": sample_names, "clusters": [], "garbage": []}
    for cid in cluster_id_to_mut_names:
        json_data["clusters"].append([mut_name_to_mut_id[x] for x in cluster_id_to_mut_names[cid]])
    print(json_data)
    
    with open(os.path.join(pairtree_dir, f"patient_{patient_id}.params.json"), 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False)
    


{0: ['12:115357954', '14:104528407', '6:159331238', 'ADAM8:10:135084844', 'BEST2:19:12866749', 'CDH12:5:21755737', 'COQ4:9:131088175', 'CPNE4:3:131436587', 'FAM63A:1:150974845', 'KRT76:12:53162562', 'POLR3C:1:145598544', 'SPAST:2:32353532', 'TRAP1:16:3724366', 'USO1:4:76726328', 'YTHDF1:20:61834739'], 1: ['11:102856321', '11:71249415', '14:102551257', '2:137238776', '3:147113881', '6:56918610', '7:100646172', '7:113219135', '8:37024639', 'ADPRHL1:13:114087226', 'ARSF:X:3007569', 'B3GALNT1:3:160818934', 'CACNA1F:X:49075401', 'CC2D2A:4:15518362', 'CDH13:16:83813772', 'DYNC2H1:11:103104860', 'EP400:12:132471331', 'FAM129B:9:130289573', 'GAREM:18:29850212', 'GLRA3:4:175654597', 'LRP4:11:46921474', 'LYN:8:56922615', 'MANSC1:12:12496026', 'MBD6:12:57920107', 'MFAP3L:4:170943584', 'MORC3:21:37736412', 'NAP1L2:X:72433430', 'NR4A1:12:52451189', 'OR2L13:1:248263077', 'PFKFB3:10:6188390', 'PLB1:2:28825725', 'RP11-923I11.4:12:52222861', 'SHOX2:3:157816006', 'SMURF1:7:98636062', 'SNORA40:10:2345400

{0: ['ACMSD:2:135616830', 'ARHGAP6:X:11272678', 'ARMCX2:X:100911994', 'COL4A6:X:107406153', 'COL4A6:X:107406157', 'DMD:X:32509541', 'ESRRG:1:216850786', 'GCA:2:163213283', 'GJB1:X:70444239', 'HSPB7:1:16345804', 'HSPB7:1:16345806', 'KIF16B:20:16496283', 'LPAR5:12:6729877', 'MAMLD1:X:149638448', 'MED12:X:70354674', 'MED1:17:37566545', 'MSANTD3-TMEFF1:9:103323781', 'MUC3A:7:100551374', 'NLRP3:1:247587425', 'PTPRR:12:71054765', 'TRIM65:17:73887350', 'WDTC1:1:27627718', 'ZNF296:19:45575767', 'ZNF527:19:37879386'], 1: ['10:102783678', '10:103338730', '10:103340714', '10:103343533', '10:104491702', '10:116659253', '10:120445794', '10:131277348', '10:131291834', '10:131293228', '10:131316390', '10:131318748', '10:131321853', '10:131323689', '10:131324776', '10:131325138', '10:131329224', '10:131331612', '10:131412605', '10:131547178', '10:19933410', '10:28436534', '10:73537978', '10:85981801', '10:95791613', '10:99220707', '10:99225738', '12:51752897', '13:48617747', '13:70753966', '13:9911606

{0: ['1:55936055', '3:190723516', '7:96077932', 'AC144449.1:2:150688158', 'CDK5RAP2:9:123290096', 'GNAQ:9:80547591', 'LNX1:4:54352324', 'LSAMP:3:115752836', 'PTPRN2:7:158378699', 'RP11-481J13.1:2:56394958'], 1: ['10:1216799', '12:64017896', '12:66440131', '12:95689974', '13:41086968', '14:48538408', '1:116408179', '1:18893145', '4:29661793', '4:43932117', '4:59189783', '4:97201193', '6:107035652', '7:143657108', '8:73249461', '9:90342546', '9:90502655', 'ABCA4:1:94522334', 'ADAMTS2:5:178556980', 'ARSD:X:2823060', 'BRCA2:13:32944624', 'BRCA2:13:32977979', 'C17orf53:17:42225396', 'CAPN10:2:241537289', 'CSMD1:8:2949102', 'DDX27:20:47858705', 'DHRS9:2:169948460', 'DLG5:10:79581848', 'ELL:19:18632836', 'ENTPD8:9:140330683', 'FGD5:3:14861524', 'FHAD1:1:15654801', 'FIBIN:11:27016214', 'FRMPD3:X:106845055', 'GABRA5:15:27193297', 'GABRB1:4:47163486', 'GABRQ:X:151804840', 'GALNT16:14:69813784', 'GHSR:3:172166164', 'GIMAP8:7:150171184', 'GPR137B:1:236347180', 'GZMA:5:54403738', 'INPPL1:11:7194448

{0: ['11:37351145', '11:74330980', '13:41705884', '13:53811341', '13:63596631', '13:84266086', '14:80858152', '18:41823577', '19:44891134', '2:103879320', '2:166905668', '2:195437326', '3:21158655', '3:76177985', '4:190509513', '5:163536989', '5:23666278', '6:130923365', '6:36976637', '6:57536605', '6:64286881', '7:108204936', '8:145737795', '8:20661859', '8:36201828', '8:52157078', '8:62987883', '8:64097997', '9:11044594', '9:28908932', '9:36879510', '9:36941475', '9:36951399', '9:37030129', 'ALPK3:15:85400381', 'ARHGEF38:4:106599060', 'ATXN3L:X:13332396', 'BACH1:21:30803838', 'BRAF:7:140510749', 'C12orf42:12:103745440', 'C5orf17:5:24118379', 'CACNA2D1:7:81877436', 'CD59:11:33739002', 'CEP152:15:49048502', 'COL11A1:1:103422610', 'CRYBG3:3:97550037', 'CSMD3:8:113339964', 'CSMD3:8:113365031', 'CSMD3:8:114225881', 'CSMD3:8:114274626', 'CXorf36:X:45015620', 'FAM208B:10:5777338', 'HAS2:8:122641005', 'HDHD1:X:7043536', 'HEPH:X:65488081', 'HLCS:21:38309208', 'HS3ST3A1:17:13465428', 'IGSF8:1:

In [49]:
# Get PyClone clusters and write paramaters file for pairtree
for patient_id in patient_ids:
    mutation_id_to_cluster = {}
    with open(os.path.join(pyclone_dir, f"patient{patient_id}_pyclone_clusters.txt"), "r") as f:
        for i, line in enumerate(f):
            muts = line.strip().split(";")
            for mut in muts:
                mutation_id_to_cluster[mut] = i
    print(mutation_id_to_cluster)


{'12:115357954': 0, '14:104528407': 0, '6:159331238': 0, 'ADAM8:10:135084844': 0, 'BEST2:19:12866749': 0, 'CDH12:5:21755737': 0, 'COQ4:9:131088175': 0, 'CPNE4:3:131436587': 0, 'FAM63A:1:150974845': 0, 'KRT76:12:53162562': 0, 'POLR3C:1:145598544': 0, 'SPAST:2:32353532': 0, 'TRAP1:16:3724366': 0, 'USO1:4:76726328': 0, 'YTHDF1:20:61834739': 0, '11:102856321': 1, '11:71249415': 1, '14:102551257': 1, '2:137238776': 1, '3:147113881': 1, '6:56918610': 1, '7:100646172': 1, '7:113219135': 1, '8:37024639': 1, 'ADPRHL1:13:114087226': 1, 'ARSF:X:3007569': 1, 'B3GALNT1:3:160818934': 1, 'CACNA1F:X:49075401': 1, 'CC2D2A:4:15518362': 1, 'CDH13:16:83813772': 1, 'DYNC2H1:11:103104860': 1, 'EP400:12:132471331': 1, 'FAM129B:9:130289573': 1, 'GAREM:18:29850212': 1, 'GLRA3:4:175654597': 1, 'LRP4:11:46921474': 1, 'LYN:8:56922615': 1, 'MANSC1:12:12496026': 1, 'MBD6:12:57920107': 1, 'MFAP3L:4:170943584': 1, 'MORC3:21:37736412': 1, 'NAP1L2:X:72433430': 1, 'NR4A1:12:52451189': 1, 'OR2L13:1:248263077': 1, 'PFKFB3