### Load paths

In [1]:
import pyreadr
import os
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

REPO_DIR = os.path.join(os.getcwd(), "../../")
os.chdir(REPO_DIR)

TRACERX_DATA_DIR = os.path.join(REPO_DIR, "src/data/tracerx_nsclc/")
OUTPUT_DIR = os.path.join(REPO_DIR, "src/data/tracerx_nsclc/patient_data")
CONIPHER_DIR = os.path.join(REPO_DIR, "src/data/tracerx_nsclc/conipher_inputs")

### Load sample overview information (from TRACERx publicly available data)

In [2]:
sample_info_df= pd.read_csv(os.path.join(TRACERX_DATA_DIR,"sample_overview_original.txt"), sep="\t")
print(sample_info_df['sampleType'].value_counts())
print(sample_info_df['sampleTypeDetail'].value_counts())
sample_info_df

sampleType
primary       476
metastasis    218
Name: count, dtype: int64
sampleTypeDetail
primary            476
LN                 148
metachronousMet     67
synchronousMet       3
Name: count, dtype: int64


Unnamed: 0,patient_id,tumour_id,region,sampleType,sampleTypeDetail
0,CRUK0010,CRUK0010,CRUK0010_SU_T1.R1,primary,primary
1,CRUK0010,CRUK0010,CRUK0010_SU_T1.R2,primary,primary
2,CRUK0010,CRUK0010,CRUK0010_SU_FLN1,metastasis,LN
3,CRUK0010,CRUK0010,CRUK0010_BR_LN1,metastasis,metachronousMet
4,CRUK0010,CRUK0010,CRUK0010_BR_LN2,metastasis,metachronousMet
...,...,...,...,...,...
689,CRUK0872,CRUK0872,CRUK0872_SU_T1.R1,primary,primary
690,CRUK0872,CRUK0872,CRUK0872_SU_T1.R2,primary,primary
691,CRUK0872,CRUK0872,CRUK0872_SU_T1.R3,primary,primary
692,CRUK0872,CRUK0872,CRUK0872_SU_T1.R4,primary,primary


### Load purity and ploidy information (from TRACERx publicly available data)

In [3]:
purity_ploidy_df = pd.read_csv(os.path.join(TRACERX_DATA_DIR, "20220808_purityandploidy.txt"), sep="\t")
purity_ploidy_df

Unnamed: 0,Patient,Sample,final.purity,final.ploidy
0,CRUK0090,CRUK0090_SU_T1.R2,0.54,3.15
1,CRUK0090,CRUK0090_BP_T1.R1,0.74,3.25
2,CRUK0090,CRUK0090_BR_T1.R1,0.20,3.30
3,CRUK0090,CRUK0090_SU_T1.R1,0.58,3.20
4,CRUK0090,CRUK0090_BP_T2.R1,0.23,3.40
...,...,...,...,...
689,CRUK0810,CRUK0810_SU_T1.R4,0.49,3.85
690,CRUK0810,CRUK0810_SU_T1.R1,0.25,3.70
691,CRUK0810,CRUK0810_SU_T1.R8,0.20,3.95
692,CRUK0810,CRUK0810_SU_T1.R6,0.39,3.65


### Load all patient mutation data (from TRACERx publicly available data)

In [4]:
patient_data_all = pyreadr.read_r(os.path.join(TRACERX_DATA_DIR, 'mutTableAll.cloneInfo.20220726.rda'))
patient_data_all_df = patient_data_all['mutTableAll']
print("Patient data columns:\n ", patient_data_all_df.columns)
print(patient_data_all_df[:10]['RegionSum'])

patient_data_all_df = patient_data_all_df[['mutation_id', 'patient_id', 'Hugo_Symbol','exonic.func','NucleotideChange', 'RegionSum' , 'MajorCPN_SC', 'MinorCPN_SC', 'chr', 'start', 'stop', 'ref', 'var']]
unique_patients = patient_data_all_df['patient_id'].unique()
print("Number of unique patients:", len(unique_patients))
patient_data_all_df

Patient data columns:
  Index(['mutation_id', 'patient_id', 'tumour_id', 'chr', 'start', 'stop', 'ref',
       'var', 'Hugo_Symbol', 'func', 'exonic.func', 'NucleotideChange',
       'AAChange', 'GL_VAF', 'GL_nAlt', 'GL_depth', 'ITHState', 'RegionSum',
       'Is.present', 'PyCloneClonal_SC', 'PyCloneCluster_SC',
       'cleanCluster_SC', 'PhyloCCF_SC', 'combTiming_SC', 'MutCPN_SC',
       'MajorCPN_SC', 'MinorCPN_SC', 'DriverMut', 'treeClones',
       'seedingClones', 'clonalClones', 'sharedClones', 'primaryClones',
       'metClones'],
      dtype='object')
0              BR_LN1:40/168;BR_LN2:64/276;BR_LN3:47/235;SU_FLN1:96/518;SU_T1.R1:236/317;SU_T1.R2:80/227
1               BR_LN1:14/106;BR_LN2:18/155;BR_LN3:18/138;SU_FLN1:12/102;SU_T1.R1:94/153;SU_T1.R2:27/175
2              BR_LN1:44/191;BR_LN2:55/233;BR_LN3:75/260;SU_FLN1:86/288;SU_T1.R1:148/245;SU_T1.R2:92/270
3             BR_LN1:71/525;BR_LN2:61/694;BR_LN3:60/675;SU_FLN1:57/485;SU_T1.R1:385/647;SU_T1.R2:103/654
4             

Unnamed: 0,mutation_id,patient_id,Hugo_Symbol,exonic.func,NucleotideChange,RegionSum,MajorCPN_SC,MinorCPN_SC,chr,start,stop,ref,var
0,CRUK0010:3:47103798:G,CRUK0010,SETD2,stopgain SNV,c.C6148T,BR_LN1:40/168;BR_LN2:64/276;BR_LN3:47/235;SU_FLN1:96/518;SU_T1.R1:236/317;SU_T1.R2:80/227,SU_T1.R1:3;SU_T1.R2:2;SU_FLN1:2;BR_LN1:2;BR_LN2:2;BR_LN3:2,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr3,47103798,47103798,G,A
1,CRUK0010:6:45459895:C,CRUK0010,RUNX2,,,BR_LN1:14/106;BR_LN2:18/155;BR_LN3:18/138;SU_FLN1:12/102;SU_T1.R1:94/153;SU_T1.R2:27/175,SU_T1.R1:2;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,chr6,45459895,45459895,C,G
2,CRUK0010:1:156105064:C,CRUK0010,LMNA,synonymous SNV,c.C561T,BR_LN1:44/191;BR_LN2:55/233;BR_LN3:75/260;SU_FLN1:86/288;SU_T1.R1:148/245;SU_T1.R2:92/270,SU_T1.R1:2;SU_T1.R2:2;SU_FLN1:3;BR_LN1:2;BR_LN2:2;BR_LN3:3,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr1,156105064,156105064,C,T
3,CRUK0010:6:25850091:C,CRUK0010,SLC17A3,stopgain SNV,c.G979T,BR_LN1:71/525;BR_LN2:61/694;BR_LN3:60/675;SU_FLN1:57/485;SU_T1.R1:385/647;SU_T1.R2:103/654,SU_T1.R1:2;SU_T1.R2:1;SU_FLN1:2;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,chr6,25850091,25850091,C,A
4,CRUK0010:8:16001109:G,CRUK0010,MSR1,nonsynonymous SNV,c.C991G,BR_LN1:16/116;BR_LN2:21/141;BR_LN3:27/180;SU_FLN1:49/281;SU_T1.R1:76/152;SU_T1.R2:31/139,SU_T1.R1:1;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr8,16001109,16001109,G,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...
136310,CRUK0872:22:40080280:C,CRUK0872,CACNA1I,,,SU_FLN1:18/668;SU_LN1:1/446;SU_T1.R1:0/602;SU_T1.R2:0/501;SU_T1.R3:0/443;SU_T1.R4:0/463,SU_T1.R1:3;SU_T1.R2:3;SU_T1.R3:2;SU_T1.R4:3;SU_FLN1:3,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:3;SU_FLN1:2,chr22,40080280,40080280,C,T
136311,CRUK0872:14:99872776:T,CRUK0872,SETD3,,,SU_FLN1:0/573;SU_LN1:0/518;SU_T1.R1:0/609;SU_T1.R2:0/625;SU_T1.R3:0/581;SU_T1.R4:13/510,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,chr14,99872776,99872776,T,C
136312,CRUK0872:8:110131566:C,CRUK0872,TRHR,stopgain SNV,c.C1079A,SU_FLN1:0/562;SU_LN1:3/616;SU_T1.R1:2/716;SU_T1.R2:0/763;SU_T1.R3:0/722;SU_T1.R4:16/650,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,chr8,110131566,110131566,C,A
136313,CRUK0872:1:51032853:G,CRUK0872,FAF1,nonsynonymous SNV,c.C1164G,SU_FLN1:0/389;SU_LN1:0/394;SU_T1.R1:0/464;SU_T1.R2:0/484;SU_T1.R3:11/449;SU_T1.R4:0/428,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:1,chr1,51032853,51032853,G,C


### Prep data for input into tree building methods (CONIPHER, PairTree, Orchard) and migration history methods (Metient and MACHINA)


In [5]:
'''
For Matient/MACHINA, need a tsv with minimum columns: ['#sample_index', 'sample_label', 'anatomical_site_index',
'anatomical_site_label', 'character_index', 'character_label','ref', 'var']

For CONIPHER, need a tsv with minimum columns: ['CASE_ID', 'SAMPLE', 'CHR', 'POS', 'REF', 'ALT', 
'REF_COUNT', 'VAR_COUNT', 'DEPTH', 'COPY_NUMBER_A', 'COPY_NUMBER_B', 'ACF', 'PLOIDY']
'''

import re
from src.util import pairtree_data_extraction_util as pt_util
import numpy as np

def remove_region_suffix(input_string):
    pattern = r'\.R\d+$'
    modified_string = re.sub(pattern, '', input_string)
    return modified_string

def generate_sample_and_mut_info(row):
    '''
    Given a row in the patient_data_all_df dataframe, yields two lists:
        (1) the first for MACHINA/Matient input, 
        with columns: [sample_name, mutation_name, sample_type_detailed, ref_count,var_count, sample_type, purity, ploidy]
        e.g. ["SETD3:2:14:99872776:T", "BR_LN1", "LN_SU_FLN1", 43,270, "metastasis", 0.54, 3.19]
        
        (2) the second for CONIPHER input,
        with columns: [patient_id, sample_name, chr, pos, ref, alt, ref_count, var_count, depth,
        copy_number_a, copy_number_b, purity, ploidy]
        
    '''
    mut_id = ":".join(row['mutation_id'].split(":")[1:])
    mut_name = f"{row['Hugo_Symbol']}:{mut_id}"
        
    # RegionSum  column values need to be split by sample 
    # (e.g. "SU_FLN1:0/562;SU_LN1:3/616")    
    sample_name_to_mut_counts = {x.split(":")[0]:x.split(":")[1] for x in row['RegionSum'].split(";")}
    # Don't include mutations without copy number information
    if isinstance(row['MajorCPN_SC'], float) or isinstance(row['MinorCPN_SC'], float):
        yield None
        return
    sample_name_to_major_cn = {x.split(":")[0]:x.split(":")[1] for x in row['MajorCPN_SC'].split(";")}
    sample_name_to_minor_cn = {x.split(":")[0]:x.split(":")[1] for x in row['MinorCPN_SC'].split(";")}


    for sample_name in sample_name_to_mut_counts.keys():
        mut_count = sample_name_to_mut_counts[sample_name]
        # Lookup sample info in sample_overview dataframe
        patient_sample_name = row['patient_id']+"_"+sample_name
        sample_info = sample_info_df[sample_info_df['region']==patient_sample_name]
        # Low purity samples
        if len(sample_info) == 0:
            yield None
            continue
        
        sample_type = sample_info['sampleType'].item()
        sample_type_detailed = sample_info['sampleTypeDetail'].item()+"_"+sample_name
        
        # Lookup purity and ploidy info
        purity_ploidy_info = purity_ploidy_df[purity_ploidy_df["Sample"]==patient_sample_name]
        if len(purity_ploidy_info) == 0:
            yield None
            continue
        purity = purity_ploidy_info["final.purity"].item()
        ploidy =  purity_ploidy_info["final.ploidy"].item()
        
        # Get ref and var counts
        var = int(mut_count.split("/")[0])
        total = int(mut_count.split("/")[1])
        ref = total - var
        
        # According to TRACERx: muts w/o copy number calls in this dataset are muts that either do not overlap with 
        # copy number  values – (e.g. on sex chromosomes or mutations from regions with low purity), and therefore 
        # not included in clustering and phylogenetic tree building.  
        if (sample_name not in sample_name_to_major_cn) or (sample_name not in sample_name_to_minor_cn):
            yield None
            continue
        cn_a_count = sample_name_to_major_cn[sample_name]
        cn_b_count = sample_name_to_minor_cn[sample_name]

        # remove_region_suffix(sample_type_detailed) means samples from multiple regions
        # of the same primary get mapped to the same anatomical site label, e.g. primary.T1.R1 
        # and primary.T1.R2 get mapped to primary.T1. This means they get pooled together below as well.
        mig_hist_input = [mut_name, patient_sample_name, remove_region_suffix(sample_type_detailed), 
                          ref, var,  sample_type_detailed, sample_type, purity, ploidy]
            
        conipher_input = [ row['patient_id'], patient_sample_name, row['chr'].replace('chr', ''), row['start'], 
                          row['ref'], row['var'], ref, var, total,cn_a_count, cn_b_count, purity, ploidy ]
        
        yield mig_hist_input, conipher_input

        
def format_mig_hist_df(data):
    sample_mut_df = pd.DataFrame(data)
    sample_mut_df.columns = ["character_label", "sample_label", "anatomical_site_label", "ref",
                            "var", "original_anatomical_site_label", "sample_type", "purity", "ploidy"]
    
    # Add indices for mutations, samples and anatomical sites as needed for input format
    sample_mut_df['#sample_index'] = sample_mut_df.apply(lambda row: list(sample_mut_df['sample_label'].unique()).index(row["sample_label"]), axis=1)    
    sample_mut_df['character_index'] = sample_mut_df.apply(lambda row: list(sample_mut_df['character_label'].unique()).index(row["character_label"]), axis=1)
    sample_mut_df['anatomical_site_index'] = sample_mut_df.apply(lambda row: list(sample_mut_df['anatomical_site_label'].unique()).index(row["anatomical_site_label"]), axis=1)
    
    sample_mut_df = sample_mut_df[['#sample_index', "sample_label", "anatomical_site_index", "anatomical_site_label", 
                                 'character_index', "character_label", "ref","var", "original_anatomical_site_label", "sample_type","purity", "ploidy"]]

    return sample_mut_df

def format_conipher_df(data):
    sample_mut_df = pd.DataFrame(data)
    sample_mut_df.columns = ['CASE_ID', 'SAMPLE', 'CHR', 'POS', 'REF', 'ALT', 'REF_COUNT', 'VAR_COUNT', 
                             'DEPTH', 'COPY_NUMBER_A', 'COPY_NUMBER_B', 'ACF', 'PLOIDY']
    sample_mut_df = sample_mut_df.dropna()
    return sample_mut_df


NOTE: Redirects are currently not supported in Windows or MacOs.


CUDA GPU: False


### Write all tsvs/json/ssm files

In [6]:
for patient_id in unique_patients:
    print(patient_id)
    output_rows = []
    subset = patient_data_all_df[patient_data_all_df['patient_id'] == patient_id]
    print(len(subset))
    
    mig_hist_data = []
    conipher_data = []
    for _, row in subset.iterrows(): # each row is one mutation
        generator = generate_sample_and_mut_info(row)
        for data in generator:
            if data == None:
                continue
            mig_hist_input, conipher_input = data
            mig_hist_data.append(mig_hist_input)
            conipher_data.append(conipher_input)
    mig_hist_df = format_mig_hist_df(mig_hist_data)
    conipher_df = format_conipher_df(conipher_data)
    # Write migration history inputs
    mig_hist_df.to_csv(os.path.join(OUTPUT_DIR, f"{patient_id}_SNVs.tsv"), sep="\t")
    # Write conipher inputs
    conipher_df.to_csv(os.path.join(CONIPHER_DIR, f"{patient_id}_conipher_SNVs.tsv"), sep="\t")
    # Write pairtree inputs
    pt_util.write_pairtree_inputs(mig_hist_df, patient_id, OUTPUT_DIR)

CRUK0010
242
CRUK0013
246
CRUK0284
1331
CRUK0361
1484
CRUK0497
1654
CRUK0452
525
CRUK0472
988
CRUK0484
1333
CRUK0418
11308
CRUK0721
2856
CRUK0702
646
CRUK0794
366
CRUK0004
550
CRUK0027
1157
CRUK0044
577
CRUK0052
1541
CRUK0100
986
CRUK0036
610
CRUK0083
1125
CRUK0178
944
CRUK0286
875
CRUK0256
1249
CRUK0227
785
CRUK0236
884
CRUK0352
2471
CRUK0388
199
CRUK0343
259
CRUK0422
1075
CRUK0485
1376
CRUK0478
1241
CRUK0598
3231
CRUK0590
889
CRUK0543
1273
CRUK0524
224
CRUK0567
1408
CRUK0620
1679
CRUK0625
606
CRUK0691
1517
CRUK0693
3106
CRUK0609
623
CRUK0617
766
CRUK0698
859
CRUK0799
2330
CRUK0766
1943
CRUK0769
1111
CRUK0714
872
CRUK0722
1103
CRUK0719
1046
CRUK0003
491
CRUK0089
1102
CRUK0290
1489
CRUK0468
1033
CRUK0410
522
CRUK0412
372
CRUK0415
537
CRUK0745
713
CRUK0084
1134
CRUK0009
602
CRUK0296
274
CRUK0097
604
CRUK0087
1122
CRUK0245
606
CRUK0074
1330
CRUK0022
367
CRUK0299
894
CRUK0369
2370
CRUK0301
1086
CRUK0337
2584
CRUK0311
1136
CRUK0451
923
CRUK0453
1951
CRUK0537
458
CRUK0530
902
CRUK0730
349
C

### Above is all on individual mutations. **After clustering mutations using PyClone**, we have to prep clustered inputs for PairTree/Orchard and Metient

#### Take PyClone clusters and create new params.json files with these clusters so PairTree/Orchard can be run on mutation clusters and not individual mutations

In [7]:
from src.util import pairtree_data_extraction_util as pt_util
PYCLONE_CLUSTERING_DIR = os.path.join(TRACERX_DATA_DIR, 'conipher_outputs', 'TreeBuilding')
for patient_id in unique_patients:
    pt_util.write_clustered_params_json_from_pyclone_clusters(os.path.join(PYCLONE_CLUSTERING_DIR, f"{patient_id}_conipher_SNVstreeTable_cleaned.tsv"), 
                                                              os.path.join(OUTPUT_DIR, f"{patient_id}.ssm"),
                                                              os.path.join(OUTPUT_DIR, f"{patient_id}.params.json"),
                                                              os.path.join(OUTPUT_DIR, "pyclone_clustered"), patient_id)
    

In [7]:
# import numpy as np
# from src.util import pairtree_data_extraction_util as pt_util

# agg_rules = {'ploidy':np.mean,'purity':np.mean,'sample_type':'first', 'original_anatomical_site_label':lambda x: ';'.join(set(x))}

# for patient_id in unique_patients:
#     pt_util.write_pooled_tsv_from_pairtree_clusters(os.path.join(OUTPUT_DIR, f"{patient_id}_SNVS.tsv"), 
#                                                     os.path.join(OUTPUT_DIR, f"{patient_id}.ssm"),
#                                                     os.path.join(OUTPUT_DIR, f"{patient_id}_clustered.params.json"),
#                                                     agg_rules, os.path.join(OUTPUT_DIR, "pairtree_clustered"), patient_id)
    

FileNotFoundError: [Errno 2] No such file or directory: '/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/jupyter_notebooks/../../src/data/tracerx_nsclc/patient_data/CRUK0010_clustered.params.json'

#### Take PyClone clusters and create pooled tsvs

In [8]:
import numpy as np
from src.util import pairtree_data_extraction_util as pt_util
from src.util import data_extraction_util as dutil

agg_rules = {'ploidy':np.mean,'purity':np.mean,'sample_type':'first', 'original_anatomical_site_label':lambda x: ';'.join(set(x))}
for patient_id in unique_patients:
    dutil.write_pooled_tsv_from_pyclone_clusters(os.path.join(OUTPUT_DIR, f"{patient_id}_SNVS.tsv"), 
                                                 os.path.join(PYCLONE_CLUSTERING_DIR, f"{patient_id}_conipher_SNVstreeTable_cleaned.tsv"),
                                                 agg_rules, os.path.join(OUTPUT_DIR, "pyclone_clustered"),  patient_id )