In [1]:
'''
“Timing of divergence was performed relative to the last clonal sweep in the primary tumour. 
A summary of how individual mutation clusters were defined as clonal, subclonal and absent in 
individual tumour regions can be found in our accompanying Article17. Briefly, clusters that were 
clonal in all regions of interest (i.e. all primary regions, or all metastatic samples) were 
defined as clonal within the primary or metastases, respectively. Clusters that were subclonal 
or absent from at least one region of interest were defined as subclonal, while clusters that 
were absent from all regions of interest were defined as absent at the tumour level. The total 
number of mutations associated only to clusters defined as clonal across all primary tumour 
regions was calculated. For each metastatic sample, the total number and proportion of primary-clonal 
mutations that were also clonal in the metastasis was computed. If this proportion was less than one, 
meaning that not all primary-clonal mutations were defined as clonal in the metastatic sample, the 
metastasis was classified as early diverging. By contrast, if all primary-clonal mutations were clonal 
within the metastasis, the metastasis was defined as late diverging.”
'''

import os
import glob
import pandas as pd

repo_dir = '/lila/data/morrisq/divyak/projects/metient/metient/'
metient_conipher_output_dir = os.path.join(repo_dir,'data', 'tracerx_nsclc', 'conipher_outputs', 'TreeBuilding')

sample_info_df = pd.read_csv(os.path.join(repo_dir,'data', 'tracerx_nsclc',"sample_overview_original.txt"), sep="\t")
sample_info_df

Unnamed: 0,patient_id,tumour_id,region,sampleType,sampleTypeDetail
0,CRUK0010,CRUK0010,CRUK0010_SU_T1.R1,primary,primary
1,CRUK0010,CRUK0010,CRUK0010_SU_T1.R2,primary,primary
2,CRUK0010,CRUK0010,CRUK0010_SU_FLN1,metastasis,LN
3,CRUK0010,CRUK0010,CRUK0010_BR_LN1,metastasis,metachronousMet
4,CRUK0010,CRUK0010,CRUK0010_BR_LN2,metastasis,metachronousMet
...,...,...,...,...,...
689,CRUK0872,CRUK0872,CRUK0872_SU_T1.R1,primary,primary
690,CRUK0872,CRUK0872,CRUK0872_SU_T1.R2,primary,primary
691,CRUK0872,CRUK0872,CRUK0872_SU_T1.R3,primary,primary
692,CRUK0872,CRUK0872,CRUK0872_SU_T1.R4,primary,primary


In [4]:
# How many patients have a LN metastasis
num_patients_with_LN = sample_info_df[sample_info_df['sampleTypeDetail'] == 'LN']['patient_id'].nunique()

print(f"Number of patients with sampleTypeDetail 'LN': {num_patients_with_LN}")

Number of patients with sampleTypeDetail 'LN': 96


In [47]:
def get_divergence_timing(clonality_df): 
    primary_regions = set(clonality_df[clonality_df['sample type']=='primary']['sample name'])
    primary_clonal_mutations = clonality_df[(clonality_df['sample type']=='primary')&(clonality_df['clonality']=='clonal')]
    primary_clonal_mutations = primary_clonal_mutations.groupby('tree cluster').size().reset_index(name='count')
    # Only keep mutations that are clonal in all primary regions
    primary_clonal_mutations = set(primary_clonal_mutations[primary_clonal_mutations['count'] == len(primary_regions)]['tree cluster'])
    primary_clonal_mutations

    # Find if any of the metastatic samples is not clonal in one of the primary clonal mutations
    metastatic_clonality_df = clonality_df[(clonality_df['sample type']=='metastasis')&(clonality_df['tree cluster'].isin(primary_clonal_mutations))]
    if set(metastatic_clonality_df['clonality']) != set(['clonal']):
        return "early"
    return "late"
    
matching_files = glob.glob(f'{metient_conipher_output_dir}/*_SNVstreeTable_cleaned.tsv')
patients = [m.split("/")[-1].split("_")[0] for m in matching_files]
print(len(matching_files))
patients_and_divergence = []

for pid, fn in zip(patients, matching_files):
    data = []
    tree_info_df = pd.read_csv(fn, sep="\t")
    orig_clust_to_tree_clust = tree_info_df.set_index('originalCLUSTER')['treeCLUSTER'].to_dict()
    if len(orig_clust_to_tree_clust.values()) != len(set(orig_clust_to_tree_clust.values())):
        print(f"Multiple original clusters map to the same tree cluster for {pid}:\n",orig_clust_to_tree_clust)
    cluster_info_fn = os.path.join(metient_conipher_output_dir, f"{pid}_conipher_SNVsclusterInfo.txt")
    clust_info_df = pd.read_csv(cluster_info_fn, sep="\t")
    for i, row in clust_info_df.iterrows():
        if row['clusterID'] in orig_clust_to_tree_clust:
            tree_clust = orig_clust_to_tree_clust[row['clusterID']]
            sample_type = sample_info_df[sample_info_df['region']==row['SAMPLE']]['sampleType'].item()
            data.append([tree_clust, row['SAMPLE'], sample_type, row['clonality']])
    clonality_df = pd.DataFrame(data, columns=['tree cluster', 'sample name', 'sample type', 'clonality'])
    divergence_timing = get_divergence_timing(clonality_df)
    patients_and_divergence.append([pid, divergence_timing])
patient_to_divergence_df = pd.DataFrame(patients_and_divergence, columns=['Patient', 'Divergence time'])
patient_to_divergence_df.to_csv("./output_plots/tracerx_divergence_timing.tsv", sep="\t")
patient_to_divergence_df

126


Unnamed: 0,Patient,Divergence time
0,CRUK0472,early
1,CRUK0052,early
2,CRUK0311,early
3,CRUK0444,late
4,CRUK0036,late
...,...,...
121,CRUK0484,late
122,CRUK0495,late
123,CRUK0497,late
124,CRUK0286,late


In [36]:
clonality_df[clonality_df['tree cluster'].isin([0,13])]

Unnamed: 0,tree cluster,sample name,sample type,clonality
0,0,CRUK0472_BR_T1.R1,metastasis,clonal
1,0,CRUK0472_SU_FLN1,metastasis,clonal
2,0,CRUK0472_SU_T1.R1,primary,clonal
3,0,CRUK0472_SU_T1.R2,primary,clonal
4,0,CRUK0472_SU_T1.R3,primary,clonal
65,13,CRUK0472_BR_T1.R1,metastasis,absent
66,13,CRUK0472_SU_FLN1,metastasis,clonal
67,13,CRUK0472_SU_T1.R1,primary,clonal
68,13,CRUK0472_SU_T1.R2,primary,clonal
69,13,CRUK0472_SU_T1.R3,primary,clonal


In [40]:
# def get_divergence_timing(clonality_df): 
primary_regions = set(clonality_df[clonality_df['sample type']=='primary']['sample name'])
primary_clonal_mutations = clonality_df[(clonality_df['sample type']=='primary')&(clonality_df['clonality']=='clonal')]
primary_clonal_mutations = primary_clonal_mutations.groupby('tree cluster').size().reset_index(name='count')
# Only keep mutations that are clonal in all primary regions
primary_clonal_mutations = set(primary_clonal_mutations[primary_clonal_mutations['count'] == len(primary_regions)]['tree cluster'])
primary_clonal_mutations

# Find if any of the metastatic samples is not clonal in one of the primary clonal mutations
metastatic_clonality_df = clonality_df[(clonality_df['sample type']=='metastasis')&(clonality_df['tree cluster'].isin(primary_clonal_mutations))]
if set(metastatic_clonality_df['clonality']) != set(['clonal']):
    print("early")
    metastatic_clonality_df['']
    

early


Unnamed: 0,tree cluster,sample name,sample type,clonality
0,0,CRUK0472_BR_T1.R1,metastasis,clonal
1,0,CRUK0472_SU_FLN1,metastasis,clonal
65,13,CRUK0472_BR_T1.R1,metastasis,absent
66,13,CRUK0472_SU_FLN1,metastasis,clonal
