In [1]:
import pandas as pd
import os
import importlib

from src.util import pairtree_data_extraction_util as pt_util

# Process Gundem et.al. neuroblastoma dataset with putative oncogenic SNVs from WGS data
REPO_DIR = os.path.join(os.getcwd(), "../../")
os.chdir(REPO_DIR)

GUNDEM_DATA_DIR = os.path.join(REPO_DIR, 'src/data/gundem_neuroblastoma_2023')
OUTPUT_DIR = os.path.join(GUNDEM_DATA_DIR, 'patient_driver_genes')

tumor_info = pd.read_excel(os.path.join(GUNDEM_DATA_DIR, "gundem_neuroblastoma_supp_tables.xlsx"), sheet_name=1)
multimets_patient_info = pd.read_excel(os.path.join(GUNDEM_DATA_DIR, "gundem_neuroblastoma_supp_tables.xlsx"), sheet_name=3)

# Possible values are (1) MSK-IMPACT (2) RNA-seq (3)WGS
wgs_samples = tumor_info[tumor_info['SEQEUNCING_TYPE']=="WGS"]["DNA_SAMPLE"]
wgs_samples


0         E-H-131201-T1-1-D1-1
1         E-H-131202-T1-1-D1-1
2         E-H-131203-T1-1-D1-1
3         E-H-131204-T1-1-D1-1
4         E-H-131205-T1-1-D1-1
                ...           
242    IID_H201574_T01_01_WG01
243    IID_H201574_T02_01_WG01
244    IID_H201601_T01_01_WG01
245    IID_H201601_T02_01_WG01
246    IID_H201603_T01_01_WG01
Name: DNA_SAMPLE, Length: 247, dtype: object

In [2]:
variant_data = pd.read_excel(os.path.join(GUNDEM_DATA_DIR, "gundem_neuroblastoma_supp_tables.xlsx"), sheet_name=2)

tumor_type_short_map = {
    "frelapse":"FR",
    "diagnosis":"D",
    "relapse":"R",
    "t-resection":"TR",
    "reresection":"RR"
}
# Only keep WGS samples
variant_data = variant_data[variant_data["DNA_SAMPLE"].isin(wgs_samples)]
print(len(variant_data))
# Add tumor info for each sample (anatomical site, primary vs. local met vs. distant met)
def add_tumor_info(row, key):
    return tumor_info[tumor_info["DNA_SAMPLE"] == row["DNA_SAMPLE"]][key].item()
def add_site_info(row, site_key, type_key):
    site = tumor_info[tumor_info["DNA_SAMPLE"] == row["DNA_SAMPLE"]][site_key].item()
    tumor_type = tumor_info[tumor_info["DNA_SAMPLE"] == row["DNA_SAMPLE"]][type_key].item()
    return f"{site} - {tumor_type_short_map[tumor_type]}"

variant_data['tumor_type'] = variant_data.apply(lambda row: add_tumor_info(row, "TUMOR_SITE_CATEGORY"), axis=1)
variant_data['anatomical_site_label'] = variant_data.apply(lambda row: add_site_info(row, "TUMOR_SITE_DETAIL", "SAMPLE_TYPE"), axis=1)
variant_data['anatomical_site_broad'] = variant_data.apply(lambda row: add_tumor_info(row, "TUMOR_SITE"), axis=1)

# additional useful patient data
variant_data['sampling_time_months_from_dx'] = variant_data.apply(lambda row: add_tumor_info(row, "SAMPLING_TIME_MONTHS_FROM_DX"), axis=1)
variant_data['sample_type'] = variant_data.apply(lambda row: add_tumor_info(row, "SAMPLE_TYPE"), axis=1)
variant_data['purity'] = variant_data.apply(lambda row: add_tumor_info(row, "TUMOR_PURITY"), axis=1)
variant_data['ploidy'] = variant_data.apply(lambda row: add_tumor_info(row, "TUMOR_PLOIDY"), axis=1)

variant_data = variant_data.dropna(subset=['VAF', 'DEPTH']) # drop any NaNs
variant_data

1690


Unnamed: 0,DNA_SAMPLE,PATIENT,PATIENT_DISEASE_SUBTYPE,SAMPLE_TYPE,VARIANT_TYPE,VARIANT_EFFECT,VARIANT_LOCUS,VARIANT_CHANGE,CHR1,START1,...,SUBCLONE_TYPE,IS_DRIVER,PATHWAY,tumor_type,anatomical_site_label,anatomical_site_broad,sampling_time_months_from_dx,sample_type,purity,ploidy
14,E-H-131203-T1-1-D1-1,H131203,ATRX,diagnosis,SNV,non_synonymous_codon,PTPRT,p.X1398,20,40710659,...,private_met_clonal,clonality,,primary_disease,retroperitoneal - D,retroperitoneal,0.000,diagnosis,0.870,2.800
15,E-H-131203-T1-1-D1-1,H131203,ATRX,diagnosis,SNV,non_synonymous_codon,ARID1A,p.G931V,1,27092771,...,private_met_clonal,clonality,CHROMATIN_MODELING,primary_disease,retroperitoneal - D,retroperitoneal,0.000,diagnosis,0.870,2.800
16,E-H-131203-T1-1-D1-1,H131203,ATRX,diagnosis,Del,inframe_codon_loss,SMARCA4,p.F1234_K1237del,19,11144117,...,shared_met_clonal,yes,CHROMATIN_MODELING,primary_disease,retroperitoneal - D,retroperitoneal,0.000,diagnosis,0.870,2.800
17,E-H-131203-T1-1-D1-1,H131203,ATRX,diagnosis,SNV,non_synonymous_codon,ATRX,p.A1690D,X,76888760,...,trunk,yes,,primary_disease,retroperitoneal - D,retroperitoneal,0.000,diagnosis,0.870,2.800
18,E-H-131203-T1-1-D1-1,H131203,ATRX,diagnosis,SNV,non_synonymous_codon,FAT1,p.D2931N,4,187538949,...,trunk,clonality,,primary_disease,retroperitoneal - D,retroperitoneal,0.000,diagnosis,0.870,2.800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1836,I-H-135463-T1-2-D1-1,H135463,ATRX,frelapse,Del,frameshift_variant,KMT2D,p.P4314Sfs*19,12,49425546,...,private_met_clonal,yes,,local_disease,aortocaval lymph node - R,retroperitoneal,12.867,relapse,0.340,2.550
1837,I-H-135463-T1-2-D1-1,H135463,ATRX,frelapse,SNV,non_synonymous_codon,EPHA5,p.R356S,4,66356429,...,private_met_subclonal,clonality,,local_disease,aortocaval lymph node - R,retroperitoneal,12.867,relapse,0.340,2.550
1838,I-H-135463-T1-2-D1-1,H135463,ATRX,frelapse,SNV,non_synonymous_codon,LATS1,p.K702N,6,150001498,...,private_met_clonal,clonality,,local_disease,aortocaval lymph node - R,retroperitoneal,12.867,relapse,0.340,2.550
1839,I-H-135463-T1-2-D1-1,H135463,ATRX,frelapse,SNV,non_synonymous_codon,ALK,p.F1174S,2,29443696,...,private_met_subclonal,yes,RAS_MAPK,local_disease,aortocaval lymph node - R,retroperitoneal,12.867,relapse,0.340,2.550


In [3]:
variant_data["sample_type"].value_counts()

frelapse       218
diagnosis       77
relapse         74
t-resection     49
reresection      8
Name: sample_type, dtype: int64

## Find patients with at least one met and primary tumor sample

In [4]:
cns_met_patients = set(tumor_info[tumor_info["TUMOR_SITE"] == "CNS"]["PATIENT"].unique())
patients_with_primary_samples = set(variant_data[variant_data["tumor_type"] == "primary_disease"]["PATIENT"].unique())

cns_met_patients = list(cns_met_patients.intersection(patients_with_primary_samples))
print(f"{len(cns_met_patients)} patients with primaries and CNS mets")

distant_met_patients = set(variant_data[(variant_data["tumor_type"] == "distant_metastasis")]["PATIENT"])
distant_met_patients = list(distant_met_patients.intersection(patients_with_primary_samples))
print(f"{len(distant_met_patients)} patients with primaries and distant mets")

met_patients = set(variant_data[(variant_data["tumor_type"] == "distant_metastasis") | (variant_data["tumor_type"] == "local_disease")]["PATIENT"])
met_patients = list(met_patients.intersection(patients_with_primary_samples))

print(f"{len(met_patients)} patients with primaries and mets (local+distant)")
met_patients

7 patients with primaries and CNS mets
10 patients with primaries and distant mets
19 patients with primaries and mets (local+distant)


['H132374',
 'H134819',
 'H133120',
 'H116984',
 'H132387',
 'H132384',
 'H132380',
 'H134821',
 'H132392',
 'H132382',
 'H133121',
 'H132379',
 'H132388',
 'H132396',
 'H103207',
 'H116988',
 'H118706',
 'H112909',
 'H116991']

## Prep data for input into PairTree, Metient and MACHINA

In [5]:
def preprocess_patient_data(patient_id):
    '''
    Output a tsv for each patient with columns: ['#sample_index', 'sample_label', 'anatomical_site_index',
    'anatomical_site_label', 'character_index', 'character_label', 'ref', 'var']
    '''
    
    def label_snv(row):
        label = []
        label += [str(row['VARIANT_LOCUS']), str(row['VARIANT_CHANGE']),str(row['VARIANT_EFFECT']),f"{str(row['CHR1'])}:{str(row['START1'])}"]
        return ("_").join(label)

    patient_data = variant_data[(variant_data["PATIENT"]==patient_id)]
    if patient_data.empty:
        print(f"No variant data for {patient_id}")
        return
        
    patient_data['character_label'] = patient_data.apply(lambda row: label_snv(row), axis=1)
    patient_data['var'] = round(patient_data["VAF"]*patient_data["DEPTH"])
    patient_data['ref'] = patient_data["DEPTH"] - patient_data['var']

    # Find the intersection of variants captured in the data by each sample
    all_samples_variants = []
    for sample in patient_data["DNA_SAMPLE"].unique():   
        subset = patient_data[patient_data["DNA_SAMPLE"]==sample]
        variants = subset["character_label"]
        all_samples_variants.append(set(variants))
    intersecting_variants = list(set.intersection(*all_samples_variants))

    patient_data = patient_data[patient_data["character_label"].isin(intersecting_variants)]

    patient_data = patient_data.rename(columns={"DNA_SAMPLE":"sample_label"})
    num_variants = len(patient_data["character_label"].unique())
    if  (num_variants <= 1):
        print(f"Skipping patient {patient_id} with {num_variants} variant")
        return
    # Add indices for mutations, samples and anatomical sites as needed for input format
    patient_data['#sample_index'] = patient_data.apply(lambda row: list(patient_data['sample_label'].unique()).index(row["sample_label"]), axis=1)    

    # Some of these patients have multiple samples w/ same anatomical site label
    if len(patient_data['anatomical_site_label'].unique()) != len(patient_data['sample_label'].unique()):
        patient_data['anatomical_site_label'] = patient_data.apply(lambda row: row['anatomical_site_label'] + f" {row['#sample_index']}", axis=1)
    
    patient_data['character_index'] = patient_data.apply(lambda row: list(patient_data['character_label'].unique()).index(row["character_label"]), axis=1)
    patient_data['anatomical_site_index'] = patient_data.apply(lambda row: list(patient_data['anatomical_site_label'].unique()).index(row["anatomical_site_label"]), axis=1)
    
    patient_data = patient_data[['#sample_index', "sample_label", "anatomical_site_index", "anatomical_site_label", 
                                 'character_index', "character_label", "ref","var", "tumor_type", "anatomical_site_broad", 
                                 "sample_type", "purity", "ploidy", "sampling_time_months_from_dx"]]

    patient_data.to_csv(os.path.join(OUTPUT_DIR, f"{patient_id}_SNVs.tsv"), sep="\t")
    
    pt_util.write_pairtree_inputs(patient_data, patient_id, OUTPUT_DIR)
    

In [6]:
for patient_id in met_patients:
    output_name = patient_id
    print(patient_id)
    preprocess_patient_data(patient_id)

H132374
H134819
H133120
H116984
H132387
H132384
Skipping patient H132384 with 1 variant
H132380
Skipping patient H132380 with 1 variant
H134821
H132392
Skipping patient H132392 with 1 variant
H132382
Skipping patient H132382 with 1 variant
H133121
H132379
Skipping patient H132379 with 1 variant
H132388
H132396
Skipping patient H132396 with 1 variant
H103207
H116988
Skipping patient H116988 with 1 variant
H118706
H112909
Skipping patient H112909 with 1 variant
H116991


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Above is all on individual mutations. We can also cluster mutations by running PairTree's clustervars, and then pooling together those clustered mutations

In [11]:
import glob
import numpy as np
import json


def get_cluster_id(mut_id, clusters):
    '''
    Given mut_id = 'm1' and clusters= [['m1'], ['m0', 'm2']], returns 1 
    '''
    for i, cluster in enumerate(clusters):
        if mut_id in cluster:
            return i
    
    print(f"mut id {mut_id} not found")
    return None

for patient_id in met_patients:
    patient_tsv = os.path.join(OUTPUT_DIR, f"{patient_id}_SNVs.tsv)
    print(patient_id)
    df = pd.read_csv(patient_tsv, delimiter="\t")
    
    # 1. Get mapping between mutation names and pairtree mutation_ids
    mut_name_to_id = dict()
    with open(os.path.join(OUTPUT_DIR, f"{patient_id}.ssm")) as f:
        for i, line in enumerate(f):
            if i == 0: continue
            items = line.split("\t")
            if items[1] not in mut_name_to_id:
                mut_name_to_id[items[1]] = items[0]
                
    print(mut_name_to_id)
    # 2. Get pairtree cluster assignments
    with open(os.path.join(OUTPUT_DIR, f"{patient_id}_clustered.params.json")) as f:
        cluster_json = json.loads(f.read())
    cluster_assignments = []
    for mut_name in df['character_label']:
        cluster_assignments.append(get_cluster_id(mut_name_to_id[mut_name], cluster_json['clusters']))
    df['cluster'] = cluster_assignments
    
    # 3. Pool reference and variant allele counts from all mutations within a cluster
    pooled_df = df.drop(['character_label', 'character_index', '#sample_index', 'anatomical_site_index'], axis=1)
    # We can use "first" aggregation rules for the string columns since it is information shared by a sample
    pooled_df = pooled_df.groupby(['cluster', 'sample_label'], as_index=False).agg({'ref': np.sum, 'var': np.sum, 
                                                                                    'purity': np.mean, 'ploidy':np.mean, 
                                                                                    'sampling_time_months_from_dx':np.mean, 
                                                                                    'anatomical_site_label':'first', 
                                                                                    'sample_type':'first', 'tumor_type':'first',
                                                                                    'anatomical_site_broad':'first'})
    # 4. Add new names for clustered mutations
    mut_id_to_name = {v:k for k,v in mut_name_to_id.items()}
    cluster_id_to_cluster_name = dict()
    for i, cluster in enumerate(cluster_json['clusters']):
        cluster_comps = []
        for mut in cluster:
            cluster_comps.append(mut_id_to_name[mut])
        cluster_id_to_cluster_name[i] = ";".join(cluster_comps)
    print(cluster_id_to_cluster_name)
    pooled_df['character_label'] = pooled_df.apply(lambda row: cluster_id_to_cluster_name[row['cluster']], axis=1)
    
    # Add indices for mutations, samples and anatomical sites as needed for input format
    pooled_df['character_index'] = pooled_df.apply(lambda row: list(pooled_df['character_label'].unique()).index(row["character_label"]), axis=1)
    pooled_df['anatomical_site_index'] = pooled_df.apply(lambda row: list(pooled_df['anatomical_site_label'].unique()).index(row["anatomical_site_label"]), axis=1)
    pooled_df['#sample_index'] = pooled_df.apply(lambda row: list(pooled_df['sample_label'].unique()).index(row["sample_label"]), axis=1)    
    
    pooled_df = pooled_df[['#sample_index', "sample_label", "anatomical_site_index", "anatomical_site_label", 
                             'character_index', "character_label", "ref","var", "tumor_type", "anatomical_site_broad", 
                             "sample_type", "purity", "ploidy", "sampling_time_months_from_dx"]]
    pooled_df.to_csv(os.path.join(OUTPUT_DIR, f"{patient_id}_clustered_SNVs.tsv"), sep="\t")
    

['/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/jupyter_notebooks/../../src/data/gundem_neuroblastoma_2023/patient_driver_genes/H133120_clustered_SNVs.tsv', '/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/jupyter_notebooks/../../src/data/gundem_neuroblastoma_2023/patient_driver_genes/H134821_SNVs.tsv', '/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/jupyter_notebooks/../../src/data/gundem_neuroblastoma_2023/patient_driver_genes/H116984_SNVs.tsv', '/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/jupyter_notebooks/../../src/data/gundem_neuroblastoma_2023/patient_driver_genes/H132388_clustered_SNVs.tsv', '/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/jupyter_notebooks/../../src/data/gundem_neuroblastoma_2023/patient_driver_genes/H132388_SNVs.tsv', '/Users/divyakoyyalagunta/Desktop/Corne

KeyError: 'ATRX_p.R250*_stop_gained_X:76940000;ALK_p.R1275Q_non_synonymous_codon_2:29432664'