In [1]:
#written by Noah Friedman (a template for scripts to be excuted in the spyder environment
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt'

sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import mutation_modeling_util

**FUNCTIONS**

In [2]:
#CALCULATES THE DIFFERENTIAL EXPECTED oncogenic mut burden as if each signature were dominant alone
def calculate_oncogenic_mut_susceptibility_of_genes_by_signature(oncogenicSDict):
    listOfDicts = []
    sigNames = ['Signature.' + str(i) for i in range(1,31)]
    for i in range(1,31):
        curSig = 'Signature.' + str(i)
        d = {}
        for s in sigNames:
            d[s] = 0
        d[curSig] = 1
        #PRETEND we got a case with 100% signature i on the decomposition
        quadNucFractions = mutation_modeling_util.get_quadnuc_fracs_given_decomposition(d, spectraPath = pathPrefix + '/ifs/work/taylorlab/friedman/noahFirstProject/signature_sig_copy/mutation-signatures/Stratton_signatures30.txt')
        v = mutation_modeling_util.get_expected_oncogenic_val_given_quadnuc_fractions(quadNucFractions, oncogenicSDict, 'IMPACT_468')
        listOfDicts.append({'Signature_Name': curSig, 'ExpectedFracOfMutsOncogenic': v})
    return pd.DataFrame(listOfDicts)

In [3]:
def expand_data_for_plot(infoDict, n=1250):
    listOfDicts = []
    for i in range(1,n):
        if i%50==0:print i
        nmut_mbIM6 = (i*1000000.0)/1139322
        for key, value in infoDict.items():
            listOfDicts.append({'Signature': key, 'Nmut_Expected': i*value, 'Nmut_Mb': nmut_mbIM6})
    return pd.DataFrame(listOfDicts)

In [4]:
def get_indel_frac_by_dominant_signature(sigs, maf):
    d = {}
    sigNames = ['mean_' + str(i) for i in range(1,31)]
    for sig in sigNames:
        sigIds = set(sigs[sigs['dominantSig'] == sig]['Tumor_Sample_Barcode'])
        sigMuts = maf[maf['Tumor_Sample_Barcode'].isin(sigIds)]
        sigName = re.sub('mean_', 'Signature.', sig )
        d[sigName] = 1.0*sigMuts[(sigMuts['Variant_Type'] == 'INS') | (sigMuts['Variant_Type'] == 'DEL')].shape[0]/sigMuts.shape[0]
    return d

In [130]:
def calculate_oncogenicity_including_indel(row, indelFracDict):
    mmrSigs = set(['Signature.6', 'Signature.15', 'Signature.16', 'Signature.20', 'Signature.21', 'Signature.26'])
    indelOncogenicity = .3497
    if row['Signature_Name'] in mmrSigs:
        indelOncogenicity = .4856
    indelFrac = 0.156
    if row['Signature_Name'] in indelFracDict: indelFrac = indelFracDict[row['Signature_Name']]
    return (1 - indelFrac)*row['ExpectedFracOfMutsOncogenic'] + indelFrac*indelOncogenicity

In [146]:
#reload(mutation_modeling_util)
reload(maf_analysis_utils)

<module 'maf_analysis_utils' from '/Users/friedman/Desktop/mnt/ifs/work/taylorlab/friedman/myUtils/maf_analysis_utils.py'>

**WORK AREA**

In [5]:
allImpactMuts = analysis_utils. load_in_df_with_progress(filePath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', nLinesFile = 275000)


  exec(code_obj, self.user_global_ns, self.user_ns)


3.64 percent done
7.27 percent done
10.91 percent done
14.55 percent done
18.18 percent done
21.82 percent done
25.45 percent done
29.09 percent done
32.73 percent done
36.36 percent done
40.0 percent done
43.64 percent done
47.27 percent done
50.91 percent done
54.55 percent done
58.18 percent done
61.82 percent done
65.45 percent done
69.09 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


72.73 percent done
76.36 percent done
80.0 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


83.64 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


87.27 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


90.91 percent done
94.55 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


98.18 percent done
101.82 percent done
performing big concat then returning


In [5]:
allImpactMuts['isFrameShiftIndel'] = allImpactMuts['Consequence'].apply(lambda x: True if x == 'frameshift_variant' else False)


**LOAD in data about hypothetical mutatability**

In [131]:
simOncogenicitySummary = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/simulatedMutationSummary.tsv')
oncogenicSusceptibilityDict = mutation_modeling_util.calculate_quadnuc_based_oncogenic_susceptibility_dict(simOncogenicitySummary)


In [8]:
oncogenicFracDf = calculate_oncogenic_mut_susceptibility_of_genes_by_signature(oncogenicSusceptibilityDict)
mmrIndelOncogenicity = mutation_modeling_util.calculate_pan_impact_likelihood_of_oncogenic_mmr_indel(repeatRegionInfo = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/IMPACT_repeat_stats.txt')
mmr6OncSusceptibility = float(oncogenicFracDf[oncogenicFracDf['Signature_Name'] == 'Signature.6']['ExpectedFracOfMutsOncogenic'].iloc[0])


**TEMP WORK AREA**

In [11]:
repeatRegionInfo = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/IMPACT_repeat_stats.txt')

In [153]:
reload(maf_analysis_utils)

<module 'maf_analysis_utils' from '/Users/friedman/Desktop/mnt/ifs/work/taylorlab/friedman/myUtils/maf_analysis_utils.py'>

In [155]:
#LOAD in cancer type info and add it to the maf_analysis_utils
cDict = analysis_utils.get_cancer_type_information(cancerTypeDfPath = pathPrefix +'/ifs/work/taylorlab/friedman/mskImpactAsOfMarch2019/dmp/mskimpact/data_clinical_sample.txt')

In [160]:
allImpactMuts['pid'] = allImpactMuts['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
allImpactMuts['cancer_type'] = allImpactMuts['pid'].apply(lambda x: cDict[x] if x in cDict else None)

In [226]:
reload(maf_analysis_utils)
df = maf_analysis_utils.summarize_oncogenic_mutation_info(allImpactMuts, pathPrefix='/Users/friedman/Desktop/mnt/')

making dict
Colon Adenocarcinoma
Prostate Cancer
Melanoma
Colorectal Cancer
Head and Neck Cancer
Ovarian Cancer
Hepatobiliary Cancer
Small Cell Lung Cancer
Soft Tissue Sarcoma
Pancreatic Cancer
Germ Cell Tumor
Renal Cell Carcinoma
Lung Adenocarcinoma
Breast Cancer
Breast Invasive Ductal Carcinoma
Thyroid Cancer
Prostate Adenocarcinoma
Endometrial Cancer
Bone Cancer
Glioma
Bladder Cancer
Pancreatic Adenocarcinoma
Non-Small Cell Lung Cancer
Gastrointestinal Stromal Tumor
Esophagogastric Cancer
Colon Adenocarcinoma
Prostate Cancer
Melanoma
Colorectal Cancer
Head and Neck Cancer
Ovarian Cancer
Hepatobiliary Cancer
Small Cell Lung Cancer
Soft Tissue Sarcoma
Pancreatic Cancer
Germ Cell Tumor
Renal Cell Carcinoma
Lung Adenocarcinoma
Breast Cancer
Breast Invasive Ductal Carcinoma
Thyroid Cancer
Prostate Adenocarcinoma
Endometrial Cancer
Bone Cancer
Glioma
Bladder Cancer
Pancreatic Adenocarcinoma
Non-Small Cell Lung Cancer
Gastrointestinal Stromal Tumor
Esophagogastric Cancer


In [228]:
result.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/temp.tsv', index=False, sep='\t')

In [184]:
observedVsExpectedDf = pd.read_table('~/Desktop/mnt/ifs/work/taylorlab/friedman/myAdjustedDataFiles/observedVsExpectedData.tsv')

In [227]:
result = pd.merge(df, observedVsExpectedDf, on='Tumor_Sample_Barcode')

In [231]:
print len(set(allImpactMuts[allImpactMuts['oncogenic'].notnull()]['HGVSp_Short']))

30724


**THIS IS A TEMPORARY BORDER**

In [None]:
#DONT USE THIS FILE ACTUALLY
filepath = pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/simulatedMafs/allmut_impact_simulated_muts_mafAnno_trinuc_v2.maf'
#get a realistic time estimate for read table

nPossibleImpactMuts = 5008844
allPossibleMutMaf = analysis_utils.load_in_df_with_progress(filepath, nPossibleImpactMuts)



In [15]:
simDfSummary = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/simulatedMutationSummary.tsv')

In [22]:
allPossibleQuadNucs = [firstNuc + change + lastNuc for firstNuc in ['A', 'T', 'C', 'G'] for change in ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] for lastNuc in ['A', 'T', 'C', 'G']]
s = 0
for q in allPossibleQuadNucs:
    s += sum(simDfSummary[q]) - sum(simDfSummary[q + '_silent'])
print s

3430366


In [26]:
oncogenicSusceptibilityDict = mutation_modeling_util.calculate_quadnuc_based_oncogenic_susceptibility_dict(simDfSummary)

In [30]:
listOfDicts = []
for key, value in oncogenicSusceptibilityDict.items():
    print value


{'IMPACT_410': 0.03146340591369016, 'IMPACT_368': 0.033739756146312215, 'IMPACT_468': 0.029787511406596273}
{'IMPACT_410': 0.05564304461942257, 'IMPACT_368': 0.05762267121490423, 'IMPACT_468': 0.0538769406693935}
{'IMPACT_410': 0.048700109849871845, 'IMPACT_368': 0.05009694164738258, 'IMPACT_468': 0.047226812642016114}
{'IMPACT_410': 0.023026315789473683, 'IMPACT_368': 0.02422247697940165, 'IMPACT_468': 0.02186633815829997}
{'IMPACT_410': 0.1735338531233628, 'IMPACT_368': 0.17342219804134928, 'IMPACT_468': 0.17146429484811457}
{'IMPACT_410': 0.08825452168228372, 'IMPACT_368': 0.08839543076758834, 'IMPACT_468': 0.08616504854368932}
{'IMPACT_410': 0.1606259922885008, 'IMPACT_368': 0.16030348718447546, 'IMPACT_468': 0.15862214708368555}
{'IMPACT_410': 0.059239524359903616, 'IMPACT_368': 0.05973151520284577, 'IMPACT_468': 0.057668002744375835}
{'IMPACT_410': 0.04053857350800582, 'IMPACT_368': 0.04531049250535332, 'IMPACT_468': 0.03911394931151467}
{'IMPACT_410': 0.026258553922868813, 'IMPA