In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re
import scipy.stats as stats
from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
import mutationSigUtils
import maf_analysis_utils
import analysis_utils 
import get_gene_and_cohort_list_utils

filePathDict = configuration_util.get_all_files_path_dict()

In [144]:
def get_n_consequential_mut_count(mafPath):
    
    allImpactMuts = pd.read_table(mafPath)
    im3Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    allImpactMuts341 = allImpactMuts[allImpactMuts['Hugo_Symbol'].isin(im3Genes)]

    nmutDict = dict(allImpactMuts341[allImpactMuts341['Variant_Type'] == 'SNP']['Tumor_Sample_Barcode'].value_counts()) #n snps
    hotspotDict = dict(allImpactMuts341[allImpactMuts341['is-a-hotspot'] == 'Y']['Tumor_Sample_Barcode'].value_counts())
    oncogenicSNPDict = dict(allImpactMuts341[(allImpactMuts341['oncogenic'].notnull()) & (allImpactMuts341['Variant_Type'].isin(['SNP']))]['Tumor_Sample_Barcode'].value_counts())
    oncogenicINDELDict = dict(allImpactMuts341[(allImpactMuts341['oncogenic'].notnull()) & ~(allImpactMuts341['Variant_Type'].isin(['SNP']))]['Tumor_Sample_Barcode'].value_counts())
    stopGainDictTSG = dict(allImpactMuts341[(allImpactMuts341['Consequence'] == 'stop_gained') & (allImpactMuts341['Hugo_Symbol'].isin(tsgs))]['Tumor_Sample_Barcode'].value_counts())
    stopGainDictOncogene = dict(allImpactMuts341[(allImpactMuts341['Consequence'] == 'stop_gained') & (allImpactMuts341['Hugo_Symbol'].isin(oncogenes))]['Tumor_Sample_Barcode'].value_counts())
    
    #add entries with the value 0 where necessary
    for case in set(expectedDf['case']):
        if case not in hotspotDict:
            hotspotDict[case] = 0
        if case not in oncogenicDict:
            oncogenicDict[case] = 0
        if case not in stopGainDict:
            stopGainDict[case] = 0
    return nmutDict, hotspotDict, oncogenicSNPDict, oncogenicINDELDict, stopGainDictTSG, stopGainDictOncogene

#a function that returns the number of indels expected in a case as conditioned on its TMB, signatures, and msi score?
def get_expected_tsg_indels(nSnps, mmrFrac):
    INDEL_ONCOGENICITY = .55*.97 #55% of indels land in TSGs, 97% of indel are frame shifting
    INDEL_TO_SNP_RATIO_MSI = .3 #ratio of indels to snps in MSI cases
    INDEL_TO_SNP_RATIO_NON_MSI = .05 #in non MSI cases the ratio of indels to snps
    tsgIndelsExpected = 0
    indelsExpectedMMR = mmrFrac*(nSnps * INDEL_TO_SNP_RATIO_MSI)
    indelsExpectedNotMMR = indelsExpected = nSnps * INDEL_TO_SNP_RATIO_NON_MSI
    indelsExpected = indelsExpectedMMR + indelsExpectedNotMMR
    nOncogenicIndelsExpected = indelsExpected * INDEL_ONCOGENICITY
    return nOncogenicIndelsExpected


In [4]:
def summarize_dominant_signatures_of_cases(mutClassificationDir):
    d = {}
    for f in os.listdir(mutClassificationDir):
        cancerType = re.sub('.tsv', '', f)
        filePath = os.path.join(mutClassificationDir, f)
        df = pd.read_table(filePath)
        hypermutatedDf = df[df['hypermutantClassification'] == 'Hypermutated']
        domSigDict = dict(zip(hypermutatedDf['Tumor_Sample_Barcode'], hypermutatedDf['dominantSignature']))
        for tsb, domSig in domSigDict.items():
            d[tsb] = domSig
    return d

In [143]:
#TODO--run this code on all impact cases
expectedDf = pd.read_table(filePathDict['EXPECTED_MUTATION_INFO_BY_GENE'])


330

In [126]:
signatureDf = pd.read_table(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
mmrSigFracDict = dict(zip(signatureDf['Tumor_Sample_Barcode'], signatureDf['mean_MMR']))

In [156]:
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(filePathDict['CANCER_TYPE_INFO'])
domSigDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath = filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
msiCases = impactMsiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)

In [146]:
listOfDicts = []
cntr = 0
nmutDict, hotspotDict, oncogenicSNPDict, oncogenicINDELDict, stopGainDictTSG, stopGainDictOncogene = get_n_consequential_mut_count(filePathDict['IMPACT_BASE_MAF'])
for case in set(expectedDf['case']):
    
    if cntr %100 == 0: print cntr
    cntr +=1
    
    nmutCase = nmutDict[case]
    signature = None
    if case in impactMsiCases:
        signature = 'MSI'
    elif case in domSigDict:
        signature = domSigDict[case]
    
    caseExpectation = expectedDf[expectedDf['case'] == case]
    hotspotExpectation = nmutCase *sum(caseExpectation['hotspotChance'])
    oncogenicSNPExpectation = nmutCase *sum(caseExpectation['oncogenicChance'])
    truncatingExpectationTSG = nmutCase * sum(caseExpectation[caseExpectation['gene'].isin(tsgs)]['truncatingChance'])
    truncatingExpectationOncogene = nmutCase * sum(caseExpectation[~caseExpectation['gene'].isin(tsgs)]['truncatingChance'])
    
    mmrFrac = mmrSigFracDict[case] if case in mmrSigFracDict and signature == 'MSI' else 0
    oncogenicIndelExpectation = get_expected_tsg_indels(nmutCase, mmrFrac)
    oncogenicAllExpectation = oncogenicSNPExpectation + oncogenicIndelExpectation #todo flesh out
    
    observedHotspots = hotspotDict[case] if case in hotspotDict else 0
    observedOncogenicSNP = oncogenicSNPDict[case] if case in oncogenicSNPDict else 0
    observedOncogenicINDEL = oncogenicINDELDict[case] if case in oncogenicINDELDict else 0
    observedStopGainTSG = stopGainDictTSG[case] if case in stopGainDictTSG else 0
    observedStopGainOncogene = stopGainDictOncogene[case] if case in stopGainDictOncogene else 0

    listOfDicts.append({'obsHotspot':observedHotspots, 'obsOncogenicSNP': observedOncogenicSNP,
                        'obsOncogenicINDEL': observedOncogenicINDEL, 'obsOncogenic': observedOncogenicSNP + observedOncogenicINDEL,
                        'obsStopGainTSG': observedStopGainTSG, 'obsStopGainOncogene': observedStopGainOncogene,
                        'expectedTruncatingTSG': truncatingExpectationTSG, 'expectedTruncatingOncogene': truncatingExpectationOncogene,
                        'expectedHotspot': hotspotExpectation, 'expectedOncogenicSNP': oncogenicSNPExpectation,
                        'expectedOncogenicIndel': oncogenicIndelExpectation, 'expectedOncogenicAll': oncogenicAllExpectation,
                       'nmut': nmutCase, 'Tumor_Sample_Barcode': case})
    
df = pd.DataFrame(listOfDicts)
    

  This is separate from the ipykernel package so we can avoid doing imports until


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


In [161]:
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: domSigDict[x] if x in domSigDict else None)
df['cancerType'] = df['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

#reduce stuff to other
domSigsDict = {'mean_1': 'MMR', 'mean_10': 'POLE', 'mean_11': 'TMZ',
             'mean_14': 'POLE', 'mean_7': 'UV', 'mean_MMR': 'MMR',
              'mean_SMOKING': 'SMOKING', 'mean_APOBEC': 'APOBEC'}
df['dominantSignature'] = df['dominantSignature'].apply(lambda x: domSigsDict[x] if x in domSigsDict
                                                      else 'Other')
df['isMsi'] = df['Tumor_Sample_Barcode'].apply(lambda x: True if x in msiCases else False)


In [148]:
msiScoreDf = pd.read_table('/Users/friedman/Desktop/hypermutationProjectFinal/files/infoFiles/mutations_TMB_and_MSI_stats.txt')
msiScoreDict = dict(zip(msiScoreDf['Tumor_Sample_Barcode'], msiScoreDf['MSI_SCORE']))
df['msiScore'] = df['Tumor_Sample_Barcode'].apply(lambda x: msiScoreDict[x] if x in msiScoreDict else None)
df['mmrFrac'] = df['Tumor_Sample_Barcode'].apply(lambda x: mmrSigFracDict[x] if x in mmrSigFracDict else None)

  """Entry point for launching an IPython kernel.


In [149]:
cancerTypesToFocusOn = set(['Non-Small Cell Lung Cancer', 'Colorectal Cancer', 'Prostate Cancer',
                           'Glioma', 'Endometrial Cancer', 'Esophagogastric Cancer'])
df['cancerTypeAdj'] = df['cancerType'].apply(lambda x: x if x in cancerTypesToFocusOn else 'Other')

In [162]:
#writeDir = os.path.join(os.getcwd(), 'FIGURE1_PLOTTING_FILES')
#df.to_csv(os.path.join(writeDir, 'figure1e_observedVsExpected.tsv'), index=False, sep='\t')
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/figure1d.tsv', index=False, sep='\t')

In [None]:
df[['obsOncogenic', 'obsOncogenicSNP']]


#df[(df['obsOncogenic'] < df['expectedOncogenicAll']) & (df['nmut'] > 100)][['Tumor_Sample_Barcode', 'obsOncogenic', 'expectedOncogenicAll',
#                               'expectedOncogenicIndel', 'dominantSignature', 'nmut', 'isMsi', 'msiScore', 'mmrFrac']]

In [8]:
#
####
##########

#MSI indel rate modeling

Counter({'mmr_RECAPTURE': 132262,
         'mmr_TCGA': 955420,
         'pole_RECAPTURE': 3192,
         'pole_TCGA': 213287})

In [24]:
#get msi cases
tcgaMsiCases = get_gene_and_cohort_list_utils.get_tcga_msi_cases(tcgaMsiScoresPath = filePathDict['TCGA_MSI_SCORES'], msiScoreThresh=10)
impactMsiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)

  msiInfoDf = pd.read_table(msiInfoFilePath)


In [37]:
reload(get_gene_and_cohort_list_utils)
idMapping1, idMapping2 = get_gene_and_cohort_list_utils.get_exome_id_mapping()
exomeHypermutatorMaf['DMP_ID'] = exomeHypermutatorMaf['Tumor_Sample_Barcode'].apply(lambda x: idMapping1[x] if x in idMapping1 else None)

msiMaf = exomeHypermutatorMaf[(exomeHypermutatorMaf['DMP_ID'].isin(impactMsiCases)) | (exomeHypermutatorMaf['Tumor_Sample_Barcode'].isin(tcgaMsiCases))]

In [48]:
#essentialGenes = get_gene_and_cohort_list_utils.get_essential_genes(depMapPath = filePathDict['DEP_MAP_DATA'], mode='getEssentialGenes')
oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
listOfDicts = []
for case in set(msiMaf['Tumor_Sample_Barcode']):
    caseMaf = msiMaf[(msiMaf['Tumor_Sample_Barcode'] == case) & (~msiMaf['Hugo_Symbol'].isin(essentialGenes | tsgs | oncogenes))]
    snpTmb = caseMaf[caseMaf['Variant_Type'] == 'SNP'].shape[0]
    indelTmb = caseMaf[caseMaf['Variant_Type'].isin(['DEL', 'INS'])].shape[0]
    listOfDicts.append({'Tumor_Sample_Barcode': case, 'SNP_TMB': snpTmb, 'INDEL_TMB': indelTmb})
df = pd.DataFrame(listOfDicts)

In [49]:
df.to_csv('/Users/friedman/Desktop/test.tsv',index=False, sep='\t')

In [60]:
notMsiMaf = exomeHypermutatorMaf[~((exomeHypermutatorMaf['DMP_ID'].isin(impactMsiCases)) | (exomeHypermutatorMaf['Tumor_Sample_Barcode'].isin(tcgaMsiCases)))]

In [129]:
Counter(msiMaf['Variant_Type'])

Counter({'DEL': 13727, 'DNP': 98, 'INS': 3093, 'SNP': 56825, 'TNP': 1})