In [1]:
#written by Noah Friedman (a template for scripts to be excuted in the spyder environment
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt'

sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import mutation_modeling_util

**FUNCTIONS**

In [2]:
#CALCULATES THE DIFFERENTIAL EXPECTED oncogenic mut burden as if each signature were dominant alone
def calculate_oncogenic_mut_susceptibility_of_genes_by_signature(oncogenicSDict, suffix='_hotspot_rate'):
    listOfDicts = []
    sigNames = ['Signature.' + str(i) for i in range(1,31)]
    for i in range(1,31):
        curSig = 'Signature.' + str(i)
        d = {}
        for s in sigNames:
            d[s] = 0
        d[curSig] = 1
        #PRETEND we got a case with 100% signature i on the decomposition
        quadNucFractions = mutation_modeling_util.get_quadnuc_fracs_given_decomposition(d, spectraPath = pathPrefix + '/ifs/work/taylorlab/friedman/noahFirstProject/signature_sig_copy/mutation-signatures/Stratton_signatures30.txt')
        #v = mutation_modeling_util.get_expected_oncogenic_val_given_quadnuc_fractions(quadNucFractions, oncogenicSDict, 'IMPACT_468')
        #ALERT NOAH I CHANGED THIS HERE
        v = mutation_modeling_util.get_expected_oncogenic_val_given_quadnuc_fractions_v2(quadNucFractions, oncogenicSDict, suffix)

        listOfDicts.append({'Signature_Name': curSig, 'ExpectedFracOfMutsOncogenic': v})
    return pd.DataFrame(listOfDicts)

In [3]:
def expand_data_for_plot(infoDict, n=1250):
    listOfDicts = []
    for i in range(1,n):
        if i%50==0:print i
        nmut_mbIM6 = (i*1000000.0)/1139322
        for key, value in infoDict.items():
            listOfDicts.append({'Signature': key, 'Nmut_Expected': i*value, 'Nmut_Mb': nmut_mbIM6})
    return pd.DataFrame(listOfDicts)

In [4]:
def get_indel_frac_by_dominant_signature(sigs, maf):
    d = {}
    sigNames = ['mean_' + str(i) for i in range(1,31)]
    for sig in sigNames:
        sigIds = set(sigs[sigs['dominantSig'] == sig]['Tumor_Sample_Barcode'])
        sigMuts = maf[maf['Tumor_Sample_Barcode'].isin(sigIds)]
        sigName = re.sub('mean_', 'Signature.', sig )
        d[sigName] = 1.0*sigMuts[(sigMuts['Variant_Type'] == 'INS') | (sigMuts['Variant_Type'] == 'DEL')].shape[0]/sigMuts.shape[0]
    return d

In [5]:
def calculate_oncogenicity_including_indel(row, indelFracDict):
    mmrSigs = set(['Signature.6', 'Signature.15', 'Signature.16', 'Signature.20', 'Signature.21', 'Signature.26'])
    indelOncogenicity = .3497
    if row['Signature_Name'] in mmrSigs:
        indelOncogenicity = .4856
    indelFrac = 0.156
    if row['Signature_Name'] in indelFracDict: indelFrac = indelFracDict[row['Signature_Name']]
    return (1 - indelFrac)*row['ExpectedFracOfMutsOncogenic'] + indelFrac*indelOncogenicity

In [6]:
#reload(mutation_modeling_util)
reload(maf_analysis_utils)

<module 'maf_analysis_utils' from '/Users/friedman/Desktop/mnt/ifs/work/taylorlab/friedman/myUtils/maf_analysis_utils.pyc'>

**WORK AREA**

In [None]:
allImpactMuts = analysis_utils.load_in_df_with_progress(filePath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', nLinesFile = 275000)


In [7]:
print Counter(allImpactMuts['Consequence']).most_common(10)
print '____________'
print Counter(pik3caMuts['Consequence']).most_common(10)

[('missense_variant', 187649), ('frameshift_variant', 33225), ('stop_gained', 25288), ('missense_variant,splice_region_variant', 5912), ('inframe_deletion', 4517), ('splice_acceptor_variant', 4273), ('upstream_gene_variant', 3890), ('splice_donor_variant', 3486), ('inframe_insertion', 987), ('stop_gained,splice_region_variant', 860)]
____________
[('missense_variant', 6854), ('synonymous_variant', 1936), ('stop_gained', 471), ('missense_variant,splice_region_variant', 257), ('5_prime_UTR_variant', 150), ('splice_region_variant,intron_variant', 105), ('splice_region_variant,synonymous_variant', 68), ('intron_variant', 27), ('splice_acceptor_variant', 18), ('splice_donor_variant', 18)]


In [4]:
pik3caMuts = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/simulatedMafs/geneMutMafs/PIK3CA_all_possible_snps_v2.maf')

  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
#allImpactMuts['posChange'] = allImpactMuts.apply(lambda row: str(row['Start_Position']) + str(allImpactMuts['Tumor_Seq_Allele2']), axis=1)

hotspots = allImpactMuts[allImpactMuts['is-a-hotspot'] == 'Y']
hotspots['posChange'] = hotspots.apply(lambda row: str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)

print len(set(hotspots['posChange']))
#print len(set(allImpactMuts['Start_Position']))




3569


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [8]:
allImpactMuts['isFrameShiftIndel'] = allImpactMuts['Consequence'].apply(lambda x: True if x == 'frameshift_variant' else False)


**LOAD in data about hypothetical mutatability**

In [11]:
simOncogenicitySummary = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/simulatedMutationSummary.tsv')
oncogenicSusceptibilityDict = mutation_modeling_util.calculate_quadnuc_based_oncogenic_susceptibility_dict(simOncogenicitySummary)


**THIS IS A TEMPORARY BORDER**
Now it hosts noahs work on 9.11.2019

In [13]:
def quantify_quadnuc_hotspot_susceptibility_per_mutation(simMafData, relatedGenes=None):
    allBases = ['A', 'C', 'G', 'T']
    changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
    allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
    
    #simulatedMutsRelated = simMafData[simMafData['Hugo_Symbol'].isin(relatedGenes)]
    #simulatedMutsUnrelated = simMafData[~simMafData['Hugo_Symbol'].isin(relatedGenes)]
    
    d = {}
    
    for quadNuc in allQuadNucs:
        nPossibleMuts = sum(simMafData[quadNuc]) - sum(simMafData[quadNuc + '_silent'])
        nPossibleHotspotMuts = sum(simMafData[quadNuc + '_hotspot'])
        #nPossibleMutsRelated = sum(simulatedMutsRelated[quadNuc]) - sum(simulatedMutsRelated[quadNuc + '_silent'])
        #nPossibleMutsUnrelated = sum(simulatedMutsUnrelated[quadNuc]) - sum(simulatedMutsUnrelated[quadNuc + '_silent'])
        #nPossibleHotspotMutRelated = sum(simulatedMutsRelated[quadNuc + '_hotspot'])
        #nPossibleHotspotMutUnrelated = sum(simulatedMutsUnrelated[quadNuc + '_hotspot'])
        #d[quadNuc + '_related_hotspot_rate'] = (1.0*nPossibleHotspotMutRelated)/(1.0*nPossibleMutsRelated)
        #d[quadNuc + '_unrelated_hotspot_rate'] = (1.0*nPossibleHotspotMutUnrelated)/(1.0*nPossibleMutsUnrelated)
        d[quadNuc + '_hotspot_rate'] = (1.0*nPossibleHotspotMuts)/(1.0*nPossibleMuts)
        
    return d

def quantify_truncating_susceptibility_per_mutation(simMafData, relatedGenes=None):
    allBases = ['A', 'C', 'G', 'T']
    changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
    allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures

    d = {}
    
    for quadNuc in allQuadNucs:
        nPossibleMuts = sum(simMafData[quadNuc]) - sum(simMafData[quadNuc + '_silent'])
        nPossibleHotspotMuts = sum(simMafData[quadNuc + '_nonsenseMutations'])
     
        d[quadNuc + '_nonsense_rate'] = (1.0*nPossibleHotspotMuts)/(1.0*nPossibleMuts)  
    return d

def calculate_quadnuc_based_oncogenic_susceptibility_dict(simMafData):
    d = {}
    allBases = ['A', 'C', 'G', 'T']
    changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
    allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
    for quadNuc in allQuadNucs:
        nPossibleMuts = sum(simMafData[quadNuc]) - sum(simMafData[quadNuc + '_silent'])
        nPossibleOncogenicMuts = sum(simMafData[quadNuc + '_oncogenic'])
        d[quadNuc + '_oncogenic_rate'] = (1.0*nPossibleOncogenicMuts)/(1.0*nPossibleMuts)
    return d

In [14]:
simulatedDataSummary = pd.read_table('/Users/friedman/Desktop/workOffline/mutSimulationInfoIncludingHotspots.tsv')

In [24]:
impactSigs = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/signatures_from_unfiltered_maf.txt')
impactSigs = mutationSigUtils.merge_signature_columns(impactSigs)
impactSigs['dominantSignature'] = impactSigs.apply(lambda row: 
        mutationSigUtils.get_dominant_signature(row.to_dict(), cols=None, prefix='mean', notEnoughMuts= True), axis=1)

casesWithMSISignature = set(impactSigs[(impactSigs['dominantSignature'] == 'mean_MMR')
                                      |((impactSigs['Nmut_Mb'] >= 30) & (impactSigs['dominantSignature'].isin(set(['mean_1']))))
                                        ]['Tumor_Sample_Barcode'])

casesWithPOLESignature = set(impactSigs[(impactSigs['dominantSignature'] == 'mean_10') & (impactSigs['Nmut_Mb'] > 30)]['Tumor_Sample_Barcode'])
casesWithTMZSignature = set(impactSigs[(impactSigs['dominantSignature'] == 'mean_11') & (impactSigs['Nmut_Mb'] > 30)]['Tumor_Sample_Barcode'])

allImpactMuts = maf_analysis_utils.fix_mll_genes(allImpactMuts)
allImpactMuts['quadNuc'] = allImpactMuts.apply(lambda row: mutationSigUtils.create_reference_four_nuc(row['Ref_Tri'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)

poleMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(casesWithPOLESignature)]
tmzMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(casesWithTMZSignature)]
mmrMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(casesWithMSISignature)]

In [16]:
impact341Genes = set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
simulatedDataSummary341 = simulatedDataSummary[simulatedDataSummary['Hugo_Symbol'].isin(impact341Genes)]

In [None]:
reload(mutation_modeling_util)
quadNucHotspotSusceptibilityDict = quantify_quadnuc_hotspot_susceptibility_per_mutation(simulatedDataSummary341)
signatureHotspotSusceptibilityDf = calculate_oncogenic_mut_susceptibility_of_genes_by_signature(quadNucHotspotSusceptibilityDict)


In [25]:
poleQuadNucs = set(['TCAT', 'TCAA', 'TCTG', 'TCTT', 'TTGT'])
tmzQuadNucs = set(['ACTC', 'ACTT', 'CCTC', 'CCTT', 'GCTC', 'GCTT', 'TCTC', 'TCTT'])
mmrQuadNucs = set([])
signatureHotspotSusceptibilityDict = dict(zip(signatureHotspotSusceptibilityDf['Signature_Name'], signatureHotspotSusceptibilityDf['ExpectedFracOfMutsOncogenic']))

In [26]:
def calculate_hotspot_stats(maf, quadNucs, sig):
    maf341 = maf[maf['Hugo_Symbol'].isin(impact341Genes)]
    listOfDicts = []
    for case in set(maf341['Tumor_Sample_Barcode']):
        caseMaf = maf341[maf341['Tumor_Sample_Barcode'] == case]
        hotspotMaf = caseMaf[caseMaf['is-a-hotspot'] == 'Y']
        hotspotQuadNucMaf = hotspotMaf[hotspotMaf['quadNuc'].isin(quadNucs)]

        nMutationsIM341 = caseMaf.shape[0]
        nTotalHotspots = hotspotMaf.shape[0]
        nHotspotsAtQuadNuc = hotspotQuadNucMaf.shape[0]
        
        quadnucChangeCounter = Counter(caseMaf['quadNuc'])
        expectedHotspots = 0
        for qChange, count in quadnucChangeCounter.items():
            if isinstance(qChange, basestring):
                expectedHotspots += count*quadNucHotspotSusceptibilityDict[qChange + '_hotspot_rate']

        listOfDicts.append({'Tumor_Sample_Barcode': case, 'nmutIM341': nMutationsIM341,
        'nTotalHotspots': nTotalHotspots, 'nHotspotsInducedBySig': nHotspotsAtQuadNuc,
        'nHotspotsExpected': nMutationsIM341*signatureHotspotSusceptibilityDict[sig],
        'nHotspotsExpected2': expectedHotspots})
    df = pd.DataFrame(listOfDicts) 
    return df

In [36]:
mutsDfTMZ = calculate_hotspot_stats(tmzMaf, tmzQuadNucs, 'Signature.11')
mutsDfPOLE = calculate_hotspot_stats(poleMaf, poleQuadNucs, 'Signature.10')
mutsDfMMR = calculate_hotspot_stats(mmrMaf, mmrQuadNucs, 'Signature.6')

In [40]:
mutsDfTMZ['sigClass'] = 'TMZ'
mutsDfPOLE['sigClass'] = 'POLE'
mutsDfMMR['sigClass'] = 'MMR'
combinedMutsDf = pd.concat([mutsDfTMZ, mutsDfPOLE, mutsDfMMR])

In [41]:
combinedMutsDf['nHotspotsExpected'] = combinedMutsDf.apply(lambda row:
    (row['nHotspotsExpected'] + row['nHotspotsExpected2'])/2.0, axis=1)

In [42]:
combinedMutsDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/hotspotObsExpected.tsv', index=False, sep='\t')

In [23]:
mutsDf[['nHotspotsExpected', 'nHotspotsExpected2']]

Unnamed: 0,nHotspotsExpected,nHotspotsExpected2
0,0.130884,0.135715
1,0.183660,0.205330
2,0.217436,0.219637
3,0.101329,0.115528
4,0.107663,0.159105
5,0.086552,0.106091
6,0.185771,0.161812
7,0.065442,0.090883
8,0.263879,0.256329
9,0.141439,0.168513


**truncating mutations analysis**

In [43]:
tumorSuppresors = set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])

In [44]:
possibleMutationsIncludingTruncating = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/simulatedMutationSummaryV3-Sep11.tsv')

In [45]:
mergedDfIncludingTruncating = pd.merge(possibleMutationsIncludingTruncating, simulatedDataSummary)
#I am fixing my error conservatively with a pandas merge

In [46]:
#we create seperate estimates of rates of truncating mutations based on the gene content of TSGs and Oncogenes
mergedDfTSG = mergedDfIncludingTruncating[mergedDfIncludingTruncating['Hugo_Symbol'].isin(tumorSuppresors)]
mergedDfOncogene = mergedDfIncludingTruncating[~mergedDfIncludingTruncating['Hugo_Symbol'].isin(tumorSuppresors)]

truncatingSusceptibilityDictTSG = quantify_truncating_susceptibility_per_mutation(mergedDfTSG)
truncatingSusceptibilityDictOncogene = quantify_truncating_susceptibility_per_mutation(mergedDfOncogene)

In [None]:
#get the rate of nonsense mutations per signature
signatureNonsenseRateTSG = calculate_oncogenic_mut_susceptibility_of_genes_by_signature(
    truncatingSusceptibilityDictTSG, suffix='_nonsense_rate')
signatureNonsenseRateOncogene = calculate_oncogenic_mut_susceptibility_of_genes_by_signature(
    truncatingSusceptibilityDictOncogene, suffix='_nonsense_rate')
#truncatingSusceptibilityDict['TCAG_nonsense_rate']


In [48]:
signatureNonsenseRateDictTSG = dict(zip(signatureNonsenseRateTSG['Signature_Name'], signatureNonsenseRateTSG['ExpectedFracOfMutsOncogenic']))
signatureNonsenseRateDictOncogene = dict(zip(signatureNonsenseRateOncogene['Signature_Name'], signatureNonsenseRateOncogene['ExpectedFracOfMutsOncogenic']))


In [108]:

print signatureNonsenseRateDictTSG['Signature.10'], signatureNonsenseRateDictOncogene['Signature.10']

0.0896027574912 0.0551205564433


In [58]:
poleMaf341SNPs = poleMaf341[poleMaf341['Variant_Type'] == 'SNP']
listOfDicts = []
for case in set(poleMaf341SNPs['Tumor_Sample_Barcode']):
    caseMaf = poleMaf341SNPs[poleMaf341SNPs['Tumor_Sample_Barcode'] == case]
    nmutIM341 = caseMaf.shape[0]
    caseMafOncogene = caseMaf[~caseMaf['Hugo_Symbol'].isin(tumorSuppresors)]
    caseMafTumorSuppressor = caseMaf[caseMaf['Hugo_Symbol'].isin(tumorSuppresors)]
    
    nOncogeneMuts = caseMafOncogene.shape[0]
    nTSGMuts = caseMafTumorSuppressor.shape[0]
    
    nOncogeneTruncating = caseMafOncogene[caseMafOncogene['Consequence'] == 'stop_gained'].shape[0]
    nTsgTruncating = caseMafTumorSuppressor[caseMafTumorSuppressor['Consequence'] == 'stop_gained'].shape[0]
    
    nOncogeneExpected = nOncogeneMuts*signatureNonsenseRateDictOncogene['Signature.10']
    nTSGExpected = nTSGMuts*signatureNonsenseRateDictTSG'Signature.10']
    
    listOfDicts.append({
        'Tumor_Sample_Barcode': case, 'nOncogeneTruncating': nOncogeneTruncating, 'nTsgTruncating': nTsgTruncating,
        'nOncogeneExpected': nOncogeneExpected, 'nTSGExpected': nTSGExpected, 'nmut': nmutIM341
    })
    
df = pd.DataFrame(listOfDicts)

NameError: name 'poleMaf341' is not defined

In [132]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/truncatingObsExpectedPOLE.tsv', index=False, sep='\t')

**POLE-PENTA-nucleotides**

In [50]:
pentaNucPossibleMaf = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/pentaNucMutationSummary.tsv')

In [51]:
pentaNucPossibleMaf341 = pentaNucPossibleMaf[pentaNucPossibleMaf['Hugo_Symbol'].isin(impact341Genes)]

In [52]:
#Get the changes of each mutation class at the penta nucleotide
letters = ['A', 'C', 'G', 'T']
changes = ['(C>A)', '(C>G)', '(C>T)', '(T>A)', '(T>C)', '(T>G)']
truncChanceD = {}
hotspotChanceD = {}
for first in letters:
    for second in letters:
        for third in letters:
            for fourth in letters:
                for change in changes:
                    penta = first + second + change + third + fourth
                    
                    #TRUNCATING MUTATIONS
                    truncKey = penta + '_truncating'
                    if truncKey not in pentaNucPossibleMaf341.columns.values:
                        truncChanceD[penta] = 0
                    else: 
                        truncChanceD[penta] = (1.0*np.nansum(pentaNucPossibleMaf341[truncKey]))/np.nansum(pentaNucPossibleMaf341[penta + '_nonSilent'])
                    
                    #HOTSPOTS
                    hotspotKey = penta + '_hotspot'
                    if hotspotKey not in pentaNucPossibleMaf341.columns.values:
                        hotspotChanceD[penta] = 0
                    else:
                        hotspotChanceD[penta] = (1.0*np.nansum(pentaNucPossibleMaf341[hotspotKey]))/np.nansum(pentaNucPossibleMaf341[penta + '_nonSilent'])

                        
                        


In [53]:
poleMafWithPentaContext = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/poleCaseMafWithPentanucleotideContext.maf')
poleMafWithPentaContext['pentaChange'] = poleMafWithPentaContext.apply(lambda row: 
                                    mutationSigUtils.create_strand_specific_pentanucleotide_change(row['Ref_Tri.1'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)


  interactivity=interactivity, compiler=compiler, result=result)


In [54]:
poleMafWithPentaContext341 = poleMafWithPentaContext[poleMafWithPentaContext['Hugo_Symbol'].isin(set(pentaNucPossibleMaf341['Hugo_Symbol']))]
poleMafWithPentaContext341 = poleMafWithPentaContext341[poleMafWithPentaContext341['pentaChange'].notnull()]

In [None]:
#cdsDict = analysis_utils.get_cds_size_targeted_by_impact(infoFilePath = pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/impact_gene_reference_signatures.tsv')
#for gene, count in Counter(poleMafWithPentaContext341[(poleMafWithPentaContext341['Consequence'] == 'stop_gained')
#            & (~poleMafWithPentaContext341['Hugo_Symbol'].isin(tumorSuppresors))]['Hugo_Symbol']).most_common(30):
#    
#    print count, gene, 1.0*count/cdsDict[gene]


print poleMafWithPentaContext341[(poleMafWithPentaContext341['Consequence'] == 'stop_gained') & (poleMafWithPentaContext['Hugo_Symbol'] == 'GNAQ')]

In [89]:
listOfDicts = []
for case in set(poleMafWithPentaContext341['Tumor_Sample_Barcode']):
    caseMaf = poleMafWithPentaContext341[poleMafWithPentaContext341['Tumor_Sample_Barcode'] == case]
    caseMafOncogenes = caseMaf[~caseMaf['Hugo_Symbol'].isin(tumorSuppresors)]
    caseMafTSGs = caseMaf[caseMaf['Hugo_Symbol'].isin(tumorSuppresors)]
    
    pentaChangeCounterAll = Counter(caseMaf['pentaChange'])
    pentaChangeCounterOncogenes = Counter(caseMafOncogenes['pentaChange'])
    pentaChangeCounterTSGs = Counter(caseMafTSGs['pentaChange'])
    
    expectedOncogeneTrunc = 0
    for pentaChange, count in pentaChangeCounterOncogenes.items():
        expectedOncogeneTrunc += count*truncChanceD[pentaChange]
    
    expectedTSGTrunc = 0
    for pentaChange, count in pentaChangeCounterTSGs.items():
        expectedTSGTrunc += count*truncChanceD[pentaChange]
        
    #WE INCLUDE HOTSPOT INFORMATION AS WELL
    expectedHotspots = 0
    for pentaChange, count in pentaChangeCounterAll.items():
        expectedHotspots += count*hotspotChanceD[pentaChange]
        
        
        
    #OTHER METHOD3##################################
    nOncogeneExpected2 = caseMafOncogenes.shape[0]*signatureNonsenseRateDictOncogene['Signature.10']
    nTSGExpected2 = caseMafTSGs.shape[0]*signatureNonsenseRateDictTSG['Signature.10']
    ##################################

    nOncogeneTrunc = caseMafOncogenes[caseMafOncogenes['Consequence'] == 'stop_gained'].shape[0]
    nTSGTrunc = caseMafTSGs[caseMafTSGs['Consequence'] == 'stop_gained'].shape[0]
    nHotspot = caseMaf[caseMaf['is-a-hotspot'] == 'Y'].shape[0]
    nmut = caseMaf.shape[0]
    
    listOfDicts.append({'Tumor_Sample_Barcode': case, 'nOncogeneTrunc': nOncogeneTrunc, 'nTSGTrunc': nTSGTrunc,
                       'expectedOncogeneTrunc1': expectedOncogeneTrunc, 'expectedTSGTrunc1': expectedTSGTrunc,
                        'expectedOncogeneTrunc2': nOncogeneExpected2, 'expectedTSGTrunc2': nTSGExpected2,
                        'nmut': nmut, 'nHotspots': nHotspot, 'nExpectedHotspots': expectedHotspots
                       })
    
    
df = pd.DataFrame(listOfDicts)

In [87]:
df['expectedTruncatingOncogene'] = df.apply(lambda row: (row['expectedOncogeneTrunc1'] + row['expectedOncogeneTrunc2'])/2.0, axis=1)
df['expectedTruncatingTSG'] = df.apply(lambda row: (row['expectedTSGTrunc1'] + row['expectedTSGTrunc2'])/2.0, axis=1)


In [88]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/truncatingObsExpectedPOLEPenta.tsv', index=False, sep='\t')

In [71]:
signatureNonsenseRateDictTSG['Signature.10'], signatureNonsenseRateDictOncogene['Signature.10']

(0.08960275749120644, 0.055120556443320756)

**Mutations at hotspot motifs**

In [46]:
possibleHotspots = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/hotspotContextSummary.tsv')

In [84]:
possibleHotspots['pentaChange'] = possibleHotspots.apply(lambda row: 
                                    mutationSigUtils.create_strand_specific_pentanucleotide_change(row['Ref_Tri.1'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)


In [60]:
poleMotifs = ['TCAT', 'TCTG']

In [103]:
poleMafWithPentaContext = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/poleCaseMafWithPentanucleotideContext.maf')
poleMaf = poleMafWithPentaContext
poleMaf['allele'] = poleMaf['Hugo_Symbol'] + '_' + poleMaf['HGVSp_Short']
poleMaf['pentaChange'] = poleMaf.apply(lambda row: 
                                    mutationSigUtils.create_strand_specific_pentanucleotide_change(row['Ref_Tri.1'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)


In [139]:
poleGreaterThanOnePercentMotifs = [x[0] for x in Counter(poleMaf['pentaChange']).most_common(100) if (1.0*x[1])/poleMaf.shape[0] > .01]

In [140]:
polePentaMotifs = ['TT(C>A)TT', 'TT(C>T)GA', 'TT(C>A)TC', 'TT(C>T)GG', 'TT(C>T)GT']
polePentaMotifs = poleGreaterThanOnePercentMotifs
#polePentaMotifs = ['TT(C>A)TT', 'TT(C>T)GA']

In [141]:
poleMotifHotspotAlleles = set(possibleHotspots[possibleHotspots['pentaChange'].isin(polePentaMotifs)]['allele'])

In [142]:
hotspotAllelesObserved = poleMaf[(poleMaf['is-a-hotspot'] == 'Y') & (poleMaf['pentaChange'].isin(polePentaMotifs))]['allele']


In [143]:
hotspotAllelesNeverObs = poleMotifHotspotAlleles - set(hotspotAllelesObserved)

In [144]:
alleleCounter = Counter(hotspotAllelesObserved)

In [148]:
listOfDs = []
for v in set(hotspotAllelesObserved):
    gene = v.split('_')[0]
    listOfDs.append({'allele': v, 'count': alleleCounter[v], 'gene': gene})
for allele in hotspotAllelesNeverObs:
    gene = allele.split('_')[0]
    listOfDs.append({'allele': allele, 'count': 0, 'gene': gene})

In [149]:
df = pd.DataFrame(listOfDs)

In [150]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/poleAlleleCounts.tsv', index=False, sep='\t')

In [157]:
set(df[df['count'] >= 4]['gene'])

{'ATM',
 'BCL6',
 'CASP8',
 'DIS3',
 'DNMT1',
 'ERBB3',
 'FBXW7',
 'FUBP1',
 'MTOR',
 'PIK3CA',
 'PIK3CB',
 'PRKCI',
 'PTEN',
 'PTPRD',
 'RICTOR',
 'SMAD2',
 'TGFBR1',
 'XPO1'}

In [None]:
tmzMaf['allele'] = tmzMaf['Hugo_Symbol'] + '_' + tmzMaf['HGVSp_Short']

In [163]:
tmzMotifs = set(['ACTC', 'ACTT', 'CCTC', 'CCTT', 'GCTC', 'GCTT', 'TCTT', 'TCTC'])

In [165]:
tmzMotifHotspotAlleles = set(possibleHotspots[possibleHotspots['quadNuc'].isin(tmzMotifs)]['allele'])

In [166]:
hotspotAllelesObserved = tmzMaf[(tmzMaf['is-a-hotspot'] == 'Y') & (tmzMaf['quadNuc'].isin(tmzMotifs))]['allele']


In [167]:
hotspotAllelesNeverObs = poleMotifHotspotAlleles - set(hotspotAllelesObserved)
alleleCounter = Counter(hotspotAllelesObserved)
############TMZ

listOfDs = []
for v in set(hotspotAllelesObserved):
    gene = v.split('_')[0]
    listOfDs.append({'allele': v, 'count': alleleCounter[v], 'gene': gene})
for allele in hotspotAllelesNeverObs:
    gene = allele.split('_')[0]
    listOfDs.append({'allele': allele, 'count': 0, 'gene': gene})
df = pd.DataFrame(listOfDs)

In [174]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/tmzAlleleCounts.tsv', index=False, sep='\t')