In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re
import scipy.stats as stats

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt'

sys.path.append(pathPrefix + '/juno/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import mutation_modeling_util
import get_gene_and_cohort_list_utils

In [None]:
mc3maf = analysis_utils.load_in_df_with_progress(pathPrefix + '/ifs/res/taylorlab/ang46/ext/mafs/mc3/mc3.v0.2.8.PUBLIC.LAML_PATCH_prepped_facets_oncokb.maf', nLinesFile= 2699035)



In [None]:
#Get proper nmuts
nonsynonMutTypes = ["Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", 
"Frame_Shift_Ins", "Frame_Shift_Del","In_Frame_Del",
"In_Frame_Ins","Translation_Start_Site","Splice_Site"]
mc3NonSynom = mc3maf[mc3maf['Variant_Classification'].isin(nonsynonMutTypes)]

nmutDict = dict(mc3NonSynom['SAMPLE_ID'].value_counts())


In [None]:
tcgaSigs = pd.read_table(pathPrefix + '/ifs/work/taylorlab/pensona/dmp_sigs/tcga/mc3.v0.2.8.PUBLIC.LAML_PATCH.trinuc.30sigs.txt')
tcgaSigs['sampleNameAdj'] = tcgaSigs['Sample Name'].apply(lambda x: x[:15])
tcgaSigs['nNonSynom'] = tcgaSigs['sampleNameAdj'].apply(lambda x: nmutDict[x] if x in nmutDict else None)

tcgaSigs = tcgaSigs[tcgaSigs['nNonSynom'].notnull()]

In [None]:
tcgaSigs['Signature.MMR'] = tcgaSigs['Signature.6'] + tcgaSigs['Signature.15'] + tcgaSigs['Signature.20']+ tcgaSigs['Signature.21'] + tcgaSigs['Signature.26']
tcgaSigs['Signature.POLE'] = tcgaSigs['Signature.10'] + tcgaSigs['Signature.14']

In [None]:
tcgaSigs['mmrAttributed'] = tcgaSigs['Signature.MMR'] * tcgaSigs['nNonSynom']
tcgaSigs['poleAttributed'] = tcgaSigs['Signature.POLE'] * tcgaSigs['nNonSynom']
tcgaSigs['tmzAttributed'] = tcgaSigs['Signature.11'] * tcgaSigs['nNonSynom']

mmrCases = set(tcgaSigs[tcgaSigs['mmrAttributed'] > 150]['sampleNameAdj'])
poleCases = set(tcgaSigs[tcgaSigs['poleAttributed'] > 150]['sampleNameAdj'])
tmzCases = set(tcgaSigs[tcgaSigs['tmzAttributed'] > 150]['sampleNameAdj'])

In [None]:
mmrMuts = mc3NonSynom[mc3NonSynom['SAMPLE_ID'].isin(mmrCases)]
poleMuts = mc3NonSynom[mc3NonSynom['SAMPLE_ID'].isin(poleCases)]


In [None]:
#Get gene lengths from alex G
geneLengthInfo = pd.read_table(pathPrefix + '/ifs/res/taylorlab/ang46/ext/resources/coding_sequences_allgenes.tsv')
geneLengthDict = dict(zip(geneLengthInfo['hgnc_symbol'], geneLengthInfo['nt.length']))

In [None]:
#Get essential genes from the dep map
essentialGenes = get_gene_and_cohort_list_utils(depMapPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/achillesDepMap.csv')


In [None]:
allGenes = set(geneLengthDict.keys())
impactGenes = allGenes & set(['ABL1', 'ACVR1', 'AGO2', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'ANKRD11', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BABAM1', 'BAP1', 'BARD1', 'BBC3', 'BCL10', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BIRC3', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CALR', 'CARD11', 'CARM1', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79A', 'CD79B', 'CDC42', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CEBPA', 'CENPA', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSDE1', 'CSF1R', 'CSF3R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'CXCR4', 'CYLD', 'CYSLTR2', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNAJB1', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'DROSHA', 'DUSP4', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EIF4A2', 'EIF4E', 'ELF3', 'EP300', 'EPAS1', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHA7', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERF', 'ERG', 'ERRFI1', 'ESR1', 'ETV1', 'ETV6', 'EZH1', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FAM58A', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXO1', 'FOXP1', 'FUBP1', 'FYN', 'GATA1', 'GATA2', 'GATA3', 'GLI1', 'GNA11', 'GNAQ', 'GNAS', 'GPS2', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3A', 'H3F3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3A', 'HIST1H3B', 'HIST1H3C', 'HIST1H3D', 'HIST1H3E', 'HIST1H3F', 'HIST1H3G', 'HIST1H3H', 'HIST1H3I', 'HIST1H3J', 'HIST2H3C', 'HIST2H3D', 'HIST3H3', 'HLA-A', 'HLA-B', 'HNF1A', 'HOXB13', 'HRAS', 'ICOSLG', 'ID3', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INHA', 'INHBA', 'INPP4A', 'INPP4B', 'INPPL1', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KMT2B', 'KMT5A', 'KNSTRN', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'LYN', 'MALT1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAP3K14', 'MAPK1', 'MAPK3', 'MAPKAP1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MGA', 'MITF', 'MLH1', 'KMT2A', 'KMT2B', 'KMT2C', 'MPL', 'MRE11A', 'MSH2', 'MSH3', 'MSH6', 'MSI1', 'MSI2', 'MST1', 'MST1R', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOA3', 'NCOR1', 'NEGR1', 'NF1', 'NF2', 'NFE2L2', 'NFKBIA', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTHL1', 'NTRK1', 'NTRK2', 'NTRK3', 'NUF2', 'NUP93', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDCD1LG2', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PGR', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLCG2', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLD1', 'POLE', 'PPARG', 'PPM1D', 'PPP2R1A', 'PPP4R2', 'PPP6C', 'PRDM1', 'PRDM14', 'PREX2', 'PRKAR1A', 'PRKCI', 'PRKD1', 'PTCH1', 'PTEN', 'PTP4A1', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAB35', 'RAC1', 'RAC2', 'RAD21', 'RAD50', 'RAD51', 'RAD51C', 'RAD51L1', 'RAD51L3', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHEB', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RRAGC', 'RRAS', 'RRAS2', 'RTEL1', 'RUNX1', 'RXRA', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SESN1', 'SESN2', 'SESN3', 'SETD2', 'SF3B1', 'SH2B3', 'SH2D1A', 'SHOC2', 'SHQ1', 'SLX4', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SMYD3', 'SOCS1', 'SOS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SPRED1', 'SRC', 'SRSF2', 'STAG2', 'STAT3', 'STAT5A', 'STAT5B', 'STK11', 'STK19', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TAP1', 'TAP2', 'TBX3', 'TCEB1', 'TCF3', 'TCF7L2', 'TEK', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP53BP1', 'TP63', 'TRAF2', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'UPF1', 'VEGFA', 'VHL', 'VTCN1', 'WHSC1', 'WHSC1L1', 'WT1', 'WWTR1', 'XIAP', 'XPO1', 'XRCC2', 'YAP1', 'YES1', 'ZFHX3', 'ZRSR2'])
tsgs = allGenes & set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])
oncogenes = allGenes & (impactGenes - tsgs)
essentialGenes = allGenes & essentialGenes
allOtherGenes = allGenes - tsgs - oncogenes - essentialGenes


In [None]:
def calculate_mut_info_summaries(muts, geneLengthDf):
    
    listOfDicts = []
    #First get the correction factors
    allBasesLength = sum(geneLengthDf['nt.length'])
    tsgLength = sum(geneLengthDf[geneLengthDf['hgnc_symbol'].isin(tsgs)]['nt.length'])
    oncogeneLength = sum(geneLengthDf[geneLengthDf['hgnc_symbol'].isin(oncogenes)]['nt.length'])
    essentialGeneLength = sum(geneLengthDf[geneLengthDf['hgnc_symbol'].isin(essentialGenes)]['nt.length'])
    otherGeneLength = sum(geneLengthDf[geneLengthDf['hgnc_symbol'].isin(allOtherGenes)]['nt.length'])
    
    for case in set(muts['SAMPLE_ID']):
        caseMuts = muts[muts['SAMPLE_ID'] == case]
        caseMutsTSG = caseMuts[caseMuts['Hugo_Symbol'].isin(tsgs)]
        caseMutsOncogene = caseMuts[caseMuts['Hugo_Symbol'].isin(oncogenes)]
        caseMutsEssential = caseMuts[caseMuts['Hugo_Symbol'].isin(essentialGenes)]
        caseMutsOther = caseMuts[caseMuts['Hugo_Symbol'].isin(allOtherGenes)]
        
        listOfDicts.append({
            'sampleID': case, 'mutType': 'single', 'type':'tsg', 'n':caseMutsTSG.shape[0], 'rate': (1.0*caseMutsTSG.shape[0])/tsgLength})
        listOfDicts.append({
            'sampleID': case,'mutType': 'single', 'type':'oncogene', 'n':caseMutsOncogene.shape[0], 'rate':(1.0*caseMutsOncogene.shape[0])/oncogeneLength})
        listOfDicts.append({
            'sampleID': case,'mutType': 'single', 'type':'essential', 'n':caseMutsEssential.shape[0], 'rate':(1.0*caseMutsEssential.shape[0])/essentialGeneLength})
        listOfDicts.append({
            'sampleID': case, 'mutType': 'single', 'type':'allOther', 'n':caseMutsOther.shape[0], 'rate':(1.0*caseMutsOther.shape[0])/otherGeneLength})
        
        #THEN do the same analysis for double mutation
        nDoubleTSG = len(caseMutsTSG['Hugo_Symbol'].value_counts()[caseMutsTSG['Hugo_Symbol'].value_counts() > 1])
        nDoubleOncogene = len(caseMutsOncogene['Hugo_Symbol'].value_counts()[caseMutsOncogene['Hugo_Symbol'].value_counts() > 1])
        nDoubleEssential = len(caseMutsEssential['Hugo_Symbol'].value_counts()[caseMutsEssential['Hugo_Symbol'].value_counts() > 1])
        nDoubleOther = len(caseMutsOther['Hugo_Symbol'].value_counts()[caseMutsOther['Hugo_Symbol'].value_counts() > 1])

        listOfDicts.append({
            'sampleID': case, 'mutType': 'double', 'type':'tsg', 'n': nDoubleTSG , 'rate': (1.0*nDoubleTSG )/tsgLength})
        listOfDicts.append({
            'sampleID': case,'mutType': 'double', 'type':'oncogene', 'n':nDoubleOncogene, 'rate':(1.0*nDoubleOncogene)/oncogeneLength})
        listOfDicts.append({
            'sampleID': case,'mutType': 'double', 'type':'essential', 'n':nDoubleEssential, 'rate':(1.0*nDoubleEssential)/essentialGeneLength})
        listOfDicts.append({
            'sampleID': case, 'mutType': 'double', 'type':'allOther', 'n':nDoubleOther, 'rate':(1.0*nDoubleOther)/otherGeneLength})

    
    return pd.DataFrame(listOfDicts)
    

In [None]:
frameshiftMMR = mmrMuts[mmrMuts['Consequence'] == 'frameshift_variant']
frameshiftCounts = frameshiftMMR['SAMPLE_ID'].value_counts()
frameshiftCaseIds = set(frameshiftCounts[frameshiftCounts > 100].index)

frameshiftMMRSelect = frameshiftMMR[frameshiftMMR['SAMPLE_ID'].isin(frameshiftCaseIds)]

mmrSummary = calculate_mut_info_summaries(frameshiftMMRSelect, geneLengthInfo)

stopGainPole = poleMuts[poleMuts['Consequence'] == 'stop_gained']
poleSummary = calculate_mut_info_summaries(stopGainPole, geneLengthInfo)


In [None]:
poleSummary['aetiology'] = 'POLE'
mmrSummary['aetiology'] = 'MMR'
fullDf= pd.concat([poleSummary, mmrSummary])

In [None]:
fullDf.to_csv('/Users/friedman/Desktop/WORK/tcgaGeneTypeRates.tsv', index=False, sep='\t')
#todo per case implied oncogene/essential gene etc rategm

In [None]:
f = open(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/tcgaNonSynomCounts.tsv', 'w+')
lines = []
for i, val in nmutDict.items():
    line = str(i) + '\t' + str(val) + '\n'
    lines.append(line)
for line in lines:
    f.write(line)
    

In [None]:
#
#
#
#
#


expectedTCGAmutData = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/mutSimulation/expectedMutationTables/tcgaHypermutatorsExpectedGeneMutInfo.tsv')

In [None]:
#TODO--get the observed non-zynom number of mutations in impact 341 genes
#compare what using that looks like to the expected mut data numbers that I have to get a good estimate
#do more stuff
im341Genes = set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'MLL', 'MLL2', 'MLL3', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
impact341TcgaPole = poleMuts[poleMuts['Hugo_Symbol'].isin(im341Genes)]

In [None]:
impact341NonsynomBurdens = dict(impact341TcgaPole['SAMPLE_ID'].value_counts())

In [None]:
poleSummaryOther = poleSummary[(poleSummary['type'] == 'allOther') & (poleSummary['mutType'] == 'single')]
poleRateDict = dict(zip(poleSummaryOther['sampleID'], poleSummaryOther['rate']))


In [None]:
expectedTCGAD = {}
for case in set(expectedTCGAmutData['case']):
    caseData = expectedTCGAmutData[expectedTCGAmutData['case'] == case]
    print caseData.shape
    expectedTCGAD[case] = sum(caseData['truncatingChance'])

In [None]:
im3CaptureArea = 896665

impactGeneLengths = analysis_utils.get_cds_size_targeted_by_impact(infoFilePath = pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/impact_gene_reference_signatures.tsv')

listOfDicts = []
for gene in im341Genes:
    print gene
    geneDict = dict(zip(expectedTCGAmutData[expectedTCGAmutData['gene'] == gene]['case'],
        expectedTCGAmutData[expectedTCGAmutData['gene'] == gene]['truncatingChance']
        ))
    
    sigsData = []
    rateData = []
    for case, nonsynomBurden in impact341NonsynomBurdens.items():
    
        if case in geneDict and case in poleRateDict and gene in impactGeneLengths:
            
            geneLength = impactGeneLengths[gene]
            sigsData.append(nonsynomBurden*geneDict[case])
            rateData.append(poleRateDict[case]*geneLength)
    
    listOfDicts.append({'gene': gene, 'sigsMean': np.nanmean(sigsData),
                        'rateMean': np.nanmean(rateData)})
         
df = pd.DataFrame(listOfDicts)
        
    #if case in expectedTCGAD and case in poleRateDict:
        #print 'expectedBySigs', nonsynomBurden*expectedTCGAD[case]
        #print 'expectedByRate', poleRateDict[case]*im3CaptureArea
        #print '_______________________'
        #e2 = (poleRateDict[case] * expectedTCGAD[case])/im3CaptureArea
        #print e1, e2
    
    

In [None]:
df.to_csv('/Users/friedman/Desktop/tempTest.tsv', index=False, sep='\t')

In [None]:
allImpactMuts = analysis_utils.load_in_df_with_progress(filePath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', nLinesFile = 275000)


In [None]:
genes = ['TP53', 'PTEN', 'ARID1A', 'KRAS', 'BRCA1', 'BRCA2']

In [None]:
mafList = []
for gene in genes:
    print gene
    mafString = pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/simulatedMafs/geneMutMafs/' + gene + '_all_possible_snps_v2.maf'
    geneMaf = pd.read_table(mafString)
    geneMaf['posChange'] = geneMaf.apply(lambda row:
    str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
    mafList.append(geneMaf)


In [None]:
geneMafObserved = allImpactMuts[allImpactMuts['Hugo_Symbol'].isin(genes)]
geneMafObserved['posChange'] = geneMafObserved.apply(lambda row:
    str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)

observedStartPos = set(geneMafObserved['posChange'])

In [None]:
notObserved = set(geneMaf['posChange']) - observedStartPos
print Counter(geneMaf[geneMaf['posChange'].isin(notObserved)]['Consequence']).most_common(10)

In [None]:
df = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/hypermutationAnalysisProj/mutSimulation/expectedMutationTables/quadNucAndGenePossibleMutationSummaryIMPACT341.tsv')

In [None]:
#gene = 'TP53'
#geneMaf = df[df['gene'] == 'TP53']
#sum(geneMaf['nNonSilent'])

mafString = pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/simulatedMafs/geneMutMafs/' + gene + '_all_possible_snps_v2.maf'
geneMafRaw = pd.read_table(mafString)
   


In [None]:
nonSynoms = ["Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "Translation_Start_Site"]

#geneMafRaw[geneMafRaw['Variant_Classification'].isin(nonSynoms)]
#print sum(geneMaf['nNonSilent'])        
df = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/allPossibleIMPACTMutationsSummary.tsv')
           

In [None]:
s = 0
for key, value in dict(df[df['Hugo_Symbol'] == 'TP53'].iloc[0]).items():
    if 'nonSilent' in str(key):
        if value > 0:
            s+= float(value)

In [None]:
print s
mafString = pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/simulatedMafs/geneMutMafs/' + 'TP53' + '_all_possible_snps_v2.maf'
geneMaf = pd.read_table(mafString)


In [2]:
#
##
#####
#########
############
###############

maf = pd.read_table(pathPrefix + '/juno/work/ccs/resources/impact/cbio_mutations/data_mutations_extended.vep.maf')



IOError: File /Users/friedman/Desktop/mnt/juno/work/ccs/resources/impact/cbio_mutations/data_mutations_extended.vep.maf does not exist