In [2]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import numpy

from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import get_gene_and_cohort_list_utils
import configuration_util

filePathDict = configuration_util.get_all_files_path_dict()
writeDir = '/Users/friedman/Desktop/hypermutationProjectFinal/scripts/figure1/FIGURE1_PLOTTING_FILES'

In [None]:
#add information to exome sigs:
#nonSynonymousClassifications = ["Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "Translation_Start_Site"]
#exomeRecaptureMafNonsynonSNP = exomeRecaptureMaf[exomeRecaptureMaf['Variant_Classification'].isin(nonSynonymousClassifications)]
#nsSnpCounts = dict(Counter(exomeRecaptureMafNonsynonSNP['Tumor_Sample_Barcode']))
#exomeRecaptureSigs['nNonSynonymous'] = exomeRecaptureSigs['Sample Name'].apply(lambda x:
#            nsSnpCounts[x] if x in nsSnpCounts else None)

#exomeRecaptureSigs.to_csv(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/exomeRecaptureSignatures.tsv', index=False, sep='\t')

In [2]:
def summarize_observed_and_expected_truncating_burdens_in_hypermutated_cases(maf,
    neutralGenes, essentialGenes, tsgs, oncogenes, geneLengthDict):
    
    nonSynonymousClassifications = ["Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "Translation_Start_Site"]
    nonSynomMaf = maf[maf['Variant_Classification'].isin(nonSynonymousClassifications)]

    nmutDict = dict(nonSynomMaf['Tumor_Sample_Barcode'].value_counts())
    truncatingConsequences = ['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins']
    truncatingMutationsMaf = maf[maf['Variant_Classification'].isin(truncatingConsequences)]

    neutralGenePanelSize = sum([value for key, value in geneLengthDict.items() if key in neutralGenes])
    tsgPanelSize = sum([value for key, value in geneLengthDict.items() if key in tsgs])
    oncogenePanelSize = sum([value for key, value in geneLengthDict.items() if key in oncogenes])
    essentialPanelSize = sum([value for key, value in geneLengthDict.items() if key in essentialGenes])
    listOfDicts = []
    
    cntr = 0
    for case in set(truncatingMutationsMaf['Tumor_Sample_Barcode']):
        if cntr%25==0:print cntr,
        cntr += 1
        caseMaf = truncatingMutationsMaf[truncatingMutationsMaf['Tumor_Sample_Barcode'] == case]
        nNeutralMuts = caseMaf[caseMaf['Hugo_Symbol'].isin(neutralGenes)].shape[0]
        nTsgMuts = caseMaf[caseMaf['Hugo_Symbol'].isin(tsgs)].shape[0]
        nOncogeneMuts = caseMaf[caseMaf['Hugo_Symbol'].isin(oncogenes)].shape[0]
        nEssentialMuts = caseMaf[caseMaf['Hugo_Symbol'].isin(essentialGenes)].shape[0]
        
        neutralRatePerMb = (1.0*1e6*nNeutralMuts)/neutralGenePanelSize
        nEssentialExpected = (1.0*essentialPanelSize/1e6)*neutralRatePerMb
        nTsgExpected = (1.0*tsgPanelSize/1e6)*neutralRatePerMb
        nOncogeneExpected = (1.0*oncogenePanelSize/1e6)*neutralRatePerMb
        
        listOfDicts.append({'nNeutral': nNeutralMuts, 'nTsg': nTsgMuts, 'nOncogene': nOncogeneMuts, 'nEssential': nEssentialMuts,
                           'nTsgExp': nTsgExpected, 'nOncogeneExp': nOncogeneExpected, 'nEssentialExp': nEssentialExpected,
                            'nmut': nmutDict[case], 'tmb': nmutDict[case]/30.0, 'Tumor_Sample_Barcode': case
                           })
        
    df = pd.DataFrame(listOfDicts)
    return df
        
        
        

In [102]:
def get_all_neutral_indel_rates(maf, neutralGenes, neutralGenePanelSize):
    maf = maf[(maf['Hugo_Symbol'].isin(neutralGenes)) & (maf['Variant_Type'].isin(['INS', 'DEL']))]
    counts = dict(maf['Tumor_Sample_Barcode'].value_counts())
    d = {}
    for key, value in counts.items():
        d[key] = (1.0*value)/neutralGenePanelSize
    return d

#TODO fix MLL2, KMT2B etc
def summarize_gene_type_lengths(geneLengthDict, geneTypes):
    d = {}
    for geneType, genes in geneTypes.items():
        print 'summarizing', geneType
        panelSize = sum([geneLengthDict[gene] if gene in geneLengthDict else 0 for gene in genes])
        d[geneType] = panelSize
    return d

#TODO calculate observed and expected based on gene size
def summarize_observed_and_expected_indels(maf, neutralRates, panelSizes, essentialGenes, tsgs, oncogenes):
    
    indelMaf = maf[(maf['Variant_Type'].isin(['INS', 'DEL']))]
    cntr = 0
    listOfDicts = []
    for case in set(maf['Tumor_Sample_Barcode']):
        cntr += 1
        if cntr%50 == 0: print cntr,
            
        if case in neutralRates:
        
            caseMaf = indelMaf[indelMaf['Tumor_Sample_Barcode'] == case]
            nOncogeneMutations = caseMaf[caseMaf['Hugo_Symbol'].isin(oncogenes)].shape[0]
            nTSGMutations = caseMaf[caseMaf['Hugo_Symbol'].isin(tsgs)].shape[0]
            nEssentialMutations = caseMaf[caseMaf['Hugo_Symbol'].isin(essentialGenes)].shape[0]

            #oncogeneRate = (1.0e6*nOncogeneMutations)/panelSizes['Oncogene']
            #tsgRate = (1.0e6*nTSGMutations)/panelSizes['TSG']
            #essentialRate = (1.0e6*nEssentialMutations)/panelSizes['Essential']

            oncogeneExpected = neutralRates[case] * panelSizes['Oncogene']
            tsgExpected = neutralRates[case] * panelSizes['TSG']
            essentialExpected = neutralRates[case] * panelSizes['Essential']

            tmb = maf[maf['Tumor_Sample_Barcode'] == case].shape[0]/30.0

            listOfDicts.append({'Tumor_Sample_Barcode': case, 
                               'OncogeneObs': nOncogeneMutations, 'TSGObs': nTSGMutations, 'EssentialObs': nEssentialMutations ,
                                'OncogeneExp': oncogeneExpected, 'TSGExp': tsgExpected, 'EssentialExp': essentialExpected,
                               'tmb': tmb})
        
    return pd.DataFrame(listOfDicts)

In [14]:
#CODE THAT COMBINES THE MC3 maf and exome recapture maf
#only run to regenerate files

mc3maf = analysis_utils.load_in_df_with_progress(pathPrefix + '/ifs/res/taylorlab/ang46/ext/mafs/mc3/mc3.v0.2.8.PUBLIC.LAML_PATCH_prepped_facets_oncokb.maf', nLinesFile= 2699035)
exomeRecaptureMaf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/exomeRecapture.maf')

mmrCasesTCGA, poleCasesTCGA = get_gene_and_cohort_list_utils.get_tcga_pole_mmr_hypermutator_ids(tcgaSigsPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/tcgaSigsCombined.txt')
mmrCasesRECAPTURE, poleCasesRECAPTURE = get_gene_and_cohort_list_utils.get_exome_recapture_pole_mmr_hypermutator_ids(exomeRecaptureSigsPath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/exomeRecaptureSignatures.tsv')
tcgaHypermutantMaf = mc3maf[mc3maf['SAMPLE_ID'].isin(mmrCasesTCGA | poleCasesTCGA)]
exomeRecaptureHypermutantMaf = exomeRecaptureMaf[exomeRecaptureMaf['Tumor_Sample_Barcode'].isin(mmrCasesRECAPTURE | poleCasesRECAPTURE)]
tcgaHypermutantMaf['Tumor_Sample_Barcode'] = tcgaHypermutantMaf['SAMPLE_ID']
allExomeHypermutantMaf = pd.concat([exomeRecaptureHypermutantMaf, tcgaHypermutantMaf])
allExomeHypermutantMaf['cohort'] = allExomeHypermutantMaf['Tumor_Sample_Barcode'].apply(lambda x:
    'mmr_TCGA' if x in mmrCasesTCGA else 'pole_TCGA' if x in poleCasesTCGA
    else 'mmr_RECAPTURE' if x in mmrCasesRECAPTURE else 'pole_RECAPTURE' if x in poleCasesRECAPTURE else None)
allExomeHypermutantMaf.to_csv(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/allHypermutatorsExome.maf', index=False, sep='\t')


NameError: name 'pathPrefix' is not defined

In [3]:
allExomeMaf = pd.read_table('/Users/friedman/Desktop/hypermutationProjectFinal/files/mafs/allExomeMaf.maf')

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#
###
#takes about 30seconds to load
allExomeHypermutantMaf = (pd.read_table(filePathDict['ALL_EXOME_HYPERMUTATOR_MAF']))


  after removing the cwd from sys.path.
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
#reload(get_gene_and_cohort_list_utils)
neutralGenes = get_gene_and_cohort_list_utils.get_cancer_neutral_genes(depMapPath = filePathDict['DEP_MAP_DATA'])
essentialGenes = get_gene_and_cohort_list_utils.get_essential_genes(depMapPath = filePathDict['DEP_MAP_DATA'], mode='getEssentialGenes')
impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
oncogenes = impactGenes - tsgs

In [5]:
geneLengthInfo = pd.read_table(filePathDict['GENE_LENGTH_INFO'])
geneLengthDict = dict(zip(geneLengthInfo['hgnc_symbol'], geneLengthInfo['nt.length']))


  """Entry point for launching an IPython kernel.


In [80]:
reload(get_gene_and_cohort_list_utils)

exomeRecaptureDomSigDict, tcgaDomSigDict = get_gene_and_cohort_list_utils.get_exome_signature_cohorts(
    filePathDict['TCGA_SIGNATURE_DECOMPOSITIONS'], filePathDict['EXOME_RECAPTURE_SIGNATURE_DECOMPOSITIONS'])

#df = summarize_observed_and_expected_truncating_burdens_in_hypermutated_cases(allExomeHypermutantMaf,
#    neutralGenes, essentialGenes, tsgs, oncogenes, geneLengthDict)

In [103]:
#NOAH WORKING HERE NOW
#geneTypeSizeDict = summarize_gene_type_lengths(geneLengthDict, {'Essential': essentialGenes, 'Neutral': neutralGenes, 
#                                            'TSG': tsgs, 'Oncogene': oncogenes})

#~1 min
#neutralIndelRates = get_all_neutral_indel_rates(allExomeMaf, neutralGenes, geneTypeSizeDict['Neutral'])

#nmuts = dict(allExomeMaf['Tumor_Sample_Barcode'].value_counts())
#hyperThresh = 750
#hyperIds = [eid for eid, count in nmuts.items() if count >= hyperThresh]
#hyperExomeMaf = allExomeMaf[allExomeMaf['Tumor_Sample_Barcode'].isin(hyperIds)]
df = summarize_observed_and_expected_indels(hyperExomeMaf, neutralIndelRates, geneTypeSizeDict, essentialGenes, tsgs, oncogenes)

df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: exomeRecaptureDomSigDict[x] if x in exomeRecaptureDomSigDict
                                                          else tcgaDomSigDict[x] if x in tcgaDomSigDict
                                                          else None)

df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/indelRateInfo.tsv', index=False, sep='\t')

50 100 150 200 250 300 350 400 450 500 550 600 650 700


In [8]:
mmrCasesTCGA, poleCasesTCGA = get_gene_and_cohort_list_utils.get_tcga_pole_mmr_hypermutator_ids(
    tcgaSigsPath = filePathDict['TCGA_SIGNATURE_DECOMPOSITIONS'])
mmrCasesRECAPTURE, poleCasesRECAPTURE = get_gene_and_cohort_list_utils.get_exome_recapture_pole_mmr_hypermutator_ids(
    exomeRecaptureSigsPath = filePathDict['EXOME_RECAPTURE_SIGNATURE_DECOMPOSITIONS'])


  tcgaSigs = pd.read_table(tcgaSigsPath)
  exomeSigs = pd.read_table(exomeRecaptureSigsPath)


In [9]:
df['signature'] = df['Tumor_Sample_Barcode'].apply(lambda x: 'POLE' if x in poleCasesTCGA | poleCasesRECAPTURE
                                                  else 'MMR' if x in mmrCasesTCGA | mmrCasesRECAPTURE
                                                  else None)

In [10]:
poleLower = np.percentile(df[df['signature'] == 'POLE']['tmb'], 5, axis=0)
poleUpper = np.percentile(df[df['signature'] == 'POLE']['tmb'], 95, axis=0)
mmrLower = np.percentile(df[df['signature'] == 'MMR']['tmb'], 5, axis=0)
mmrUpper = np.percentile(df[df['signature'] == 'MMR']['tmb'], 95, axis=0)

df['isOutlier'] = df.apply(lambda row: 
    True if row['signature'] == 'POLE' and (row['tmb'] < poleLower or row['tmb'] > poleUpper)
    else True if row['signature'] == 'MMR' and (row['tmb'] < mmrLower or row['tmb'] > mmrUpper)
    else False, axis=1)


In [11]:
#change for new way of plotting
dfAdj = pd.melt(df, id_vars=['Tumor_Sample_Barcode', 'signature', 'tmb'], value_vars=['nEssential', 'nEssentialExp', 
                                                                 'nOncogene', 'nOncogeneExp',
                                                                 'nTsg', 'nTsgExp'])

In [13]:
#dfAdj.to_csv(os.path.join(writeDir, 'figure1f_truncatingObsExp.tsv'), index=False, sep='\t')

os.path.join(writeDir, 'figure1f_truncatingObsExp.tsv')

'/Users/friedman/Desktop/hypermutationProjectFinal/scripts/figure1/FIGURE1_PLOTTING_FILES/figure1f_truncatingObsExp.tsv'

In [None]:
#
######
############
###################
############################
####################
#############
######
#

#TEMP MOVE TO A NEW SCRIPT I JUST WANT TO NOT RELOAD FILE

#allExomeHypermutantMaf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/allHypermutatorsExome.maf', index=False, sep='\t')


In [None]:
#MOVE THIS IT IS FOR FIGURE 5
def compare_mut_rate_by_dataclass(maf, genes, geneType, thresh=0.05):
    truncatingClassifications = ["Frame_Shift_Del", "Frame_Shift_Ins", 'Nonsense_Mutation']
    maf = maf[maf['Hugo_Symbol'].isin(genes)]
    cntr = 0
    cntr += 1
    listOfDs = []
    for case in set(maf['Tumor_Sample_Barcode']):
        
        if cntr % 25 == 0: print cntr,
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        
        
        clonalMaf = caseMaf[caseMaf['isClonal'] == True]
        subclonalMaf = caseMaf[caseMaf['isClonal'] == False]

            #ONLY do this analysis on cases with an adequate number of clonal mutations
        
        if (1.0*clonalMaf.shape[0])/caseMaf.shape[0] > thresh and (1.0*subclonalMaf.shape[0])/caseMaf.shape[0] > thresh:

            clonalMafTrunc = clonalMaf[clonalMaf['Variant_Classification'].isin(truncatingClassifications)]
            subclonalMafTrunc = subclonalMaf[subclonalMaf['Variant_Classification'].isin(truncatingClassifications)]

            clonalMafSynonymous = clonalMaf[(~clonalMaf['Variant_Classification'].isin(truncatingClassifications)) & (clonalMaf['oncogenic'].isnull())]
            subclonalMafSynonymous = subclonalMaf[(~subclonalMaf['Variant_Classification'].isin(truncatingClassifications)) & (subclonalMaf['oncogenic'].isnull())]



            clonalTruncating = list(clonalMafTrunc['Hugo_Symbol'])
            doubleTruncating = [tsg for tsg, count in Counter(clonalTruncating).items() if count > 1]
            oneHitTruncating = [tsg for tsg, count in Counter(clonalTruncating).items() if count == 1]
            notMutated = genes - set(clonalTruncating)

            nmutTruncatingBiallelic = clonalMafTrunc[(clonalMafTrunc['Hugo_Symbol'].isin(doubleTruncating))].shape[0]
            nmutTruncatingOneHit = clonalMafTrunc[(clonalMafTrunc['Hugo_Symbol'].isin(oneHitTruncating))].shape[0]
            nmutSynonymousBiallelic = clonalMafSynonymous[(clonalMafSynonymous['Hugo_Symbol'].isin(doubleTruncating))].shape[0]
            nmutSynonymousOneHit = clonalMafSynonymous[(clonalMafSynonymous['Hugo_Symbol'].isin(oneHitTruncating))].shape[0]

            truncatingBiallelic = None
            truncatingOneHit = None
            if nmutSynonymousBiallelic > 0: truncatingBiallelic = (1.0*nmutTruncatingBiallelic)/nmutSynonymousBiallelic                      
            if nmutSynonymousOneHit > 0: truncatingOneHit = (1.0*nmutTruncatingOneHit)/nmutSynonymousOneHit                                  

            #listOfDs.append({
            #    'truncatingBiallelicSynonymous': nmutSynonymousBiallelic, 'nmutTruncatingBiallelic': nmutTruncatingBiallelic,
            #    'truncatingOneHitSynonymous': nmutSynonymousOneHit, 'nmutTruncatingOneHit': nmutTruncatingOneHit,
            #    'noMut': 0,
            #    'timing': 'Earlier', 'Tumor_Sample_Barcode': case, 'timingMethod': 'clonality', 'geneType': geneType
            #})

            ############################################################
            ############################################################

            subclonalTruncating = list(subclonalMafTrunc['Hugo_Symbol'])

            nmutTruncatingBiallelic = subclonalMafTrunc[(subclonalMafTrunc['Hugo_Symbol'].isin(doubleTruncating))].shape[0]
            nmutTruncatingOneHit = subclonalMafTrunc[(subclonalMafTrunc['Hugo_Symbol'].isin(oneHitTruncating))].shape[0]
            nmutTruncatingNoHit = subclonalMafTrunc[(subclonalMafTrunc['Hugo_Symbol'].isin(notMutated))].shape[0]
            nmutSynonymousBiallelic = subclonalMafSynonymous[(subclonalMafSynonymous['Hugo_Symbol'].isin(doubleTruncating))].shape[0]
            nmutSynonymousOneHit = subclonalMafSynonymous[(subclonalMafSynonymous['Hugo_Symbol'].isin(oneHitTruncating))].shape[0]
            nmutSynonymousNoHit = subclonalMafSynonymous[(subclonalMafSynonymous['Hugo_Symbol'].isin(notMutated))].shape[0]

            truncatingBiallelic = None
            truncatingOneHit = None
            truncatingNoHit = None

            if nmutSynonymousBiallelic > 0: truncatingBiallelic = (1.0*nmutTruncatingBiallelic)/nmutSynonymousBiallelic                      
            if nmutSynonymousOneHit > 0: truncatingOneHit = (1.0*nmutTruncatingOneHit)/nmutSynonymousOneHit                                  
            if nmutSynonymousNoHit > 0: truncatingNoHit = (1.0*nmutTruncatingNoHit)/nmutSynonymousNoHit                                  

            listOfDs.append({
                'early_allelic_status': 'truncatingOneHit', 'val': truncatingOneHit,
                'timing': 'Later', 'Tumor_Sample_Barcode': case, 'timingMethod': 'clonality', 'geneType': geneType
            })
            listOfDs.append({
                'early_allelic_status': 'noMut', 'val': truncatingNoHit,
                'timing': 'Later', 'Tumor_Sample_Barcode': case, 'timingMethod': 'clonality', 'geneType': geneType
            })

    return pd.DataFrame(listOfDs)
        

In [None]:
depMapDf = pd.read_csv(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/achillesDepMap.csv')

In [None]:
listOfDicts = []
for col in depMapDf.columns.values:
    gene = col.split(' ')[0]
    if col != 'Unnamed: 0':
        listOfDicts.append({'Hugo_Symbol': gene, 'score': np.nanmean(depMapDf[col])})
depMapDfZ = pd.DataFrame(listOfDicts)

essentialGenes = set(depMapDfZ[depMapDfZ['score'] < -1]['Hugo_Symbol'])
moderatelyEssentialGenes = set(depMapDfZ[(depMapDfZ['score'] >= -1) & (depMapDfZ['score'] < -.5)]['Hugo_Symbol'])
neutralGenes = set(depMapDfZ[(depMapDfZ['score'] >= -.5)]['Hugo_Symbol'])
impactGenes = set(['ABL1', 'ACVR1', 'AGO2', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'ANKRD11', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BABAM1', 'BAP1', 'BARD1', 'BBC3', 'BCL10', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BIRC3', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CALR', 'CARD11', 'CARM1', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79A', 'CD79B', 'CDC42', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CEBPA', 'CENPA', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSDE1', 'CSF1R', 'CSF3R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'CXCR4', 'CYLD', 'CYSLTR2', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNAJB1', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'DROSHA', 'DUSP4', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EIF4A2', 'EIF4E', 'ELF3', 'EP300', 'EPAS1', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHA7', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERF', 'ERG', 'ERRFI1', 'ESR1', 'ETV1', 'ETV6', 'EZH1', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FAM58A', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXO1', 'FOXP1', 'FUBP1', 'FYN', 'GATA1', 'GATA2', 'GATA3', 'GLI1', 'GNA11', 'GNAQ', 'GNAS', 'GPS2', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3A', 'H3F3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3A', 'HIST1H3B', 'HIST1H3C', 'HIST1H3D', 'HIST1H3E', 'HIST1H3F', 'HIST1H3G', 'HIST1H3H', 'HIST1H3I', 'HIST1H3J', 'HIST2H3C', 'HIST2H3D', 'HIST3H3', 'HLA-A', 'HLA-B', 'HNF1A', 'HOXB13', 'HRAS', 'ICOSLG', 'ID3', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INHA', 'INHBA', 'INPP4A', 'INPP4B', 'INPPL1', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KMT2B', 'KMT5A', 'KNSTRN', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'LYN', 'MALT1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAP3K14', 'MAPK1', 'MAPK3', 'MAPKAP1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MGA', 'MITF', 'MLH1', 'KMT2A', 'KMT2B', 'KMT2C', 'MPL', 'MRE11A', 'MSH2', 'MSH3', 'MSH6', 'MSI1', 'MSI2', 'MST1', 'MST1R', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOA3', 'NCOR1', 'NEGR1', 'NF1', 'NF2', 'NFE2L2', 'NFKBIA', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTHL1', 'NTRK1', 'NTRK2', 'NTRK3', 'NUF2', 'NUP93', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDCD1LG2', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PGR', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLCG2', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLD1', 'POLE', 'PPARG', 'PPM1D', 'PPP2R1A', 'PPP4R2', 'PPP6C', 'PRDM1', 'PRDM14', 'PREX2', 'PRKAR1A', 'PRKCI', 'PRKD1', 'PTCH1', 'PTEN', 'PTP4A1', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAB35', 'RAC1', 'RAC2', 'RAD21', 'RAD50', 'RAD51', 'RAD51C', 'RAD51L1', 'RAD51L3', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHEB', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RRAGC', 'RRAS', 'RRAS2', 'RTEL1', 'RUNX1', 'RXRA', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SESN1', 'SESN2', 'SESN3', 'SETD2', 'SF3B1', 'SH2B3', 'SH2D1A', 'SHOC2', 'SHQ1', 'SLX4', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SMYD3', 'SOCS1', 'SOS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SPRED1', 'SRC', 'SRSF2', 'STAG2', 'STAT3', 'STAT5A', 'STAT5B', 'STK11', 'STK19', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TAP1', 'TAP2', 'TBX3', 'TCEB1', 'TCF3', 'TCF7L2', 'TEK', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP53BP1', 'TP63', 'TRAF2', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'UPF1', 'VEGFA', 'VHL', 'VTCN1', 'WHSC1', 'WHSC1L1', 'WT1', 'WWTR1', 'XIAP', 'XPO1', 'XRCC2', 'YAP1', 'YES1', 'ZFHX3', 'ZRSR2'])
neutralGenes = neutralGenes - impactGenes
tsgs = set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])
oncogenes = impactGenes - tsgs

In [None]:
clonalThresh = .8
allExomeHypermutantMaf['isClonal'] = allExomeHypermutantMaf['ccf_Mcopies_lower'].apply(lambda x: True if x > clonalThresh else False)

In [None]:
#allExomeHypermutantMafFlat = allExomeHypermutantMaf[allExomeHypermutantMaf['Tumor_Sample_Barcode'].isin(flatGenomeHypermutated)]
allExomeHypermutantMafFlat['t_var_freq'] = allExomeHypermutantMaf['t_alt_count']/allExomeHypermutantMaf['t_depth']
clonalityAnnotatedFlatMaf = clonality_analysis_util.assign_clonality_information_for_hypermutated_cases(allExomeHypermutantMafFlat, flatGenomeHypermutated, set([]))


In [None]:
#TEMP METHOD FOR hypermutated
flatGenomeHypermutated = set([tid for tid, count in dict(allExomeHypermutantMaf[allExomeHypermutantMaf['purity'].isnull()]['Tumor_Sample_Barcode'].value_counts()).items() if count > 3000])
exomeMafNotFlat = allExomeHypermutantMaf[~allExomeHypermutantMaf['Tumor_Sample_Barcode'].isin(flatGenomeHypermutated)]

combinedClonalityAnnotatedMaf = pd.concat([exomeMafNotFlat, clonalityAnnotatedFlatMaf])
combinedClonalityAnnotatedMaf = combinedClonalityAnnotatedMaf[combinedClonalityAnnotatedMaf['isClonal'].notnull()]


In [None]:
print 'tsg'
summaryDfTsg = compare_mut_rate_by_dataclass(combinedClonalityAnnotatedMaf, tsgs, 'tsg')
print 'oncogene'
summaryDfOncogene = compare_mut_rate_by_dataclass(combinedClonalityAnnotatedMaf, oncogenes, 'oncogene')
print 'essential'
summaryDfStrongEssential = compare_mut_rate_by_dataclass(combinedClonalityAnnotatedMaf, essentialGenes, 'strong essential')
print 'moderate essential'
summaryDfModerateEssential = compare_mut_rate_by_dataclass(combinedClonalityAnnotatedMaf, moderatelyEssentialGenes, 'moderate essential')
print 'neutral'
summaryDfNeutral = compare_mut_rate_by_dataclass(combinedClonalityAnnotatedMaf, neutralGenes, 'neutral')


In [None]:
combinedDf = pd.concat([summaryDfTsg, summaryDfOncogene, summaryDfStrongEssential, summaryDfModerateEssential, summaryDfNeutral])
combinedDf['geneAndStatus'] = combinedDf['geneType'] + '_' + combinedDf['early_allelic_status']
combinedDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/mutRateByGeneType.tsv', index=False, sep='\t')
