In [8]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re
import scipy.stats as stats
from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
import mutationSigUtils
import maf_analysis_utils
import analysis_utils 
import get_gene_and_cohort_list_utils

filePathDict = configuration_util.get_all_files_path_dict()

In [3]:
def get_n_consequential_mut_count(mafPath):
    
    allImpactMuts = pd.read_table(mafPath)
    im3Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    allImpactMuts341 = allImpactMuts[allImpactMuts['Hugo_Symbol'].isin(im3Genes)]

    nmutDict = dict(allImpactMuts341['Tumor_Sample_Barcode'].value_counts())
    hotspotDict = dict(allImpactMuts341[allImpactMuts341['is-a-hotspot'] == 'Y']['Tumor_Sample_Barcode'].value_counts())
    oncogenicDict = dict(allImpactMuts341[allImpactMuts341['oncogenic'].notnull()]['Tumor_Sample_Barcode'].value_counts())
    stopGainDict = dict(allImpactMuts341[allImpactMuts341['Consequence'] == 'stop_gained']['Tumor_Sample_Barcode'].value_counts())

    #add entries with the value 0 where necessary
    for case in set(expectedDf['case']):
        if case not in hotspotDict:
            hotspotDict[case] = 0
        if case not in oncogenicDict:
            oncogenicDict[case] = 0
        if case not in stopGainDict:
            stopGainDict[case] = 0
    return hotspotDict, oncogenicDict, stopGainDict

In [4]:
def summarize_dominant_signatures_of_cases(mutClassificationDir):
    d = {}
    for f in os.listdir(mutClassificationDir):
        cancerType = re.sub('.tsv', '', f)
        filePath = os.path.join(mutClassificationDir, f)
        df = pd.read_table(filePath)
        hypermutatedDf = df[df['hypermutantClassification'] == 'Hypermutated']
        domSigDict = dict(zip(hypermutatedDf['Tumor_Sample_Barcode'], hypermutatedDf['dominantSignature']))
        for tsb, domSig in domSigDict.items():
            d[tsb] = domSig
    return d

In [12]:
#TODO--run this code on all impact cases
expectedDf = pd.read_table(filePathDict['EXPECTED_MUTATION_INFO_BY_GENE'])

In [13]:
listOfDicts = []
cntr = 0
for case in set(expectedDf['case']):
    
    if cntr %100 == 0: print cntr
    cntr +=1
    
    nmutCase = nmutDict[case]
    caseExpectation = expectedDf[expectedDf['case'] == case]
    hotspotExpectation = nmutCase *sum(caseExpectation['hotspotChance'])
    oncogenicExpectation = nmutCase *sum(caseExpectation['oncogenicChance'])
    
    observedHotspots = hotspotDict[case]
    observedOncogenic = oncogenicDict[case]
    observedStopGain = stopGainDict[case]

    listOfDicts.append({'obsHotspot':observedHotspots, 'obsOncogenic': observedOncogenic, 'obsStopGain': observedStopGain,
                        'expectedHotspot': hotspotExpectation, 'expectedOncogenic': oncogenicExpectation,
                       'nmut': nmutCase, 'Tumor_Sample_Barcode': case})
    
df = pd.DataFrame(listOfDicts)
    

0


NameError: name 'nmutDict' is not defined

In [5]:
cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(filePathDict['CANCER_TYPE_INFO'])
domSigDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath = filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])


In [6]:
df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: domSigDict[x] if x in domSigDict else None)
df['cancerType'] = df['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)

#reduce stuff to other
domSigDict = {'mean_1': 'MMR', 'mean_10': 'POLE', 'mean_11': 'TMZ',
             'mean_14': 'POLE', 'mean_7': 'UV', 'mean_MMR': 'MMR',
              'mean_SMOKING': 'SMOKING', 'mean_APOBEC': 'APOBEC'}
df['dominantSignature'] = df['dominantSignature'].apply(lambda x: domSigDict[x] if x in domSigDict
                                                       else 'Other')

NameError: name 'df' is not defined

In [None]:
cancerTypesToFocusOn = set(['Non-Small Cell Lung Cancer', 'Colorectal Cancer', 'Prostate Cancer',
                           'Glioma', 'Endometrial Cancer', 'Esophagogastric Cancer'])
df['cancerTypeAdj'] = df['cancerType'].apply(lambda x: x if x in cancerTypesToFocusOn else 'Other')

In [None]:
writeDir = os.path.join(os.getcwd(), 'FIGURE1_PLOTTING_FILES')
df.to_csv(os.path.join(writeDir, 'figure1e_observedVsExpected.tsv'), index=False, sep='\t')
