In [2]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import math
import re
from collections import Counter


sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
import get_gene_and_cohort_list_utils
filePathDict = configuration_util.get_all_files_path_dict()

## TABLE 1
Summary of IMPACT data <br></br>
Columns: **DMP, TMB, IS_MSI, DOMINANT_SIGNATURE, CANCER_TYPE, HYPERMUTATION_STATUS, N_ONCOGENIC, N_HOTSPOT, N_TRUNCATING, N_TRUNCATING_TSG, N_TRUNCATING_ONCOGENE, N_HOTSPOT_TSG, N_HOTSPOT_ONCOGENE**

In [33]:
def get_per_case_mut_info(muts, mutType='oncogenic', geneType='all'):
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
    
    if geneType == 'tsg':
        muts = muts[muts['Hugo_Symbol'].isin(tsgs)]
    if geneType == 'oncogene':
        muts = muts[muts['Hugo_Symbol'].isin(oncogenes)]
    
    if mutType == 'oncogenic':
        muts = muts[muts['oncogenic'].notnull()]
    if mutType == 'hotspot':
        muts = muts[muts['is-a-hotspot'] == 'Y']
    if mutType == 'stopGain':
        muts = muts[muts['Variant_Classification'] == 'Nonsense_Mutation']
    if mutType == 'frameShiftIndel':
        muts = muts[muts['Variant_Classification'].isin(set(['Frame_Shift_Del', 'Frame_Shift_Ins']))]
        
    infoDict = dict(muts['Tumor_Sample_Barcode'].value_counts())
    return infoDict

def create_table_one(maf):
    
    listOfDicts = []
    hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
    normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
    tmbDict = get_gene_and_cohort_list_utils.get_all_tmb_info(tmbFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])
    dominantSignatureDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath=filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
    msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])
    cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
    
    nMutTypeDicts = {}
    for mutType in ['oncogenic', 'hotspot', 'stopGain', 'frameShiftIndel']:
        for geneType in ['tsg', 'oncogene', 'all']:
            geneMutType = geneType + '_' + mutType
            nMutTypeDicts[geneMutType] = get_per_case_mut_info(maf, mutType=mutType, geneType=geneType)
            
        
    #MAKE mut counts dict
    for case in set(maf['Tumor_Sample_Barcode']):
        localD = {'DMP': case}
        localD['TMB'] = tmbDict[case]
        localD['IS_MSI'] = True if case in msiCases else False
        localD['DOMINANT_SIGNATURE'] = dominantSignatureDict[case] if case in dominantSignatureDict else None
        localD['CANCER_TYPE'] = cancerTypeDict[case] if case in cancerTypeDict else None
        localD['HYPERMUTATION_STATUS'] = 'HYPERMUTATED' if case in hypermutantIds else 'NORMAL' if case in normalIds else 'INDETERMINATE'
        
        for mutType in ['oncogenic', 'hotspot', 'stopGain', 'frameShiftIndel']:
            for geneType in ['tsg', 'oncogene', 'all']:
                geneMutType = geneType + '_' + mutType
                localDict = nMutTypeDicts[geneMutType]
                localD['N_' + geneMutType] = localDict[case] if case in localDict else 0
        
        listOfDicts.append(localD)
        
    return pd.DataFrame(listOfDicts)
    
    
    

In [34]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
dfTableOne = create_table_one(allImpactMutsMaf)
dfTableOne.to_csv('/Users/friedman/Desktop/hypermutationProjectFinal/tables/table1.tsv', index=False, sep='\t')

  """Entry point for launching an IPython kernel.


## TABLE 2
Summary of EXOME hypermutator data <br></br>
Columns: **TUMOR_SAMPLE_BARCODE, TMB, CANCER_TYPE, COHORT, N_TRUNCATING_TSG, N_TRUNCATING_ONCOGENE, N_TRUNCATING_ESSENTIAL, N_TRUNCATING_NEUTRAL**

In [78]:
def get_n_truncating_by_gene_class(maf, genes):
    truncatingConsequences = ['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins']
    geneClassTruncatingMaf = maf[(maf['Hugo_Symbol'].isin(genes)) & (maf['Variant_Classification'].isin(truncatingConsequences))]
    return dict(geneClassTruncatingMaf['Tumor_Sample_Barcode'].value_counts())
    
def create_table_two(maf):
    
    EXOME_TMB_DENOMINATOR = 30.0 #the value we divide the number of exome non-synonymous muts by to get TMB MUST BE A FLOAT
    nonSynonymousClassifications = ["Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "Translation_Start_Site"]
    nonSynomMaf = maf[maf['Variant_Classification'].isin(nonSynonymousClassifications)]
    nmutDict = dict(nonSynomMaf['Tumor_Sample_Barcode'].value_counts())
    cohortDict = dict(zip(maf['Tumor_Sample_Barcode'], maf['cohort']))
    
    tcgaCancerTypeDict = get_gene_and_cohort_list_utils.get_tcga_cancer_type_info(tcgaInfoPath = filePathDict['TCGA_CANCER_TYPE_INFO'])
    impactCancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
    
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
    essentialGenes = get_gene_and_cohort_list_utils.get_essential_genes(depMapPath = filePathDict['DEP_MAP_DATA'], mode='getEssentialGenes')
    neutralGenes = get_gene_and_cohort_list_utils.get_cancer_neutral_genes(depMapPath = filePathDict['DEP_MAP_DATA'])
    geneTypes = {'tsg': tsgs, 'oncogene': oncogenes, 'essential': essentialGenes, 'neutral': neutralGenes}
    countDicts = {}
    for geneType, genes in geneTypes.items():
        countDicts[geneType] = get_n_truncating_by_gene_class(nonSynomMaf, genes)
    
    listOfDicts = []
    for case in set(maf['Tumor_Sample_Barcode']):
        localD = {'Tumor_Sample_Barcode': case}
        localD['COHORT'] = cohortDict[case]
        localD['TMB'] = nmutDict[case]/EXOME_TMB_DENOMINATOR
        localD['CANCER_TYPE'] = tcgaCancerTypeDict[case[:12]] if case[:12] in tcgaCancerTypeDict else impactCancerTypeDict[case] if case in impactCancerTypeDict else None
        for geneType, geneDict in countDicts.items():
            localD['N_truncating_' + geneType] = geneDict[case] if case in geneDict else 0
        listOfDicts.append(localD)
    
    df = pd.DataFrame(listOfDicts)
    
    #summarize per/mb rates of truncating mutation
    geneLengthInfo = pd.read_table(filePathDict['GENE_LENGTH_INFO'])
    geneLengthDict = dict(zip(geneLengthInfo['hgnc_symbol'], geneLengthInfo['nt.length']))
    neutralGenePanelSize = sum([value for key, value in geneLengthDict.items() if key in neutralGenes])
    tsgPanelSize = sum([value for key, value in geneLengthDict.items() if key in tsgs])
    oncogenePanelSize = sum([value for key, value in geneLengthDict.items() if key in oncogenes])
    essentialPanelSize = sum([value for key, value in geneLengthDict.items() if key in essentialGenes])

    df['NEUTRAL_TRUNCATING_RATE'] = df['N_truncating_neutral'].apply(lambda x:
            (1.0*1e6*x)/neutralGenePanelSize)
    df['ESSENTIAL_TRUNCATING_RATE'] = df['N_truncating_essential'].apply(lambda x:
            (1.0*1e6*x)/essentialPanelSize)
    df['ONCOGENE_TRUNCATING_RATE'] = df['N_truncating_oncogene'].apply(lambda x:
            (1.0*1e6*x)/oncogenePanelSize)
    df['TSG_TRUNCATING_RATE'] = df['N_truncating_tsg'].apply(lambda x:
            (1.0*1e6*x)/tsgPanelSize)
    
    return df
        
    

In [31]:
allExomeMaf = pd.read_table(filePathDict['ALL_EXOME_HYPERMUTATOR_MAF'])

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


In [79]:
dfTableTwo = create_table_two(allExomeMaf)
dfTableTwo.to_csv('/Users/friedman/Desktop/hypermutationProjectFinal/tables/table2.tsv', index=False, sep='\t')




## Table 3
currently table3 is the all possible mutations in IMPACT table

## Table 4
currently table4 is the chance of a x_type mutation in each gene based on the signatures present in the case

## Table 5
Gene mutation information

In [93]:
def create_table_five(maf, clonalInfoMaf):
    
    def get_dnds_summary_information(dndsData):
        dndsData['cancerType_gene'] = dndsData.apply(lambda row: str(row['cancerType']) + '_' + str(row['gene_name']), axis=1)
        m_dndsData = dndsData.groupby('cancerType_gene').sum()
        return dict(zip(m_dndsData.index, m_dndsData['qglobal_cv.Normal'])),  dict(zip(m_dndsData.index, m_dndsData['qglobal_cv.Hypermutated']))
    
    def get_maf_summary_information(maf):
        maf['count'] = 1
        maf['cancerType_gene'] = maf.apply(lambda row: str(row['cancerType']) + '_' + str(row['Hugo_Symbol']), axis=1)
        truncatingConsequences = ['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins']
        truncatingDriverMaf = maf[maf['Variant_Classification'].isin(truncatingConsequences)]
        missenseDriverMaf = maf[~maf['Variant_Classification'].isin(truncatingConsequences)]

        m_missense = missenseDriverMaf.groupby('cancerType_gene').sum()
        m_truncating = truncatingDriverMaf.groupby('cancerType_gene').sum()
        
        print m_truncating.columns.values
        return dict(zip(m_missense.index, m_missense['count'])), dict(zip(m_truncating.index, m_truncating['count']))
    
    def get_phasing_summary_information(phasingMaf):
        phasingMaf = phasingMaf[(phasingMaf['oncogenic.1'].notnull()) & (phasingMaf['oncogenic.2'].notnull())]
        phasingMaf['count'] = 1
        phasingMaf['cancerType_gene'] = phasingMaf.apply(lambda row: str(row['cancerType']) + '_' + str(row['Hugo_Symbol']), axis=1)

        transMaf = phasingMaf[phasingMaf['phase'] == 'trans']
        cisMaf = phasingMaf[phasingMaf['phase'] == 'cis']
        m_trans = transMaf.groupby('cancerType_gene').sum()
        m_cis = cisMaf.groupby('cancerType_gene').sum()
        return dict(zip(m_trans.index, m_trans['count'])), dict(zip(m_cis.index, m_cis['count']))
    
    def summarize_clonality_info(clonalInfoMaf):
        clonalInfoMaf['isClonalBinary'] = clonalInfoMaf['isClonal'].apply(lambda x: 1 if x == True else 0 if x == False else None)
        clonalInfoMaf['cancerType_gene'] = clonalInfoMaf.apply(lambda row: str(row['cancerType']) + '_' + str(row['Hugo_Symbol']), axis=1)
        m_clonality = clonalInfoMaf.groupby('cancerType_gene').agg(np.nanmean)
        return dict(zip(m_clonality.index, m_clonality['isClonalBinary']))
        
    #THIS table is only about drivers in hypermutated cases
    hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
    cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])

    #Driver summaries
    print 'summarizing drivers'
    maf = maf[maf['Tumor_Sample_Barcode'].isin(hypermutantIds)] 
    maf = maf[maf['oncogenic'].notnull()]
    maf['varUuid'] = maf.apply(lambda row: str(row['Tumor_Sample_Barcode']) + '_' +
                               str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
    driverVariants = set(maf['varUuid'])
    maf['cancerType'] = maf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
    missenseDict, truncatingDict = get_maf_summary_information(maf)
    
    #Phasing summaries
    print 'summarizing phasing'
    phasingSummary = pd.read_table(filePathDict['PHASING_SUMMARY']) #note currently we ignore cases called 'trans or separate cells'
    phasingSummary = phasingSummary[phasingSummary['Tumor_Sample_Barcode'].isin(hypermutantIds)]
    phasingSummary['cancerType'] = phasingSummary['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
    transDict, cisDict = get_phasing_summary_information(phasingSummary)
    
    #DNDS summaries
    print 'summarizing dnds data'
    dndsData = pd.read_table(filePathDict['DNDS_RESULTS'])
    dndsData['cancerType'] = dndsData['cancerType'].apply(lambda x: re.sub('_', ' ', x))
    dndsNormalDict, dndsHyperDict = get_dnds_summary_information(dndsData)
    
    #Clonality summaries
    print 'summarizing clonality info'
    clonalInfoMaf = clonalInfoMaf[clonalInfoMaf['Tumor_Sample_Barcode'].isin(hypermutantIds)]
    clonalInfoMaf['varUuid'] = clonalInfoMaf.apply(lambda row: str(row['Tumor_Sample_Barcode']) + '_' +
                               str(row['Start_Position']) + '_' + str(row['Tumor_Seq_Allele2']), axis=1)
    clonalInfoMaf = clonalInfoMaf[clonalInfoMaf['varUuid'].isin(driverVariants)]
    clonalInfoMaf['cancerType'] = clonalInfoMaf['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cancerTypeDict else None)
    clonalityDict = summarize_clonality_info(clonalInfoMaf)  
    
    impactGenes = get_gene_and_cohort_list_utils.get_im6_genes()
    listOfDicts = []
    for cancerType in set(maf['cancerType']):
        for gene in impactGenes:
            key = cancerType + '_' + gene
            localD = {'GENE': gene, 'IS_RELATED': None, 'CANCER_TYPE': cancerType}
            localD['N_PHASE_TRANS'] = transDict[key] if key in transDict else None
            localD['N_PHASE_CIS'] = cisDict[key] if key in cisDict else None
            localD['DNDS_NORMAL_SCORE'] = dndsNormalDict[key] if key in dndsNormalDict else None
            localD['DNDS_HYPER_SCORE'] = dndsHyperDict[key] if key in dndsHyperDict else None
            localD['N_MISSENSE_DRIVER'] = missenseDict[key] if key in missenseDict else None
            localD['N_TRUNCATING_DRIVER'] = truncatingDict[key] if key in truncatingDict else None
            localD['FRACTION_DRIVERS_CLONAL'] = clonalityDict[key] if key in clonalityDict else None
            
            listOfDicts.append(localD)
    df = pd.DataFrame(listOfDicts)
    df['N_DRIVER'] = df['N_MISSENSE_DRIVER'] + df['N_TRUNCATING_DRIVER']
    return df


In [97]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
clonalityMaf = pd.read_csv(filePathDict['IMPACT_MAF_WITH_ADJUSTED_CLONALITY_ANNOTATION'])
dfTableFive = create_table_five(allImpactMutsMaf, clonalityMaf)
dfTableFive.to_csv('/Users/friedman/Desktop/hypermutationProjectFinal/tables/table5.tsv', index=False, sep='\t')