In [2]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re
import scipy.stats as stats
from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
filePathDict = configuration_util.get_all_files_path_dict()
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
#import mutation_modeling_util
import get_gene_and_cohort_list_utils

In [None]:
allImpactMuts = analysis_utils.load_in_df_with_progress(filePath = pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/data_mutations_extended_annotated_sigContext_nov19_2019.maf', nLinesFile = 331821)

In [None]:

simDfSummary = pd.read_table('/Users/friedman/Desktop/workOffline/mutSimulationInfoIncludingHotspots.tsv')

In [None]:
def calculate_n_muts_of_category(df, ignoreMotifs = ['TCAT', 'TCAA', 'TCTG', 'TCTT']):
    allBases = ['A', 'C', 'G', 'T']
    changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
    allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
    
    sQuadNuc = 0
    sSilent = 0
    sHotspot = 0
    sOncogenic = 0
    for q in allQuadNucs:
        if q not in ignoreMotifs:
            sQuadNuc += sum(df[q])
            sSilent += sum(df[q + '_silent'])
            sHotspot += sum(df[q + '_hotspot'])
            sOncogenic += sum(df[q + '_oncogenic'])
    
    return sQuadNuc - sSilent, sHotspot, sOncogenic
        

In [None]:
def calculate_n_possible_muts_by_category_and_sig(sigMotifs):
    nSigNonSilentPOSSIBLE = sum([sum(simDfSummary[x]) for x in sigMotifs]) - sum([sum(simDfSummary[x + '_silent']) for x in sigMotifs])
    nSigHotspotPOSSIBLE = sum([sum(simDfSummary[x + '_hotspot']) for x in sigMotifs])
    nSigOncogenicPOSSIBLE = sum([sum(simDfSummary[x + '_oncogenic']) for x in sigMotifs])
    nNOTSigNonSilentPOSSIBLE, nNOTSigHotspotPOSSIBLE, nNOTSigOncogenicPOSSIBLE = calculate_n_muts_of_category(simDfSummary, ignoreMotifs = sigMotifs)
    return nSigNonSilentPOSSIBLE, nSigHotspotPOSSIBLE, nSigOncogenicPOSSIBLE, nNOTSigNonSilentPOSSIBLE, nNOTSigHotspotPOSSIBLE, nNOTSigOncogenicPOSSIBLE



In [None]:
allBases = ['A', 'C', 'G', 'T']
changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
  
tmzMotifs = set(['ACTC', 'ACTT', 'CCTC', 'CCTT', 'GCTC', 'GCTT', 'TCTC', 'TCTT'])
apobecMotifs = set(['TCTA', 'TCTT', 'TCGA', 'TCGT'])

allHotspotCols = [x + '_hotspot' for x in allQuadNucs]
allSilentCols = [x + '_silent' for x in allQuadNucs]
allOncogenicCols = [x + '_oncogenic' for x in allQuadNucs]



In [None]:
nPOLENonSilentPOSSIBLE, nPOLEHotspotPOSSIBLE, nPOLEOncogenicPOSSIBLE, nNOTPOLENonSilentPOSSIBLE, nNOTPOLEHotspotPOSSIBLE, nNOTPOLEOncogenicPOSSIBLE = calculate_n_possible_muts_by_category_and_sig(poleMotifs)

nTMZNonSilentPOSSIBLE, nTMZHotspotPOSSIBLE, nTMZOncogenicPOSSIBLE, nNOTTMZNonSilentPOSSIBLE, nNOTTMZHotspotPOSSIBLE, nNOTTMZOncogenicPOSSIBLE = calculate_n_possible_muts_by_category_and_sig(tmzMotifs)

nAPOBECNonSilentPOSSIBLE, nAPOBECHotspotPOSSIBLE, nAPOBECOncogenicPOSSIBLE, nNOTAPOBECNonSilentPOSSIBLE, nNOTAPOBECHotspotPOSSIBLE, nNOTAPOBECOncogenicPOSSIBLE = calculate_n_possible_muts_by_category_and_sig(apobecMotifs)




In [None]:
impactSigs = pd.read_table(filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])


In [None]:
poleCases = set(impactSigs[(impactSigs['Nmut_Mb'] > 50) & (impactSigs['mean_10'] > .33)]['Tumor_Sample_Barcode'])

In [None]:
tmzCases = set(impactSigs[(impactSigs['Nmut_Mb'] > 50) & (impactSigs['mean_11'] > .33)]['Tumor_Sample_Barcode'])

In [None]:
apobecCases = set(impactSigs[(impactSigs['Nmut_Mb'] > 75) & (impactSigs['mean_2'] + impactSigs['mean_13'] > .5)]['Tumor_Sample_Barcode'])


now go and calculate p values <br/>

In [None]:
def get_pvalues_for_oncogneic_mutations_in_cases(ids, allMutsMaf, sigMotifs, 
                                                 nSigOncogenicPossible, nNotSigOncogenicPossible,
                                                 nSigNotSilentPossible, nNotSigNonSilentPossible):

    listOfDicts = []
    for case in ids:
        caseMaf = allMutsMaf[allMutsMaf['Tumor_Sample_Barcode'] == case]
        nmut = caseMaf.shape[0]
        nmutAtMotif = caseMaf[caseMaf['quadNuc'].isin(sigMotifs)].shape[0]
        if nmut > 0:
            nOncogenicSIG = caseMaf[(caseMaf['oncogenic'].notnull()) & (caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nOncogenicNOTSIG = caseMaf[(caseMaf['oncogenic'].notnull()) & (~caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nNOTOncogenicSIG = caseMaf[~(caseMaf['oncogenic'].notnull()) & (caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nNOTOncogenicNOTSIG = caseMaf[~(caseMaf['oncogenic'].notnull()) & (~caseMaf['quadNuc'].isin(sigMotifs))].shape[0]

            oddsratioSIGMuts, pvalueSIGMuts = stats.fisher_exact([[nOncogenicSIG, nSigOncogenicPossible - nOncogenicSIG],
                                                    [nNOTOncogenicSIG, nSigNotSilentPossible - nNOTOncogenicSIG]])

            oddsratioNOTSIGMuts, pvalueNOTSIGMuts = stats.fisher_exact([[nOncogenicNOTSIG, nNotSigOncogenicPossible - nOncogenicNOTSIG],
                                                    [nNOTOncogenicNOTSIG, nNotSigNonSilentPossible - nNOTOncogenicNOTSIG]])

            listOfDicts.append({
                'Tumor_Sample_Barcode': case,
                'Nmut': nmut,
                'pvalueSIGMuts': pvalueSIGMuts, 'pvalueNOTSIGMuts': pvalueNOTSIGMuts,
                'oddsratioSIGMuts': oddsratioSIGMuts, 'oddsratioNOTSIGMuts': oddsratioNOTSIGMuts,
                'mutOfInterestFrac': (1.0*nOncogenicSIG)/nmutAtMotif
            })
    df = pd.DataFrame(listOfDicts)
    return df



In [None]:
def get_pvalues_for_hotspot_mutations_in_cases(ids, allMutsMaf, sigMotifs,
                                               nSigHotspotPossible, nNotSigHotspotPossible,
                                               nSigNotSilentPossible, nNotSigNonSilentPossible):

    listOfDicts = []
    for case in ids:
        caseMaf = allMutsMaf[allMutsMaf['Tumor_Sample_Barcode'] == case]
        nmut = caseMaf.shape[0]
        nmutAtMotif = caseMaf[caseMaf['quadNuc'].isin(sigMotifs)].shape[0]
        if nmut > 0:
            nHotspotsSIG = caseMaf[(caseMaf['is-a-hotspot'] == 'Y') & (caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nHotspotsNOTSIG = caseMaf[(caseMaf['is-a-hotspot'] == 'Y') & (~caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nNOTHotspotsSIG = caseMaf[(caseMaf['is-a-hotspot'] != 'Y') & (caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nNOTHotspotsNOTSIG = caseMaf[(caseMaf['is-a-hotspot'] != 'Y') & (~caseMaf['quadNuc'].isin(sigMotifs))].shape[0]

            oddsratioSIGMuts, pvalueSIGMuts = stats.fisher_exact([[nHotspotsSIG, nSigHotspotPossible - nHotspotsSIG],
                                                    [nNOTHotspotsSIG, nSigNotSilentPossible - nNOTHotspotsSIG]])

            oddsratioNOTSIGMuts, pvalueNOTSIGMuts = stats.fisher_exact([[nHotspotsNOTSIG, nNotSigHotspotPossible - nHotspotsNOTSIG],
                                                    [nNOTHotspotsNOTSIG, nNotSigNonSilentPossible - nNOTHotspotsNOTSIG]])

            listOfDicts.append({
                'Tumor_Sample_Barcode': case,
                'Nmut': nmut,
                'pvalueSIGMuts': pvalueSIGMuts, 'pvalueNOTSIGMuts': pvalueNOTSIGMuts,
                'oddsratioSIGMuts': oddsratioSIGMuts, 'oddsratioNOTSIGMuts': oddsratioNOTSIGMuts,
                'mutOfInterestFrac': (1.0*nHotspotsSIG)/nmutAtMotif
            })
    df = pd.DataFrame(listOfDicts)
    return df
    
    

In [None]:
dfOncPOLE = get_pvalues_for_oncogneic_mutations_in_cases(poleCases, allImpactMuts, poleMotifs,
                                                       nPOLEOncogenicPOSSIBLE, nNOTPOLEOncogenicPOSSIBLE,
                                                 nPOLENonSilentPOSSIBLE, nNOTPOLENonSilentPOSSIBLE)

dfHotspotPOLE = get_pvalues_for_hotspot_mutations_in_cases(poleCases, allImpactMuts, poleMotifs,
                                                         nPOLEHotspotPOSSIBLE, nNOTPOLEHotspotPOSSIBLE,
                                               nPOLENonSilentPOSSIBLE, nNOTPOLENonSilentPOSSIBLE)

In [None]:
dfOncTMZ = get_pvalues_for_oncogneic_mutations_in_cases(tmzCases, allImpactMuts, tmzMotifs,
                                                       nTMZOncogenicPOSSIBLE, nNOTTMZOncogenicPOSSIBLE,
                                                 nTMZNonSilentPOSSIBLE, nNOTTMZNonSilentPOSSIBLE)

dfHotspotTMZ = get_pvalues_for_hotspot_mutations_in_cases(tmzCases, allImpactMuts, tmzMotifs,
                                                         nTMZHotspotPOSSIBLE, nNOTTMZHotspotPOSSIBLE,
                                               nTMZNonSilentPOSSIBLE, nNOTTMZNonSilentPOSSIBLE)

In [None]:
dfOncAPOBEC = get_pvalues_for_oncogneic_mutations_in_cases(apobecCases, allImpactMuts, apobecMotifs,
                                                       nAPOBECOncogenicPOSSIBLE, nNOTAPOBECOncogenicPOSSIBLE,
                                                 nAPOBECNonSilentPOSSIBLE, nNOTAPOBECNonSilentPOSSIBLE)

dfHotspotAPOBEC = get_pvalues_for_hotspot_mutations_in_cases(apobecCases, allImpactMuts, apobecMotifs,
                                                         nAPOBECHotspotPOSSIBLE, nNOTAPOBECHotspotPOSSIBLE,
                                               nAPOBECNonSilentPOSSIBLE, nNOTAPOBECNonSilentPOSSIBLE)

In [None]:
dfHotspotPOLE.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/hotspotPValues_pole.tsv', index=False, sep='\t')
dfOncPOLE.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/oncogenicPValues_pole.tsv', index=False, sep='\t')


In [None]:
dfHotspotTMZ.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/hotspotPValues_tmz.tsv', index=False, sep='\t')
dfOncTMZ.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/oncogenicPValues_tmz.tsv', index=False, sep='\t')


In [None]:
dfHotspotAPOBEC.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/hotspotPValues_apobec.tsv', index=False, sep='\t')
dfOncAPOBEC.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/oncogenicPValues_apobec.tsv', index=False, sep='\t')


**part ii--endometrial pole and not pole** others as well<br/> <br/> <br/> <br/><br/> <br/> <br/> <br/><br/> <br/> <br/> <br/><br/> <br/> <br/> <br/>

In [None]:
def summarize_hotspots_per_case_by_motif(maf, sigMotifs, cancerTypeRelatedGenes=None, mode='hotspot'):
    listOfDicts = []
    for case in set(maf['Tumor_Sample_Barcode']):
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        
        #ive been too lazy to change variable names but there is a mode that does onocgenic mutations
        hotspotMuts = None
        if mode == 'hotspot':
            hotspotMuts = caseMaf[caseMaf['is-a-hotspot'] == 'Y']
        elif mode == 'oncogenic':
            hotspotMuts = caseMaf[caseMaf['oncogenic'].notnull()]
        
        hotspotMutsAtSigMotif = hotspotMuts[hotspotMuts['quadNuc'].isin(sigMotifs)]
        hotspotMutsNotAtSigMotif = hotspotMuts[~hotspotMuts['quadNuc'].isin(sigMotifs)]
        
        nmut = caseMaf.shape[0]
        nHotspotMutsAtSigMotif = hotspotMutsAtSigMotif.shape[0]
        nHotspotMutsNotAtSigMotif = hotspotMutsNotAtSigMotif.shape[0]
        
        fracRelatedGenesAtSigMotif = None
        if nHotspotMutsAtSigMotif > 0:
            fracRelatedGenesAtSigMotif = (1.0*hotspotMutsAtSigMotif[hotspotMutsAtSigMotif['Hugo_Symbol'].isin(cancerTypeRelatedGenes)].shape[0])/nHotspotMutsAtSigMotif
        
        fracRelatedGenesAtNotSigMotif = None
        if nHotspotMutsNotAtSigMotif > 0:
            fracRelatedGenesAtNotSigMotif = (1.0*hotspotMutsNotAtSigMotif[hotspotMutsNotAtSigMotif['Hugo_Symbol'].isin(cancerTypeRelatedGenes)].shape[0])/nHotspotMutsNotAtSigMotif
        
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'Nmut': nmut,
                            'fracRelatedMotif': fracRelatedGenesAtSigMotif, 'fracRelatedNotMotif': fracRelatedGenesAtNotSigMotif,
                            'nHotspotsAtMotif': nHotspotMutsAtSigMotif, 'hotspotMutsNotAtSigMotif': nHotspotMutsNotAtSigMotif})
        #Todo--include cancer type related genes
    return pd.DataFrame(listOfDicts)
    

In [None]:
endometrialIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'all')
endometrialHyperIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'Hypermutated')


In [None]:
endometrialMuts = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(endometrialIds)]
endometrialMuts = analysis_utils.map_cases_to_msi_sensor_class(endometrialMuts, msiSensorInfo= pathPrefix + '/ifs/work/taylorlab/friedman/mskImpactAsOfMarch2019/dmp/mskimpact/data_clinical_sample.txt')


In [None]:
endometrialMss = endometrialMuts[endometrialMuts['caseMsiClass'] == 'Stable']

In [None]:
impactSigs = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/signatures_from_unfiltered_maf.txt')

In [None]:
impactSigs = mutationSigUtils.merge_signature_columns(impactSigs)
impactSigs['dominantSignature'] = impactSigs.apply(lambda row: 
        mutationSigUtils.get_dominant_signature(row.to_dict(), cols=None, prefix='mean', notEnoughMuts= True), axis=1)

In [None]:
#also filter out case where mmr is the dominant signature or sig1 or sig14 is the dominant signature in >30mut_mb cases
casesWithMSISignature = set(impactSigs[(impactSigs['dominantSignature'] == 'mean_MMR')
                                      |((impactSigs['Nmut_Mb'] >= 30) & (impactSigs['dominantSignature'].isin(set(['mean_1', 'mean_14']))))
                                        ]['Tumor_Sample_Barcode'])

In [None]:
reload(analysis_utils)
relatedGenesDict = analysis_utils.enumerate_related_unrelated_genes_for_hypermutation_analysis(allImpactMuts, cTypes = ['Endometrial Cancer', 'Glioma', 'Bladder Cancer'])

In [None]:
endometrialMSSNoMSISig = endometrialMss[~endometrialMss['Tumor_Sample_Barcode'].isin(casesWithMSISignature)]
#df = summarize_hotspots_per_case_by_motif(endometrialMSSNoMSISig, poleMotifs, cancerTypeRelatedGenes=relatedGenesDict['Endometrial Cancer'])
df = summarize_hotspots_per_case_by_motif(endometrialMSSNoMSISig, poleMotifs, cancerTypeRelatedGenes=relatedGenesDict['Endometrial Cancer'], mode='oncogenic')


In [None]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/poleEndometrialHotspotCharacteristics.tsv', index=False, sep='\t')

In [None]:
#Do the same for gliomas (TMZ) and bladders (APOBEC)
bladderIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Bladder Cancer', hypermutantStatus = 'all')
gliomaIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Glioma', hypermutantStatus = 'all')


In [None]:
bladderMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(bladderIds)]
gliomaMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(gliomaIds)]

In [None]:
hypermutationGenes = set(['MLH1', 'MSH2', 'MSH6', 'POLE', 'PMS2'])
relatedGenesDict['Bladder Cancer'] = relatedGenesDict['Bladder Cancer'] - hypermutationGenes
relatedGenesDict['Glioma'] = relatedGenesDict['Glioma'] - hypermutationGenes

In [None]:
dfBladder = summarize_hotspots_per_case_by_motif(bladderMaf, apobecMotifs, cancerTypeRelatedGenes=relatedGenesDict['Bladder Cancer'])
dfGlioma = summarize_hotspots_per_case_by_motif(gliomaMaf, tmzMotifs, cancerTypeRelatedGenes=relatedGenesDict['Glioma'])



In [None]:
dfBladder.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/apobecBladderHotspotCharacteristics.tsv', index=False, sep='\t')
dfGlioma.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/tmzGliomaHotspotCharacteristics.tsv', index=False, sep='\t')

**PART 3** <br/> <br/> <br/> <br/> <br/> <br/>
<br/> We have 3 groups:  <br/>
1. MSS endo (POLE motif vs not) <br/>
2. MSI and low mut burden Colorectal (MSI snp vs INDEL vs Other snp <br/>
3. TMZ and low mut burden Glioma (TMZ vs not TMZ motifs)

In [81]:
#First enumerate the enriched spectra
enrichedSpectra = mutationSigUtils.get_enriched_spectra_for_signatures(spectrumFile='/ifs/work/taylorlab/friedman/noahFirstProject/signature_sig_copy/mutation-signatures/Stratton_signatures30.txt', spectraSignificanceThresh=.025, pathPrefix=pathPrefix, signaturesToIgnore= set([]))

In [82]:
poleMotifs = enrichedSpectra['Signature.10']
tmzMotifs = enrichedSpectra['Signature.11']
mmrMotifs = enrichedSpectra['Signature.MMR']
motifDict = {'Endometrial Cancer': poleMotifs, 'Colorectal Cancer': mmrMotifs, 'Glioma': tmzMotifs}

In [4]:
#GET in ids for normal and hypermutated

endometrialIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'all')
endometrialNormalIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'Normal')
endometrialHyperIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'Hypermutated')

colorectalIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Colorectal Cancer', hypermutantStatus = 'all')
colorectalNormalIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Colorectal Cancer', hypermutantStatus = 'Normal')
colorectalHyperIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Colorectal Cancer', hypermutantStatus = 'Hypermutated')

gliomaIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Glioma', hypermutantStatus = 'all')
gliomaNormalIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Glioma', hypermutantStatus = 'Normal')
gliomaHyperIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Glioma', hypermutantStatus = 'Hypermutated')


In [18]:
#load in signature information for ruling out cases for analysis
impactSigs = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/signatures_from_unfiltered_maf.txt')
impactSigs = mutationSigUtils.merge_signature_columns(impactSigs)
impactSigs['dominantSignature'] = impactSigs.apply(lambda row: 
        mutationSigUtils.get_dominant_signature(row.to_dict(), cols=None, prefix='mean', notEnoughMuts= True), axis=1)

In [35]:
#We enumerate pan impact marks of msi and tmz to filter stuff out
casesWithMSISignature = set(impactSigs[(impactSigs['dominantSignature'] == 'mean_MMR')
                                      |((impactSigs['Nmut_Mb'] >= 30) & (impactSigs['dominantSignature'].isin(set(['mean_1']))))
                                        ]['Tumor_Sample_Barcode'])

casesWithPOLESignature = set(impactSigs[(impactSigs['dominantSignature'] == 'mean_10')]['Tumor_Sample_Barcode'])
casesWithTMZSignature = set(impactSigs[impactSigs['dominantSignature'] == 'mean_11']['Tumor_Sample_Barcode'])


In [36]:
endometrialHyperPOLEOnly = endometrialHyperIds & casesWithPOLESignature
colorectalHyperMMROnly = colorectalHyperIds & casesWithMSISignature
gliomaHyperTMZOnly = gliomaHyperIds & casesWithTMZSignature

In [None]:
relatedGenesDict = analysis_utils.enumerate_related_unrelated_genes_for_hypermutation_analysis(allImpactMuts, cTypes = ['Endometrial Cancer', 'Glioma', 'Colorectal Cancer'])


In [None]:
endometrialPOLEOnlyMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(endometrialHyperPOLEOnly)]
endometrialPOLEOnlyMaf['cancer_type'] = 'Endometrial Cancer'
endometrialPOLEOnlyMaf['cohort'] = 'Endometrial_Cancer_Hyper'
#####################
endometrialNormalMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(endometrialNormalIds)]
endometrialNormalMaf['cancer_type'] = 'Endometrial Cancer'
endometrialNormalMaf['cohort'] = 'Endometrial_Cancer_Normal'


colorectalMMROnlyMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(colorectalHyperMMROnly)]
colorectalMMROnlyMaf['cancer_type'] = 'Colorectal Cancer'
colorectalMMROnlyMaf['cohort'] = 'Colorectal_Cancer_Hyper'
#####################
colorectalNormalMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(colorectalNormalIds)]
colorectalNormalMaf['cancer_type'] = 'Colorectal Cancer'
colorectalNormalMaf['cohort'] = 'Colorectal_Cancer_Normal'


gliomaTMZOnlyMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(gliomaHyperTMZOnly)]
gliomaTMZOnlyMaf['cancer_type'] = 'Glioma'
gliomaTMZOnlyMaf['cohort'] = 'Glioma_Hyper'
#####################
gliomaNormalMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(gliomaNormalIds)]
gliomaNormalMaf['cancer_type'] = 'Glioma'
gliomaNormalMaf['cohort'] = 'Glioma_Normal'

In [41]:
combinedMaf = pd.concat([endometrialPOLEOnlyMaf, colorectalMMROnlyMaf, gliomaTMZOnlyMaf,
                        endometrialNormalMaf, colorectalNormalMaf, gliomaNormalMaf])

In [72]:
#a function to summarize info for plotting
def summarize_mutation_motif_info(maf, motifDict, relatedGenesD):
    cancerTypeMotifNames = {'Endometrial Cancer': ['POLE', 'Not POLE'],
                            'Colorectal Cancer': ['MSI', 'Not MSI'],
                           'Glioma': ['TMZ', 'Not TMZ']}#a dictionary that maps cancer types to labels we will use
    listOfDicts = []
    cntr = 0
    for case in set(maf['Tumor_Sample_Barcode']):
        
        cntr += 1
        if cntr%50 == 0: print cntr
        
        #Count n oncogenic, nmut etc
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        caseMafOncogenic = caseMaf[caseMaf['oncogenic'].notnull()]
        nmut = caseMaf.shape[0]
        nmutOnc = caseMafOncogenic.shape[0]
        
        #COUNT n times things happen at motifs
        cancerType = caseMaf['cancer_type'].iloc[0]
        cohort = caseMaf['cohort'].iloc[0]

        motifs = motifDict[cancerType]
        mafIndel = caseMafOncogenic[caseMafOncogenic['Variant_Type'].isin(set(['DEL', 'INS']))]
        mafMotif = caseMafOncogenic[(caseMafOncogenic['quadNuc'].isin(motifs)) & (caseMafOncogenic['Variant_Type'] == 'SNP')]
        mafNotMotif = caseMafOncogenic[(~caseMafOncogenic['quadNuc'].isin(motifs)) & (caseMafOncogenic['Variant_Type'] == 'SNP')]
        nIndel = mafIndel.shape[0]
        nMotif = mafMotif.shape[0]
        nNOTMotif = mafNotMotif.shape[0]
        
        #related not related analyses
        relatedGenes = relatedGenesD[cancerType]
        nRelatedMotif = mafMotif[mafMotif['Hugo_Symbol'].isin(relatedGenes)].shape[0]
        nRelatedNOTMotif = mafNotMotif[mafNotMotif['Hugo_Symbol'].isin(relatedGenes)].shape[0]
        nRelatedIndel = mafIndel[mafIndel['Hugo_Symbol'].isin(relatedGenes)].shape[0]
        fracRelatedMotif = None
        fracRelatedNOTMotif = None
        fracRelatedIndel = None
        if nMotif > 0:
            fracRelatedMotif = (1.0*nRelatedMotif)/nMotif
        if nNOTMotif > 0:
            fracRelatedNOTMotif = (1.0*nRelatedNOTMotif)/nNOTMotif   
        if nIndel > 0:
            fracRelatedIndel = (1.0*nRelatedIndel)/nIndel
        
        #NOW DO THE PROPER DICT ASSIGNMENT
        sharedD = {'Tumor_Sample_Barcode': case, 'Cancer_Type': cancerType, 'cohort':cohort, 'NmutCase': nmut, 'nOncCase': nmutOnc}
        motifD = sharedD.copy()
        notMotifD = sharedD.copy()
        indelD = sharedD.copy()
        
        motifD['type'], motifD['n'], motifD['frac'] = cancerTypeMotifNames[cancerType][0], nMotif, fracRelatedMotif
        listOfDicts.append(motifD)
        #
        notMotifD['type'], notMotifD['n'], notMotifD['frac'] = cancerTypeMotifNames[cancerType][1], nNOTMotif, fracRelatedNOTMotif
        listOfDicts.append(notMotifD)
        #
        indelD['type'], indelD['n'], indelD['frac'] = 'indel', nIndel, fracRelatedIndel
        listOfDicts.append(indelD)
        
    return pd.DataFrame(listOfDicts)



In [None]:
df = summarize_mutation_motif_info(combinedMaf, motifDict, relatedGenesDict)

In [86]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/mutationMotifSummary.tsv', index=False, sep='\t')

**Recurrent hotspots at uncommon trinucletides** 

In [64]:
#OLD METHOD
def count_hotspot_and_vus_quadnucs(maf):
    
    listOfDicts = []
    
    nTotalNotHotspotMuts = maf[~(maf['is-a-hotspot'] == 'Y')].shape[0]
    onePctCount = .01*nTotalNotHotspotMuts
    recurrentHotspotThresh = .1
    
    allBases = ['A', 'C', 'G', 'T']
    changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
    allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
    for quadNuc in allQuadNucs:
        quadNucMuts = maf[maf['quadNuc'] == quadNuc]
        nNotHotspotMuts =  quadNucMuts[~(quadNucMuts['is-a-hotspot'] == 'Y')].shape[0]
        quadNucHotspotMuts = quadNucMuts[quadNucMuts['is-a-hotspot'] == 'Y']
        nHotspotMuts = quadNucHotspotMuts.shape[0]
        
        mostCommonHotspot = None
        recurrentAlleles = []
        if quadNucHotspotMuts.shape[0] > 0 and nNotHotspotMuts < onePctCount:
            for hotspot, count in Counter(quadNucHotspotMuts['allele']).most_common():
                if (1.0*count)/nNotHotspotMuts > recurrentHotspotThresh and count > 1:
                    #only include hotspots with a ratio of at least 1:10 to non-hotspot mutations and >1mutations
                    recurrentAlleles.append(hotspot+':'+str(count))
        alleleNames = '\n'.join(recurrentAlleles)
        listOfDicts.append({'quadNuc': quadNuc, 'nNotHotspotMuts': nNotHotspotMuts,
                           'nHotspotMuts': nHotspotMuts, 'alleleNames': alleleNames})
        
    df = pd.DataFrame(listOfDicts)
    return df
            

In [80]:
def summarize_hotspots_and_motif_fractions(maf):
    
    allBases = ['A', 'C', 'G', 'T']
    changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
    allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
    quadNucCountDict = dict(maf['quadNuc'].value_counts())
    vusQuadNucCountDict = dict(maf[maf['oncogenic'].isnull()]['quadNuc'].value_counts())
    for quadNuc in allQuadNucs:
        if quadNuc not in quadNucCountDict: quadNucCountDict[quadNuc] = 0
        if quadNuc not in vusQuadNucCountDict: vusQuadNucCountDict[quadNuc] = 0
              
    maf = maf[maf['is-a-hotspot'] == 'Y']
    listOfDicts = []
    for allele in set(maf['allele']):
        alleleMaf = maf[maf['allele'] == allele]
        for quadNuc in set(alleleMaf['quadNuc']):
            if quadNuc != None:
                quadNucAlleleMaf = alleleMaf[alleleMaf['quadNuc'] == quadNuc]
                listOfDicts.append({'allele': allele, 'nMuts': quadNucAlleleMaf.shape[0],
                                'quadNuc': quadNuc, 'nQuadnuc': quadNucCountDict[quadNuc],
                                'nQuadnucVUS': vusQuadNucCountDict[quadNuc]})
    df = pd.DataFrame(listOfDicts)
    return df
    

In [13]:
allImpactMuts['allele'] = allImpactMuts.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)

In [72]:
idsDict = {}
#cancerTypes where we have at least 40 hypermutated cases
cancerTypes = ['Endometrial Cancer', 'Colorectal Cancer', 'Glioma', 'Non-Small Cell Lung Cancer',
                  'Prostate Cancer', 'Esophagogastric Cancer', 'Bladder Cancer']
for cancerType in cancerTypes:
    idsDict[cancerType] = get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/juno/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType=cancerType, hypermutantStatus = 'Hypermutated')


In [76]:
mafsDict = {}
for cancerType, ids in idsDict.items():
    mafsDict[cancerType] = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(ids)]


In [81]:
summaryDfsList = []

for cancerType, maf in mafsDict.items():
    summaryDf = summarize_hotspots_and_motif_fractions(maf)
    summaryDf['cancerType'] = cancerType
    nCases = len(set(maf['Tumor_Sample_Barcode']))
    summaryDf['fracCasesWithHotspot'] = summaryDf['nMuts'].apply(lambda x: (1.0*x)/nCases)
    summaryDfsList.append(summaryDf)
    
combinedDf = pd.concat(summaryDfsList)
combinedDf['alleleAndCancerType'] = combinedDf['allele'] + '\n' + combinedDf['cancerType']

In [82]:
combinedDf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/nonSignatureFavoredAlleles.tsv', index=False, sep='\t')

In [121]:

#impactSigsDf = pd.read_table(pathPrefix + '/juno/work/taylorlab/friedman/myAdjustedDataFiles/impactSignatureCalls_Nov20_2019.tsv')

In [131]:
poleCases = impactSigsDf[impactSigsDf['dominantSignature'] == 'mean_10']['Tumor_Sample_Barcode']

In [132]:
allPoleMuts = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(poleCases)]


In [133]:
nHotspotMuts = dict(allPoleMuts[allPoleMuts['is-a-hotspot'] == 'Y']['Tumor_Sample_Barcode'].value_counts())
nMuts = dict(allPoleMuts['Tumor_Sample_Barcode'].value_counts())

In [138]:
listOfDicts = []
for case in set(allPoleMuts['Tumor_Sample_Barcode']):
    if case in nHotspotMuts and case in nMuts:
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'nHotspot': nHotspotMuts[case],
                       'nmut': nMuts[case], 'hotspotRate': (1.0*nHotspotMuts[case])/nMuts[case]})
    
df = pd.DataFrame(listOfDicts)

In [145]:
np.nanmedian(df['hotspotRate'])
for i in df[df['hotspotRate'] > .25]['Tumor_Sample_Barcode']:
    print i

P-0042339-T01-IM6
P-0047608-T01-IM6
P-0014528-T01-IM6
P-0033990-T01-IM6
P-0033990-T02-IM6
