In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re
import scipy.stats as stats

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt'

sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import mutation_modeling_util

In [None]:
allImpactMuts = analysis_utils.load_in_df_with_progress(filePath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', nLinesFile = 275000)

In [3]:
allImpactMuts['quadNuc'] = allImpactMuts.apply(lambda row: mutationSigUtils.create_reference_four_nuc(row['Ref_Tri'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)


In [8]:
simDfSummary = pd.read_table('/Users/friedman/Desktop/workOffline/mutSimulationInfoIncludingHotspots.tsv')

In [11]:
def calculate_n_muts_of_category(df, ignoreMotifs = ['TCAT', 'TCAA', 'TCTG', 'TCTT']):
    allBases = ['A', 'C', 'G', 'T']
    changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
    allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
    
    sQuadNuc = 0
    sSilent = 0
    sHotspot = 0
    sOncogenic = 0
    for q in allQuadNucs:
        if q not in ignoreMotifs:
            sQuadNuc += sum(df[q])
            sSilent += sum(df[q + '_silent'])
            sHotspot += sum(df[q + '_hotspot'])
            sOncogenic += sum(df[q + '_oncogenic'])
    
    return sQuadNuc - sSilent, sHotspot, sOncogenic
        

In [60]:
def calculate_n_possible_muts_by_category_and_sig(sigMotifs):
    nSigNonSilentPOSSIBLE = sum([sum(simDfSummary[x]) for x in sigMotifs]) - sum([sum(simDfSummary[x + '_silent']) for x in sigMotifs])
    nSigHotspotPOSSIBLE = sum([sum(simDfSummary[x + '_hotspot']) for x in sigMotifs])
    nSigOncogenicPOSSIBLE = sum([sum(simDfSummary[x + '_oncogenic']) for x in sigMotifs])
    nNOTSigNonSilentPOSSIBLE, nNOTSigHotspotPOSSIBLE, nNOTSigOncogenicPOSSIBLE = calculate_n_muts_of_category(simDfSummary, ignoreMotifs = sigMotifs)
    return nSigNonSilentPOSSIBLE, nSigHotspotPOSSIBLE, nSigOncogenicPOSSIBLE, nNOTSigNonSilentPOSSIBLE, nNOTSigHotspotPOSSIBLE, nNOTSigOncogenicPOSSIBLE



In [11]:
allBases = ['A', 'C', 'G', 'T']
changes = ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] #format: 'CA' means a change from C>A
allQuadNucs = [firstBase + change + lastBase for firstBase in allBases for change in changes for lastBase in allBases] #enumerate all 96 quadnucs for signatures
    

b
tmzMotifs = set(['ACTC', 'ACTT', 'CCTC', 'CCTT', 'GCTC', 'GCTT', 'TCTC', 'TCTT'])
apobecMotifs = set(['TCTA', 'TCTT', 'TCGA', 'TCGT'])

allHotspotCols = [x + '_hotspot' for x in allQuadNucs]
allSilentCols = [x + '_silent' for x in allQuadNucs]
allOncogenicCols = [x + '_oncogenic' for x in allQuadNucs]



In [107]:
#nPoleNonSilentPOSSIBLE = sum(simDfSummary['TCAT']) + sum(simDfSummary['TCAA']) + sum(simDfSummary['TCTG']) + sum(simDfSummary['TCTT']) - sum(simDfSummary['TCAT_silent']) - sum(simDfSummary['TCAA_silent']) - sum(simDfSummary['TCTG_silent']) - sum(simDfSummary['TCTT_silent'])
#nPoleHotspotPOSSIBLE = sum(simDfSummary['TCAT_hotspot']) + sum(simDfSummary['TCAA_hotspot']) + sum(simDfSummary['TCTG_hotspot']) + sum(simDfSummary['TCTT_hotspot'])
#nPoleOncogenicPOSSIBLE = sum(simDfSummary['TCAT_oncogenic']) + sum(simDfSummary['TCAA_oncogenic']) + sum(simDfSummary['TCTG_oncogenic']) + sum(simDfSummary['TCTT_oncogenic'])
#nNOTPoleNonSilentPOSSIBLE, nNOTPoleHotspotPOSSIBLE, nNOTPoleOncogenicPOSSIBLE = calculate_n_muts_of_category(simDfSummary)

nPOLENonSilentPOSSIBLE, nPOLEHotspotPOSSIBLE, nPOLEOncogenicPOSSIBLE, nNOTPOLENonSilentPOSSIBLE, nNOTPOLEHotspotPOSSIBLE, nNOTPOLEOncogenicPOSSIBLE = calculate_n_possible_muts_by_category_and_sig(poleMotifs)

nTMZNonSilentPOSSIBLE, nTMZHotspotPOSSIBLE, nTMZOncogenicPOSSIBLE, nNOTTMZNonSilentPOSSIBLE, nNOTTMZHotspotPOSSIBLE, nNOTTMZOncogenicPOSSIBLE = calculate_n_possible_muts_by_category_and_sig(tmzMotifs)

nAPOBECNonSilentPOSSIBLE, nAPOBECHotspotPOSSIBLE, nAPOBECOncogenicPOSSIBLE, nNOTAPOBECNonSilentPOSSIBLE, nNOTAPOBECHotspotPOSSIBLE, nNOTAPOBECOncogenicPOSSIBLE = calculate_n_possible_muts_by_category_and_sig(apobecMotifs)




In [15]:
impactSigs = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/signatures_from_unfiltered_maf.txt')


In [108]:
poleCases = set(impactSigs[(impactSigs['Nmut_Mb'] > 50) & (impactSigs['mean_10'] > .33)]['Tumor_Sample_Barcode'])

In [109]:
tmzCases = set(impactSigs[(impactSigs['Nmut_Mb'] > 50) & (impactSigs['mean_11'] > .33)]['Tumor_Sample_Barcode'])

In [110]:
apobecCases = set(impactSigs[(impactSigs['Nmut_Mb'] > 75) & (impactSigs['mean_2'] + impactSigs['mean_13'] > .5)]['Tumor_Sample_Barcode'])


now go and calculate p values <br/>

In [67]:
def get_pvalues_for_oncogneic_mutations_in_cases(ids, allMutsMaf, sigMotifs, 
                                                 nSigOncogenicPossible, nNotSigOncogenicPossible,
                                                 nSigNotSilentPossible, nNotSigNonSilentPossible):

    listOfDicts = []
    for case in ids:
        caseMaf = allMutsMaf[allMutsMaf['Tumor_Sample_Barcode'] == case]
        nmut = caseMaf.shape[0]
        nmutAtMotif = caseMaf[caseMaf['quadNuc'].isin(sigMotifs)].shape[0]
        if nmut > 0:
            nOncogenicSIG = caseMaf[(caseMaf['oncogenic'].notnull()) & (caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nOncogenicNOTSIG = caseMaf[(caseMaf['oncogenic'].notnull()) & (~caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nNOTOncogenicSIG = caseMaf[~(caseMaf['oncogenic'].notnull()) & (caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nNOTOncogenicNOTSIG = caseMaf[~(caseMaf['oncogenic'].notnull()) & (~caseMaf['quadNuc'].isin(sigMotifs))].shape[0]

            oddsratioSIGMuts, pvalueSIGMuts = stats.fisher_exact([[nOncogenicSIG, nSigOncogenicPossible - nOncogenicSIG],
                                                    [nNOTOncogenicSIG, nSigNotSilentPossible - nNOTOncogenicSIG]])

            oddsratioNOTSIGMuts, pvalueNOTSIGMuts = stats.fisher_exact([[nOncogenicNOTSIG, nNotSigOncogenicPossible - nOncogenicNOTSIG],
                                                    [nNOTOncogenicNOTSIG, nNotSigNonSilentPossible - nNOTOncogenicNOTSIG]])

            listOfDicts.append({
                'Tumor_Sample_Barcode': case,
                'Nmut': nmut,
                'pvalueSIGMuts': pvalueSIGMuts, 'pvalueNOTSIGMuts': pvalueNOTSIGMuts,
                'oddsratioSIGMuts': oddsratioSIGMuts, 'oddsratioNOTSIGMuts': oddsratioNOTSIGMuts,
                'mutOfInterestFrac': (1.0*nOncogenicSIG)/nmutAtMotif
            })
    df = pd.DataFrame(listOfDicts)
    return df



In [77]:
def get_pvalues_for_hotspot_mutations_in_cases(ids, allMutsMaf, sigMotifs,
                                               nSigHotspotPossible, nNotSigHotspotPossible,
                                               nSigNotSilentPossible, nNotSigNonSilentPossible):

    listOfDicts = []
    for case in ids:
        caseMaf = allMutsMaf[allMutsMaf['Tumor_Sample_Barcode'] == case]
        nmut = caseMaf.shape[0]
        nmutAtMotif = caseMaf[caseMaf['quadNuc'].isin(sigMotifs)].shape[0]
        if nmut > 0:
            nHotspotsSIG = caseMaf[(caseMaf['is-a-hotspot'] == 'Y') & (caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nHotspotsNOTSIG = caseMaf[(caseMaf['is-a-hotspot'] == 'Y') & (~caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nNOTHotspotsSIG = caseMaf[(caseMaf['is-a-hotspot'] != 'Y') & (caseMaf['quadNuc'].isin(sigMotifs))].shape[0]
            nNOTHotspotsNOTSIG = caseMaf[(caseMaf['is-a-hotspot'] != 'Y') & (~caseMaf['quadNuc'].isin(sigMotifs))].shape[0]

            oddsratioSIGMuts, pvalueSIGMuts = stats.fisher_exact([[nHotspotsSIG, nSigHotspotPossible - nHotspotsSIG],
                                                    [nNOTHotspotsSIG, nSigNotSilentPossible - nNOTHotspotsSIG]])

            oddsratioNOTSIGMuts, pvalueNOTSIGMuts = stats.fisher_exact([[nHotspotsNOTSIG, nNotSigHotspotPossible - nHotspotsNOTSIG],
                                                    [nNOTHotspotsNOTSIG, nNotSigNonSilentPossible - nNOTHotspotsNOTSIG]])

            listOfDicts.append({
                'Tumor_Sample_Barcode': case,
                'Nmut': nmut,
                'pvalueSIGMuts': pvalueSIGMuts, 'pvalueNOTSIGMuts': pvalueNOTSIGMuts,
                'oddsratioSIGMuts': oddsratioSIGMuts, 'oddsratioNOTSIGMuts': oddsratioNOTSIGMuts,
                'mutOfInterestFrac': (1.0*nHotspotsSIG)/nmutAtMotif
            })
    df = pd.DataFrame(listOfDicts)
    return df
    
    

In [111]:
dfOncPOLE = get_pvalues_for_oncogneic_mutations_in_cases(poleCases, allImpactMuts, poleMotifs,
                                                       nPOLEOncogenicPOSSIBLE, nNOTPOLEOncogenicPOSSIBLE,
                                                 nPOLENonSilentPOSSIBLE, nNOTPOLENonSilentPOSSIBLE)

dfHotspotPOLE = get_pvalues_for_hotspot_mutations_in_cases(poleCases, allImpactMuts, poleMotifs,
                                                         nPOLEHotspotPOSSIBLE, nNOTPOLEHotspotPOSSIBLE,
                                               nPOLENonSilentPOSSIBLE, nNOTPOLENonSilentPOSSIBLE)

In [105]:
dfOncTMZ = get_pvalues_for_oncogneic_mutations_in_cases(tmzCases, allImpactMuts, tmzMotifs,
                                                       nTMZOncogenicPOSSIBLE, nNOTTMZOncogenicPOSSIBLE,
                                                 nTMZNonSilentPOSSIBLE, nNOTTMZNonSilentPOSSIBLE)

dfHotspotTMZ = get_pvalues_for_hotspot_mutations_in_cases(tmzCases, allImpactMuts, tmzMotifs,
                                                         nTMZHotspotPOSSIBLE, nNOTTMZHotspotPOSSIBLE,
                                               nTMZNonSilentPOSSIBLE, nNOTTMZNonSilentPOSSIBLE)

In [101]:
dfOncAPOBEC = get_pvalues_for_oncogneic_mutations_in_cases(apobecCases, allImpactMuts, apobecMotifs,
                                                       nAPOBECOncogenicPOSSIBLE, nNOTAPOBECOncogenicPOSSIBLE,
                                                 nAPOBECNonSilentPOSSIBLE, nNOTAPOBECNonSilentPOSSIBLE)

dfHotspotAPOBEC = get_pvalues_for_hotspot_mutations_in_cases(apobecCases, allImpactMuts, apobecMotifs,
                                                         nAPOBECHotspotPOSSIBLE, nNOTAPOBECHotspotPOSSIBLE,
                                               nAPOBECNonSilentPOSSIBLE, nNOTAPOBECNonSilentPOSSIBLE)

In [112]:
dfHotspotPOLE.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/hotspotPValues_pole.tsv', index=False, sep='\t')
dfOncPOLE.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/oncogenicPValues_pole.tsv', index=False, sep='\t')


In [106]:
dfHotspotTMZ.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/hotspotPValues_tmz.tsv', index=False, sep='\t')
dfOncTMZ.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/oncogenicPValues_tmz.tsv', index=False, sep='\t')


In [102]:
dfHotspotAPOBEC.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/hotspotPValues_apobec.tsv', index=False, sep='\t')
dfOncAPOBEC.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/oncogenicPValues_apobec.tsv', index=False, sep='\t')


**part ii--endometrial pole and not pole** others as well<br/> <br/> <br/> <br/><br/> <br/> <br/> <br/><br/> <br/> <br/> <br/><br/> <br/> <br/> <br/>

In [15]:
def summarize_hotspots_per_case_by_motif(maf, sigMotifs, cancerTypeRelatedGenes=None, mode='hotspot'):
    listOfDicts = []
    for case in set(maf['Tumor_Sample_Barcode']):
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        
        #ive been too lazy to change variable names but there is a mode that does onocgenic mutations
        hotspotMuts = None
        if mode == 'hotspot':
            hotspotMuts = caseMaf[caseMaf['is-a-hotspot'] == 'Y']
        elif mode == 'oncogenic':
            hotspotMuts = caseMaf[caseMaf['oncogenic'].notnull()]
        
        hotspotMutsAtSigMotif = hotspotMuts[hotspotMuts['quadNuc'].isin(sigMotifs)]
        hotspotMutsNotAtSigMotif = hotspotMuts[~hotspotMuts['quadNuc'].isin(sigMotifs)]
        
        nmut = caseMaf.shape[0]
        nHotspotMutsAtSigMotif = hotspotMutsAtSigMotif.shape[0]
        nHotspotMutsNotAtSigMotif = hotspotMutsNotAtSigMotif.shape[0]
        
        fracRelatedGenesAtSigMotif = None
        if nHotspotMutsAtSigMotif > 0:
            fracRelatedGenesAtSigMotif = (1.0*hotspotMutsAtSigMotif[hotspotMutsAtSigMotif['Hugo_Symbol'].isin(cancerTypeRelatedGenes)].shape[0])/nHotspotMutsAtSigMotif
        
        fracRelatedGenesAtNotSigMotif = None
        if nHotspotMutsNotAtSigMotif > 0:
            fracRelatedGenesAtNotSigMotif = (1.0*hotspotMutsNotAtSigMotif[hotspotMutsNotAtSigMotif['Hugo_Symbol'].isin(cancerTypeRelatedGenes)].shape[0])/nHotspotMutsNotAtSigMotif
        
        listOfDicts.append({'Tumor_Sample_Barcode': case, 'Nmut': nmut,
                            'fracRelatedMotif': fracRelatedGenesAtSigMotif, 'fracRelatedNotMotif': fracRelatedGenesAtNotSigMotif,
                            'nHotspotsAtMotif': nHotspotMutsAtSigMotif, 'hotspotMutsNotAtSigMotif': nHotspotMutsNotAtSigMotif})
        #Todo--include cancer type related genes
    return pd.DataFrame(listOfDicts)
    

In [21]:
endometrialIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'all')
endometrialHyperIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Endometrial Cancer', hypermutantStatus = 'Hypermutated')


In [22]:
endometrialMuts = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(endometrialIds)]
endometrialMuts = analysis_utils.map_cases_to_msi_sensor_class(endometrialMuts, msiSensorInfo= pathPrefix + '/ifs/work/taylorlab/friedman/mskImpactAsOfMarch2019/dmp/mskimpact/data_clinical_sample.txt')


  if self.run_code(code, result):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['caseMsiClass'] = df['Tumor_Sample_Barcode'].apply(lambda x: msiClassDict[x] if x in msiClassDict else None)


In [23]:
endometrialMss = endometrialMuts[endometrialMuts['caseMsiClass'] == 'Stable']

In [16]:
impactSigs = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/signatures_from_unfiltered_maf.txt')

In [17]:
impactSigs = mutationSigUtils.merge_signature_columns(impactSigs)
impactSigs['dominantSignature'] = impactSigs.apply(lambda row: 
        mutationSigUtils.get_dominant_signature(row.to_dict(), cols=None, prefix='mean', notEnoughMuts= True), axis=1)

In [18]:
#also filter out case where mmr is the dominant signature or sig1 or sig14 is the dominant signature in >30mut_mb cases
casesWithMSISignature = set(impactSigs[(impactSigs['dominantSignature'] == 'mean_MMR')
                                      |((impactSigs['Nmut_Mb'] >= 30) & (impactSigs['dominantSignature'].isin(set(['mean_1', 'mean_14']))))
                                        ]['Tumor_Sample_Barcode'])

In [8]:
reload(analysis_utils)
relatedGenesDict = analysis_utils.enumerate_related_unrelated_genes_for_hypermutation_analysis(allImpactMuts, cTypes = ['Endometrial Cancer', 'Glioma', 'Bladder Cancer'])

Endometrial Cancer


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cohortMaf['pid'] = cohortMaf['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  oncogenicMutations['patientGeneMutated'] = oncogenicMutations.apply(lambda row: row['pid'] + '_' + row['Hugo_Symbol'], axis=1)


Glioma
Bladder Cancer


In [24]:
endometrialMSSNoMSISig = endometrialMss[~endometrialMss['Tumor_Sample_Barcode'].isin(casesWithMSISignature)]
#df = summarize_hotspots_per_case_by_motif(endometrialMSSNoMSISig, poleMotifs, cancerTypeRelatedGenes=relatedGenesDict['Endometrial Cancer'])
df = summarize_hotspots_per_case_by_motif(endometrialMSSNoMSISig, poleMotifs, cancerTypeRelatedGenes=relatedGenesDict['Endometrial Cancer'], mode='oncogenic')


In [25]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/poleEndometrialHotspotCharacteristics.tsv', index=False, sep='\t')

In [4]:
#Do the same for gliomas (TMZ) and bladders (APOBEC)
bladderIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Bladder Cancer', hypermutantStatus = 'all')
gliomaIds = analysis_utils.get_ids_by_hypermutant_status(hypermutantIdDir=pathPrefix +'/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/hypermutationStatusIds', cancerType='Glioma', hypermutantStatus = 'all')


In [5]:
bladderMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(bladderIds)]
gliomaMaf = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(gliomaIds)]

In [10]:
hypermutationGenes = set(['MLH1', 'MSH2', 'MSH6', 'POLE', 'PMS2'])
relatedGenesDict['Bladder Cancer'] = relatedGenesDict['Bladder Cancer'] - hypermutationGenes
relatedGenesDict['Glioma'] = relatedGenesDict['Glioma'] - hypermutationGenes

In [12]:
dfBladder = summarize_hotspots_per_case_by_motif(bladderMaf, apobecMotifs, cancerTypeRelatedGenes=relatedGenesDict['Bladder Cancer'])
dfGlioma = summarize_hotspots_per_case_by_motif(gliomaMaf, tmzMotifs, cancerTypeRelatedGenes=relatedGenesDict['Glioma'])



In [13]:
dfBladder.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/apobecBladderHotspotCharacteristics.tsv', index=False, sep='\t')
dfGlioma.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/tmzGliomaHotspotCharacteristics.tsv', index=False, sep='\t')