In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import math
import re
from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
import get_gene_and_cohort_list_utils

filePathDict = configuration_util.get_all_files_path_dict()

In [2]:
def summarize_n_hypermutated_and_signatures_of_cases(dominantSignatureDict, mutClassificationDir):
    nCasesSummaryL = []
    signaturesSummaryL = []
    for f in os.listdir(mutClassificationDir):
        cancerType = re.sub('.tsv', '', f)
        filePath = os.path.join(mutClassificationDir, f)
        df = pd.read_table(filePath)
        df['dominantSignature'] = df['Tumor_Sample_Barcode'].apply(lambda x: dominantSignatureDict[x] if x in dominantSignatureDict else None)
  
        hypermutatedDf = df[df['hypermutantClassification'] == 'Hypermutated']
        highMutBurdenDf = df[df['hypermutantClassification'] == 'highMutationBurden']
        
        nCasesSummaryL.append({'cancerType': cancerType,
        'nHypermutated': hypermutatedDf.shape[0], 'nTotal': df.shape[0], 'nHighMutBurden': highMutBurdenDf.shape[0]})
      
        #TODO make the code for plotting the signatures
        for signature in set(df[df['dominantSignature'].notnull()]['dominantSignature']):
            
            signatureName = re.sub('mean_', '', signature)
            signaturesSummaryL.append({'cancerType': cancerType,
                'signature': signatureName, 'nCases': df.shape[0], 
                'nHypermutatedCases': hypermutatedDf[hypermutatedDf['dominantSignature'] == signature].shape[0],
                'nHighMutationBurdenCases': highMutBurdenDf[highMutBurdenDf['dominantSignature'] == signature].shape[0],
                'nTotal': df.shape[0]})
        
    casesSummaryDf = pd.DataFrame(nCasesSummaryL)
    signaturesSummaryDf = pd.DataFrame(signaturesSummaryL)
    return casesSummaryDf, signaturesSummaryDf
        

In [3]:
def summarize_dominant_signatures_of_cases(mutClassificationDir = '/Users/friedman/Desktop/hypermutationStatusIds/'):
    d = {}
    for f in os.listdir(mutClassificationDir):
        cancerType = re.sub('.tsv', '', f)
        filePath = os.path.join(mutClassificationDir, f)
        df = pd.read_table(filePath)
        hypermutatedDf = df[df['hypermutantClassification'] == 'Hypermutated']
        domSigDict = dict(zip(hypermutatedDf['Tumor_Sample_Barcode'], hypermutatedDf['dominantSignature']))
        for tsb, domSig in domSigDict.items():
            d[tsb] = domSig
    return d

In [43]:
dominantSignatureDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath=filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
casesSummary, signaturesSummary = summarize_n_hypermutated_and_signatures_of_cases(dominantSignatureDict,
    mutClassificationDir = filePathDict['HYPERMUTATION_STATUS_IDS'])

  import sys


In [44]:
#ADJUST THE CASES SUMMARY INFORMATION
minNCasesToDisplay = 250 #lump all cancers with fewer than this number of cases as other
casesSummary['cancerType'] = casesSummary.apply(lambda row: 'other' if row['nHypermutated'] + row['nHighMutBurden'] < 10 else row['cancerType'] ,axis=1)



In [46]:
#ADJUST THE SIGNATURES SUMMARY INFORMATION
minNHypermutatedCasesToDisplay = 10
signaturesSummary['cancerType'] = signaturesSummary.apply(
    lambda row: 'other' if row['nHypermutatedCases'] + row['nHighMutationBurdenCases'] < minNHypermutatedCasesToDisplay else row['cancerType'] ,axis=1)
signaturesSummary['orderingVal'] = signaturesSummary.apply(
    lambda row: -1 if row['cancerType'] == 'other' else row['nHypermutatedCases'], axis=1)

signaturesRenameDict = {'1': 'MMR', 'SMOKING': 'SMOKING',
                        'MMR': 'MMR', 'APOBEC': 'APOBEC', '10': 'POLE',
                       '11': 'TMZ', '14': 'POLE&MMR', '7': 'UV'}
signaturesSummary['signature'] = signaturesSummary['signature'].apply(lambda x:
        signaturesRenameDict[x] if x in signaturesRenameDict else 'other')

In [47]:
signaturesSummary['nHyperHigh'] = signaturesSummary['nHighMutationBurdenCases'] + signaturesSummary['nHypermutatedCases']
signaturesSummary['nCasesHyperHighType'] = signaturesSummary.apply(lambda row:
            sum(signaturesSummary[signaturesSummary['cancerType'] == row['cancerType']]['nHighMutationBurdenCases']) + 
            sum(signaturesSummary[signaturesSummary['cancerType'] == row['cancerType']]['nHypermutatedCases']), axis=1)
signaturesSummary['frac'] = signaturesSummary.apply(lambda row:
        (row['nHighMutationBurdenCases'] + row['nHypermutatedCases'] + 0.0)/row['nCasesHyperHighType'] , axis=1)

In [48]:
cancersOfIndeterminateOrigin = set(['Cancer_of_Unknown_Primary', 'Skin_Cancer,_Non-Melanoma'])  #remove cancers of indeterminate origin
signaturesSummary = signaturesSummary[~signaturesSummary['cancerType'].isin(cancersOfIndeterminateOrigin)]
casesSummary = casesSummary[~casesSummary['cancerType'].isin(cancersOfIndeterminateOrigin)]
casesSummary = casesSummary.groupby(['cancerType']).sum()
casesSummary['cancerType'] = casesSummary.index

casesSummary['fracHypermutated'] = casesSummary.apply(lambda row:
        (1.0*(row['nHypermutated'] + row['nHighMutBurden']))/row['nTotal'], axis=1)
casesSummary['fracHypermutatedOrdering'] = casesSummary.apply(lambda row:
        row['fracHypermutated'] if row['cancerType'] != 'other' else -1, axis=1)
#casesSummary['label'] = casesSummary.apply(lambda row: 
#                                           row['cancerType'] + ' (' + str(100*round(row['fracHypermutated'], 3)) + '%)', axis=1)

casesSummary['label'] = casesSummary.apply(lambda row:
                                          row['cancerType'] + ' (n = ' + str(row['nHypermutated'] + row['nHighMutBurden']) + ')', axis=1)


In [23]:
#writeDir = os.path.join(os.getcwd(), 'FIGURE1_PLOTTING_FILES')
writeDir = '/Users/friedman/Desktop/hypermutationProjectFinal/scripts/figure1/FIGURE1_PLOTTING_FILES/'

In [50]:
signaturesSummary.to_csv(os.path.join(writeDir, 'figure1bSignatureSummary.tsv'), index=False, sep='\t')
casesSummary.to_csv(os.path.join(writeDir, 'figure1aCancerTypeSummary.tsv'), index=False, sep='\t')

In [None]:
def get_per_case_nmut_mb_info(nmutDfPath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/nmutInfo_impact_filtered.tsv'):
    df = pd.read_table(nmutDfPath)
    return dict(zip(df['Tumor_Sample_Barcode'], df['Nmut_Mb']))

In [49]:
casesSummary[['cancerType', 'nHypermutated', 'nHighMutBurden']]

Unnamed: 0_level_0,cancerType,nHypermutated,nHighMutBurden
cancerType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bladder_Cancer,Bladder_Cancer,134,0
Breast_Cancer,Breast_Cancer,87,0
Colorectal_Cancer,Colorectal_Cancer,353,0
Endometrial_Cancer,Endometrial_Cancer,335,0
Esophagogastric_Cancer,Esophagogastric_Cancer,55,0
Glioma,Glioma,70,0
Head_and_Neck_Cancer,Head_and_Neck_Cancer,14,0
Hepatobiliary_Cancer,Hepatobiliary_Cancer,19,0
Melanoma,Melanoma,348,0
Non-Small_Cell_Lung_Cancer,Non-Small_Cell_Lung_Cancer,358,0
