In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np

from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import get_gene_and_cohort_list_utils
import configuration_util
import mutation_modeling_util
import re

reload(configuration_util)
filePathDict = configuration_util.get_all_files_path_dict()

In [None]:
#specify where to write files
writeDir = '/Users/friedman/Desktop/hypermutationProjectFinal/scripts/figure1/FIGURE1_PLOTTING_FILES/'

## Figure S1(i)
Summaries of TMB distributions and hypermutation classifications

In [None]:
writePath = os.path.join(writeDir, 'figureS1_iv.tsv', index=False, sep='\t')
df.to_csv(writePath, index=False, sep='\t')

## Figure S1(ii)
Signature summaries of hypermutated cases by cancer type

In [None]:
writePath = os.path.join(writeDir, 'figureS1_iii.tsv', index=False, sep='\t')
df.to_csv(writePath, index=False, sep='\t')

## Figure S1(iii)
Fraction drivers by TMB

In [None]:
def count_fraction_drivers(maf):
    impact341Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    im341Maf = maf[maf['Hugo_Symbol'].isin(impact341Genes)]
    driverMaf = im341Maf[im341Maf['oncogenic'].notnull()]
    driverCounts = dict(driverMaf['Tumor_Sample_Barcode'].value_counts())
    mutCounts = dict(im341Maf['Tumor_Sample_Barcode'].value_counts())
    dominantSigDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath=filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
    
    listOfDicts = []
    for case in set(im341Maf['Tumor_Sample_Barcode']):
        listOfDicts.append({
            'nMut': mutCounts[case] if case in mutCounts else 0,
            'nDriver': driverCounts[case] if case in driverCounts else 0,
            'dominantSignature': dominantSigDict[case] if case in dominantSigDict else None
        })
    
    df = pd.DataFrame(listOfDicts)
    df['fracDriver'] = df['nDriver']/df['nMut']
    return df

In [2]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
df = count_fraction_drivers(allImpactMutsMaf)
#adjust the labels for plotting
df['dominantSignature'] = df['dominantSignature'].apply(lambda x:
        'low-TMB' if x == 'insufficientMutBurden'
        else 'APOBEC' if x == 'mean_APOBEC'
        else 'MMR' if x == 'mean_MMR'
        else 'SMOKING' if x == 'mean_SMOKING'
        else 'POLE' if x == 'mean_10'
        else 'TMZ' if x == 'mean_11'
        else 'UV' if x == 'mean_7'
        else 'other')

  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
writePath = os.path.join(writeDir, 'figureS1_iii.tsv', index=False, sep='\t')
df.to_csv(writePath, index=False, sep='\t')

## Figure S1(iv)
Propensity of cosmic signatures to cause mutations

In [None]:
def calculate_mut_susceptibility_of_genes_by_signature(dfAllPossibleMutations, mutType):
    listOfDicts = []
    sigNames = ['Signature.' + str(i) for i in range(1,31)]
    spectraD = mutationSigUtils.convert_spectrum_file_to_dict_of_dicts(spectrumFile=filePathDict['SIGNATURE_SPECTRUM'])
    for i in range(1,31):
        curSig = 'Signature.' + str(i)
        d = {}
        for s in sigNames:
            d[s] = 0
        d[curSig] = 1
        #PRETEND we got a case with 100% signature i on the decomposition
        quadNucFractions = mutation_modeling_util.get_quadnuc_fracs_given_decomposition(d, spectraD)
        d = mutation_modeling_util.summarize_mutation_chances(dfAllPossibleMutations, mutType)

        p = mutation_modeling_util.get_expected_mut_chance_given_quadnuc_fractions(quadNucFractions, d)
        
        listOfDicts.append({'Signature_Name': curSig, 'frac': p, 'mutType': mutType})
        
    return pd.DataFrame(listOfDicts)

In [None]:
dfAllPossibleMutations = pd.read_table(filePathDict['ALL_POSSIBLE_MUTATION_SUMMARY'])
dfAllPossibleMutations = mutation_modeling_util.add_zero_cols_to_counts_df(dfAllPossibleMutations)
dfTruncating = calculate_mut_susceptibility_of_genes_by_signature(dfAllPossibleMutations, 'truncating')
dfOncogneic = calculate_mut_susceptibility_of_genes_by_signature(dfAllPossibleMutations, 'oncogenic')
dfHotspot = calculate_mut_susceptibility_of_genes_by_signature(dfAllPossibleMutations, 'hotspot')

#set colors for plotting
df = pd.concat([dfTruncating, dfOncogneic, dfHotspot])
df['colorName'] = df['Signature_Name'].apply(lambda x:
                                            'POLE' if x == 'Signature.10'
                                            else 'MMR' if x in set(['Signature.6', 'Signature.15', 'Signature.20', 'Signature.21', 'Signature.26'])
                                            else 'UV' if x == 'Signature.7'
                                            else 'APOBEC' if x in set(['Signature.2', 'Signature.13'])
                                            else 'BRCA' if x == 'Signature.3'
                                            else 'SMOKING' if x == 'Signature.4'
                                            else 'POLE_MMR' if x == 'Signature.14'
                                            else 'AGING' if x == 'Signature.1'
                                            else 'OTHER')

In [None]:
writePath = os.path.join(writeDir, 'figureS1_iv.tsv', index=False, sep='\t')
df.to_csv(writePath, index=False, sep='\t')

## Figure S1(v)
Fraction SNVs, INDELs etc by aetiology

In [12]:
def summarize_mutation_information_by_signature(maf):
    indelTypes = ['INS', 'DEL']
    listOfDicts = []
    for signature in set(maf['dominantSignature']):
        signatureMaf = maf[maf['dominantSignature'] == signature]
        nMuts = 1.0*signatureMaf.shape[0]
        nIndels = 1.0*signatureMaf[signatureMaf['Variant_Type'].isin(indelTypes)].shape[0]
        nTruncating = 1.0*signatureMaf[signatureMaf['Variant_Classification'] == 'Nonsense_Mutation'].shape[0]
        
        listOfDicts.append({'signature': signature, 'fracIndel': nIndels/nMuts,
                           'fracTruncating': nTruncating/nMuts
                           })
        
    df = pd.DataFrame(listOfDicts)
    return df

In [2]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
domSigDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath = filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
hypermutatedIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
hypermutatedMaf = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(hypermutatedIds)]

#summarize dominant signatures
hypermutatedMaf['dominantSignature'] = hypermutatedMaf['Tumor_Sample_Barcode'].apply(lambda x:
    domSigDict[x] if x in domSigDict else None)
hypermutatedMaf['dominantSignature'] = hypermutatedMaf['dominantSignature'].apply(lambda x:
        'APOBEC' if x == 'mean_APOBEC'
        else 'MMR' if x == 'mean_MMR' or x == 'mean_1'
        else 'SMOKING' if x == 'mean_SMOKING'
        else 'POLE' if x == 'mean_10'
        else 'POLE_MMR' if x == 'mean_14'
        else 'TMZ' if x == 'mean_11'
        else 'UV' if x == 'mean_7'
        else 'other')

#summarize indels
indels = ['INS', 'DEL']
hypermutatedMaf['isTruncating'] = hypermutatedMaf['Variant_Classification'].apply(lambda x:
        1 if x == 'Nonsense_Mutation' else 0)
hypermutatedMaf['isIndel'] = hypermutatedMaf['Variant_Type'].apply(lambda x: 1 if x in indels else 0)


  """Entry point for launching an IPython kernel.
  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
hypermutatedMafWrite = hypermutatedMaf[['isIndel', 'isTruncating', 'dominantSignature']]
hypermutatedMafWrite.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/mutationTypeSummary.tsv', index=False, sep='\t')

## Figure S1(vi)

In [13]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return array[idx]


In [19]:
nucleosomeBed = pd.read_table(filePathDict['NUCLEOSOME_DYAD_POSITIONS'], header=None)
nucleosomeBed = nucleosomeBed.rename(columns = {0: 'Chromosome', 1: 'Nucleosome_Start_Position', 2: 'Nucleosome_End_Position'})
nucleosomeBed['Chromosome'] = nucleosomeBed['Chromosome'].apply(lambda x: re.sub('chr', '', x))
impactMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])

#Figure out the closest nucleosome
chromosomeDicts = {}
for chromosome in set(nucleosomeBed['Chromosome']):
    chromosomeBed = nucleosomeBed[nucleosomeBed['Chromosome'] == chromosome]
    chromosomeDicts[chromosome] = list(chromosomeBed['Nucleosome_Start_Position'])
impactMaf['closestNucleosome'] = impactMaf.apply(lambda row:
        find_nearest(chromosomeDicts[str(row['Chromosome'])], row['Start_Position']), axis = 1)
impactMaf['closestNucleosomeDistance'] = impactMaf.apply(lambda row: row['Start_Position'] - row['closestNucleosome'], axis=1)
nucleosomeCloseThresh = 1000
impactMaf['isCloseToNucleosome'] = impactMaf['closestNucleosomeDistance'].apply(lambda x:
        1 if abs(x) < nucleosomeCloseThresh else 0)

#Make curved plot for fraction of mutations
hypermutationIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
hypermutationMaf = impactMaf[impactMaf['Tumor_Sample_Barcode'].isin(hypermutationIds)]
tsgs = get_gene_and_cohort_list_utils.get_tsgs()
oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
truncatingConsequences = set(['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins'])
hypermutationMaf['truncatingType'] = hypermutationMaf.apply(lambda row:
        'truncatingTSG' if row['Hugo_Symbol'] in tsgs and row['Variant_Classification'] in truncatingConsequences
        else 'truncatingOncogene' if row['Hugo_Symbol'] in oncogenes and row['Variant_Classification'] in truncatingConsequences
        else None, axis=1)


  """Entry point for launching an IPython kernel.


In [41]:
writePath = os.path.join(writeDir, 'figureS1_v.tsv', index=False, sep='\t')
hypermutationMaf.to_csv(writePath, index=False, sep='\t')
#hypermutationMaf.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/nucleosomeDyadOnfo.tsv', index=False)