In [2]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import re

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt'

sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import mutation_modeling_util


**Load in data**

In [5]:
simMafDir = pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/simulatedMafs/geneMutMafs'

In [7]:
simDfSummary = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/simulatedMutationSummary.tsv')

In [None]:
allImpactMuts = analysis_utils.load_in_df_with_progress(filePath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', nLinesFile = 275000)

**First compare fraction of possible muts observed vs possible**

In [15]:
allImpactMuts['mutUuid'] = allImpactMuts.apply(lambda row: 
                                               row['Hugo_Symbol'] + '_' + str(row['Start_Position']) + row['Tumor_Seq_Allele2'],
                                               axis=1)

In [16]:
allImpactSNPs = allImpactMuts[allImpactMuts['Variant_Type'] == 'SNP']

In [None]:
allImpactSNPs['quadNuc'] = allImpactSNPs.apply(lambda row: mutationSigUtils.create_reference_four_nuc(row['Ref_Tri'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)


In [36]:
simDfSummary['nNonSilentMuts'] = simDfSummary['nPossibleMuts'] - simDfSummary['nSilent']

In [38]:
listOfDicts = []
nOncogenicPossibleDict = dict(zip(simDfSummary['Hugo_Symbol'], simDfSummary['totalNOncogenic']))
nPossibleDict = dict(zip(simDfSummary['Hugo_Symbol'], simDfSummary['nNonSilentMuts']))
for gene in set(simDfSummary['Hugo_Symbol']):
    nObserved = len(set(allImpactSNPs[(allImpactSNPs['Hugo_Symbol'] == gene)]['mutUuid']))
    nPossible = nPossibleDict[gene]
    ratio = 1.0*nObserved/nPossible
    nObservedOncogenic = len(set(allImpactSNPs[(allImpactSNPs['Hugo_Symbol'] == gene) & (allImpactSNPs['oncogenic'].notnull())]['mutUuid']))
    nPossibleOncogenic = nOncogenicPossibleDict[gene]
    ratioOnc = None 
    if nPossibleOncogenic > 0: 
        ratioOnc = 1.0*nObservedOncogenic/nOncogenicPossibleDict[gene]
    listOfDicts.append({
        'gene': gene,
        'nPossibleOncogenic': nPossibleOncogenic,
        'nObservedOncogenic': nObservedOncogenic,
        'fractionOfPossibleOncMutsObserved': ratioOnc,
        'nPossible': nPossible,
        'nObserved': nObserved,
        'ratio': ratio
    })
df = pd.DataFrame(listOfDicts)

In [39]:
#we will label common genes like p53
df['displayName'] = df['gene'].apply(lambda x: x if x in set(['TP53', 'PIK3CA', 'KRAS', 'PTEN', 'TERT', 'APC', 'KMT2D', 'ARID1A', 'EGFR', 'KMT2C', 'NF1', 'BRAF']) else None)
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/geneObservedFracs.tsv', index=False, sep='\t')

**Compare Observed Vs Possible At Quadnucs**

In [47]:
allPossibleQuadNucs = [firstNuc + change + lastNuc for firstNuc in ['A', 'T', 'C', 'G'] for change in ['CA', 'CG', 'CT', 'TA', 'TC', 'TG'] for lastNuc in ['A', 'T', 'C', 'G']]
listOfDicts = []
for quadNuc in allPossibleQuadNucs:
    nPossibleMuts = sum(simDfSummary[quadNuc]) - sum(simDfSummary[quadNuc + '_silent'])
    nPossibleOncMuts = sum(simDfSummary[quadNuc + '_oncogenic'])
    
    nObservedQuadNuc = len(set(allImpactSNPs[(allImpactSNPs['quadNuc'] == quadNuc)]['mutUuid']))
    nObservedOncAtQuadNuc = len(set(allImpactSNPs[(allImpactSNPs['quadNuc'] == quadNuc) & (allImpactSNPs['oncogenic'].notnull())]['mutUuid']))
    
    listOfDicts.append({
        'quadNuc': quadNuc,
        'nPossibleMuts': nPossibleMuts,
        'nPossibleOncMuts': nPossibleOncMuts,
        'nObservedQuadNuc': nObservedQuadNuc,
        'nObservedOncAtQuadNuc': nObservedOncAtQuadNuc
    })
    
df = pd.DataFrame(listOfDicts)
    

In [51]:
df['changeType'] = df['quadNuc'].apply(lambda x: 'C>A' if x[1:3] == 'CA' else 'C>G' if x[1:3] == 'CG'
                                      else 'C>T' if x[1:3] == 'CT' else 'T>A' if x[1:3] == 'TA'
                                      else 'T>C' if x[1:3] == 'TC' else 'T>G')

In [54]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/quadNucObservedFracs.tsv', index=False, sep='\t')

**Compare gene length to mutation fraction**

In [56]:
nPossibleDict = dict(zip(simDfSummary['Hugo_Symbol'], simDfSummary['nNonSilentMuts']))
listOfDicts = []
for gene in set(simDfSummary['Hugo_Symbol']):
    listOfDicts.append({
        'gene': gene,
        'nPossibleNonSilentMuts': nPossibleDict[gene],
        'nObservedDistinctNonSilentMuts': len(set(allImpactSNPs[(allImpactSNPs['Hugo_Symbol'] == gene)]['mutUuid']))
    })
df = pd.DataFrame(listOfDicts)

In [59]:
df['displayName'] = df['gene'].apply(lambda x: x if x in set(['TP53', 'PIK3CA', 'KRAS', 'PTEN', 'TERT', 'APC', 'KMT2D', 'ARID1A', 'EGFR', 'KMT2C', 'NF1', 'BRAF']) else None)
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/mutationObservedVsPossibleMuts.tsv', index=False, sep='\t')

**DO OBSERVED VS POSSIBLE BY MUTATION FREQ**

In [62]:
print set(allImpactMuts['Variant_Classification']) - set(["Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation", "Splice_Site", "Translation_Start_Site"])

set(['Silent', "5'Flank", 'Splice_Region', 'Nonstop_Mutation', "3'Flank", 'Targeted_Region', 'Intron', "5'UTR", 'IGR'])


In [85]:
listOfDicts = []
varTypePossibleCounts = {'Missense_Mutation': 2404402, 'Nonsense_Mutation': 137057, 'Splice_Site': 21090, 'Translation_Start_Site': 4162}
for varType in set(['Missense_Mutation', 'Nonsense_Mutation', 'Splice_Site', 'Translation_Start_Site']):
    nObserved = len(set(allImpactMuts[allImpactMuts['Variant_Classification'] == varType]['mutUuid']))
    listOfDicts.append({
        'mutationType': varType,
        'frac': 1.0*nObserved/varTypePossibleCounts[varType]
    })
df = pd.DataFrame(listOfDicts)

In [87]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/variantTypeObsVsPossible.tsv', sep='\t', index=False)

**SIGNATURE SPECIFIC ANALYSES**

In [89]:
impactSigs = pd.read_table(pathPrefix + '/ifs/res/taylorlab/impact_sigs/mixedpact_data_mutations_unfiltered.sigs.tab.txt')

In [90]:
impactSigs['pid'] = impactSigs['Tumor_Sample_Barcode'].apply(lambda x: x[:9])

In [91]:
impactSigs = mutationSigUtils.merge_signature_columns(impactSigs)

In [97]:
impactSigs['dominantSig'] = impactSigs.apply(lambda row: mutationSigUtils.get_dominant_signature(row.to_dict(), notEnoughMuts= False), axis=1)


In [124]:
poleDominantSigIds = set(impactSigs[(impactSigs['dominantSig'] == 'mean_10') & (impactSigs['Nmut'] > 50)]['Tumor_Sample_Barcode'])
mmrDominantSigIds = set(impactSigs[(impactSigs['dominantSig'] == 'mean_MMR') & (impactSigs['Nmut'] > 50)]['Tumor_Sample_Barcode'])
tmzDominantSigIds = set(impactSigs[(impactSigs['dominantSig'] == 'mean_11') & (impactSigs['Nmut'] > 50)]['Tumor_Sample_Barcode'])
apobecDominantSigIds = set(impactSigs[(impactSigs['dominantSig'] == 'mean_APOBEC') & (impactSigs['Nmut'] > 50)]['Tumor_Sample_Barcode'])
uvDominantSigIds = set(impactSigs[(impactSigs['dominantSig'] == 'mean_7') & (impactSigs['Nmut'] > 50)]['Tumor_Sample_Barcode'])
smokingDominantSigIds = set(impactSigs[(impactSigs['dominantSig'] == 'mean_4') & (impactSigs['Nmut'] > 50)]['Tumor_Sample_Barcode'])



In [125]:
#FUNCTIONS TO USE
def get_per_case_mut_info(nmutDfPath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/nmutInfo_impact_filtered.tsv'):
    df = pd.read_table(nmutDfPath)
    return dict(zip(df['Tumor_Sample_Barcode'], df['Nmut']))

def get_per_case_oncogenic_mut_info(muts):
    oncogenicMuts = muts[muts['oncogenic'].notnull()]
    nMutOncDict = dict(oncogenicMuts['Tumor_Sample_Barcode'].value_counts())
    return nMutOncDict

In [127]:
nMutDict = get_per_case_mut_info()
oncogenicMutDict = get_per_case_oncogenic_mut_info(allImpactMuts)

In [134]:
#summarize mutation information
listOfDicts = []
for name, idList in [['POLE', poleDominantSigIds], ['MMR', mmrDominantSigIds], ['TMZ', tmzDominantSigIds],
                    ['APOBEC', apobecDominantSigIds], ['UV', uvDominantSigIds], ['SMOKING', smokingDominantSigIds]]:
    muts = allImpactMuts[allImpactMuts['Tumor_Sample_Barcode'].isin(idList)]
    for case in set(muts['Tumor_Sample_Barcode']):
        if case not in nMutDict: pass
        elif case not in oncogenicMutDict: pass
        else: 
            listOfDicts.append({'Tumor_Sample_Barcode': case,
                            'Signature_Aetiology': name,
                            'nmut': nMutDict[case],
                            'nOncogenic': oncogenicMutDict[case]
                           })
        
df = pd.DataFrame(listOfDicts)
    

In [136]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/oncMutFracsBySignatures.tsv', sep='\t', index=False)

**QUAD NUC MUT SUSCEPTIBILITY**

In [137]:
oncogenicSusceptibilityDict = mutation_modeling_util.calculate_quadnuc_based_oncogenic_susceptibility_dict(simDfSummary)

In [146]:
listOfDicts = []
for key, value in oncogenicSusceptibilityDict.items():
    listOfDicts.append({
        'quadNuc': key, 
        'probability': value['IMPACT_468']
    })

df = pd.DataFrame(listOfDicts)
df['changeType'] = df['quadNuc'].apply(lambda x: 'C>A' if x[1:3] == 'CA' else 'C>G' if x[1:3] == 'CG'
                                      else 'C>T' if x[1:3] == 'CT' else 'T>A' if x[1:3] == 'TA'
                                      else 'T>C' if x[1:3] == 'TC' else 'T>G')
df['motifLabel'] = df['quadNuc'].apply(lambda x: 'POLE-30%' if x == 'TCAT'
                                       else 'POLE-20%' if x == 'TCTG'
                                      else 'MMR-17%' if x == 'GCTG'
                                      else 'TMZ-15%' if x == 'TCTC'
                                      else 'TMZ-14%' if x == 'ACTC'
                                      else 'APOBEC-36%' if x == 'TCGT'
                                      else 'APOBEC-30%' if x == 'TCGA'
                                      else None)

In [147]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/quadNucExpectedFracs.tsv', index=False, sep='\t')

In [148]:
simDfSummary

Unnamed: 0,ACAA,ACAA_oncogenic,ACAA_silent,ACAC,ACAC_oncogenic,ACAC_silent,ACAG,ACAG_oncogenic,ACAG_silent,ACAT,...,TTGG,TTGG_oncogenic,TTGG_silent,TTGT,TTGT_oncogenic,TTGT_silent,nPossibleMuts,nSilent,totalNOncogenic,nNonSilentMuts
0,63,0,11,99,0,5,24,0,4,66,...,47,0,11,46,0,1,7315,1832,0,5483
1,60,2,1,56,4,1,9,0,4,52,...,52,1,11,106,2,7,4960,964,207,3996
2,26,0,3,38,1,0,21,0,3,25,...,23,0,7,22,0,0,3511,807,8,2704
3,37,0,4,42,0,0,12,0,1,30,...,37,0,8,61,0,2,3457,636,0,2821
4,44,7,7,53,12,1,33,5,4,23,...,36,1,1,16,1,0,4120,858,543,3262
5,51,5,11,70,10,0,24,1,3,39,...,47,3,2,21,1,1,5914,1344,618,4570
6,26,2,1,18,2,1,5,1,0,28,...,28,0,3,40,0,2,2233,235,116,1998
7,65,0,9,75,0,2,28,0,3,38,...,52,0,11,56,0,7,6673,1459,0,5214
8,8,0,0,11,1,0,2,0,0,16,...,13,0,0,39,0,0,1345,114,50,1231
9,71,4,1,54,8,1,25,2,5,59,...,80,1,17,154,1,6,5911,1029,416,4882
