In [2]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import numpy

from collections import Counter

pathPrefix = '/Users/friedman/Desktop/mnt/'

sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util
import re

In [2]:
#returns the name of the gene and left most position spanning the msi indel
#uses re stuff
def get_left_aligned_allele_name(hgvsNames):
    
    positions = []
    geneName = hgvsNames[0].split('_p.')[0]
    for entry in hgvsNames:
        if len(entry.split('_p.')) == 2: #ignore weirdly formatted hgvs names
            variantNotation = entry.split('_p.')[1]
            number = variantNotation[1:]
            refAA = variantNotation[0]
            position = re.match('\d*', number).group(0)
            positions.append((position, refAA))
    
    if len(positions) == 0: return None #if all the hgvs names were ill formatted return None
    
    minEntry = sorted(positions)[0] #this is a sorted list of tuples the first thing is the position second is the reference aa
    return geneName + '_p.' + str(minEntry[1]) + str(minEntry[0])

#collapses all indels within 1 bp of each other for a start to be at the same location/name for matching
def standardize_allele_names(msiLengthInfo, observedMuts):
    
    neverObservedSites = set([]) #all the names of sites from criags msi file we cant match with the real maf
    msiSitesToNameMapping = {} #a dictionary mapping each msi site allele from craigs file to its corrected name
    mafMsiSiteToNameMapping = {} #a dictionary mapping each msi site allele from the maf to its corrected name
    
    cntr = 0.0
    for hgvs in set(msiLengthInfo['allele']):
        
        cntr += 1
        if cntr%500 == 0: print 100*(cntr/len(set(msiLengthInfo['allele']))), 'percent done'
        
        startPos = msiLengthInfo[msiLengthInfo['allele'] == hgvs]['Start_Position']
        
        
        if startPos.shape[0] == 1:
            #we want all names given to indels near (within 1position) of the start position of the MSI site in Craig's file
            putativeVariantNames = list(set(observedMuts[(abs(observedMuts['Start_Position'] - int(startPos)) < 2)
                                                    & (observedMuts['Variant_Type'].isin(set(['INS', 'DEL'])))]['allele']))
            
            if len(putativeVariantNames) == 0:
                neverObservedSites.add(hgvs) #if it cant be matched in the MAF we add it to never observed sites
                #note some of there are likely to be actually matched but missed by my method
            else:
                trueVariantName = get_left_aligned_allele_name(putativeVariantNames)
                
                #NOW WE PROPERLY create the mappings
                for putativeVariantName in putativeVariantNames:
                    mafMsiSiteToNameMapping[putativeVariantName] = trueVariantName
                msiSitesToNameMapping[hgvs] = trueVariantName
        else:
            pass #ignore variants with multiple start position in the msi info file
        
    return neverObservedSites, msiSitesToNameMapping, mafMsiSiteToNameMapping

In [3]:
msiSummary = pd.read_table('/Users/friedman/Desktop/impact_microsatellites.txt')


  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
allImpactMuts = analysis_utils.load_in_df_with_progress(filePath = pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/all_impact_mutations_annotated_cohort.maf', nLinesFile = 275000)


  exec(code_obj, self.user_global_ns, self.user_ns)


3.64 percent done
7.27 percent done
10.91 percent done
14.55 percent done
18.18 percent done
21.82 percent done
25.45 percent done
29.09 percent done
32.73 percent done
36.36 percent done
40.0 percent done
43.64 percent done
47.27 percent done
50.91 percent done
54.55 percent done
58.18 percent done
61.82 percent done
65.45 percent done
69.09 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


72.73 percent done
76.36 percent done
80.0 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


83.64 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


87.27 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


90.91 percent done
94.55 percent done


  exec(code_obj, self.user_global_ns, self.user_ns)


98.18 percent done
101.82 percent done
performing big concat then returning


In [5]:
allImpactMuts['allele'] = allImpactMuts.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)
msiSummary['allele'] = msiSummary.apply(lambda row: str(row['Hugo_Symbol']) + '_' + str(row['HGVSp_Short']), axis=1)

In [6]:
neverObservedSites, msiSitesToNameMapping, mafMsiSiteToNameMapping =  standardize_allele_names(msiSummary, allImpactMuts)



5.74382538771 percent done
11.4876507754 percent done
17.2314761631 percent done
22.9753015508 percent done
28.7191269385 percent done
34.4629523262 percent done
40.206777714 percent done
45.9506031017 percent done
51.6944284894 percent done
57.4382538771 percent done
63.1820792648 percent done
68.9259046525 percent done
74.6697300402 percent done
80.4135554279 percent done
86.1573808156 percent done
91.9012062033 percent done
97.645031591 percent done


In [8]:
allImpactMuts['correctedAllele'] = allImpactMuts['allele'].apply(lambda x:
                                                                 mafMsiSiteToNameMapping[x] if x in mafMsiSiteToNameMapping else None)
msiSummary['correctedAllele'] = msiSummary['allele'].apply(lambda x:
                                                           msiSitesToNameMapping[x] if x in msiSitesToNameMapping else None)



In [46]:
def summarize_mutation_counts_and_msi_allele_relationship(impactMuts, msiInfo):
    listOfDicts = []
    for correctedAllele in set(msiInfo['correctedAllele']):
        msiInfoAlleleDf = msiInfo[msiInfo['correctedAllele'] == correctedAllele]
        if msiInfoAlleleDf.shape[0] > 0:
            nRepeats = msiInfoAlleleDf['repeat_times'].iloc[0]
            gene = msiInfoAlleleDf[msiInfoAlleleDf['correctedAllele'] == correctedAllele]['Hugo_Symbol'].iloc[0]
            basePair = msiInfoAlleleDf[msiInfoAlleleDf['correctedAllele'] == correctedAllele]['Tumor_Seq_Allele2'].iloc[0]
            nOccurences = impactMuts[impactMuts['correctedAllele'] == correctedAllele].shape[0]
            listOfDicts.append({'Hugo_Symbol': gene, 'repeat_length': nRepeats,
                                'nOccurences': nOccurences, 'allele': correctedAllele, 'basePair': basePair})
    return pd.DataFrame(listOfDicts)

In [82]:
occurenceSummaryDf = summarize_mutation_counts_and_msi_allele_relationship(allImpactMuts, msiSummary)

In [83]:
occurenceSummaryDf['repeatLengthsLabel'] = occurenceSummaryDf['repeat_length'].apply(lambda x: '_' + str(x) if x < 10 else '>=10')
tumorSuppressors = set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])
occurenceSummaryDf['isTumorSuppresor'] = occurenceSummaryDf['Hugo_Symbol'].apply(lambda x: True if x in tumorSuppressors else False)
occurenceSummaryDf['label'] = occurenceSummaryDf.apply(lambda row: row['allele'] if row['nOccurences'] >= 75 else None, axis=1)
occurenceSummaryDf['basePair'] = occurenceSummaryDf['basePair'].apply(lambda x: x if x in set(['A', 'C', 'T', 'G']) else '_other')

In [84]:
#orderingValDict = {'A': .1, 'T':.2, 'C':.3, 'G':.4, '_other': None}
#occurenceSummaryDf['basePairAndLength'] = occurenceSummaryDf.apply(lambda row: row['repeatLengthsLabel'] + '_' + row['basePair'], axis=1)
#occurenceSummaryDf['orderingVal'] = occurenceSummaryDf.apply(lambda row: row['repeat_length'] + orderingValDict[row['basePair']] if orderingValDict[row['basePair']] != None else None, axis=1)

occurenceSummaryDf = occurenceSummaryDf[occurenceSummaryDf['nOccurences'].notnull()]


In [86]:
occurenceSummaryDf.to_csv('/Users/friedman/Desktop/WORK/msiSiteOccurenceInfo.tsv', index=False, sep='\t')

**SUMMARIZE DOUBLE HIT MUTATIONS** <br/>



In [71]:
def summarize_double_mut_msi_character(allelesAndCounts, msiInfo):
    listOfDicts = []
    for correctedAllele, count in allelesAndCounts.items():
        msiInfoAlleleDf = msiInfo[msiInfo['correctedAllele'] == correctedAllele]
        if msiInfoAlleleDf.shape[0] > 0:
            nRepeats = msiInfoAlleleDf['repeat_times'].iloc[0]
            gene = msiInfoAlleleDf[msiInfoAlleleDf['correctedAllele'] == correctedAllele]['Hugo_Symbol'].iloc[0]
            basePair = msiInfoAlleleDf[msiInfoAlleleDf['correctedAllele'] == correctedAllele]['Tumor_Seq_Allele2'].iloc[0]
        listOfDicts.append({'Hugo_Symbol': gene, 'repeat_length': nRepeats,
                            'allele': correctedAllele, 'basePair': basePair, 'nOccurences': count})
    df = pd.DataFrame(listOfDicts)
    return df
        
        
        
        

In [67]:
msiAlleleNames = {}
observedDoubleMutsEndoColoHyper = Counter({'RNF43_p.G659Vfs*41': 44, 'TCF7L2_p.K468Sfs*23': 14, 'MSH3_p.K383Rfs*32': 12, 'JAK1_p.K860Nfs*16': 10, 'PTEN_p.K267Rfs*9': 6, 'TGFBR2_p.K153Afs*3': 5, 'INPPL1_p.R1156Gfs*46': 4, 'ASXL1_p.G645Vfs*58': 3, 'KMT2C_p.X2481_splice': 2, 'TP53BP1_p.X1769_splice': 2, 'B2M_p.V69Wfs*34': 2, 'APC_p.T1556Nfs*3': 2, 'KMT2D_p.X2683_splice': 2, 'FLCN_p.H429Tfs*39': 2, 'ERF_p.G299Efs*12': 1, 'HLA-A_p.W157*': 1, 'EP300_p.H2324Tfs*29': 1, 'TGFBR2_p.K153Sfs*35': 1, 'APC_p.R856Nfs*6': 1, 'NF1_p.R1362*': 1, 'FUBP1_p.S11Lfs*43': 1, 'CYLD_p.N719Mfs*13': 1, 'ASXL2_p.X135_splice': 1, 'SOCS1_p.P36Vfs*75': 1, 'CIC_p.P1248Hfs*54': 1, 'NF1_p.N78Ifs*7': 1, 'ARID1A_p.P224Rfs*8': 1, 'ATM_p.K2811Sfs*46': 1, 'TSC1_p.R420Gfs*20': 1, 'KMT2C_p.K2797Rfs*26': 1, 'CREBBP_p.X608_splice': 1, 'MSH6_p.F1088Lfs*5': 1, 'APC_p.R2204*': 1, 'TP53_p.V73Wfs*50': 1, 'PIK3R1_p.R348*': 1, 'RAD50_p.K722Rfs*14': 1, 'PTEN_p.R130Q': 1, 'MSH6_p.E1322*': 1, 'MGA_p.E1249Rfs*42': 1, 'HNF1A_p.P291Qfs*51': 1, 'HLA-B_p.X25_splice': 1, 'NF1_p.I679Dfs*21': 1, 'ARID1A_p.D1850Tfs*33': 1, 'SMAD4_p.R361C': 1, 'CTNNB1_p.S45F': 1, 'B2M_p.L15Ffs*41': 1, 'PTEN_p.X267_splice': 1, 'DICER1_p.X504_splice': 1, 'PTEN_p.R173C': 1, 'DICER1_p.X301_splice': 1, 'JAK2_*505*': 1, 'B2M_p.T93Lfs*10': 1, 'NF1_p.Y628Tfs*3': 1, 'PTPRD_p.X1751_splice': 1, 'PTEN_p.D268Gfs*30': 1, 'TOP1_p.R140*': 1})
for val in observedDoubleMutsEndoColoHyper.keys():
    if val in mafMsiSiteToNameMapping:
        msiAlleleNames[mafMsiSiteToNameMapping[val]] = observedDoubleMutsEndoColoHyper[val]
        


In [80]:
df = summarize_double_mut_msi_character(msiAlleleNames, msiSummary)
df['label'] = df.apply(lambda row: row['allele'] if row['nOccurences'] > 2 else None, axis=1)
df['basePair'] = df['basePair'].apply(lambda x: x if x in set(['A', 'C', 'T', 'G']) else 'other')

In [81]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/doubleMutationMSIInfo.tsv', index=False, sep='\t')

**Look for MSI Signatures** <br/> <br/> <br/> <br/> 

In [175]:
def summarize_msi_repeat_info(impactMsiMuts, msiInfo, impactSigs, mmrGeneSummary):
    listOfDicts = []
    cntr = 0
    for case in set(impactMsiMuts['Tumor_Sample_Barcode']):
        if cntr%100 == 0: print cntr
        cntr +=1
        
        caseMuts = impactMsiMuts[impactMsiMuts['Tumor_Sample_Barcode'] == case]
        alleles = set(caseMuts[caseMuts['correctedAllele'].notnull()]['correctedAllele'])
        msiInfoAllelesInCase = msiInfo[msiInfo['correctedAllele'].isin(alleles)] #get msi site info about the case
        basePairCounter = Counter(msiInfoAllelesInCase['Tumor_Seq_Allele2'])
        nCG = basePairCounter['C'] + basePairCounter['G']
        nAT = basePairCounter['A'] + basePairCounter['T']
        nOther = sum(basePairCounter.values()) - nCG - nAT
        
        #cg ratio and indel length
        cg_atRATIO = None
        if nAT > 0: cg_atRATIO = (1.0*nCG)/nAT
        caseAverageIndelLength = np.nanmean(msiInfoAllelesInCase['repeat_times'])    
        
        #gene info
        mmrGene = 'None'
        pid = caseMuts['pid'].iloc[0]
        if pid in set(mmrGeneSummary['Patient_ID']):
            mmrGene = mmrGeneSummary[mmrGeneSummary['Patient_ID'] == pid]['Hugo_Symbol'].iloc[0]
        
        dominantSig = impactSigs[impactSigs['Tumor_Sample_Barcode'] == case]['dominantSiganture'].iloc[0]
        
        listOfDicts.append({'Tumor_Sample_Barcode': case,'dominantSig': dominantSig, 'mmrGene': mmrGene,
                            'nCG': nCG, 'nAT': nAT,
                            'nC': basePairCounter['C'], 'nG': basePairCounter['G'],
                            'nA': basePairCounter['A'], 'nT': basePairCounter['T'],
                            'cg_atRATIO': cg_atRATIO, 'averageIndelLen': caseAverageIndelLength, 'nOtherBP': nOther
                           })
    return pd.DataFrame(listOfDicts)
        

In [88]:
allImpactMuts = analysis_utils.map_cases_to_msi_sensor_class(allImpactMuts, msiSensorInfo=pathPrefix + '/ifs/work/taylorlab/friedman/mskImpactAsOfMarch2019/dmp/mskimpact/data_clinical_sample.txt')


  if self.run_code(code, result):


In [170]:
allImpactMuts['pid'] = allImpactMuts['Tumor_Sample_Barcode'].apply(lambda x: x[:9])
msiOnlyMuts = allImpactMuts[allImpactMuts['caseMsiClass'] == 'Instable']

In [None]:
preethiMMRSummary = pd.read_table('/Users/friedman/Downloads/unique_MMR-4genes.txt')

In [176]:
df = summarize_msi_repeat_info(msiOnlyMuts, msiSummary, impactSigs, preethiMMRSummary)



0
100
200
300
400
500
600
700
800


In [177]:
df['dominantSig'] = df['dominantSig'].apply(lambda x: x if x in set(['mean_6', 'mean_1', 'mean_14', 'mean_15', 'mean_20', 
                                                                'mean_21', 'mean_26']) else 'other')
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/msiIndelFeatures.tsv', index=False, sep='\t')

**New section find out info about msi mut co-occurence** <br/><br/><br/><br/>

In [15]:
msiMuts = allImpactMuts[allImpactMuts['correctedAllele'].notnull()]

In [40]:
tumorSuppressors = set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])


In [97]:
msiMuts['geneType'] = msiMuts['Hugo_Symbol'].apply(lambda x: 'Tumor Suppressor' if x in tumorSuppressors else 'Oncogene')
msiMuts['isFrameshift'] = msiMuts['Consequence'].apply(lambda x: True if x == 'frameshift_variant' else False)
msiSummary['geneType'] = msiSummary['Hugo_Symbol'].apply(lambda x: 'Tumor Suppressor' if x in tumorSuppressors else 'Oncogene')
msiSummary['isFrameshift'] = msiSummary['Consequence'].apply(lambda x: True if x == 'frameshift_variant' else False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [100]:
import scipy.stats as stats

def do_fisher_test_for_genes_indels(obsMuts, possibleMuts):
    listOfDicts = []
    for gene in set(obsMuts['Hugo_Symbol']):
        obsGene = obsMuts[obsMuts['Hugo_Symbol'] == gene]
        possgene = possibleMuts[possibleMuts['Hugo_Symbol'] == gene]
        
        nObsFrameshift = obsGene[obsGene['isFrameshift'] == True].shape[0]
        nObsNotFrameshift = obsGene[obsGene['isFrameshift'] == False].shape[0]
        nPossFrameshift = possibleMuts[possibleMuts['isFrameshift'] == True].shape[0]
        nPossNotFrameshift = possibleMuts[possibleMuts['isFrameshift'] == False].shape[0]
        
        oddsRatio, pValue = stats.fisher_exact([[nPossFrameshift, nPossNotFrameshift],
                                                                    [nObsFrameshift, nObsNotFrameshift]])
        
        listOfDicts.append({'Hugo_Symbol': gene, 'oddsRatio': oddsRatio, 'pValue': pValue})
    return pd.DataFrame(listOfDicts)
        

In [101]:
df = do_fisher_test_for_genes_indels(msiMuts, msiSummary)

In [118]:
df['displayLabel'] = df.apply(lambda row: row['Hugo_Symbol'] if row['pValue'] < 1e-8 else None, axis=1)
df['isTumorS'] = df['Hugo_Symbol'].apply(lambda x: True if x in tumorSuppressors else False)

In [119]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/frameShiftTestMsi.tsv', index=False, sep='\t')

In [17]:
allImpactMuts['quadNuc'] = allImpactMuts.apply(lambda row: mutationSigUtils.create_reference_four_nuc(row['Ref_Tri'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)


In [19]:
impactSigs = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/hypermutationAnalysisProj/projectDataAndConfigFiles/signatures_from_unfiltered_maf.txt')

#print allImpactMuts[allImpactMuts['HGVSp_Short'] == 'p.Y27C']['quadNuc']