In [1]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
from collections import Counter
pathPrefix = '/Users/friedman/Desktop/mnt'

sys.path.append(pathPrefix + '/ifs/work/taylorlab/friedman/myUtils')
import analysis_utils 
import mutationSigUtils 
import maf_analysis_utils
import clonality_analysis_util

**Note: First you need to generate gene level calls for CNA from facets**

In [57]:
cases = set(gliomaDf['Tumor_Sample_Barcode'])
facetsPath = '/ifs/res/taylorlab/impact_facets/facets_0.5.6/'
rScriptPath = '/home/friedman/friedman/facets-suite/geneLevel.R'
listOfCommands = []
for case in cases:
    target = None
    if 'IM3' in case:
        target = 'IMPACT341'
    elif 'IM5' in case:
        target = 'IMPACT410'
    elif 'IM6' in case:
        target = 'IMPACT468'
    caseSplit = case.split('-')
    caseSplit[2] = 'N01'
    nString = '-'.join(caseSplit)
    fullString = case + '_' + nString
    
    innerDir = 'facets_R0.5.6s100n25c50p100/'
    cncfPath = fullString + '_hisens.cncf.txt'
    fullPath = facetsPath + fullString + '/' + innerDir + cncfPath
    outPath = '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/facetsGeneLevelCalls/' + case + '_geneLevel.seg'
    
    command = 'Rscript ' + rScriptPath + ' -f ' + fullPath + ' -o ' + outPath + ' -t ' + target
    listOfCommands.append(command)
    

In [59]:
#write the commands to a file on the desktop which I will then scp to luna
f= open("/Users/friedman/Desktop/geneLevelCommands.txt","w+")
for command in listOfCommands:
    f.write(command+ '\n')
f.close()

In [60]:
listOfCommands

['Rscript /home/friedman/friedman/facets-suite/geneLevel.R -f /ifs/res/taylorlab/impact_facets/facets_0.5.6/P-0003822-T03-IM6_P-0003822-N01-IM6/facets_R0.5.6s100n25c50p100/P-0003822-T03-IM6_P-0003822-N01-IM6_hisens.cncf.txt -o /ifs/work/taylorlab/friedman/myAdjustedDataFiles/facetsGeneLevelCalls/P-0003822-T03-IM6_geneLevel.seg -t IMPACT468',
 'Rscript /home/friedman/friedman/facets-suite/geneLevel.R -f /ifs/res/taylorlab/impact_facets/facets_0.5.6/P-0020686-T01-IM6_P-0020686-N01-IM6/facets_R0.5.6s100n25c50p100/P-0020686-T01-IM6_P-0020686-N01-IM6_hisens.cncf.txt -o /ifs/work/taylorlab/friedman/myAdjustedDataFiles/facetsGeneLevelCalls/P-0020686-T01-IM6_geneLevel.seg -t IMPACT468',
 'Rscript /home/friedman/friedman/facets-suite/geneLevel.R -f /ifs/res/taylorlab/impact_facets/facets_0.5.6/P-0000500-T01-IM3_P-0000500-N01-IM3/facets_R0.5.6s100n25c50p100/P-0000500-T01-IM3_P-0000500-N01-IM3_hisens.cncf.txt -o /ifs/work/taylorlab/friedman/myAdjustedDataFiles/facetsGeneLevelCalls/P-0000500-T01-I

**Actual Data parsing**

In [2]:
gliomaDf = pd.read_table(pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/subsettedMafs/Glioma_HypermutantCaseMuts_MAF_ANNO_trinuc.maf')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#We say TMZ motifs are the top 8 tmz mutations (note this will miss ~10% of tmz muts)
tmzMotifs = set(['ACTC', 'ACTT', 'CCTC', 'CCTT', 'GCTC', 'GCTT', 'TCTC', 'TCTT'])
gliomaDf['quadNuc'] = gliomaDf.apply(lambda row: mutationSigUtils.create_reference_four_nuc(row['Ref_Tri'], row['Reference_Allele'], row['Tumor_Seq_Allele2'], row['Variant_Type']), axis=1)


In [5]:
gliomaDfTMZOnly = gliomaDf[gliomaDf['quadNuc'].isin(tmzMotifs)]
#gliomaDfTMZOnly = gliomaDfTMZOnly[~gliomaDfTMZOnly['Chromosome'].isin(set(['X', 'Y']))]

In [4]:
mutSimulationSummary = pd.read_table('/Users/friedman/Desktop/workOffline/mutSimulationInfoIncludingHotspots.tsv')


In [6]:
#adds a column to a maf telling you which genes are double mutated
def mark_double_mutants_in_maf(maf, mode = 'oncogenic'):
    countMaf = maf
    if mode == 'oncogenic':
        countMaf = maf[maf['oncogenic'].notnull()]
    counts = countMaf['Hugo_Symbol'].value_counts()
    doubles = dict(counts[counts > 1]).keys() #convoluted way to get genes mutated 2+ times
    maf['isDouble'] = maf['Hugo_Symbol'].apply(lambda x: True if x in doubles else False)
    return maf
    
#assumes we are taking a maf of TMZ motif mutations to start
def calculate_fraction_of_tumor_suppressors_balanced(cases, geneSizeDict, maf):
    
    listOfDicts = []
    tumorSuppressors = set(['ERRFI1', 'ASXL2', 'PMAIP1', 'ACTG1', 'SUFU', 'FBXO11', 'MEN1', 'FAM58A', 'B2M', 'RB1', 'DUSP22', 'SESN1', 'GPS2', 'RAD51D', 'SMG1', 'CDC73', 'MAP3K1', 'SMARCB1', 'INPP4B', 'PARK2', 'SMAD4', 'CBFB', 'CDH1', 'PPP6C', 'SETDB1', 'SETDB2', 'NF2', 'CDKN2B', 'CDKN2C', 'CDKN2A', 'DDX3X', 'PIK3R1', 'BARD1', 'PDS5B', 'KLF4', 'SPRED1', 'VHL', 'SMAD2', 'PMS1', 'PMS2', 'SETD2', 'GATA3', 'TBL1XR1', 'MUTYH', 'SOCS1', 'FAM175A', 'ROBO1', 'ARID1B', 'ARID1A', 'TCF7L2', 'STK11', 'FOXA1', 'PTEN', 'FAT1', 'FAS', 'CYLD', 'MAX', 'SH2D1A', 'APC', 'NTHL1', 'CTCF', 'KDM5C', 'KMT2C', 'ZFHX3', 'FOXP1', 'PIGA', 'CDKN1B', 'CDKN1A', 'FUBP1', 'MSH2', 'ID3', 'TNFRSF14', 'TRAF3', 'EP400', 'BRIP1', 'ARID4A', 'ARID4B', 'XRCC2', 'DAXX', 'SDHAF2', 'ASXL1', 'AMER1', 'RASA1', 'EGR1', 'MST1', 'SOX17', 'RUNX1', 'PIK3R3', 'NCOR1', 'NF1', 'JAK1', 'PTPRD', 'CHEK2', 'CHEK1', 'SMC1A', 'TMEM127', 'STAG1', 'RAD51', 'TCF3', 'STAG2', 'ARID2', 'RAD50', 'RNF43', 'PARP1', 'BLM', 'CUX1', 'RECQL', 'RAD21', 'PTPN2', 'PTPN1', 'SLX4', 'INHA', 'PAX5', 'IRF1', 'TP53', 'HLA-A', 'IRF8', 'CBL', 'TOP1', 'SHQ1', 'PRDM1', 'NSD1', 'ATXN2', 'CREBBP', 'HDAC4', 'SESN2', 'PPP2R1A', 'EPHA7', 'ATM', 'EPHA3', 'POT1', 'SMAD3', 'MOB3B', 'TBX3', 'POLE', 'ATR', 'FANCD2', 'FH', 'BCORL1', 'SOX9', 'IKZF3', 'TSC1', 'TP63', 'MRE11A', 'SDHC', 'BTG1', 'POLD1', 'CIITA', 'SMC3', 'SAMHD1', 'RTEL1', 'ECT2L', 'PIK3R2', 'CRBN', 'FANCC', 'NBN', 'FANCA', 'HLA-B', 'RECQL4', 'DUSP4', 'ERCC2', 'FBXW7', 'TGFBR2', 'TGFBR1', 'MSH3', 'RBM15', 'TET1', 'TET3', 'SESN3', 'MGA', 'LTB', 'FOXL2', 'SH2B3', 'BCOR', 'HIST1H1D', 'ATRX', 'EP300', 'RAD51C', 'RAD51B', 'HIST1H1B', 'TNFAIP3', 'DICER1', 'ARID5B', 'LATS2', 'FOXO1', 'KEAP1', 'EZH2', 'SP140', 'NKX3-1', 'PBRM1', 'PALB2', 'CIC', 'BRCA1', 'DTX1', 'FLCN', 'SPEN', 'CD58', 'ERCC3', 'ERCC4', 'MSH6', 'BCL11B', 'BMPR1A', 'ERF', 'BRCA2', 'NOTCH2', 'EED', 'MITF', 'ELF3', 'SMARCA4', 'BBC3', 'ANKRD11', 'CEBPA', 'BCL2L11', 'AXIN2', 'AXIN1', 'CDK12', 'ESCO2', 'MLH1', 'SDHB', 'MED12', 'HNF1A', 'RYBP', 'ATP6V1B2', 'DNMT3B', 'KMT2B', 'KMT2A', 'DNMT3A', 'NFKBIA', 'TRAF5', 'KMT2D', 'SPOP', 'RBM10', 'P2RY8', 'TP53BP1', 'TSC2', 'KDM6A', 'EPCAM', 'PHOX2B', 'NPM1', 'BCL10', 'LATS1', 'HOXB13', 'ARID3A', 'PTPRT', 'PTPRS', 'INPPL1', 'NOTCH4', 'TET2', 'NOTCH1', 'CASP8', 'NOTCH3', 'GRIN2A', 'MAP2K4', 'WT1', 'BACH2', 'SDHA', 'BAP1', 'PTCH1', 'SDHD'])
    pathToGeneLevelCalls = pathPrefix + '/ifs/work/taylorlab/friedman/myAdjustedDataFiles/facetsGeneLevelCalls/'
    cntr = 0
    for case in cases:
        
        cntr += 1
        if cntr%10 == 0: print cntr, len(cases)
        
        geneLevelPath = pathToGeneLevelCalls + case + '_geneLevel.seg'
        if os.path.exists(geneLevelPath):
            caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
            caseMaf = mark_double_mutants_in_maf(caseMaf, mode = 'oncogenic')
            
            geneLevelDf = pd.read_table(geneLevelPath)
            geneLevelDf['size'] = geneLevelDf['Hugo_Symbol'].apply(lambda x: geneSizeDict[x] if x in geneSizeDict else None)
            geneLevelDf['balanced'] = geneLevelDf.apply(lambda row: True if row['tcn']/2.0 == row['lcn'] else False, axis=1)
            geneLevelDfTSGs = geneLevelDf[geneLevelDf['Hugo_Symbol'].isin(tumorSuppressors)]

            balancedDf = geneLevelDfTSGs[geneLevelDfTSGs['balanced'] == True]
            unbalancedDf = geneLevelDfTSGs[geneLevelDfTSGs['balanced'] == False]
            balancedGenes = set(balancedDf['Hugo_Symbol'])
            unbalancedGenes = set(unbalancedDf['Hugo_Symbol'])
            balancedSize = np.nansum(balancedDf['size'])
            unbalancedSize = np.nansum(unbalancedDf['size'])

            #Rate of mutations per balanced/unbalanced region
            balancedMaf = caseMaf[caseMaf['Hugo_Symbol'].isin(balancedGenes)]
            unbalancedMaf = caseMaf[caseMaf['Hugo_Symbol'].isin(unbalancedGenes)]
            nOncBalanced = balancedMaf[balancedMaf['oncogenic'].notnull()].shape[0]
            nOncUnbalanced = unbalancedMaf[unbalancedMaf['oncogenic'].notnull()].shape[0]
            nBalancedVus = balancedMaf[balancedMaf['oncogenic'].isnull()].shape[0]
            nUnbalancedVus = unbalancedMaf[unbalancedMaf['oncogenic'].isnull()].shape[0]
            
            #double stuff
            nDoubleBalanced = balancedMaf[(~balancedMaf['oncogenic'].notnull()) &(balancedMaf['isDouble'] == True)].shape[0]
            nDoubleUnbalanced = unbalancedMaf[(~unbalancedMaf['oncogenic'].notnull()) &(unbalancedMaf['isDouble'] == True)].shape[0]
            nDoubleOncBalanced = balancedMaf[(balancedMaf['oncogenic'].notnull()) & (balancedMaf['isDouble'] == True)].shape[0]
            nDoubleOncUnbalanced = unbalancedMaf[(unbalancedMaf['oncogenic'].notnull()) & (unbalancedMaf['isDouble'] == True)].shape[0]

            listOfDicts.append({
                'Tumor_Sample_Barcode': case, 'nBalanced': nBalancedVus, 'nOncBalanced': nOncBalanced,
                'nUnbalanced': nUnbalancedVus, 'nOncUnbalanced': nOncUnbalanced,
                'oncBalancedRate': (1.0*nOncBalanced)/balancedSize, 'oncUnbalancedRate': (1.0*nOncUnbalanced)/unbalancedSize,
                'balancedRate': (1.0*nBalancedVus)/balancedSize, 'unbalancedRate': (1.0*nUnbalancedVus)/unbalancedSize,
                'balancedOncDoubleRate': (1.0*nDoubleOncBalanced)/balancedSize, 'unbalancedOncDoubleRate': (1.0*nDoubleOncUnbalanced)/unbalancedSize,
                'balancedDoubleRate': (1.0*nDoubleBalanced)/balancedSize, 'unbalancedDoubleRate': (1.0*nDoubleUnbalanced)/unbalancedSize
            })
            
            
         
    return pd.DataFrame(listOfDicts)
        
        

In [None]:
#mark_double_mutants_in_maf(gliomaDf, mode = 'oncogenic')

In [9]:
geneSizeDict = dict(zip(mutSimulationSummary['Hugo_Symbol'], mutSimulationSummary['nPossibleMuts']/3))

In [11]:
df = calculate_fraction_of_tumor_suppressors_balanced(set(gliomaDf['Tumor_Sample_Barcode']), geneSizeDict, gliomaDfTMZOnly)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


10 59
20 59




30 59
40 59
50 59


In [12]:
print np.nanmean(df['oncBalancedRate']), np.nanmean(df['oncUnbalancedRate'])
print np.nanmean(df['balancedRate']), np.nanmean(df['unbalancedRate'])
print np.nanmean(df['balancedOncDoubleRate']), np.nanmean(df['unbalancedOncDoubleRate'])
print np.nanmean(df['balancedDoubleRate']), np.nanmean(df['unbalancedDoubleRate'])

1.447374654720212e-05 1.2862728038604819e-05
0.00017360854949382084 0.000150449377631688
2.156136289741331e-06 1.6246426273908706e-06
2.3901541999333142e-06 2.0127642152944027e-06


In [17]:
#TODO should we consider mutations that occur at a 2-0 regime and are double as different (likewise for 4 - 0)
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/gliomaMutBalance.tsv', index=False, sep='\t')

