In [2]:
import sys
import argparse
import os
import pandas as pd
import numpy as np
import math
import re
from collections import Counter

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import configuration_util
import get_gene_and_cohort_list_utils
filePathDict = configuration_util.get_all_files_path_dict()

## TABLE 1
Summary of IMPACT data <br></br>
Columns: **DMP, TMB, IS_MSI, DOMINANT_SIGNATURE, CANCER_TYPE, HYPERMUTATION_STATUS, N_ONCOGENIC, N_HOTSPOT, N_TRUNCATING, N_TRUNCATING_TSG, N_TRUNCATING_ONCOGENE, N_HOTSPOT_TSG, N_HOTSPOT_ONCOGENE**

In [21]:
def get_per_case_mut_info(muts, mutType='oncogenic', geneType='all'):
    tsgs = get_gene_and_cohort_list_utils.get_tsgs()
    oncogenes = get_gene_and_cohort_list_utils.get_oncogenes()
    
    if geneType == 'tsg':
        muts = muts[muts['Hugo_Symbol'].isin(tsgs)]
    if geneType == 'oncogene':
        muts = muts[muts['Hugo_Symbol'].isin(oncogenes)]
    
    if mutType == 'oncogenic':
        muts = muts[muts['oncogenic'].notnull()]
    if mutType == 'hotspot':
        muts = muts[muts['is-a-hotspot'] == 'Y']
    if mutType == 'stopGain':
        muts = muts[muts['Variant_Classification'] == 'Nonsense_Mutation']
    if mutType == 'frameShiftIndel':
        muts = muts[muts['Variant_Classification'].isin(set(['Frame_Shift_Del', 'Frame_Shift_Ins']))]
        
    infoDict = dict(muts['Tumor_Sample_Barcode'].value_counts())
    return infoDict

def create_table_one(maf):
    
    listOfDicts = []
    hypermutantIds = get_gene_and_cohort_list_utils.get_all_hypermutant_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
    normalIds = get_gene_and_cohort_list_utils.get_all_normal_ids(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'])
    tmbDict = get_gene_and_cohort_list_utils.get_all_tmb_info(tmbFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])
    dominantSignatureDict = get_gene_and_cohort_list_utils.get_hypermutator_signature_cohorts(impactSigsPath=filePathDict['IMPACT_SIGNATURE_DECOMPOSITIONS'])
    msiCases = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'])
    cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath=filePathDict['CANCER_TYPE_INFO'])
    
    nMutTypeDicts = {}
    for mutType in ['oncogenic', 'hotspot', 'stopGain', 'frameShiftIndel']:
        for geneType in ['tsg', 'oncogene', 'all']:
            geneMutType = geneType + '_' + mutType
            nMutTypeDicts[geneMutType] = get_per_case_mut_info(maf, mutType=mutType, geneType=geneType)
            
        
    #MAKE mut counts dict
    for case in set(maf['Tumor_Sample_Barcode']):
        localD = {'DMP': case}
        localD['TMB'] = tmbDict[case]
        localD['IS_MSI'] = True if case in msiCases else False
        localD['DOMINANT_SIGNATURE'] = dominantSignatureDict[case] if case in dominantSignatureDict else None
        localD['CANCER_TYPE'] = cancerTypeDict[case] if case in cancerTypeDict else None
        localD['HYPERMUTATION_STATUS'] = 'HYPERMUTATED' if case in hypermutantIds else 'NORMAL' if case in normalIds else 'INDETERMINATE'
        
        for mutType in ['oncogenic', 'hotspot', 'stopGain', 'frameShiftIndel']:
            for geneType in ['tsg', 'oncogene', 'all']:
                geneMutType = geneType + '_' + mutType
                localDict = nMutTypeDicts[geneMutType]
                localD['N_' + geneMutType] = localDict[case] if case in localDict else None
        
        listOfDicts.append(localD)
        
    return pd.DataFrame(listOfDicts)
    
    
    

In [30]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
dfTableOne = create_table_one(allImpactMutsMaf)
dfTableOne.to_csv('/Users/friedman/Desktop/hypermutationProjectFinal/tables/table1.tsv', index=False, sep='\t')