In [1]:
import pandas as pd
from collections import Counter
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import svm
import itertools
import sys

sys.path.append('/Users/friedman/Desktop/hypermutationProjectFinal/scripts/utilityScripts')
import analysis_utils
import mutationSigUtils
import maf_analysis_utils
import get_gene_and_cohort_list_utils
import configuration_util

filePathDict = configuration_util.get_all_files_path_dict()

In [2]:
def create_gene_mutation_matrix(maf):
    impact341Genes = get_gene_and_cohort_list_utils.get_im3_genes()
    set(['ABL1', 'AKT1', 'AKT2', 'AKT3', 'ALK', 'ALOX12B', 'APC', 'AR', 'ARAF', 'ARID1A', 'ARID1B', 'ARID2', 'ARID5B', 'ASXL1', 'ASXL2', 'ATM', 'ATR', 'ATRX', 'AURKA', 'AURKB', 'AXIN1', 'AXIN2', 'AXL', 'B2M', 'BAP1', 'BARD1', 'BBC3', 'BCL2', 'BCL2L1', 'BCL2L11', 'BCL6', 'BCOR', 'BLM', 'BMPR1A', 'BRAF', 'BRCA1', 'BRCA2', 'BRD4', 'BRIP1', 'BTK', 'CARD11', 'CASP8', 'CBFB', 'CBL', 'CCND1', 'CCND2', 'CCND3', 'CCNE1', 'CD274', 'CD276', 'CD79B', 'CDC73', 'CDH1', 'CDK12', 'CDK4', 'CDK6', 'CDK8', 'CDKN1A', 'CDKN1B', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'CHEK1', 'CHEK2', 'CIC', 'CREBBP', 'CRKL', 'CRLF2', 'CSF1R', 'CTCF', 'CTLA4', 'CTNNB1', 'CUL3', 'DAXX', 'DCUN1D1', 'DDR2', 'DICER1', 'DIS3', 'DNMT1', 'DNMT3A', 'DNMT3B', 'DOT1L', 'E2F3', 'EED', 'EGFL7', 'EGFR', 'EIF1AX', 'EP300', 'EPCAM', 'EPHA3', 'EPHA5', 'EPHB1', 'ERBB2', 'ERBB3', 'ERBB4', 'ERCC2', 'ERCC3', 'ERCC4', 'ERCC5', 'ERG', 'ESR1', 'ETV1', 'ETV6', 'EZH2', 'FAM123B', 'FAM175A', 'FAM46C', 'FANCA', 'FANCC', 'FAT1', 'FBXW7', 'FGF19', 'FGF3', 'FGF4', 'FGFR1', 'FGFR2', 'FGFR3', 'FGFR4', 'FH', 'FLCN', 'FLT1', 'FLT3', 'FLT4', 'FOXA1', 'FOXL2', 'FOXP1', 'FUBP1', 'GATA1', 'GATA2', 'GATA3', 'GNA11', 'GNAQ', 'GNAS', 'GREM1', 'GRIN2A', 'GSK3B', 'H3F3C', 'HGF', 'HIST1H1C', 'HIST1H2BD', 'HIST1H3B', 'HNF1A', 'HRAS', 'ICOSLG', 'IDH1', 'IDH2', 'IFNGR1', 'IGF1', 'IGF1R', 'IGF2', 'IKBKE', 'IKZF1', 'IL10', 'IL7R', 'INPP4A', 'INPP4B', 'INSR', 'IRF4', 'IRS1', 'IRS2', 'JAK1', 'JAK2', 'JAK3', 'JUN', 'KDM5A', 'KDM5C', 'KDM6A', 'KDR', 'KEAP1', 'KIT', 'KLF4', 'KRAS', 'LATS1', 'LATS2', 'LMO1', 'MAP2K1', 'MAP2K2', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MAPK1', 'MAX', 'MCL1', 'MDC1', 'MDM2', 'MDM4', 'MED12', 'MEF2B', 'MEN1', 'MET', 'MITF', 'MLH1', 'KMT2A', 'KMT2C', 'KMT2D', 'MPL', 'MRE11A', 'MSH2', 'MSH6', 'MTOR', 'MUTYH', 'MYC', 'MYCL1', 'MYCN', 'MYD88', 'MYOD1', 'NBN', 'NCOR1', 'NF1', 'NF2', 'NFE2L2', 'NKX2-1', 'NKX3-1', 'NOTCH1', 'NOTCH2', 'NOTCH3', 'NOTCH4', 'NPM1', 'NRAS', 'NSD1', 'NTRK1', 'NTRK2', 'NTRK3', 'PAK1', 'PAK7', 'PALB2', 'PARK2', 'PARP1', 'PAX5', 'PBRM1', 'PDCD1', 'PDGFRA', 'PDGFRB', 'PDPK1', 'PHOX2B', 'PIK3C2G', 'PIK3C3', 'PIK3CA', 'PIK3CB', 'PIK3CD', 'PIK3CG', 'PIK3R1', 'PIK3R2', 'PIK3R3', 'PIM1', 'PLK2', 'PMAIP1', 'PMS1', 'PMS2', 'PNRC1', 'POLE', 'PPP2R1A', 'PRDM1', 'PRKAR1A', 'PTCH1', 'PTEN', 'PTPN11', 'PTPRD', 'PTPRS', 'PTPRT', 'RAC1', 'RAD50', 'RAD51', 'RAD51B', 'RAD51C', 'RAD51D', 'RAD52', 'RAD54L', 'RAF1', 'RARA', 'RASA1', 'RB1', 'RBM10', 'RECQL4', 'REL', 'RET', 'RFWD2', 'RHOA', 'RICTOR', 'RIT1', 'RNF43', 'ROS1', 'RPS6KA4', 'RPS6KB2', 'RPTOR', 'RUNX1', 'RYBP', 'SDHA', 'SDHAF2', 'SDHB', 'SDHC', 'SDHD', 'SETD2', 'SF3B1', 'SH2D1A', 'SHQ1', 'SMAD2', 'SMAD3', 'SMAD4', 'SMARCA4', 'SMARCB1', 'SMARCD1', 'SMO', 'SOCS1', 'SOX17', 'SOX2', 'SOX9', 'SPEN', 'SPOP', 'SRC', 'STAG2', 'STK11', 'STK40', 'SUFU', 'SUZ12', 'SYK', 'TBX3', 'TERT', 'TET1', 'TET2', 'TGFBR1', 'TGFBR2', 'TMEM127', 'TMPRSS2', 'TNFAIP3', 'TNFRSF14', 'TOP1', 'TP53', 'TP63', 'TRAF7', 'TSC1', 'TSC2', 'TSHR', 'U2AF1', 'VHL', 'VTCN1', 'WT1', 'XIAP', 'XPO1', 'YAP1', 'YES1'])
    listOfDicts = []
    for case in set(maf['Tumor_Sample_Barcode']):
        caseMaf = maf[maf['Tumor_Sample_Barcode'] == case]
        caseGenes = set(caseMaf['Hugo_Symbol'])
        cancerType = caseMaf['cancer_type'].iloc[0]
        
        geneMutDict = dict([(gene, 1) if gene in caseGenes else (gene, 0) for gene in impact341Genes])
        geneMutDict['cancerType'] = cancerType
        geneMutDict['Tumor_Sample_Barcode'] = case
        listOfDicts.append(geneMutDict)
    return pd.DataFrame(listOfDicts)

In [3]:
def run_n_fold_cross_validation(X, y, n = 100, testSize=1.0/3, mode='SVM'):
    scores = []
    for i in range(n):
        X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=testSize)
        
        score = None
        if mode == 'SVM':
            clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
            score = clf.score(X_test, y_test)
        elif mode == 'logisticRegression':
            clf = LogisticRegression().fit(X_train, y_train)
            score = clf.score(X_test, y_test)
        else: print 'error improper mode', mode
        scores.append(score)
    return scores
        

In [399]:
def fit_and_score_logistic_regression(X, y):
    clf = LogisticRegression().fit(X, y)
    return [clf.score(X, y)]

In [4]:
def analyze_performance_by_removing_genes(maf, nGenesToRemove=50, nFold=100, mode='SVM'):
    
    #returns X and y for classification from mutMatrix
    def get_vals_and_labels(mutMatrix):
        X = mutMatrix[[c for c in mutMatrix.columns.values if c not in set(['Tumor_Sample_Barcode', 'cancerType'])]].values
        y = mutMatrix['cancerType'].values
        return X, y
    
    def get_ordered_gene_removal_list(maf):
        l = []
        for cancerType in set(maf['cancer_type']):
            cTypeMaf = maf[maf['cancer_type'] == cancerType]
            nCases = len(set(cTypeMaf['Tumor_Sample_Barcode']))
            valueCountsDict = dict(cTypeMaf['Hugo_Symbol'].value_counts())
            for key,value in valueCountsDict.items():
                l.append((key, (1.0*value)/nCases))
        sortedL = sorted(l, key=lambda x: x[1], reverse=True)
        
        alreadySeen = []
        ordering = []
        for gene, val in sortedL:
            if gene not in alreadySeen:
                ordering.append(gene)
                alreadySeen.append(gene)
        return ordering
            
            
    print Counter(maf.drop_duplicates(subset=['Tumor_Sample_Barcode'])['cancer_type'])
    genesToRemove = get_ordered_gene_removal_list(maf)[:nGenesToRemove]
    matrix = create_gene_mutation_matrix(maf)
    
    i = 0
    listOfDicts = []
    X,y = get_vals_and_labels(matrix)
    
    scores = []
    scores = run_n_fold_cross_validation(X, y, n = nFold, mode=mode)
    listOfDicts.append({'step': i, 'score': np.nanmean(scores), 'removed': 'Begining'})
    
    for gene in genesToRemove:
        if i%10==0:print i,
        i += 1
        matrix = matrix.drop([gene], axis=1)
        X,y= get_vals_and_labels(matrix)
        
        scores = run_n_fold_cross_validation(X, y, n = nFold, mode=mode)
        listOfDicts.append({'step': i, 'score': np.nanmean(scores), 'removed': gene})
        
    return pd.DataFrame(listOfDicts)



In [16]:
allImpactMutsMaf = pd.read_table(filePathDict['IMPACT_BASE_MAF'])
unfilteredMaf = pd.read_csv(filePathDict['IMPACT_BASE_MAF_WITH_SYNONYMOUS'])

  """Entry point for launching an IPython kernel.


In [9]:
cancerTypes = ['Endometrial Cancer', 'Colorectal Cancer', 'Bladder Cancer', 'Prostate Cancer', 'Esophagogastric Cancer']
ids= []
for ct in cancerTypes:
    ids = ids + list(get_gene_and_cohort_list_utils.get_ids_by_hypermutant_status(hypermutantIdDir=filePathDict['HYPERMUTATION_STATUS_IDS'],
                                                                                  cancerType=ct, hypermutantStatus = 'Hypermutated'))

mmrIds = get_gene_and_cohort_list_utils.get_msi_cases(msiInfoFilePath = filePathDict['CASE_TMB_AND_MSI_STATS'], msiScoreThresh=10)
caseIds = set(mmrIds) & set(ids)


In [None]:
hypermutantFiltered = allImpactMutsMaf[allImpactMutsMaf['Tumor_Sample_Barcode'].isin(caseIds)]
silentMutsDf = unfilteredMaf[unfilteredMaf['Variant_Classification'] == 'Silent']
hypermutantSilent = silentMutsDf[silentMutsDf['Tumor_Sample_Barcode'].isin(caseIds)]

cancerTypeDict = get_gene_and_cohort_list_utils.get_impact_cancer_type_info(impactCancerTypeInfoPath = filePathDict['CANCER_TYPE_INFO'])
hypermutantFiltered['cancer_type'] = hypermutantFiltered['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cTypeDict else None)
hypermutantSilent['cancer_type'] = hypermutantSilent['Tumor_Sample_Barcode'].apply(lambda x: cancerTypeDict[x] if x in cTypeDict else None)


In [377]:
hypermutantSilent = hypermutantSilent[hypermutantSilent['cancer_type'].isin(cancerTypes)]
hypermutantFiltered = hypermutantFiltered[hypermutantFiltered['cancer_type'].isin(cancerTypes)]

#ALERT I switched MLL genes to not MLL genes
impact341Genes = get_gene_and_cohort_list_utils.get_im3_genes()
hypermutantSilent = hypermutantSilent[hypermutantSilent['Hugo_Symbol'].isin(impact341Genes)]
hypermutantFiltered = hypermutantFiltered[hypermutantFiltered['Hugo_Symbol'].isin(impact341Genes)]

In [407]:
#change m to equal 'logisticRegression if desired'
m = 'logisticRegression'

#BIG code block to run everything 
notOncogenicMafOnly = hypermutantFiltered[hypermutantFiltered['oncogenic'].isnull()]
oncogenicMafOnly = hypermutantFiltered[hypermutantFiltered['oncogenic'].notnull()]

#iterate over all pairwise comparissons and all types at once
allCancerTypes = ['Colorectal Cancer', 'Endometrial Cancer', 'Esophagogastric Cancer', 'Prostate Cancer']
#combos = list(itertools.combinations(allCancerTypes, 2))
#combos.append(allCancerTypes)

nRemove = 200
combos = [allCancerTypes]
listOfDfs = []

cntr = 0
for cts in combos:
    print 'analyzing the ', cntr, 'th out of ', len(combos), ':', cts
    ctSet = set(cts)
    focusMafOnc = oncogenicMafOnly[oncogenicMafOnly['cancer_type'].isin(ctSet)]
    focusMafNotOnc = notOncogenicMafOnly[notOncogenicMafOnly['cancer_type'].isin(ctSet)]
    focusMafSilent = hypermutantSilent[hypermutantSilent['cancer_type'].isin(ctSet)]
    
    dfOnc = analyze_performance_by_removing_genes(focusMafOnc, mode=m, nGenesToRemove=nRemove, nFold=50)
    dfOnc['cType'] = '|'.join(cts)
    dfOnc['mutType'] = 'onc'
    listOfDfs.append(dfOnc)
    
    print '********************\n'
    
    dfNotOnc = analyze_performance_by_removing_genes(focusMafNotOnc, mode=m, nGenesToRemove=nRemove, nFold=50)
    dfNotOnc['cType'] = '|'.join(cts)
    dfNotOnc['mutType'] = 'not-onc'
    listOfDfs.append(dfNotOnc)
    
    print '********************\n'
    
    dfSilent = analyze_performance_by_removing_genes(focusMafSilent, mode=m, nGenesToRemove=nRemove, nFold=50)
    dfSilent['cType'] = '|'.join(cts)
    dfSilent['mutType'] = 'silent'
    listOfDfs.append(dfSilent)
    
    print '_________________________________\n'
    cntr += 1
    
df = pd.concat(listOfDfs)
    

analyzing the  0 th out of  1 : ['Colorectal Cancer', 'Endometrial Cancer', 'Esophagogastric Cancer', 'Prostate Cancer']
Counter({'Colorectal Cancer': 231, 'Endometrial Cancer': 181, 'Prostate Cancer': 47, 'Esophagogastric Cancer': 32})
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 ********************

Counter({'Colorectal Cancer': 231, 'Endometrial Cancer': 181, 'Prostate Cancer': 47, 'Esophagogastric Cancer': 32})
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 ********************

Counter({'Colorectal Cancer': 229, 'Endometrial Cancer': 178, 'Prostate Cancer': 47, 'Esophagogastric Cancer': 32})
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 _________________________________



In [408]:
df['mutAndComp'] = df['mutType'] + '_' + df['cType']
df['mutType'] = df['mutType'].apply(lambda x: 'Oncogenic' if x == 'onc' else 'VUS' if x == 'not-onc' else x)

In [409]:
df.to_csv('/Users/friedman/Desktop/WORK/dataForLocalPlotting/geneDifferencesAnalysis_lr.tsv', index=False, sep='\t')