In [1]:
import shutil
import gzip
import os
import pandas as pd
import io
import sys
from __future__ import print_function
from scipy import stats

In [2]:
#Unzip the .tar gdc file from GDC website
shutil.unpack_archive('gdc_tcpa_luad_fpkm_uq.tar', 'data_extracted_once')

In [3]:
#Create directory to accept extracted FPKM-UQ text files
workingDir = os.path.abspath('')
newDirectory = workingDir+'\\annotations'
if not os.path.exists(newDirectory):
    os.makedirs(newDirectory)

In [4]:
#Iterate over the extracted files
for entry in os.scandir(workingDir+'\\data_extracted_once'):
    if entry.is_dir():
        for subEntry in os.scandir(entry.path):
            #some files have annotation text files, so we ignore those
            if subEntry.name.endswith('.txt'):
                continue
            else:
                #Unpack the .gz and copy it to .txt file
                with gzip.open(subEntry.path, 'rb') as f_in:
                    with open(f_in.name[: -3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    #Copies the extracted textfiles into the annotations folder    
                    shutil.copy2(f_in.name[: -3], workingDir+'\\annotations')
            #print(subEntry.path)
            #shutil.copy2(f_out.name, workingDir+'\\annotations')
            

In [5]:
def createGeneResultsList(geneInListId):
    results = []
    for textFile in os.scandir(workingDir+'\\annotations'):
        with open(os.path.dirname(textFile)+'\\'+textFile.name) as f:
            for line in f.readlines():
                if(geneInListId not in line):
                    continue
                else:
                    results.append(line)
    return results

In [6]:
#Declare the Ensembl IDs for each gene
STAT3_ID, JAK2_ID = 'ENSG00000168610', 'ENSG00000096968'
NRF2, LCN2A, PI3KP85, TUBERIN, STATHMIN, STAT5ALPHA, MTOR, MRE11, ATM = 'ENSG00000116044', 'ENSG00000148346', 'ENSG00000145675', 'ENSG00000103197', 'ENSG00000117632', 'ENSG00000126561', 'ENSG00000198793', 'ENSG00000020922', 'ENSG00000149311'   
THYMIDILATESYNTHASE, SHP2_pY542, GSK3_pS9, MTOR_pS2448, GSK3ALPHABETA_pS21S9, CAVEOLIN1, AKT_pS473, X4EBP1 = 'ENSG00000176890', 'ENSG00000179295', 'ENSG00000105723', 'ENSG00000198793' ,'ENSG00000082701', 'ENSG00000105974', 'ENSG00000142208', 'ENSG00000187840'

#Declare iterative list for each gene of interest
jak2GeneList = {'JAK2_ID': JAK2_ID, 'NRF2': NRF2, 'LCN2A': LCN2A, 'PI3KP85': PI3KP85, 'TUBERIN': TUBERIN, 'STATHMIN': STATHMIN, 'STAT5ALPHA': STAT5ALPHA, 'MTOR': MTOR, 'MRE11': MRE11, 'ATM': ATM}
stat3GeneList = {'STAT3_ID': STAT3_ID, 'THYMIDILATESYNTHASE': THYMIDILATESYNTHASE, 'SHP2_pY542': SHP2_pY542, 'GSK3_pS9': GSK3_pS9, 'MTOR_pS2448': MTOR_pS2448, 'GSK3ALPHABETA_pS21S9': GSK3ALPHABETA_pS21S9, 'CAVEOLIN1': CAVEOLIN1, 'AKT_pS473': AKT_pS473, 'X4EBP1': X4EBP1}
results = []
jak2results = []
stat3results = []

In [7]:
#Create a dataframe to hold all of the FPKM_UQ values
completeJak2FpkmDf = pd.DataFrame()

for gene in jak2GeneList:
    results = createGeneResultsList(jak2GeneList[gene])
    df = pd.DataFrame([lineString.split('\t') for lineString in results], columns=[gene, gene+' FPKM']) #tells the data frame to add a new column when it sees a tab i.e. '\t'
    df[gene+' FPKM'] = df[gene+' FPKM'].str.rstrip('\n') #Remove '\n' from end of FPKM values
    df = df.drop(gene, axis=1) #Drops the ensembl ID before adding it to final data frame
    completeJak2FpkmDf = pd.concat([df, completeJak2FpkmDf], axis=1)
    



In [8]:
#Same as above but for STAT3
completeStat3FpkmDf = pd.DataFrame()

for gene in stat3GeneList:
    results = createGeneResultsList(stat3GeneList[gene])
    df = pd.DataFrame([lineString.split('\t') for lineString in results], columns=[gene, gene+' FPKM']) #tells the data frame to add a new column when it sees a tab i.e. '\t'
    df[gene+' FPKM'] = df[gene+' FPKM'].str.rstrip('\n') #Remove '\n' from end of FPKM values
    df = df.drop(gene, axis=1) #Drops the ensembl ID before adding it to final data frame
    completeStat3FpkmDf = pd.concat([df, completeStat3FpkmDf], axis=1)

In [9]:
completeJak2FpkmDf

Unnamed: 0,ATM FPKM,MRE11 FPKM,MTOR FPKM,STAT5ALPHA FPKM,STATHMIN FPKM,TUBERIN FPKM,PI3KP85 FPKM,LCN2A FPKM,NRF2 FPKM,JAK2_ID FPKM
0,79465.0033294,76928.5040934,163547.004191,156422.395939,339837.615644,258030.157082,218001.506661,260401.754917,635166.073268,89539.3251076
1,86555.605234,73385.7878637,117435.225032,303346.288748,762945.616968,204645.451755,169506.675017,757216.112617,556073.739803,262729.90219
2,99884.6714878,179384.578735,134694.484656,184478.392968,485566.115611,225702.037808,88728.5043025,17568886.6747,234694.137139,99757.2536258
3,39681.1535808,79876.8667377,144845.331911,143503.959815,251274.292678,140095.78719,128759.329993,1821502.60003,584987.087352,85616.6710607
4,41197.1831058,50574.006065,118616.809594,234282.281276,299697.088123,174537.495951,152637.733745,129326.422976,583450.8716,91629.121991
...,...,...,...,...,...,...,...,...,...,...
414,68012.9746308,83131.138992,127279.880103,173904.923404,693393.514501,214384.149437,96070.7022509,9053334.83857,348505.080349,70672.137952
415,63642.3787141,74522.4027599,147949.078629,113467.598537,241558.328416,294637.418506,51902.5823618,1478629.08131,441332.147759,63032.5417723
416,43425.0776106,80799.700584,123426.065766,58297.5070899,592893.708262,175076.124361,77430.9876821,5835890.28863,393244.383079,49242.005411
417,109577.098121,89829.5732185,117789.3978,213299.461745,110738.43369,266607.164068,126036.689611,6255556.0706,404158.037356,128982.58593


In [10]:
completeStat3FpkmDf

Unnamed: 0,X4EBP1 FPKM,AKT_pS473 FPKM,CAVEOLIN1 FPKM,GSK3ALPHABETA_pS21S9 FPKM,MTOR_pS2448 FPKM,GSK3_pS9 FPKM,SHP2_pY542 FPKM,THYMIDILATESYNTHASE FPKM,STAT3_ID FPKM
0,355578.702435,354552.619591,591029.323701,171406.031533,163547.004191,226590.609314,425819.111076,106727.351309,1158303.64021
1,552168.209719,338811.877963,350495.101209,136339.751456,117435.225032,324420.260235,252809.039111,408665.744419,855985.781714
2,1650394.95471,294863.119147,259248.512294,118519.524596,134694.484656,347468.967021,422525.308035,216972.603278,532518.90993
3,751933.035398,304369.765999,1106580.19781,238862.847615,144845.331911,318701.07366,490719.106081,318573.538632,1007037.6473
4,429444.695171,425221.813169,7533379.50383,202039.251027,118616.809594,370705.258125,410745.241657,50999.9501087,1390452.7932
...,...,...,...,...,...,...,...,...,...
414,758999.282126,489516.612229,494566.131488,147277.6015,127279.880103,499672.389805,211671.896317,236977.408329,664835.839748
415,614709.078659,346688.436148,194266.262593,159856.494878,147949.078629,452587.263498,850938.828484,409319.548076,706971.26365
416,1332378.44606,361692.717286,186637.809135,234802.577483,123426.065766,411135.045273,459384.207955,241481.575259,1417794.35143
417,414820.951219,355763.19715,557299.637271,153629.633742,117789.3978,302727.459511,298159.795015,74373.9691108,626904.202836


In [11]:
#Calculating r-values and p-values for genes relative to JAK2
tTestJak2, rValueJak2 = [], []

#Preprocessing: the FPKM data was read in as strings, so it must be converted to float for ttest
completeJak2FpkmDf = completeJak2FpkmDf.apply(pd.to_numeric)


for fpkmValue in completeJak2FpkmDf.columns:
    rValueJak2.append(completeJak2FpkmDf[completeJak2FpkmDf.columns[-1]].corr(completeJak2FpkmDf[fpkmValue]))
    tTestJak2.append(stats.ttest_ind(completeJak2FpkmDf[completeJak2FpkmDf.columns[-1]], completeJak2FpkmDf[fpkmValue]).pvalue)
    
 
    
#Add the correlation coefficient and p-values to the final dataframe
#Row 419 is the r-value, row 420 is the p-value
completeJak2FpkmDf.append(pd.Series(rValueJak2, index=completeJak2FpkmDf.columns[:len(rValueJak2)]), ignore_index=True).append(pd.Series(tTestJak2, index=completeJak2FpkmDf.columns[:len(tTestJak2)]), ignore_index=True)


Unnamed: 0,ATM FPKM,MRE11 FPKM,MTOR FPKM,STAT5ALPHA FPKM,STATHMIN FPKM,TUBERIN FPKM,PI3KP85 FPKM,LCN2A FPKM,NRF2 FPKM,JAK2_ID FPKM
0,7.946500e+04,76928.504093,1.635470e+05,1.564224e+05,3.398376e+05,2.580302e+05,2.180015e+05,2.604018e+05,6.351661e+05,89539.325108
1,8.655561e+04,73385.787864,1.174352e+05,3.033463e+05,7.629456e+05,2.046455e+05,1.695067e+05,7.572161e+05,5.560737e+05,262729.902190
2,9.988467e+04,179384.578735,1.346945e+05,1.844784e+05,4.855661e+05,2.257020e+05,8.872850e+04,1.756889e+07,2.346941e+05,99757.253626
3,3.968115e+04,79876.866738,1.448453e+05,1.435040e+05,2.512743e+05,1.400958e+05,1.287593e+05,1.821503e+06,5.849871e+05,85616.671061
4,4.119718e+04,50574.006065,1.186168e+05,2.342823e+05,2.996971e+05,1.745375e+05,1.526377e+05,1.293264e+05,5.834509e+05,91629.121991
...,...,...,...,...,...,...,...,...,...,...
416,4.342508e+04,80799.700584,1.234261e+05,5.829751e+04,5.928937e+05,1.750761e+05,7.743099e+04,5.835890e+06,3.932444e+05,49242.005411
417,1.095771e+05,89829.573218,1.177894e+05,2.132995e+05,1.107384e+05,2.666072e+05,1.260367e+05,6.255556e+06,4.041580e+05,128982.585930
418,4.170072e+04,95338.559154,1.434099e+05,9.537628e+04,1.019173e+06,1.456007e+05,4.597449e+04,1.752357e+07,2.876530e+05,26195.614973
419,6.837148e-01,0.261374,1.079426e-01,3.617124e-01,-2.352907e-01,-3.279575e-01,6.723333e-01,-1.123300e-01,1.465323e-01,1.000000


In [12]:
#Same as above but for STAT3
tTestStat3, rValuestat3  = [], []

#Preprocessing: the FPKM data was read in as strings, so it must be converted to float for ttest
completeStat3FpkmDf = completeStat3FpkmDf.apply(pd.to_numeric)

for fpkmValue in completeStat3FpkmDf.columns:
    rValuestat3.append(completeStat3FpkmDf[completeStat3FpkmDf.columns[-1]].corr(completeStat3FpkmDf[fpkmValue]))
    tTestStat3.append(stats.ttest_ind(completeStat3FpkmDf[completeStat3FpkmDf.columns[-1]], completeStat3FpkmDf[fpkmValue]).pvalue)

#Add the correlation coefficient and p-values to the final dataframe
#Row 419 is the r-value, row 420 is the p-value
completeStat3FpkmDf.append(pd.Series(rValuestat3, index=completeStat3FpkmDf.columns[:len(rValuestat3)]), ignore_index=True).append(pd.Series(tTestStat3, index=completeStat3FpkmDf.columns[:len(tTestStat3)]), ignore_index=True)

Unnamed: 0,X4EBP1 FPKM,AKT_pS473 FPKM,CAVEOLIN1 FPKM,GSK3ALPHABETA_pS21S9 FPKM,MTOR_pS2448 FPKM,GSK3_pS9 FPKM,SHP2_pY542 FPKM,THYMIDILATESYNTHASE FPKM,STAT3_ID FPKM
0,3.555787e+05,3.545526e+05,5.910293e+05,1.714060e+05,1.635470e+05,2.265906e+05,4.258191e+05,1.067274e+05,1.158304e+06
1,5.521682e+05,3.388119e+05,3.504951e+05,1.363398e+05,1.174352e+05,3.244203e+05,2.528090e+05,4.086657e+05,8.559858e+05
2,1.650395e+06,2.948631e+05,2.592485e+05,1.185195e+05,1.346945e+05,3.474690e+05,4.225253e+05,2.169726e+05,5.325189e+05
3,7.519330e+05,3.043698e+05,1.106580e+06,2.388628e+05,1.448453e+05,3.187011e+05,4.907191e+05,3.185735e+05,1.007038e+06
4,4.294447e+05,4.252218e+05,7.533380e+06,2.020393e+05,1.186168e+05,3.707053e+05,4.107452e+05,5.099995e+04,1.390453e+06
...,...,...,...,...,...,...,...,...,...
416,1.332378e+06,3.616927e+05,1.866378e+05,2.348026e+05,1.234261e+05,4.111350e+05,4.593842e+05,2.414816e+05,1.417794e+06
417,4.148210e+05,3.557632e+05,5.572996e+05,1.536296e+05,1.177894e+05,3.027275e+05,2.981598e+05,7.437397e+04,6.269042e+05
418,2.228729e+06,2.521775e+05,1.192437e+05,1.668965e+05,1.434099e+05,3.533354e+05,4.651459e+05,4.904009e+05,7.244330e+05
419,-7.161271e-02,7.565460e-02,1.076065e-01,1.416902e-01,1.166128e-01,1.870986e-04,2.429991e-01,-2.489876e-01,1.000000e+00
