In [19]:
import glob
import pandas as pd
import numpy as np
import time

# Process TCGA files

    The FPKM files from TCGA contain the 'Gene stable ID version' of each gene and its expression, but this IDs is not found in the metabolic model. For this reason, we added a "Gene_ID" column to the file containing the 'Gene_name' present in the metabolic model. 

    Unlike the validation files, adding the IDs of the genes present in the metabolic model is confusing and dificult, because in the model the same gene, e.g. the gene "SULT1A1" , contains five IDs ("6817_AT1", ..., "6817_AT5"). Therefore, the gene names were saved instead of the IDs. This is corrected in the PheFlux script using reloadFPKMHsapiens() function.  

    *The name of "Gene_ID" column is kept even though it contain the name of the genes.

### Add "Gene_ID" column to FPKM files

In [20]:
# Load Data
##########################################################
# load dict Gencode -> Gene_ID
dictFile = 'Hsapiens_ENSGtonames.csv'
dic = pd.read_csv(dictFile, sep='\t', lineterminator='\n', index_col=0).to_dict()['Gene_name']

##########################################################
# load FPKM file
files = glob.glob("/home/bguzman/jupyter/UI/pheflux/data/lineas2022/*.txt") #glo.glob recopila todos los archivos, en este caso con la extension .txt

In [21]:
files

['/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_PC9_ENCFF877GJA.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_PC3_ENCFF200VPR.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_MEL_ENCFF199TJO.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_MCF7_ENCFF721BRA.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_Karpas422_ENCFF257LZM.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_K562_ENCFF928NYA.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_IMR90_ENCFF019KLP.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_HepG2ENCFF773JNC.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_HCT116_ENCFF435PHM.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_H9_ENCFF216CFE.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_H7_ENCFF199UNP.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_H1_ENCFF816ERP.txt',
 '/home/bguzman/jupyter/UI/pheflux/data/lineas2022/2_GM23248_ENCFF185CZL.txt',
 '/home/bg

In [26]:
print ('Hello world!')
processStart = time.time()
times = []
for file in files:
    fileStart = time.time()
    print ('File ID:', file.split('/')[-2])
    #########################################################################################
    # load fpkm data from TCGA file
    data = pd.read_csv(file, sep='\t', lineterminator='\n', header=None, prefix='Column_')
    #print(data)
    fpkmFile = pd.DataFrame(columns=['Gencode', 'Gene_ID', 'Expression'])
    #########################################################################################
    # add gene name ('Gene_ID') to the TCGA file
    for i in range(len(data)):
        gencode = data['Column_0'][i]          # ENSGXXXXXXXXXXXX.X
        gencodesplit = gencode.split('.')[0]   # ENSGXXXXXXXXXXXX
        fpkm = data['Column_1'][i]
        if gencodesplit in dic:
            gene_id = dic[gencodesplit]
        else:continue
#             gene_id = 'NA'
        #############################################################
        # load info in new file
        values = [gencode, gene_id, fpkm]
        fpkmFile.loc[fpkmFile.shape[0]] = values
    #########################################################################################  
    # export new file
    fpkmFile.to_csv("file_fpkm.csv", sep='\t',index=None)
    print ('\t...is processed.') 
    #########################################################################################
    # time for file
    fileFinish = time.time()
    fileTime = fileFinish - fileStart
    times.append(fileTime)
    break
#########################################################################################
processFinish = time.time()
processTime = processFinish - processStart
print ('')
print ('Average time per file:', np.mean(times), 's')
print ('Total process time:', processTime/60, 'min')

Hello world!
File ID: lineas2022
	...is processed.

Average time per file: 122.74015688896179 s
Total process time: 2.0456824978192647 min


In [24]:
data[:5]

Unnamed: 0,Column_0,Column_1
0,gene_id,FPKM\r
1,10904,0.00\r
2,12954,0.00\r
3,12956,0.00\r
4,12958,0.00\r


In [25]:
fpkmFile[:5]

Unnamed: 0,Gencode,Gene_ID,Expression
0,ENSG00000000003.14,TSPAN6,24.12\r
1,ENSG00000000005.5,TNMD,0.00\r
2,ENSG00000000419.12,DPM1,32.65\r
3,ENSG00000000457.13,SCYL3,3.61\r
4,ENSG00000000460.16,C1orf112,11.27\r
