# Loading Data Set

In [1]:
import pandas as pd

# Formatting Gene Expressions

In [2]:
# loading gene counts
gene_fpkm = pd.read_csv('data/gene_count.txt', sep='\t', index_col='GENE_ID')

# removing data not collected at the first trail
for col in gene_fpkm.columns:
    if '_1_' not in col:
        del gene_fpkm[col]

# transpose matrix, delete patients and gene with all nan, and replace remainder missing by zero
gene_fpkm = gene_fpkm.T.dropna(how='all', axis=0).dropna(how='all', axis=1).fillna(0)

# replace id column name
gene_fpkm.index.name = 'ID'

# normalize index value transforming mmrf ids to integers
gene_fpkm.index = [''.join(col.split('_')[:2]) for col in gene_fpkm.index]

# selected class
gene_details = pd.read_csv('data/gene_details.tsv', sep='\t')

gene_selected_class = pd.read_csv('data/gene_selected_class.tsv', sep='\t')

gene_selected_class = gene_details.merge(gene_selected_class, on='gene_biotype').set_index('ensembl_gene_id')

gene_selected_class = [gen for gen in gene_selected_class.index if gen in gene_fpkm.columns]

gene_fpkm = gene_fpkm[gene_selected_class]

# removing genes with zero sum
gene_fpkm = gene_fpkm[list(gene_fpkm.sum()[gene_fpkm.sum() > 0].index)]

# removing duplicated index
gene_fpkm = gene_fpkm.loc[~gene_fpkm.index.duplicated(keep='first')]

# removing duplicated columns
gene_fpkm = gene_fpkm.loc[:,~gene_fpkm.columns.duplicated()]

gene_fpkm.index.name = 'ID'

gene_fpkm.to_csv('data/gene_count.tsv', sep='\t')

print(gene_fpkm.shape)

gene_fpkm.iloc[:8,:8]

(769, 27777)


GENE_ID,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMRF2801,17,1,1495,231,87,318,347,1087
MMRF2718,999,0,3271,809,319,105,1093,1122
MMRF2815,85,0,2303,203,31,119,374,1638
MMRF1656,9,2,1234,1208,183,190,140,1301
MMRF2461,801,3,1896,425,85,2064,287,2763
MMRF2057,12,0,2013,710,174,12,16,1505
MMRF1910,16,0,1745,922,292,351,238,1332
MMRF2341,43,1,2403,1203,478,555,35,1339
