In [1]:
import pandas as pd 
import numpy as np 

In [2]:
LIHC = pd.read_csv('raw_data/lihc-rsem-fpkm-tcga-t.txt.gz', compression='gzip', sep='\t')
BRCA = pd.read_csv('raw_data/brca-rsem-fpkm-tcga-t.txt.gz', compression='gzip', sep='\t')
STAD = pd.read_csv('raw_data/stad-rsem-fpkm-tcga-t.txt.gz', compression='gzip', sep='\t')
LIHC_Normal = pd.read_csv('raw_data/liver-rsem-fpkm-gtex.txt.gz', compression='gzip', sep='\t')
BRCA_Normal = pd.read_csv('raw_data/breast-rsem-fpkm-gtex.txt.gz', compression='gzip', sep='\t')
STAD_Normal = pd.read_csv('raw_data/stomach-rsem-fpkm-gtex.txt.gz', compression='gzip', sep='\t')

In [3]:
LIHC = LIHC.drop(['Entrez_Gene_Id'], axis=1)
BRCA = BRCA.drop(['Entrez_Gene_Id'], axis=1)
STAD = STAD.drop(['Entrez_Gene_Id'], axis=1)
LIHC_Normal = LIHC_Normal.drop(['Entrez_Gene_Id'], axis=1)
BRCA_Normal = BRCA_Normal.drop(['Entrez_Gene_Id'], axis=1)
STAD_Normal = STAD_Normal.drop(['Entrez_Gene_Id'], axis=1)

In [4]:
LIHC.rename(columns={'Hugo_Symbol':'Gene_ID'}, inplace=True)
BRCA.rename(columns={'Hugo_Symbol':'Gene_ID'}, inplace=True)
STAD.rename(columns={'Hugo_Symbol':'Gene_ID'}, inplace=True)
LIHC_Normal.rename(columns={'Hugo_Symbol':'Gene_ID'}, inplace=True)
BRCA_Normal.rename(columns={'Hugo_Symbol':'Gene_ID'}, inplace=True)
STAD_Normal.rename(columns={'Hugo_Symbol':'Gene_ID'}, inplace=True)

In [5]:
TCGA_genes = set(STAD['Gene_ID']).intersection(set(LIHC['Gene_ID']).intersection(set(BRCA['Gene_ID'])))

In [6]:
Normal_genes = set(STAD_Normal['Gene_ID']).intersection(set(BRCA_Normal['Gene_ID']).intersection(set(LIHC_Normal['Gene_ID'])))

In [7]:
All_genes = TCGA_genes.intersection(Normal_genes)

In [8]:
LIHC_patients = np.array(LIHC.columns[1:])
BRCA_patients = np.array(BRCA.columns[1:])
STAD_patients = np.array(STAD.columns[1:])

In [9]:
def keep_mutual_genes(df, genes):
    return df[df['Gene_ID'].isin(genes)]

In [10]:
def rename_header(df):
    new_header = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data less the header row
    df.columns = new_header #set the header row as the df header
    return df 

In [11]:
def reorder_genes(df, genes):
    df = df[genes]
    return df 

In [12]:
LIHC = keep_mutual_genes(LIHC, All_genes)
BRCA = keep_mutual_genes(BRCA, All_genes)
STAD = keep_mutual_genes(STAD, All_genes)
LIHC_Normal = keep_mutual_genes(LIHC_Normal, All_genes)
BRCA_Normal = keep_mutual_genes(BRCA_Normal, All_genes)
STAD_Normal = keep_mutual_genes(STAD_Normal, All_genes)

In [13]:
LIHC = LIHC.transpose()
BRCA = BRCA.transpose()
STAD = STAD.transpose()
LIHC_Normal = LIHC_Normal.transpose()
BRCA_Normal = BRCA_Normal.transpose()
STAD_Normal = STAD_Normal.transpose()

In [14]:
LIHC = rename_header(LIHC)
BRCA = rename_header(BRCA)
STAD = rename_header(STAD)
LIHC_Normal = rename_header(LIHC_Normal)
BRCA_Normal = rename_header(BRCA_Normal)
STAD_Normal = rename_header(STAD_Normal)

In [15]:
LIHC = reorder_genes(LIHC, All_genes)
BRCA = reorder_genes(BRCA, All_genes)
STAD = reorder_genes(STAD, All_genes)
LIHC_Normal = reorder_genes(LIHC_Normal, All_genes)
BRCA_Normal = reorder_genes(BRCA_Normal, All_genes)
STAD_Normal = reorder_genes(STAD_Normal, All_genes)

In [16]:
LIHC = LIHC.astype(float)
BRCA = BRCA.astype(float)
STAD = STAD.astype(float)
LIHC_Normal = LIHC_Normal.astype(float)
BRCA_Normal = BRCA_Normal.astype(float)
STAD_Normal = STAD_Normal.astype(float)

In [17]:
LIHC = np.log10(LIHC)
BRCA = np.log10(BRCA)
STAD = np.log10(STAD)
LIHC_Normal = np.log10(LIHC_Normal)
BRCA_Normal = np.log10(BRCA_Normal)
STAD_Normal = np.log10(STAD_Normal)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


In [18]:
LIHC = LIHC.apply(lambda x: np.where(x < 0, 0, x))
BRCA = BRCA.apply(lambda x: np.where(x < 0, 0, x))
STAD = STAD.apply(lambda x: np.where(x < 0, 0, x))
LIHC_Normal = LIHC_Normal.apply(lambda x: np.where(x < 0, 0, x))
BRCA_Normal = BRCA_Normal.apply(lambda x: np.where(x < 0, 0, x))
STAD_Normal = STAD_Normal.apply(lambda x: np.where(x < 0, 0, x))

In [19]:
LIHC.to_csv('data/LIHC.csv') 
BRCA.to_csv('data/BRCA.csv') 
STAD.to_csv('data/STAD.csv') 
LIHC_Normal.to_csv('data/LIHC_Normal.csv') 
BRCA_Normal.to_csv('data/BRCA_Normal.csv') 
STAD_Normal.to_csv('data/STAD_Normal.csv') 

In [20]:
LIHC = pd.read_csv('data/LIHC.csv')
BRCA = pd.read_csv('data/BRCA.csv')
STAD = pd.read_csv('data/STAD.csv')
LIHC_Normal = pd.read_csv('data/LIHC_Normal.csv')
BRCA_Normal = pd.read_csv('data/BRCA_Normal.csv')
STAD_Normal = pd.read_csv('data/STAD_Normal.csv')

In [21]:
# LIHC.head()

In [22]:
# BRCA.head()

In [23]:
# STAD.head()

In [24]:
# LIHC_Normal.head()

In [25]:
# BRCA_Normal.head()

In [26]:
# STAD_Normal.head()