# TCGA RNA-Seq: Differential expression

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Push the code form the previous notebook to a python file we can import
from tcga_rna_load_filter_transform import *

# Imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sb
import sklearn as sk
import statsmodels as sm
import sys
import os

from statsmodels.stats.weightstats import ttest_ind
from statsmodels.stats.multitest import multipletests

### Load datasets

In [3]:
df_lusc = pd.read_csv("data/tcga/LUSC.filtered_transformed.csv", index_col=0)
df_luad = pd.read_csv("data/tcga/LUAD.filtered_transformed.csv", index_col=0)

### Tumor and normals

In [4]:
df_lusc_t, df_lusc_n = get_tumors(df_lusc), get_normals(df_lusc)
df_lusc_t.shape, df_lusc_n.shape

((13225, 501), (13225, 51))

In [5]:
df_luad_t, df_luad_n = get_tumors(df_luad), get_normals(df_luad)
df_luad_t.shape, df_luad_n.shape

((11711, 517), (11711, 59))

# Statistics: t-test

In [16]:
def t_test(df1, df2, gene):
    (tstat, pval, degfree) = ttest_ind(df1.loc[gene], df2.loc[gene])
    return pval

def t_test_all(df1, df2):
    genes_intersect = list(set(df1.index).intersection(df2.index))
    genes_intersect = np.array(genes_intersect)
    pvals = list()
    for g in genes_intersect:
        pval = t_test(df1, df2, g)
        pvals.append(pval)
    pvals_raw = np.array(pvals)
    reject, pvals_corrected = multipletests(pvals_raw)[0:2]
    dfpvals = pd.DataFrame({'pvals_corrected':pvals_corrected, 'pvals_raw':pvals_raw, 'reject': reject}, index=genes_intersect)
    dfpvals.sort_values(['pvals_corrected', 'pvals_raw'], inplace=True)
    return dfpvals

### Sanity check: Normals should not be statistically different

In [17]:
df_pvals =  t_test_all(df_luad_n, df_lusc_n)
df_pvals.reject.sum()

0

### Difference between tumors and normals

In [27]:
df_pvals_luad_tn = t_test_all(df_luad_t, df_luad_n)
df_pvals_luad_tn.reject.sum(), (~df_pvals_luad_tn.reject).sum()

(6089, 5622)

In [29]:
df_pvals_luad_tn.head(30)

Unnamed: 0,pvals_corrected,pvals_raw,reject
PYCR1,0.0,8.44365e-110,True
RTKN2,0.0,2.58122e-98,True
OTUD1,0.0,4.314357e-94,True
RS1,0.0,6.080452e-94,True
SLC6A4,0.0,6.301427e-93,True
SGCG,0.0,2.0486289999999998e-91,True
PECAM1,0.0,7.084435e-90,True
STX11,0.0,1.6328259999999998e-88,True
EPAS1,0.0,2.8894289999999997e-88,True
C16orf59,0.0,2.145034e-87,True


In [20]:
df_pvals_lusc_tn = t_test_all(df_lusc_t, df_lusc_n)
df_pvals_lusc_tn.reject.sum(), (~df_pvals_lusc_tn.reject).sum()

(7912, 5313)

In [21]:
df_pvals_lusc_tn.head(20)

Unnamed: 0,pvals_corrected,pvals_raw,reject
KIF4A,0.0,2.953531e-164,True
TPX2,0.0,6.251027e-160,True
KIF2C,0.0,1.411952e-158,True
UBE2C,0.0,1.995707e-158,True
CENPA,0.0,1.686133e-157,True
HJURP,0.0,2.462296e-157,True
BUB1B,0.0,2.4894140000000003e-157,True
PLK1,0.0,7.409413e-157,True
TROAP,0.0,3.398176e-155,True
CDC20,0.0,1.032954e-154,True


In [26]:
df_pvals_lusc_tn.loc[['TP53', 'CDKN2A', 'PTEN', 'PIK3CA', 'KEAP1', 'FOXP1', 'NOTCH1', 'NOTCH2'
                      , 'ASCL4', 'FAM123B', 'HRAS', 'FBXW7', 'SMARCA4', 'NF1', 'SMAD4', 'EGFR']]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0,pvals_corrected,pvals_raw,reject
TP53,1.0,0.06859452,False
CDKN2A,1.108985e-05,1.57393e-09,True
PTEN,0.0005486696,8.649647e-08,True
PIK3CA,6.074027e-07,8.106272e-11,True
KEAP1,,,
FOXP1,0.0,8.32964e-34,True
NOTCH1,,,
NOTCH2,,,
ASCL4,,,
FAM123B,0.0,4.044902e-18,True
