### This notebook modifies the meta_file so it matches the samples in the expression_file, then filters it again by only keeping the samples that satisfy the "controls" (so GSEA compares the samples differed only by the class_name).

### Dataframe samp_to_class (cls file for GSEA) is made, according to the class_name specified by the user.

### Expression_file is filtered so its samples match those in the meta_file.

### Benefits:
### 1. Correctly matches the samples in expression_file and meta_file; 
### 2. Keep other conditions the same (by specifying controls) when comparing results for a certain class;
### 3. User can specify the class name and the controls


In [1]:
import pandas as pd
import numpy as np
import gseapy as gp

gmt_file = '/Users/Miko/Desktop/CCBB/Network/gmt/c2.cp.v6.1.symbols.gmt'
expression_file = '/Users/Miko/Desktop/CCBB/Network/mouse_liver/RNA_cpm_all_samples.tsv'
meta_file = "/Users/Miko/Desktop/CCBB/Network/mouse_liver/dHEP_metadata.csv"
output_dir = '/Users/Miko/Desktop/CCBB/Network/output/by_tissue_w_0'

# ONLY compare the samples differed by the class_name, 
# while keeping all other parameters the same
class_name = 'Tissue'  #'Treatment' 
class_A = 'Liver'  #'DEN_HFD_alcohol'
class_B = 'liver_tumor'  #'DEN_only'

# User types in the controls (attributes that remain constant)
controls = {
    'mouse genotype': 'Alb-Cre;IL-17RA-flox/flox',
    'Model': 'DEN ',  ### There is a space at the end
    'Treatment': 'DEN_HFD_alcohol'}
print controls

{'mouse genotype': 'Alb-Cre;IL-17RA-flox/flox', 'Model': 'DEN ', 'Treatment': 'DEN_HFD_alcohol'}


In [2]:
df_expression = pd.read_table(expression_file, index_col='Unnamed: 0')
df_expression.head()

Unnamed: 0,dHEP_549_A_T,dHEP_520_A_NT,dHEP_178_NI,F_F_167_NC_T,dHEP_550_A_NT,F_F_158_NC_NT,dHEP_549_A_NT,dHEP_520_A_T,F_F_169_NC_T,F_F_503_A_T1,...,dHEP_164_NC_NT,F_F_523_A_T1,F_F_184_NI,dHEP_155_NC_NT,F_F_159_NC_NT,dHEP_165_NC_T,dHEP_155_NC_T,F_F_168_NC_T,F_F_595_A_T,F_F_581_A_NT
Gnai3,121.51956,89.035979,94.557437,113.223446,98.230254,111.136707,97.474351,132.836892,109.32884,114.096305,...,113.769663,132.89166,109.7287,106.096154,103.304758,192.471416,153.176144,183.460817,152.620839,108.366033
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cdc45,1.958903,1.673062,1.19693,2.630385,1.766455,1.086912,0.898758,2.300206,1.993288,1.984284,...,2.222064,1.612585,0.69351,1.687041,1.777031,1.688346,4.976438,4.1231,1.531535,1.943865
,0.067548,0.072742,0.108812,0.233812,0.153605,0.0,0.0,0.287526,0.241611,0.631363,...,0.0,0.293197,0.077057,0.0,0.0,0.135068,0.238221,0.056104,0.0,0.065916
Apoh,1484.443494,1381.221548,1823.576617,1170.81344,1223.00122,1827.370055,1292.41348,886.585618,1267.127294,1078.638506,...,1497.374679,1145.37511,1898.753434,1567.636285,1727.57068,1453.395562,627.533836,1003.480178,1232.619699,1349.697628


In [3]:
focal_samples = list(df_expression)  # header
len(focal_samples)

30

In [4]:
df_meta = pd.read_csv(meta_file)
df_meta.head()

Unnamed: 0,Sample_name,Sample_Name2,mouse genotype,short_genotype,Model,Treatment,Tissue
0,dHEP_518_A_NT,dHEP-518-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
1,dHEP_520_A_NT,dHEP-520-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
2,dHEP_549_A_NT,dHEP-549-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
3,dHEP_550_A_NT,dHEP-550-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
4,F_F_503_A_NT,F/F-503-A-NT,IL-17RA-flox/flox,IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver


In [5]:
# keep the samples that are in the expression file
df_meta = df_meta[df_meta['Sample_name'].isin(focal_samples)]
df_meta.shape

(30, 7)

In [6]:
#df_meta['Model'] == 'DEN'  #this would print as False, because it is actually 'DEN ' with a space


In [7]:
# filter the samples: ONLY keep the samples that match the controls
for key, value in controls.iteritems(): 
    df_meta = df_meta[df_meta[key]==value]

df_meta   


Unnamed: 0,Sample_name,Sample_Name2,mouse genotype,short_genotype,Model,Treatment,Tissue
0,dHEP_518_A_NT,dHEP-518-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
1,dHEP_520_A_NT,dHEP-520-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
2,dHEP_549_A_NT,dHEP-549-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
3,dHEP_550_A_NT,dHEP-550-A-NT,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,Liver
14,dHEP_518_A_T1,dHEP-518-A-T1,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,liver_tumor
15,dHEP_520_A_T,dHEP-520-A-T,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,liver_tumor
16,dHEP_549_A_T,dHEP-549-A-T,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,liver_tumor
17,dHEP_550_A_T,dHEP-550-A-T,Alb-Cre;IL-17RA-flox/flox,AlbCre_IL17RA_floxflox,DEN,DEN_HFD_alcohol,liver_tumor


In [8]:
# extract only the COLUMNS with sample_name and class name
samp_to_class = df_meta[['Sample_name', class_name]]

# only keep the rows with class_A and class_B
samp_to_class = samp_to_class[(samp_to_class[class_name] == class_A) | (samp_to_class[class_name] == class_B)]

print(len(samp_to_class))
samp_to_class

8


Unnamed: 0,Sample_name,Tissue
0,dHEP_518_A_NT,Liver
1,dHEP_520_A_NT,Liver
2,dHEP_549_A_NT,Liver
3,dHEP_550_A_NT,Liver
14,dHEP_518_A_T1,liver_tumor
15,dHEP_520_A_T,liver_tumor
16,dHEP_549_A_T,liver_tumor
17,dHEP_550_A_T,liver_tumor


In [9]:
# Filter expression file
real_focal_samples = samp_to_class['Sample_name'].tolist()
df_expression = df_expression[real_focal_samples]
df_expression.shape

(22187, 8)

In [10]:
cap_gene = [str(g).upper() for g in df_expression.index.tolist()] # cap the genes
df_expression['Name'] = cap_gene                                  # create a new column
df_expression = df_expression[['Name'] + real_focal_samples]           # put the 'Name' column at front
df_expression.index = range(0,len(df_expression))                 # number the rows
df_expression.head()
df_expression.shape

(22187, 9)

In [11]:
cls_list = samp_to_class[class_name].tolist()
cls_list

['Liver',
 'Liver',
 'Liver',
 'Liver',
 'liver_tumor',
 'liver_tumor',
 'liver_tumor',
 'liver_tumor']

In [12]:
gs_res = gp.gsea(data=df_expression, 
                 gene_sets=gmt_file,
                 cls=samp_to_class[class_name].tolist(),  # we only need Treatment column here, since the Sample_name is in the expression file
                 permutation_num=100, # reduce number to speed up test
                 weighted_score_type = 1,  # default: 1
                 outdir=output_dir,
                 method='log2_ratio_of_classes',
                 processes=4,    ## 1 is default
                 format='png')



In [2]:
#access the dataframe results throught res2d attribute
gs_res.res2d.head()

NameError: name 'gs_res' is not defined

In [1]:
#gp.gsea?

Object `gp.gsea` not found.
