In [None]:
import sys
import itertools
import pandas as pd
import numpy as np
import gseapy as gp
import warnings
warnings.filterwarnings('ignore')

gmt_file = '/Users/Miko/Desktop/CCBB/Network/gmt/c2.cp.v6.1.symbols.gmt'
expression_file = '/Users/Miko/Desktop/CCBB/Network/mouse_liver/RNA_cpm_all_samples.tsv'
meta_file = "/Users/Miko/Desktop/CCBB/Network/mouse_liver/dHEP_metadata.csv"
general_output_dir = '/Users/Miko/Desktop/CCBB/Network/output/rep_perm'

# ONLY compare the samples differed by the class_name, 
# while keeping all other parameters the same
class_name = 'Tissue'  #'Treatment' 
class_A = 'Liver'  #'DEN_HFD_alcohol'
class_B = 'liver_tumor'  #'DEN_only'

# User types in the controls (attributes that remain constant)
controls = {'mouse genotype': 'Alb-Cre;IL-17RA-flox/flox',
            'Model': 'DEN ',  ### There is a space at the end
            'Treatment': 'DEN_HFD_alcohol'}
print controls

In [None]:
def run_gsea(perm_num, i):
    #make dir
    !mkdir $general_output_dir/$perm_num

    !mkdir $general_output_dir/$perm_num/$i

    output_dir = general_output_dir +"/"+ str(perm_num) +"/"+ str(i)
    #print(output_dir)
    
    df_expression = pd.read_table(expression_file, index_col='Unnamed: 0')
    #df_expression.head()

    focal_samples = list(df_expression)  # header
    #len(focal_samples)

    df_meta = pd.read_csv(meta_file)
    #df_meta.head()

    # keep the samples that are in the expression file
    df_meta = df_meta[df_meta['Sample_name'].isin(focal_samples)]
    #df_meta.shape
    #df_meta['Model'] == 'DEN'  #this would print as False, because it is actually 'DEN ' with a space

    # filter the samples: ONLY keep the samples that match the controls
    for key, value in controls.iteritems(): 
        df_meta = df_meta[df_meta[key]==value]

    #df_meta   


    # extract only the COLUMNS with sample_name and class name
    samp_to_class = df_meta[['Sample_name', class_name]]

    # only keep the rows with class_A and class_B
    samp_to_class = samp_to_class[(samp_to_class[class_name] == class_A) | (samp_to_class[class_name] == class_B)]

    #print(len(samp_to_class))
    #samp_to_class

    # Filter expression file
    real_focal_samples = samp_to_class['Sample_name'].tolist()
    df_expression = df_expression[real_focal_samples]
    #df_expression.shape

    cap_gene = [str(g).upper() for g in df_expression.index.tolist()] # cap the genes
    df_expression['Name'] = cap_gene                                  # create a new column
    df_expression = df_expression[['Name'] + real_focal_samples]           # put the 'Name' column at front
    df_expression.index = range(0,len(df_expression))                 # number the rows
    #df_expression.head()
    #df_expression.shape

    cls_list = samp_to_class[class_name].tolist()
    #cls_list

    gs_res = gp.gsea(data=df_expression, 
                     gene_sets=gmt_file,
                     cls=samp_to_class[class_name].tolist(),  # we only need Treatment column here, since the Sample_name is in the expression file
                     permutation_num=perm_num, # reduce number to speed up test
                     weighted_score_type = 1,  # default: 1
                     outdir=output_dir,
                     method='log2_ratio_of_classes',
                     processes=4,    ## 1 is default
                     format='png')

In [None]:
# permutation number
perm_num_all = [8, 12, 15]
#[5, 10, 50, 100, 500, 600, 1000]

# number of iterations for each permutation number
i_all = [1,2,3,4,5,6,7,8,9,10]

for perm_num, i in itertools.product(perm_num_all, i_all):
    print(perm_num, i)
    print("===========")
    run_gsea(perm_num, i)

In [None]:
#access the dataframe results throught res2d attribute
#gs_res.res2d.head()

In [None]:
#gp.gsea?