In [21]:
import pandas as pd
import gseapy as gp

gmt_file = '/Users/Miko/Desktop/CCBB/Network/geneset.gmt'
csv_file = '/Users/Miko/Desktop/CCBB/Network/DE_ALCvsCHOWinNTinAlbCre_30_Liver.csv'
output_dir = '/Users/Miko/Desktop/CCBB/Network/ssgsea_weight=1.5'


### Read in csv file

In [2]:
# read in csv file
DE_df = pd.read_csv(csv_file)
DE_df.head()

Unnamed: 0.1,Unnamed: 0,ENSEMBL,ENTREZID,SYMBOL,logFC,AveExpr,t,P.Value,adj.P.Val,B
0,ENSMUSG00000020572,ENSMUSG00000020572,59027.0,Nampt,1.125802,6.422679,9.23413,2.848587e-07,0.002271,7.163066
1,ENSMUSG00000039157,ENSMUSG00000039157,98952.0,Fam102a,1.24693,5.07999,8.650189,6.184195e-07,0.002271,6.391317
2,ENSMUSG00000025277,ENSMUSG00000025277,66082.0,Abhd6,0.926458,5.17392,8.577569,6.827854e-07,0.002271,6.312529
3,ENSMUSG00000028327,ENSMUSG00000028327,74152.0,Stra6l,-0.94438,7.534265,-8.528602,7.301721e-07,0.002271,6.280035
4,ENSMUSG00000059743,ENSMUSG00000059743,110196.0,Fdps,-2.847604,6.29879,-8.342241,9.449679e-07,0.002271,6.023082


### Get column (yield a series)

In [3]:
DE_df['logFC'].head()

0    1.125802
1    1.246930
2    0.926458
3   -0.944380
4   -2.847604
Name: logFC, dtype: float64

### Set the symbols as indices

In [4]:
DE_df.index = DE_df['SYMBOL'].tolist() # convert the series to a list
DE_df.head()

Unnamed: 0.1,Unnamed: 0,ENSEMBL,ENTREZID,SYMBOL,logFC,AveExpr,t,P.Value,adj.P.Val,B
Nampt,ENSMUSG00000020572,ENSMUSG00000020572,59027.0,Nampt,1.125802,6.422679,9.23413,2.848587e-07,0.002271,7.163066
Fam102a,ENSMUSG00000039157,ENSMUSG00000039157,98952.0,Fam102a,1.24693,5.07999,8.650189,6.184195e-07,0.002271,6.391317
Abhd6,ENSMUSG00000025277,ENSMUSG00000025277,66082.0,Abhd6,0.926458,5.17392,8.577569,6.827854e-07,0.002271,6.312529
Stra6l,ENSMUSG00000028327,ENSMUSG00000028327,74152.0,Stra6l,-0.94438,7.534265,-8.528602,7.301721e-07,0.002271,6.280035
Fdps,ENSMUSG00000059743,ENSMUSG00000059743,110196.0,Fdps,-2.847604,6.29879,-8.342241,9.449679e-07,0.002271,6.023082


### Get row

In [5]:
DE_df.loc['Fdps']

Unnamed: 0    ENSMUSG00000059743
ENSEMBL       ENSMUSG00000059743
ENTREZID                  110196
SYMBOL                      Fdps
logFC                    -2.8476
AveExpr                  6.29879
t                       -8.34224
P.Value              9.44968e-07
adj.P.Val             0.00227149
B                        6.02308
Name: Fdps, dtype: object

### Extract a sub dataframe rank_df from DE_df

In [6]:
rank_df = DE_df[['SYMBOL','logFC']]
rank_df.head()

Unnamed: 0,SYMBOL,logFC
Nampt,Nampt,1.125802
Fam102a,Fam102a,1.24693
Abhd6,Abhd6,0.926458
Stra6l,Stra6l,-0.94438
Fdps,Fdps,-2.847604


In [7]:
rank_df = rank_df.sort_values('logFC')
rank_df.head()

Unnamed: 0,SYMBOL,logFC
,,-4.984858
,,-4.678138
Chrna4,Chrna4,-4.580266
,,-3.993394
Hist1h4n,Hist1h4n,-3.742476


In [8]:
rank_df = rank_df.dropna()
rank_df.head()

Unnamed: 0,SYMBOL,logFC
Chrna4,Chrna4,-4.580266
Hist1h4n,Hist1h4n,-3.742476
Gm14434,Gm14434,-3.676823
Idi1,Idi1,-3.316795
Zfp968,Zfp968,-3.223627


### Turn the indices back to numbers in rank_df

In [9]:
rank_df.index = range(len(rank_df))  # get number of rows in rank_df
rank_df.head()

Unnamed: 0,SYMBOL,logFC
0,Chrna4,-4.580266
1,Hist1h4n,-3.742476
2,Gm14434,-3.676823
3,Idi1,-3.316795
4,Zfp968,-3.223627


In [10]:
# list conprehension: set symbols to UPPER CASE 
UC_symbols = [gene.upper() for gene in rank_df['SYMBOL'].tolist()]  # cast to a list
UC_symbols[0:5]


['CHRNA4', 'HIST1H4N', 'GM14434', 'IDI1', 'ZFP968']

In [11]:
rank_df['SYMBOL'] = UC_symbols
rank_df.head()

Unnamed: 0,SYMBOL,logFC
0,CHRNA4,-4.580266
1,HIST1H4N,-3.742476
2,GM14434,-3.676823
3,IDI1,-3.316795
4,ZFP968,-3.223627


In [14]:
pre_res = gp.prerank(rnk=rank_df,
                     gene_sets=gmt_file,
                     permutation_num=100, # reduce number to speed up test
                     weighted_score_type=0, # default was 1
                     outdir=output_dir,
                     format='png')

2018-01-30 10:23:27,753 Input gene rankings contains duplicated IDs, Only use the duplicated ID with highest value!


In [19]:
print(pre_res)

<gseapy.gsea.Prerank object at 0x10c7c6a10>


In [22]:
ssgsea_1 = gp.ssgsea(data=rank_df,
                     gene_sets=gmt_file,
                     weighted_score_type=1.5, # default was 0.25
                     outdir=output_dir,
                     format='png')

