In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import yaml

sns.set_style("darkgrid")
palette = sns.color_palette()

# Define params

- `input_file`: Path (relative to project root) to a csv file containing a table with gene names and ranking metric(s). Input files should be put in the `resources` folder.
  
- `project_name`: A string to tag output files. Results will be saved in `results/{project_name}/some_filename.{project_name}.csv`

- `metrics`: A list of string specifying columns in the input table that are used to rank the genes.

In [14]:
#### User-defined variables ####

input_file = "resources/Chiara/edger.lrt.lfc0.KO_WT.p1.csv"
project_name = "test2"

metrics = ['logFC', 'neg_signed_logpval']#, 'signed_LR']
libraries = ["KEGG","GO"]
tools = ["clusterProfiler","gseapy"]


# ClusterProfiler

keytype = "ENSEMBL"
organismKEGG = "hsa"

In [13]:
#### User-defined variables ####

input_file = "resources/Liana/deg.edger.lrt.batch.unm_0.6.clean.clExc7_DL.thresh.0.2.2024-01-22-17-42.P90.p19rc.csv"
project_name = "met.Exc7_DL.P90.p19rc"

metrics = ['logFC', 'neg_signed_logpval']
libraries = ["KEGG","GO"]
tools = ["clusterProfiler"]


# ClusterProfiler

keytype = "SYMBOL"
organismKEGG = "mmu"

## Create config.yaml file

In [15]:
# Create configuration dictionary
config_data = {
    'input_file': input_file,
    'project_name': project_name,
    'metrics': metrics,
    'keytype': keytype,
    'organismKEGG': organismKEGG,
    'libraries': libraries,
    'tools': tools
}

# Write to config.yaml
config_filename = "../../config/config.yaml"
with open(config_filename, 'w') as file:
    yaml.dump(config_data, file, default_flow_style=False)

print(f"Configuration file '{config_filename}' created successfully!")

savepath = f"../../results/{project_name}/"

Configuration file '../../config/config.yaml' created successfully!


# Inspect/modify input

This space can be used to calculate further ranking metrics that are missing in the input table, such as $-\mathrm{sign}(\log_2\mathrm{FC})\times\log_{10}(p\mathrm{-value})$

In [16]:
df = pd.read_csv(f"../../{input_file}", index_col=0)
df["neg_signed_logpval"] = -np.sign(df["logFC"]) * np.log10(df["PValue"])
df["signed_LR"] = np.sign(df["logFC"]) * df["LR"]
display(df.head())
#df.to_csv(f"../../{input_file}")

Unnamed: 0,logFC,logCPM,LR,PValue,FDR,neg_signed_logpval,signed_LR
ENSG00000044524,-3.322779,1.44094,127.429106,1.496523e-29,2.753153e-25,-28.824917,-127.429106
ENSG00000000971,-1.100446,5.60421,113.631694,1.569063e-26,1.443302e-22,-25.80436,-113.631694
ENSG00000154162,-1.168942,3.551626,97.522212,5.325551e-23,3.2658049999999995e-19,-22.273635,-97.522212
ENSG00000134363,1.101229,6.585811,91.486803,1.123389e-21,5.166746e-18,20.94947,91.486803
ENSG00000106278,1.032081,4.456309,84.07941,4.7529499999999995e-20,1.7488e-16,19.323037,84.07941


# Run Snakemake

Run the following command in project root directory:

`snakemake --use-conda --cores 1` (adjust number of cores as needed)

In [17]:
# import subprocess

# cores = 1
# command = f"snakemake -s ../Snakefile --configfile ../../config/config.yaml --use-conda --cores {cores}"
# subprocess.run(command, shell=True, check=True)

# Inspect results

In [18]:
sns.set_style("darkgrid")
palette = sns.color_palette()

output_files = glob.glob(f"{savepath}/syn.*csv")
print(f"Found {len(output_files)} output files:\n",*[o+"\n" for o in output_files])

Found 10 output files:
 ../../results/test2/syn.gseapy.logFC.GO.test2.csv
 ../../results/test2/syn.clusterProfiler.logFC.KEGG.test2.csv
 ../../results/test2/syn.gseapy.neg_signed_logpval.GO.test2.csv
 ../../results/test2/syn.clusterProfiler.logFC.GO.test2.csv
 ../../results/test2/syn.clusterProfiler.neg_signed_logpval.GO.test2.csv
 ../../results/test2/syn.combined.GO.test2.csv
 ../../results/test2/syn.gseapy.neg_signed_logpval.KEGG.test2.csv
 ../../results/test2/syn.clusterProfiler.neg_signed_logpval.KEGG.test2.csv
 ../../results/test2/syn.combined.KEGG.test2.csv
 ../../results/test2/syn.gseapy.logFC.KEGG.test2.csv



## Correlations

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,5))

### Pearson correlation

correlation = df["logFC"].corr(df["neg_signed_logpval"], method='pearson')
sns.regplot(x=df["logFC"], y=df["neg_signed_logpval"], ax=ax[0], scatter_kws={'alpha':0.1}, line_kws={"color":palette[3]})
ax[0].set_title(f"Pearson: {correlation:.2f}")

ax[0].set(xlabel="logFC")
ax[0].set(ylabel="-sign(logFC)*log10(p-value)")

### Spearman rank correlation

df['rank_lfc'] = df['logFC'].rank(method='average')
df['rank_nslp'] = df['neg_signed_logpval'].rank(method='average')
rank_correlation = df['rank_lfc'].corr(df['rank_nslp'], method='spearman')

sns.regplot(x=df['rank_lfc'] ,y=df['rank_nslp'], ax=ax[1], scatter_kws={'alpha':0.01}, line_kws={"color":palette[3]})
ax[1].set_title(f"Spearman: {rank_correlation:.2f}")
fig.tight_layout()

ax[1].set(xlabel="logFC [Rank]")
ax[1].set(ylabel="-sign(logFC)*log10(p-value) [Rank]")

## Venn diagrams

In [62]:
lib = libraries[0]

summary_df = pd.read_csv(f"{savepath}/syn.combined.{lib}.{project_name}.csv", index_col=0, header=[0,1,2])
summary_df.sort_values(by=("Combined","nan","Stouffer FDR"))
summary_df.head()

Tool,clusterProfiler,clusterProfiler,clusterProfiler,clusterProfiler,gseapy,gseapy,gseapy,gseapy,Combined,Combined,Combined,Combined,nan
Metric,logFC,logFC,neg_signed_logpval,neg_signed_logpval,neg_signed_logpval,neg_signed_logpval,logFC,logFC,nan,nan,nan,nan,nan
Value,enrichmentScore,pvalue,enrichmentScore,pvalue,enrichmentScore,pvalue,enrichmentScore,pvalue,enrichmentScore Mean,enrichmentScore SD,Stouffer pvalue,Stouffer FDR,Description
ECM-receptor interaction,0.63086,5.8e-05,0.762691,1e-06,0.762691,0.0,0.63086,0.0,0.696775,0.076113,0.0,0.0,ECM-receptor interaction
AGE-RAGE signaling pathway in diabetic complications,0.584807,3.7e-05,0.640283,2.8e-05,0.640283,0.0,0.584807,0.0,0.612545,0.032029,0.0,0.0,AGE-RAGE signaling pathway in diabetic complic...
Wnt signaling pathway,0.527954,5.6e-05,0.583503,4.2e-05,0.563348,0.0,0.510086,0.0,0.546223,0.03328,0.0,0.0,Wnt signaling pathway
Calcium signaling pathway,0.525658,3.3e-05,0.490488,0.004372,0.490889,0.0,0.519936,0.0,0.506743,0.018685,0.0,0.0,Calcium signaling pathway
Osteoclast differentiation,0.531374,0.000245,0.48497,0.017393,0.471665,0.033898,0.529426,0.0,0.504359,0.030567,0.0,0.0,Osteoclast differentiation


In [41]:
lvl0 = summary_df.columns.get_level_values(level = 0)
lvl0 = [val.split(".")[0] for val in lvl0]
lvl00 = summary_df.columns.get_level_values(level = 0)
lvl1 = [val.split(".")[1] if len(val.split("."))>1 else "" for val in lvl00]
lvl2 = list(summary_df.columns.get_level_values(level = 1))
lvl0[-1] = ""
lvl2[-1] = "Description"

tuples = list(zip(*[lvl0,lvl1,lvl2]))
summary_df.columns = pd.MultiIndex.from_tuples(tuples, names=["Tool","Metric","Value"])

Tool,clusterProfiler,clusterProfiler,clusterProfiler,clusterProfiler,gseapy,gseapy,gseapy,gseapy,Combined,Combined,Combined,Combined,Unnamed: 13_level_0
Metric,logFC,logFC,neg_signed_logpval,neg_signed_logpval,logFC,logFC,neg_signed_logpval,neg_signed_logpval,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Value,enrichmentScore,pvalue,enrichmentScore,pvalue,enrichmentScore,pvalue,enrichmentScore,pvalue,enrichmentScore Mean,enrichmentScore SD,Stouffer pvalue,Stouffer FDR,Description
GO:0030198,0.554510,5.202451e-09,0.649073,1.449042e-11,0.586376,0.000000,0.684476,0.000000,0.618609,0.058919,0.000000e+00,0.000000e+00,"extracellular matrix organization, Extracellul..."
GO:0043062,0.554510,5.202451e-09,0.649073,1.449042e-11,0.652101,0.000000,0.699761,0.000000,0.638861,0.060837,0.000000e+00,0.000000e+00,"extracellular structure organization, Extracel..."
GO:0045229,0.552204,8.286496e-09,0.647655,1.667461e-11,0.652255,0.000000,0.692329,0.000000,0.636111,0.059427,0.000000e+00,0.000000e+00,"external encapsulating structure organization,..."
GO:0062023,0.502956,2.758063e-07,0.559671,1.884857e-07,0.500824,0.000000,0.534577,0.000000,0.524507,0.028067,0.000000e+00,0.000000e+00,"collagen-containing extracellular matrix, Coll..."
GO:0007264,0.440365,5.694347e-07,0.509698,7.819481e-08,0.356630,0.183521,0.476284,0.030948,0.445744,0.065811,5.974988e-11,2.613504e-09,"small GTPase mediated signal transduction, Sma..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
GO:2001185,-0.296376,8.506224e-01,-0.339316,7.430407e-01,,,,,-0.317846,0.030363,,,"regulation of CD8-positive, alpha-beta T cell ..."
GO:2001212,0.692975,6.903353e-02,0.624721,1.988528e-01,,,,,0.658848,0.048263,,,regulation of vasculogenesis
GO:2001251,-0.141052,1.000000e+00,-0.159495,1.000000e+00,,,,,-0.150274,0.013041,,,negative regulation of chromosome organization
GO:2001256,0.359955,6.908397e-01,0.514752,3.532710e-01,,,,,0.437354,0.109459,,,regulation of store-operated calcium entry


In [9]:
# TO DO
from matplotlib_venn import venn2, venn3

def plot_venn(summary_df, metrics):
    return

plot_venn(summary_df, metrics)

NameError: name 'summary_df' is not defined

In [None]:
x=summary_df.xs(("neg_signed_logpval","pvalue"), axis=1)
y=summary_df.xs(("Combined","Stouffer FDR"), axis=1)
sns.regplot(x=x,y=y)
plt.xscale("log")
plt.yscale("log")

In [None]:
summary_df[("Combined","enrichmentScore SD")].hist()

# Test stuff

## GSEApy

In [54]:

tab = pd.read_csv(f"../../{input_file}", index_col=0)
gene_converter = pd.read_csv(f"{savepath}/gene_converter.csv", index_col=0)
tab[keytype] = tab.index
tab = tab.merge(gene_converter, how='left', on=keytype)
tab.dropna(axis=0, inplace=True)
tab.set_index("SYMBOL", inplace=True)
tab

Unnamed: 0_level_0,logFC,logCPM,LR,PValue,FDR,neg_signed_logpval,signed_LR,ENSEMBL,ENTREZID
SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
EPHA3,-3.322779,1.440940,1.274291e+02,1.496523e-29,2.753153e-25,-28.824917,-1.274291e+02,ENSG00000044524,2042.0
CFH,-1.100446,5.604210,1.136317e+02,1.569063e-26,1.443302e-22,-25.804360,-1.136317e+02,ENSG00000000971,3075.0
CDH12,-1.168942,3.551626,9.752221e+01,5.325551e-23,3.265805e-19,-22.273635,-9.752221e+01,ENSG00000154162,1010.0
FST,1.101229,6.585811,9.148680e+01,1.123389e-21,5.166746e-18,20.949470,9.148680e+01,ENSG00000134363,10468.0
PTPRZ1,1.032081,4.456309,8.407941e+01,4.752950e-20,1.748800e-16,19.323037,8.407941e+01,ENSG00000106278,5803.0
...,...,...,...,...,...,...,...,...,...
YIPF2,-0.000072,5.417572,2.973946e-07,9.995649e-01,9.997277e-01,-0.000189,-2.973946e-07,ENSG00000130733,78992.0
B3GNT4,0.000092,1.602074,2.302643e-07,9.996171e-01,9.997277e-01,0.000166,2.302643e-07,ENSG00000176383,79369.0
APH1A,0.000040,6.914367,2.114103e-07,9.996331e-01,9.997277e-01,0.000159,2.114103e-07,ENSG00000117362,51107.0
HDAC5,-0.000081,4.369544,1.676334e-07,9.996733e-01,9.997277e-01,-0.000142,-1.676334e-07,ENSG00000108840,10014.0


In [63]:
print(metrics[0])
tab[metrics[0]].value_counts()

logFC


logFC
-0.112030    1662
-0.582420      11
-0.106530       7
-0.520863       5
-1.709634       5
             ... 
 0.510178       1
-0.500169       1
 0.454181       1
-0.089897       1
-0.114244       1
Name: count, Length: 15067, dtype: int64

## ClusterProfiler

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
source("../../.Rprofile")
.libPaths()

In [None]:
%%R -i input_file
library(clusterProfiler)
library(org.Hs.eg.db)

In [None]:
%%R

filepath <- paste0("../../",input_file)

metrics = c("neg_signed_logpval","logFC")

df <- read.csv(filepath, row.names = 1)

for (metric in metrics) {
    # Check if the metric is in the columns
    if (!(metric %in% colnames(df))) {
            if (metric == "neg_signed_logpval") {
                message(paste("Adding", metric, "to df"))
                df$neg_signed_logpval <- -sign(df$logFC) * log10(df$PValue)
            } else {
                stop(paste("Metric", metric, "not in columns!"))
            }
        }
}

head(df)

In [None]:
%%R

run_clusterProfiler <- function(df, savepath, paramset,
                                metric, cluster, overwrite=FALSE, 
                                organism.KEGG="hsa",
                                organism.GO = org.Hs.eg.db, seed=123) 
{
  set.seed(seed)

  outfile_go <- paste0(savepath,"/cluster.gseGO.",metric,".",paramset,".csv")
  outfile_kegg <- paste0(savepath,"/cluster.gseKEGG.",metric,".",paramset,".csv")
  print(outfile_go)
  print(outfile_kegg)

  if (file.exists(outfile_go) && file.exists(outfile_kegg) && !overwrite) {
    print("Existing files not overwritte, skipping")
    return
  }

  start_time <- Sys.time()

  geneList <- df[[metric]]
  names(geneList) <- df$ENTREZID
  geneList = sort(geneList, decreasing = TRUE)

  if (!file.exists(outfile_go) || overwrite) {

    ego3 <- gseGO(geneList     = geneList,
                  OrgDb        = organism.GO,
                  ont          = "ALL", ## CC MF BP
                  minGSSize    = 10,
                  maxGSSize    = 500,
                  pvalueCutoff = 1,
                  eps = 0,
                  seed = TRUE,
                  verbose = FALSE)
    write.csv(ego3,outfile_go)
  }

  if (!file.exists(outfile_kegg) || overwrite) {

    kegg <- gseKEGG(geneList     = geneList,
                  organism        =  organism.KEGG,
                  minGSSize    = 10,
                  maxGSSize    = 500,
                  pvalueCutoff = 1,
                  eps = 0,
                  seed = TRUE,
                  verbose = FALSE)
    write.csv(kegg,outfile_kegg)
  }

  end_time <- Sys.time()
  print(end_time - start_time)
}

convert_df <- function(df, OrgDb=org.Hs.eg.db) {

  if ("ENTREZID" %in% names(df)) return(df)
  
  df$ENSEMBL <- row.names(df)
  # Convert to ENTREZ ID
  # We will lose some genes here because not all IDs will be converted

  ids<-bitr(row.names(df), fromType = "ENSEMBL", toType = "ENTREZID", OrgDb=OrgDb)
  df <- merge(df, ids, by = "ENSEMBL", all.x = TRUE)
  print(paste("Before",nrow(df)))
  df <- na.omit(df)
  print(paste("After",nrow(df)))
  return(df)
}

In [None]:
%%R
df <- convert_df(df, OrgDb=org.Hs.eg.db)
head(df)

In [None]:
%%R -i project_name
savepath <- paste0("../../results/",project_name)
paramset <- "test"
metric <- "neg_signed_logpval"
df <- convert_df(df, OrgDb=org.Hs.eg.db)
run_clusterProfiler(df, savepath, paramset,
                                metric, overwrite=FALSE, 
                                organism.KEGG="hsa",
                                organism.GO = org.Hs.eg.db) 