In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import yaml

sns.set_style("darkgrid")
palette = sns.color_palette()
pd.options.mode.copy_on_write = True

# Define params

- `input_file`: Path (relative to project root) to a csv file containing a table with gene names and ranking metric(s). Input files should be put in the `resources` folder.
  
- `project_name`: A string to tag output files. Results will be saved in `results/{project_name}/some_filename.{project_name}.csv`

- `metrics`: A list of string specifying columns in the input table that are used to rank the genes.

In [None]:
#### User-defined variables ####

input_file = "resources/Chiara/edger.lrt.lfc0.KO_WT.p1.csv"
project_name = "test2"

metrics = ['logFC', 'neg_signed_logpval']#, 'signed_LR']
libraries = ["KEGG","GO"]
tools = ["clusterProfiler","gseapy"]


# ClusterProfiler

keytype = "ENSEMBL"
organismKEGG = "hsa"

In [None]:
#### User-defined variables ####

input_file = "resources/Liana/deg.edger.lrt.batch.unm_0.6.clean.clExc7_DL.thresh.0.2.2024-01-22-17-42.P90.p19rc.csv"
project_name = "met.Exc7_DL.P90.p19rc"

metrics = ['logFC', 'neg_signed_logpval']
libraries = ["KEGG","GO"]
tools = ["clusterProfiler","gseapy","string"]


# ClusterProfiler

keytype = "SYMBOL"
organismKEGG = "mmu"

## Create config.yaml file

In [None]:
# Create configuration dictionary
config_data = {
    'input_file': input_file,
    'project_name': project_name,
    'metrics': metrics,
    'keytype': keytype,
    'organismKEGG': organismKEGG,
    'libraries': libraries,
    'tools': tools
}

# Write to config.yaml
config_filename = "../../config/config.yaml"
with open(config_filename, 'w') as file:
    yaml.dump(config_data, file, default_flow_style=False)

print(f"Configuration file '{config_filename}' created successfully!")

savepath = f"../../results/{project_name}/"

# Inspect/modify input

This space can be used to calculate further ranking metrics that are missing in the input table, such as $-\mathrm{sign}(\log_2\mathrm{FC})\times\log_{10}(p\mathrm{-value})$

In [None]:
df = pd.read_csv(f"../../{input_file}", index_col=0)
df["neg_signed_logpval"] = -np.sign(df["logFC"]) * np.log10(df["PValue"])
df["signed_LR"] = np.sign(df["logFC"]) * df["LR"]
display(df.head())
#df.to_csv(f"../../{input_file}")

# Run Snakemake

Run the following command in project root directory:

`snakemake --use-conda --cores 1` (adjust number of cores as needed)

In [None]:
# import subprocess

# cores = 1
# command = f"snakemake -s ../Snakefile --configfile ../../config/config.yaml --use-conda --cores {cores}"
# subprocess.run(command, shell=True, check=True)

# Inspect results

In [None]:
sns.set_style("darkgrid")
palette = sns.color_palette()

output_files = glob.glob(f"{savepath}/syn.*[tc]sv")
print(f"Found {len(output_files)} output files:\n",*[o+"\n" for o in output_files])

## Correlations

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,5))

### Pearson correlation

correlation = df["logFC"].corr(df["neg_signed_logpval"], method='pearson')
sns.regplot(x=df["logFC"], y=df["neg_signed_logpval"], ax=ax[0], scatter_kws={'alpha':0.1}, line_kws={"color":palette[3]})
ax[0].set_title(f"Pearson: {correlation:.2f}")

ax[0].set(xlabel="logFC")
ax[0].set(ylabel="-sign(logFC)*log10(p-value)")

### Spearman rank correlation

df['rank_lfc'] = df['logFC'].rank(method='average')
df['rank_nslp'] = df['neg_signed_logpval'].rank(method='average')
rank_correlation = df['rank_lfc'].corr(df['rank_nslp'], method='spearman')

sns.regplot(x=df['rank_lfc'] ,y=df['rank_nslp'], ax=ax[1], scatter_kws={'alpha':0.01}, line_kws={"color":palette[3]})
ax[1].set_title(f"Spearman: {rank_correlation:.2f}")
fig.tight_layout()

ax[1].set(xlabel="logFC [Rank]")
ax[1].set(ylabel="-sign(logFC)*log10(p-value) [Rank]")

## Venn diagrams

In [None]:
lib = libraries[1]
print(lib)

summary_df = pd.read_csv(f"{savepath}/syn.combined.{lib}.{project_name}.csv", index_col=0, header=[0,1,2])
summary_df.sort_values(by=("Combined","nan","Stouffer FDR"))
print(len(summary_df))
summary_df.head()

In [None]:
# TO DO
from matplotlib_venn import venn2, venn3

def plot_venn(summary_df, metrics):
    return

plot_venn(summary_df, metrics)

In [None]:
x=summary_df.xs(("neg_signed_logpval","pvalue"), axis=1)
y=summary_df.xs(("Combined","Stouffer FDR"), axis=1)
sns.regplot(x=x,y=y)
plt.xscale("log")
plt.yscale("log")

In [None]:
summary_df[("Combined","enrichmentScore SD")].hist()

# Format STRING table

User can (optionally) manually add STRING functional scoring output tables to the resutls folder, and they will be combined with the output from SynEnrich. For this, STRING tables have to be formatted first.

In [None]:
def format_string_table(df: pd.DataFrame, library: str) -> pd.DataFrame:
    """
    Format table from STRING databse functional scoring results (proteins with values/ranks)
    Output will look closer to ClusterProfiler table
    """

    df.loc[:,"ONTOLOGY"] = df.index

    match library:
        case "GO":
            df = df[df.index.str.startswith("GO ")]
            df.replace({"ONTOLOGY" : "GO Process"}, "BP", inplace=True)
            df.replace({"ONTOLOGY" : "GO Function"}, "MF", inplace=True)
            df.replace({"ONTOLOGY" : "GO Component"}, "CC", inplace=True)
        case "KEGG":
            df = df[df.index.str.startswith("KEGG")]
        case _:
            raise Exception("Unknown library:", library)
        
    df.rename({"enrichment score": "enrichmentScore",
               "term description": "Description",
               "term ID": "ID",
               "false discovery rate": "qvalue"},
              axis=1, inplace=True)
    df.set_index("ID", inplace=True)

    df["pvalue"] = df["qvalue"] # dubious but STRING doesn't save pvalues...

    # STRING sort values from negative to positive, hence "top" will be downregulated, hence reverse this here
    df["enrichmentScore"] = df["enrichmentScore"] * df["direction"].apply(lambda x: -1 if x == "top" else 1 if x == "bottom" else 0)

    return df

In [None]:
string_file = f"{savepath}/syn.string.logFC.{project_name}.tsv"
string = pd.read_csv(string_file, index_col=0, sep="\t")

for library in ["GO", "KEGG"]:
    string_formatted = format_string_table(string, library)
    display(string_formatted.head())
    string_formatted.to_csv(string_file.replace(f"{project_name}.tsv", f"{library}.{project_name}.csv"))

# Test stuff

## ClusterProfiler

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
source("../../.Rprofile")
.libPaths()

In [None]:
%%R -i input_file
library(clusterProfiler)
library(org.Hs.eg.db)

In [None]:
%%R

filepath <- paste0("../../",input_file)

metrics = c("neg_signed_logpval","logFC")

df <- read.csv(filepath, row.names = 1)

for (metric in metrics) {
    # Check if the metric is in the columns
    if (!(metric %in% colnames(df))) {
            if (metric == "neg_signed_logpval") {
                message(paste("Adding", metric, "to df"))
                df$neg_signed_logpval <- -sign(df$logFC) * log10(df$PValue)
            } else {
                stop(paste("Metric", metric, "not in columns!"))
            }
        }
}

head(df)

In [None]:
%%R

run_clusterProfiler <- function(df, savepath, paramset,
                                metric, cluster, overwrite=FALSE, 
                                organism.KEGG="hsa",
                                organism.GO = org.Hs.eg.db, seed=123) 
{
  set.seed(seed)

  outfile_go <- paste0(savepath,"/cluster.gseGO.",metric,".",paramset,".csv")
  outfile_kegg <- paste0(savepath,"/cluster.gseKEGG.",metric,".",paramset,".csv")
  print(outfile_go)
  print(outfile_kegg)

  if (file.exists(outfile_go) && file.exists(outfile_kegg) && !overwrite) {
    print("Existing files not overwritte, skipping")
    return
  }

  start_time <- Sys.time()

  geneList <- df[[metric]]
  names(geneList) <- df$ENTREZID
  geneList = sort(geneList, decreasing = TRUE)

  if (!file.exists(outfile_go) || overwrite) {

    ego3 <- gseGO(geneList     = geneList,
                  OrgDb        = organism.GO,
                  ont          = "ALL", ## CC MF BP
                  minGSSize    = 10,
                  maxGSSize    = 500,
                  pvalueCutoff = 1,
                  eps = 0,
                  seed = TRUE,
                  verbose = FALSE)
    write.csv(ego3,outfile_go)
  }

  if (!file.exists(outfile_kegg) || overwrite) {

    kegg <- gseKEGG(geneList     = geneList,
                  organism        =  organism.KEGG,
                  minGSSize    = 10,
                  maxGSSize    = 500,
                  pvalueCutoff = 1,
                  eps = 0,
                  seed = TRUE,
                  verbose = FALSE)
    write.csv(kegg,outfile_kegg)
  }

  end_time <- Sys.time()
  print(end_time - start_time)
}

convert_df <- function(df, OrgDb=org.Hs.eg.db) {

  if ("ENTREZID" %in% names(df)) return(df)
  
  df$ENSEMBL <- row.names(df)
  # Convert to ENTREZ ID
  # We will lose some genes here because not all IDs will be converted

  ids<-bitr(row.names(df), fromType = "ENSEMBL", toType = "ENTREZID", OrgDb=OrgDb)
  df <- merge(df, ids, by = "ENSEMBL", all.x = TRUE)
  print(paste("Before",nrow(df)))
  df <- na.omit(df)
  print(paste("After",nrow(df)))
  return(df)
}

In [None]:
%%R
df <- convert_df(df, OrgDb=org.Hs.eg.db)
head(df)

In [None]:
%%R -i project_name
savepath <- paste0("../../results/",project_name)
paramset <- "test"
metric <- "neg_signed_logpval"
df <- convert_df(df, OrgDb=org.Hs.eg.db)
run_clusterProfiler(df, savepath, paramset,
                                metric, overwrite=FALSE, 
                                organism.KEGG="hsa",
                                organism.GO = org.Hs.eg.db) 