# Benchmark 2.0 

- This time, we are using a new database : SLKB, which has higher-quality, uniform data
- We are integrating the wilcox displacement estimate into the labeling of our positive results (which should have been done already)
- We will use no negative samples, instead relying on positive vs unlabeled samples
- Positive samples will be benchmarked against interactions found in n different cell lines
- SLKB uses different scoring systems with small overlap. 
  
- We will evaluate the overlap with each scoring system separately, at different thresholds : draw a heatmap table with one column per scoring system, and one row per score threshold, where each square is colored corresponding to the precision of the model.
- This heatmap can be redrawn for different p-value thresholds and numbers of cell lines

In [1]:
import os
import gdown
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter

In [2]:
# This cell allows you to download the necessary SLKB files
# Google drive links come from slkb.osubmi.org

file_urls = {
    # SLKB files, obtrained from slkb.osubmi.org
    "inputs/SLKB_predictions.xlsx": "https://slkb.osubmi.org/session/09c5c4738990db9810156682c87de6b8/download/download_data-predSL?w="
}
# Create the inputs directory if it doesn't exist
os.makedirs("inputs", exist_ok=True)

# Download missing files
for filepath, file_url in file_urls.items():
    if not os.path.exists(filepath):
        print(f"Downloading {filepath}...")
        gdown.download(file_url, filepath, quiet=False)

In [None]:
# This excel file is the output of the model we want to benchmark

model_predictions = pd.read_excel("inputs/Supplementary_Table_11_CRISPR_DepMap_analysis.xlsx", sheet_name = None)

# We first extract the list of "mutant" genes included in our model, by getting the names of the tabs in the excel file, this will be used later
mutant_genes = model_predictions.keys()
mutant_genes_list = list(mutant_genes)

# Then, we extract a list of "ko" genes, which are simply all of the genes that each mutant (such as ARID1A) was tested against. 
# Of course, this also has to include ARID1A itself.
ko_genes_list = model_predictions['ARID1A']['gene'].tolist()
if 'ARID1A' not in ko_genes_list:
    ko_genes_list.append('ARID1A')

# Then, since the excel has multiple sheets, we concatenate them to obtain a single dataframe
model_predictions = pd.concat(
  [df for df in model_predictions.values()],
  ignore_index=True
)

print(model_predictions.shape)
model_predictions.head()

In [None]:
# This table, from SLKB, shows which SL pairs scored within the top 10% of 5 different scoring systems, among 22 different cell lines.
#
# Downloaded on 04/04/2025 from https://slkb.osubmi.org/,

slkb_predictions = pd.read_excel("inputs/SLKB_predictions.xlsx", sheet_name=None)

# Here, we also have to concatenate the different sheets
slkb_predictions = pd.concat(
  [df.assign(cell_line=name) for name, df in slkb_predictions.items()],
  ignore_index=True
)

# We separate the "gene_pair" column into two separate columns, to match the format of our first table
slkb_predictions[["gene1", "gene2"]] = slkb_predictions["gene_pair"].str.split('|', expand=True)
slkb_predictions.drop(columns="gene_pair", inplace=True)

# According to the SLKB paper, genes are considered "SLi" if they are in the top 10% of at least 3 different scoring systems, so we filter the dataframe accordingly
slkb_predictions_sli = slkb_predictions[slkb_predictions["total_count"] >= 3]

# Group by gene pairs and aggregate cell lines and scoring metrics
slkb_predictions_sli = slkb_predictions_sli.groupby(['gene1', 'gene2']).agg({
    'cell_line': lambda x: ';'.join(sorted(set(x))),
}).reset_index()

print("initial number of SLis: ", slkb_predictions_sli.shape[0])

# We now need to filter the SLi pairs to only include those that are also present in our model predictions
slkb_predictions_sli = slkb_predictions_sli[
    slkb_predictions_sli['gene1'].isin(ko_genes_list) & slkb_predictions_sli['gene2'].isin(ko_genes_list)
]
print("number of SLis after removing non-reactome genes: ", slkb_predictions_sli.shape[0])

# We then filter the SLi pairs to only include those that have been found in at least two different cell lines
slkb_predictions_sli = slkb_predictions_sli[
    slkb_predictions_sli['cell_line'].str.split(';').apply(len) >= 2
]
print("number of SLis after removing row for interactions found in only one cell line: ", slkb_predictions_sli.shape[0])

slkb_predictions_sli.head()

initial number of SLis:  10455
number of SLis after removing non-reactome genes:  5464
number of SLis after removing row for interactions found in only one cell line:  791


Unnamed: 0,gene1,gene2,cell_line
5,AARS2,PTTG1,30033366_JURKAT;30033366_K562
29,ABCB7,HSCB,30033366_JURKAT;30033366_K562
37,ABCB7,OPA1,30033366_JURKAT;30033366_K562
39,ABCB7,PITRM1,30033366_JURKAT;30033366_K562
61,ACO1,IREB2,34469736_HELA;34469736_PC9


In [None]:
# Now we export the number of "unique" mutants to feed our R model so that our screening is more complete
unique_mutants_sli = slkb_predictions_sli["gene1"].unique()

print("number of unique mutants: ", len(unique_mutants_sli))

# Saving it to a csv
np.savetxt("outputs/unique_mutants_sli_slkb.csv", unique_mutants_sli, delimiter=",", fmt="%s")

number of unique mutants:  330
