# Filtering SynlethDB data

Here, we are filtering SynLethDB data so as to obtain a list of mutant genes to run through our model

In [4]:
import os
import gdown
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter

In [5]:
# This cell allows you to download the necessary SynLethDB 3.0 files
# Google drive links come from SynLethDB.com

file_urls = {
    # SyhlethDB 3.0 files, obtained from SynlethDB.com
    "inputs/synlethdb3_sli.tsv": "https://drive.google.com/uc?export=download&id=1yDVv789aRbY3eBJz7qetrWQMn1zHoR6X",
    "inputs/synlethdb3_nonsli.tsv": "https://drive.google.com/uc?export=download&id=1SQRINp58iL5mN6EOYvQ5I9g9WvpD_F2T"
}

# Create the inputs directory if it doesn't exist
os.makedirs("inputs", exist_ok=True)

# Download missing files
for filepath, file_url in file_urls.items():
    if not os.path.exists(filepath):
        print(f"Downloading {filepath}...")
        gdown.download(file_url, filepath, quiet=False)


In [6]:
# This excel file is the output of the model we want to benchmark
# It contains our model's predictions, generated by an R script
model_predictions = pd.read_excel("inputs/Supplementary_Table_11_CRISPR_DepMap_analysis.xlsx", sheet_name = None)

# We first extract the list of "mutant" genes included in our model, by getting the names of the tabs in the excel file, this will be used later
mutant_genes = model_predictions.keys()
mutant_genes_list = list(mutant_genes)

# Then, we extract a list of "ko" genes, which are simply all of the genes that each mutant (such as ARID1A) was tested against. 
# Of course, this also has to include ARID1A itself.
ko_genes_list = model_predictions['ARID1A']['gene'].tolist()
if 'ARID1A' not in ko_genes_list:
    ko_genes_list.append('ARID1A')

# Then, since the excel has multiple sheets, we concatenate them to obtain a single dataframe
model_predictions = pd.concat(
  [df for df in model_predictions.values()],
  ignore_index=True
)

print(model_predictions.shape)
model_predictions.head()

(130676, 6)


Unnamed: 0,mutant,gene,num_lines_high,num_lines_low,estimate,p_adj
0,ARID1A,EP300,98,98,-0.368858,8.644689e-08
1,ARID1A,MICOS10,98,98,-0.241643,7.014478e-07
2,ARID1A,PDCD10,98,98,-0.185685,2.418385e-06
3,ARID1A,SDHD,98,98,-0.236701,6.698955e-06
4,ARID1A,ROCK1,98,98,-0.115831,2.065706e-05


In [7]:
# These tables, from SynLethDB 3.0, contain the positive and negative data we want to benchmark our model against
#
# Downloaded on 25/04/2025 from SynLethDB.com, the sources are available at the top of the notebook

synlethdb_predictions_sli = pd.read_csv("inputs/synlethdb3_sli.tsv", sep="\t")
synlethdb_predictions_nonsli = pd.read_csv("inputs/synlethdb3_nonsli.tsv", sep="\t")

print("initial number of SLis: ", synlethdb_predictions_sli.shape[0])

# We put both tables in an array since we'll process both the same way
tables = [synlethdb_predictions_sli, synlethdb_predictions_nonsli]

# First, we filter to keep keep only the gene paris that are in our model
for i in range(2):

    # Rename columns
    table = tables[i]
    table.rename(columns={"x_name": "gene1", "y_name": "gene2"}, inplace=True)

    # Check if the gene pairs are in the list of genes that were tested by our model
    table = table[
        ((table["gene1"].isin(ko_genes_list)) & table["gene2"].isin(ko_genes_list)) |
        ((table["gene2"].isin(ko_genes_list)) & table["gene1"].isin(ko_genes_list))
    ].copy()

    print("number of SLis after removing non-reactome genes: ",table.shape[0]) if i == 0 else print("")

    sources_to_exclude = "Text Mining|Computational Prediction|Drug Inhibition|Drug Screen|GenomeRNAi|Decipher|Synlethality"
    table = table[~table["rel_source"].str.contains(sources_to_exclude, regex=True)]

    print("number of SLis after removing undesired data sources: ",table.shape[0]) if i == 0 else print("")

    table = table.dropna(subset=["cell_line"]) # We drop the rows with no cell line information

    print("number of SLis after removing rows with no cell line information: ",table.shape[0]) if i == 0 else print("")

    table = table[table["cell_line"].str.split(';').apply(len) >= 2]

    print("number of SLis after removing row for interactions found in only one cell line: ",table.shape[0]) if i == 0 else print("")

    # Finally, we add a column to indicate whether the pair is a synthetic lethal interaction (sli) or not
    if i == 0: # The first table is the table with the positive results, so we mark it as such and store it
        table["sli"] = 1
        synlethdb_predictions_sli = table
    else:
        table["sli"] = 0 # The second table contains the negative results
        synlethdb_predictions_nonsli = table
        synlethdb_predictions = pd.concat([synlethdb_predictions_sli, synlethdb_predictions_nonsli], ignore_index=True) # We concatenate the two tables to obtain a single dataframe

synlethdb_predictions = synlethdb_predictions[["gene1", "gene2", "sli", "cell_line", "rel_source"]]


# print(synlethdb_predictions.shape)
synlethdb_predictions.head()

initial number of SLis:  37943
number of SLis after removing non-reactome genes:  24213
number of SLis after removing undesired data sources:  7074
number of SLis after removing rows with no cell line information:  6840
number of SLis after removing row for interactions found in only one cell line:  428






Unnamed: 0,gene1,gene2,sli,cell_line,rel_source
0,CENPP,RNGTT,1,K562;Jurkat,CRISPR/CRISPRi
1,MED28,RNF20,1,K562;Jurkat,CRISPR/CRISPRi
2,CTNNBL1,GNPAT,1,K562;Jurkat,CRISPR/CRISPRi
3,CDC23,LEO1,1,JURKAT;K562,High Throughput
4,INTS5,PDCD7,1,K562;Jurkat,CRISPR/CRISPRi


In [8]:
# Here are the data sources that were retained
print(synlethdb_predictions_sli["rel_source"].unique())

['CRISPR/CRISPRi' 'High Throughput' 'CRISPR/CRISPRi;High Throughput'
 'High Throughput;CRISPR/CRISPRi'
 'Low Throughput;High Throughput;RNAi Screen'
 'CRISPR/CRISPRi;Low Throughput']


In [None]:
# Now we export the number of "unique" mutants to feed our R model so that our screening is more complete
unique_mutants_sli = synlethdb_predictions_sli["gene1"].unique()

print("number of unique mutants: ", len(unique_mutants_sli))

# Saving it to a csv
np.savetxt("outputs/unique_mutants_sli_synlethdb.csv", unique_mutants_sli, delimiter=",", fmt="%s")

number of unique mutants:  166


In [8]:
# Splitting the data into 10 smaller csv files to calculate in parallel
for i in range (1, 11):
    # We take the first 10 mutants
    mutant_genes_list = unique_mutants_sli[:17]
    # Then we remove them from the list
    unique_mutants_sli = unique_mutants_sli[17:]
    # And we save them to a csv file
    np.savetxt(f"outputs/unique_mutants_split/unique_mutants_sli_{i}.csv", mutant_genes_list, delimiter=",", fmt="%s")