# Generating a negative sample dataset

Here, we generate a dataset of "sure" negative samples so that we can properly benchmark our model. 

In [17]:
import os
import pandas as pd
import gdown
import numpy as np

In [18]:
# This cell allows you to download the necessary SLKB files
# Google drive links come from slkb.osubmi.org

file_urls = {
    # SLKB files, obtrained from slkb.osubmi.org
    "inputs/SLKB_predictions.xlsx": "https://slkb.osubmi.org/session/09c5c4738990db9810156682c87de6b8/download/download_data-predSL?w=",
    "inputs/SLKB_calculated_scores.csv": "https://slkb.osubmi.org/session/ba83f6fcd5e34694819567d3091d7dce/download/download_data-calcSL?w="
}
# Create the inputs directory if it doesn't exist
os.makedirs("inputs", exist_ok=True)

# Download missing files
for filepath, file_url in file_urls.items():
    if not os.path.exists(filepath):
        print(f"Downloading {filepath}...")
        gdown.download(file_url, filepath, quiet=False)

In [19]:
# This table, from SLKB, shows the calculated scores with 5 different scoring systems, among 22 different cell lines.
#
# Downloaded on 04/04/2025 from https://slkb.osubmi.org/

slkb_calculated_scores = pd.read_csv('inputs/SLKB_calculated_scores.csv')


# Create combined b/nb scores in accordance with the SLKB paper, quoting :
# "Normally, Median-B and sgRNA-B scores are used in the Venn diagram, but Median-NB and sgRNA-NB scores are used if a study has no available dual-targeting controls."
slkb_calculated_scores["median_b_nb_score"] = slkb_calculated_scores["median_b_score_Z_SL_score"].fillna(
    slkb_calculated_scores["median_nb_score_Z_SL_score"]
)
slkb_calculated_scores["sgrna_derived_b_nb_score"] = slkb_calculated_scores["sgrna_derived_b_score_SL_score"].fillna(
    slkb_calculated_scores["sgrna_derived_nb_score_SL_score"]
)

slkb_calculated_scores = slkb_calculated_scores.drop(columns=[
    # "median_b_score_SL_score",
    # "median_nb_score_SL_score",
    # "sgrna_derived_b_score_SL_score",
    # "sgrna_derived_nb_score_SL_score",
    "median_b_score_standard_error",
    "median_nb_score_standard_error",
    "mageck_score_standard_error",
    "horlbeck_score_standard_error",
    "median_nb_score_SL_score",
    "median_b_score_SL_score",
    "mageck_score_SL_score",
    "gemini_score_SL_score_SensitiveLethality",
    "gemini_score_SL_score_SensitiveRecovery"
])

print("Total number of gene pairs in SLKB database: ",slkb_calculated_scores.shape[0])
slkb_calculated_scores[(slkb_calculated_scores["gene_1"] == "CREBBP")].sort_values("gene_2").iloc[:4]

Total number of gene pairs in SLKB database:  261958


Unnamed: 0.1,Unnamed: 0,gene_1,gene_2,study_origin,cell_line_origin,gemini_score_SL_score_Strong,horlbeck_score_SL_score,mageck_score_Z_SL_score,median_b_score_Z_SL_score,median_nb_score_Z_SL_score,sgrna_derived_b_score_SL_score,sgrna_derived_nb_score_SL_score,median_b_nb_score,sgrna_derived_b_nb_score
258188,258189,CREBBP,DNMT1,26864203,OVCAR8,-0.227918,-2.074034,1.313669,0.212457,0.214889,0.622403,1.100861,0.212457,0.622403
258236,258237,CREBBP,DNMT3A,26864203,OVCAR8,-0.20583,-1.100728,-1.29844,-0.350839,-0.347878,-2.77521,-2.491147,-0.350839,-2.77521
258283,258284,CREBBP,DNMT3B,26864203,OVCAR8,-0.119662,-2.172712,-0.617945,-1.793505,-1.791938,-1.374846,-1.275087,-1.793505,-1.374846
258329,258330,CREBBP,DNMT3L,26864203,OVCAR8,-0.263856,0.755723,-0.981394,-0.396679,-0.394356,-1.527794,-1.271348,-0.396679,-1.527794


In [5]:
# Here, we calculate the worst scoring pairs on each scoring system

# Define thresholds for different scoring systems
high_percentile_scores = [
    'gemini_score_SL_score_SensitiveLethality',
    'gemini_score_SL_score_SensitiveRecovery', 
    'gemini_score_SL_score_Strong'
]

# List of scoring systems 
scoring_systems_list = slkb_calculated_scores.columns[5:].to_list()
print(scoring_systems_list)

study_origin_list = slkb_calculated_scores["study_origin"].unique().tolist()
print(study_origin_list)

# Initialize DataFrames with gene identifiers
calc_slkb_predictions = slkb_calculated_scores[["gene_1", "gene_2", "cell_line_origin"]].copy()
calc_slkb_predictions.rename(columns={"gene_1":"gene1", "gene_2":"gene2"}, inplace=True)

# Initialize columns with NaN values instead of zeros
for scoring_system in scoring_systems_list:
    calc_slkb_predictions[scoring_system] = np.nan

# Get the predictions for each scoring system based on the defined quantile thresholds
for scoring_system in scoring_systems_list:    
    for study_origin in study_origin_list:
        # Filter data for this study origin
        study_mask = slkb_calculated_scores["study_origin"] == study_origin
        study_data = slkb_calculated_scores[study_mask]
        
        # Get valid values for this scoring system in this study
        valid_values = study_data[scoring_system].dropna().values
        
        # Skip if no valid values
        if len(valid_values) == 0:
            print(f"Warning: No valid values for {scoring_system} in study {study_origin}")
            continue
        
        # Calculate percentile based on scoring system type
        if scoring_system not in high_percentile_scores:
            percentile = 90
            sl_cutoff = np.percentile(valid_values, percentile)
            
            # Create mask for values >= cutoff and ensure not NaN
            # Note: we're only operating on rows where values are not NaN
            non_nan_mask = study_mask & (~pd.isna(slkb_calculated_scores[scoring_system]))
            score_mask = non_nan_mask & (slkb_calculated_scores[scoring_system] >= sl_cutoff)
        else:
            percentile = 10
            sl_cutoff = np.percentile(valid_values, percentile)
            
            # Create mask for values <= cutoff and ensure not NaN
            non_nan_mask = study_mask & (~pd.isna(slkb_calculated_scores[scoring_system]))
            score_mask = non_nan_mask & (slkb_calculated_scores[scoring_system] <= sl_cutoff)
        
        # Update our predictions - set to 1 for those meeting criteria
        calc_slkb_predictions.loc[score_mask, scoring_system] = 1
        
        # Set to 0 for those with values but not meeting criteria (keeping NaN for missing data)
        non_score_mask = non_nan_mask & (~score_mask)
        calc_slkb_predictions.loc[non_score_mask, scoring_system] = 0
    
    # Print overall statistics
    total_positive = calc_slkb_predictions[scoring_system].sum()

for scoring_system in scoring_systems_list:
    positive_count = calc_slkb_predictions[scoring_system].sum()
    percent = (positive_count / len(calc_slkb_predictions)) * 100

# Remove the intermediate columns used for calculation
calc_slkb_predictions = calc_slkb_predictions.drop(columns=[
    "median_b_score_Z_SL_score",
    "median_nb_score_Z_SL_score",
    "sgrna_derived_b_score_SL_score",
    "sgrna_derived_nb_score_SL_score"
])

# Add a total_count column that sums the number of positive scores across all scoring systems
scoring_columns = [col for col in calc_slkb_predictions.columns if col not in ['gene1', 'gene2', 'cell_line_origin']]
calc_slkb_predictions['total_count'] = calc_slkb_predictions[scoring_columns].fillna(0).sum(axis=1).astype(int)

# Preview results for a specific gene
calc_slkb_predictions[(calc_slkb_predictions["gene1"] == "SMARCA2")].sort_values(["gene2", "total_count"], ascending=[True, False]).iloc[:20]

['gemini_score_SL_score_Strong', 'horlbeck_score_SL_score', 'mageck_score_Z_SL_score', 'median_b_score_Z_SL_score', 'median_nb_score_Z_SL_score', 'sgrna_derived_b_score_SL_score', 'sgrna_derived_nb_score_SL_score', 'median_b_nb_score', 'sgrna_derived_b_nb_score']
[33956155, 30033366, 34857952, 29251726, 34469736, 36060092, 28319113, 33637726, 26864203, 29452643]


Unnamed: 0,gene1,gene2,cell_line_origin,gemini_score_SL_score_Strong,horlbeck_score_SL_score,mageck_score_Z_SL_score,median_b_nb_score,sgrna_derived_b_nb_score,total_count
200404,SMARCA2,SMARCA4,HS936T,1.0,0.0,1.0,1.0,1.0,4
195339,SMARCA2,SMARCA4,GI1,1.0,0.0,1.0,1.0,0.0,3
225729,SMARCA2,SMARCA4,MELJUSO,0.0,0.0,1.0,0.0,1.0,2
230794,SMARCA2,SMARCA4,MEWO,1.0,0.0,1.0,0.0,0.0,2
190274,SMARCA2,SMARCA4,A549,1.0,0.0,0.0,0.0,0.0,1
220664,SMARCA2,SMARCA4,MEL202,1.0,0.0,0.0,0.0,0.0,1
235859,SMARCA2,SMARCA4,PATU8988S,1.0,0.0,0.0,0.0,0.0,1
205469,SMARCA2,SMARCA4,HS944T,0.0,0.0,0.0,0.0,0.0,0
210534,SMARCA2,SMARCA4,HSC5,0.0,0.0,0.0,0.0,0.0,0
215599,SMARCA2,SMARCA4,IPC298,0.0,0.0,0.0,0.0,0.0,0


In [6]:
reactome_genes = pd.read_excel("inputs/reactome genes list (to include).xlsx")

reactome_genes_list = reactome_genes["gene_name"].tolist()

In [None]:
slkb_predictions = pd.read_excel("inputs/SLKB_predictions.xlsx", sheet_name=None)

# Here, we also have to concatenate the different sheets
slkb_predictions = pd.concat(
  [df.assign(cell_line=name) for name, df in slkb_predictions.items()],
  ignore_index=True
)

# We separate the "gene_pair" column into two separate columns
slkb_predictions[["gene1", "gene2"]] = slkb_predictions["gene_pair"].str.split('|', expand=True)
slkb_predictions.drop(columns="gene_pair", inplace=True)

slkb_predictions_set = set()
for index, row in slkb_predictions.iterrows():
    gene1 = row['gene1']
    gene2 = row['gene2']
    prediction_pair_sorted = tuple(sorted((gene1, gene2)))
    if prediction_pair_sorted not in slkb_predictions_set:
        # Add the pair to the set
        slkb_predictions_set.add(prediction_pair_sorted)

slkb_calculated_scores_set = set()
for index, row in slkb_calculated_scores.iterrows():
    gene1 = row['gene_1']
    gene2 = row['gene_2']
    calculated_pair_sorted = tuple(sorted((gene1, gene2)))
    if calculated_pair_sorted not in slkb_calculated_scores_set:
        # Add the pair to the set
      slkb_calculated_scores_set.add(calculated_pair_sorted)

slkb_worst_scores_set = set()
for index, row in calc_slkb_predictions[calc_slkb_predictions["total_count"] >= 3].iterrows():
    gene1 = row['gene1']
    gene2 = row['gene2']
    calculated_pair_sorted = tuple(sorted((gene1, gene2)))
    if calculated_pair_sorted not in slkb_worst_scores_set:
        # Add the pair to the set
        slkb_worst_scores_set.add(calculated_pair_sorted)

# Sanity check : printing the total number of unique pairs
print("Number of unique pairs in the SLKB database : ", len(slkb_calculated_scores_set))

# Printing the number of unique pairs found among the bottom 10% scores of a study
print("Number of unique pairs found in the bottom 10% of any score in any study and cell line : ", len(slkb_worst_scores_set))

print("Number of unique pairs found SLi in the top 10% of any score in any study and cell line and in the reactome genes list : ",slkb_predictions.shape[0])

slkb_predictions.head()

Number of unique pairs in the SLKB database :  127688
Number of unique pairs found SLi in the top 10% of any score in any study and cell line and in the reactome genes list :  42772


Unnamed: 0,GEMINI-Score,HORLBECK-Score,MAGECK-Score,MEDIAN-B/NB Score,sgRNA-Derived B/NB Score,total_count,cell_line,gene1,gene2
0,1,1,1,1,1,5,26864203_OVCAR8,KDM1B,MBD1
1,1,1,1,1,1,5,26864203_OVCAR8,KDM4A,KDM5A
2,1,1,1,1,1,5,26864203_OVCAR8,KDM4C,KDM5A
3,1,1,1,0,1,4,26864203_OVCAR8,BMI1,KDM1B
4,0,1,1,1,1,4,26864203_OVCAR8,CREBBP,ING5


In [9]:
# Next, negative samples will be generated. To do so, we select gene pairs that were never found to be SLi in any cell line, and that were in the lower 10% of the calculated scores.
slkb_predictions_set = set()
for index, row in slkb_predictions.iterrows():
    gene1 = row['gene1']
    gene2 = row['gene2']
    prediction_pair_sorted = tuple(sorted((gene1, gene2)))
    slkb_predictions_set.add(prediction_pair_sorted)

slkb_calculated_scores_set = set()
for index, row in calc_slkb_predictions[calc_slkb_predictions["total_count"] >= 3].iterrows():
    gene1 = row['gene1']
    gene2 = row['gene2']
    calculated_pair_sorted = tuple(sorted((gene1, gene2)))
    slkb_calculated_scores_set.add(calculated_pair_sorted)

negative_samples = slkb_calculated_scores_set - slkb_predictions_set

print(f"Gene pairs in the bottom 10% for their study: {len(slkb_calculated_scores_set)}")
print(f"Gene pairs in the bottom 10% for their study, which were never found in the top 10%: {len(negative_samples)}")

Gene pairs in the bottom 10% for their study: 10935
Gene pairs in the bottom 10% for their study, which were never found in the top 10%: 8075


In [11]:
genes_to_remove = []
for pair in negative_samples:
    gene1, gene2 = pair
    if (gene1 not in reactome_genes_list) or (gene2 not in reactome_genes_list):
        genes_to_remove.append(pair)

negative_samples_filtered = set(negative_samples) - set(genes_to_remove)

print(f"Negative sample pairs after filtering: {len(negative_samples_filtered)}")

Negative sample pairs after filtering: 5050


In [12]:
with open("inputs/slkb_negative_samples_strict.csv", "w") as f:
    f.write("gene1,gene2\n")
    for pair in negative_samples_filtered:
        f.write(f"{pair[0]},{pair[1]}\n")

# Re-running the pipeline with stricter criteria

In [13]:
import os
import pandas as pd
import gdown
import numpy as np

In [14]:
# This cell allows you to download the necessary SLKB files
# Google drive links come from slkb.osubmi.org

file_urls = {
    # SLKB files, obtrained from slkb.osubmi.org
    "inputs/SLKB_predictions.xlsx": "https://slkb.osubmi.org/session/09c5c4738990db9810156682c87de6b8/download/download_data-predSL?w=",
    "inputs/SLKB_calculated_scores.csv": "https://slkb.osubmi.org/session/ba83f6fcd5e34694819567d3091d7dce/download/download_data-calcSL?w="
}
# Create the inputs directory if it doesn't exist
os.makedirs("inputs", exist_ok=True)

# Download missing files
for filepath, file_url in file_urls.items():
    if not os.path.exists(filepath):
        print(f"Downloading {filepath}...")
        gdown.download(file_url, filepath, quiet=False)

In [15]:
# This table, from SLKB, shows the calculated scores with 5 different scoring systems, among 22 different cell lines.
#
# Downloaded on 04/04/2025 from https://slkb.osubmi.org/

slkb_calculated_scores = pd.read_csv('inputs/SLKB_calculated_scores.csv')


# Create combined b/nb scores in accordance with the SLKB paper, quoting :
# "Normally, Median-B and sgRNA-B scores are used in the Venn diagram, but Median-NB and sgRNA-NB scores are used if a study has no available dual-targeting controls."
slkb_calculated_scores["median_b_nb_score"] = slkb_calculated_scores["median_b_score_Z_SL_score"].fillna(
    slkb_calculated_scores["median_nb_score_Z_SL_score"]
)
slkb_calculated_scores["sgrna_derived_b_nb_score"] = slkb_calculated_scores["sgrna_derived_b_score_SL_score"].fillna(
    slkb_calculated_scores["sgrna_derived_nb_score_SL_score"]
)

slkb_calculated_scores = slkb_calculated_scores.drop(columns=[
    # "median_b_score_SL_score",
    # "median_nb_score_SL_score",
    # "sgrna_derived_b_score_SL_score",
    # "sgrna_derived_nb_score_SL_score",
    "median_b_score_standard_error",
    "median_nb_score_standard_error",
    "mageck_score_standard_error",
    "horlbeck_score_standard_error",
    "median_nb_score_SL_score",
    "median_b_score_SL_score",
    "mageck_score_SL_score",
    "gemini_score_SL_score_SensitiveLethality",
    "gemini_score_SL_score_SensitiveRecovery"
])

print(slkb_calculated_scores.shape)
slkb_calculated_scores[(slkb_calculated_scores["gene_1"] == "CREBBP")].sort_values("gene_2").iloc[:4]

(261958, 14)


Unnamed: 0.1,Unnamed: 0,gene_1,gene_2,study_origin,cell_line_origin,gemini_score_SL_score_Strong,horlbeck_score_SL_score,mageck_score_Z_SL_score,median_b_score_Z_SL_score,median_nb_score_Z_SL_score,sgrna_derived_b_score_SL_score,sgrna_derived_nb_score_SL_score,median_b_nb_score,sgrna_derived_b_nb_score
258188,258189,CREBBP,DNMT1,26864203,OVCAR8,-0.227918,-2.074034,1.313669,0.212457,0.214889,0.622403,1.100861,0.212457,0.622403
258236,258237,CREBBP,DNMT3A,26864203,OVCAR8,-0.20583,-1.100728,-1.29844,-0.350839,-0.347878,-2.77521,-2.491147,-0.350839,-2.77521
258283,258284,CREBBP,DNMT3B,26864203,OVCAR8,-0.119662,-2.172712,-0.617945,-1.793505,-1.791938,-1.374846,-1.275087,-1.793505,-1.374846
258329,258330,CREBBP,DNMT3L,26864203,OVCAR8,-0.263856,0.755723,-0.981394,-0.396679,-0.394356,-1.527794,-1.271348,-0.396679,-1.527794


In [16]:
# Here, we calculate the worst scoring pairs on each scoring system

# Define thresholds for different scoring systems
high_percentile_scores = [
    'gemini_score_SL_score_SensitiveLethality',
    'gemini_score_SL_score_SensitiveRecovery', 
    'gemini_score_SL_score_Strong'
]

# List of scoring systems 
scoring_systems_list = slkb_calculated_scores.columns[5:].to_list()
print(scoring_systems_list)

study_origin_list = slkb_calculated_scores["study_origin"].unique().tolist()
print(study_origin_list)

# Initialize DataFrames with gene identifiers
calc_slkb_predictions = slkb_calculated_scores[["gene_1", "gene_2", "cell_line_origin"]].copy()
calc_slkb_predictions.rename(columns={"gene_1":"gene1", "gene_2":"gene2"}, inplace=True)

# Initialize columns with NaN values instead of zeros
for scoring_system in scoring_systems_list:
    calc_slkb_predictions[scoring_system] = np.nan

# Get the predictions for each scoring system based on the defined quantile thresholds
for scoring_system in scoring_systems_list:    
    for study_origin in study_origin_list:
        # Filter data for this study origin
        study_mask = slkb_calculated_scores["study_origin"] == study_origin
        study_data = slkb_calculated_scores[study_mask]
        
        # Get valid values for this scoring system in this study
        valid_values = study_data[scoring_system].dropna().values
        
        # Skip if no valid values
        if len(valid_values) == 0:
            print(f"Warning: No valid values for {scoring_system} in study {study_origin}")
            continue
        
        # Calculate percentile based on scoring system type
        if scoring_system not in high_percentile_scores:
            percentile = 95
            sl_cutoff = np.percentile(valid_values, percentile)
            
            # Create mask for values >= cutoff and ensure not NaN
            # Note: we're only operating on rows where values are not NaN
            non_nan_mask = study_mask & (~pd.isna(slkb_calculated_scores[scoring_system]))
            score_mask = non_nan_mask & (slkb_calculated_scores[scoring_system] >= sl_cutoff)
        else:
            percentile = 5
            sl_cutoff = np.percentile(valid_values, percentile)
            
            # Create mask for values <= cutoff and ensure not NaN
            non_nan_mask = study_mask & (~pd.isna(slkb_calculated_scores[scoring_system]))
            score_mask = non_nan_mask & (slkb_calculated_scores[scoring_system] <= sl_cutoff)
        
        # Update our predictions - set to 1 for those meeting criteria
        calc_slkb_predictions.loc[score_mask, scoring_system] = 1
        
        # Set to 0 for those with values but not meeting criteria (keeping NaN for missing data)
        non_score_mask = non_nan_mask & (~score_mask)
        calc_slkb_predictions.loc[non_score_mask, scoring_system] = 0
    
    # Print overall statistics
    total_positive = calc_slkb_predictions[scoring_system].sum()

for scoring_system in scoring_systems_list:
    positive_count = calc_slkb_predictions[scoring_system].sum()
    percent = (positive_count / len(calc_slkb_predictions)) * 100

# Remove the intermediate columns used for calculation
calc_slkb_predictions = calc_slkb_predictions.drop(columns=[
    "median_b_score_Z_SL_score",
    "median_nb_score_Z_SL_score",
    "sgrna_derived_b_score_SL_score",
    "sgrna_derived_nb_score_SL_score"
])

# Add a total_count column that sums the number of positive scores across all scoring systems
scoring_columns = [col for col in calc_slkb_predictions.columns if col not in ['gene1', 'gene2', 'cell_line_origin']]
calc_slkb_predictions['total_count'] = calc_slkb_predictions[scoring_columns].fillna(0).sum(axis=1).astype(int)

# Preview results for a specific gene
calc_slkb_predictions[(calc_slkb_predictions["gene1"] == "SMARCA2")].sort_values(["gene2", "total_count"], ascending=[True, False]).iloc[:20]

['gemini_score_SL_score_Strong', 'horlbeck_score_SL_score', 'mageck_score_Z_SL_score', 'median_b_score_Z_SL_score', 'median_nb_score_Z_SL_score', 'sgrna_derived_b_score_SL_score', 'sgrna_derived_nb_score_SL_score', 'median_b_nb_score', 'sgrna_derived_b_nb_score']
[33956155, 30033366, 34857952, 29251726, 34469736, 36060092, 28319113, 33637726, 26864203, 29452643]


Unnamed: 0,gene1,gene2,cell_line_origin,gemini_score_SL_score_Strong,horlbeck_score_SL_score,mageck_score_Z_SL_score,median_b_nb_score,sgrna_derived_b_nb_score,total_count
200404,SMARCA2,SMARCA4,HS936T,1.0,0.0,1.0,1.0,0.0,3
195339,SMARCA2,SMARCA4,GI1,0.0,0.0,1.0,1.0,0.0,2
225729,SMARCA2,SMARCA4,MELJUSO,0.0,0.0,1.0,0.0,1.0,2
190274,SMARCA2,SMARCA4,A549,1.0,0.0,0.0,0.0,0.0,1
205469,SMARCA2,SMARCA4,HS944T,0.0,0.0,0.0,0.0,0.0,0
210534,SMARCA2,SMARCA4,HSC5,0.0,0.0,0.0,0.0,0.0,0
215599,SMARCA2,SMARCA4,IPC298,0.0,0.0,0.0,0.0,0.0,0
220664,SMARCA2,SMARCA4,MEL202,0.0,0.0,0.0,0.0,0.0,0
230794,SMARCA2,SMARCA4,MEWO,0.0,0.0,0.0,0.0,0.0,0
235859,SMARCA2,SMARCA4,PATU8988S,0.0,0.0,0.0,0.0,0.0,0


In [17]:
slkb_predictions = pd.read_excel("inputs/SLKB_predictions.xlsx", sheet_name=None)

# Here, we also have to concatenate the different sheets
slkb_predictions = pd.concat(
  [df.assign(cell_line=name) for name, df in slkb_predictions.items()],
  ignore_index=True
)

print("Number of pairs found SLi in the top 10% of any score in any study and cell line : ",slkb_predictions.shape[0])

# We remove the duplicates
slkb_predictions = slkb_predictions.drop_duplicates(subset=["gene_pair"])

print("Number of unique pairs found SLi in the top 10% of any score in any study and cell line : ",slkb_predictions.shape[0])

# We separate the "gene_pair" column into two separate columns
slkb_predictions[["gene1", "gene2"]] = slkb_predictions["gene_pair"].str.split('|', expand=True)
slkb_predictions.drop(columns="gene_pair", inplace=True)

slkb_predictions.head()

Number of pairs found SLi in the top 10% of any score in any study and cell line :  78803
Number of unique pairs found SLi in the top 10% of any score in any study and cell line :  55555


Unnamed: 0,GEMINI-Score,HORLBECK-Score,MAGECK-Score,MEDIAN-B/NB Score,sgRNA-Derived B/NB Score,total_count,cell_line,gene1,gene2
0,1,1,1,1,1,5,26864203_OVCAR8,KDM1B,MBD1
1,1,1,1,1,1,5,26864203_OVCAR8,KDM4A,KDM5A
2,1,1,1,1,1,5,26864203_OVCAR8,KDM4C,KDM5A
3,1,1,1,0,1,4,26864203_OVCAR8,BMI1,KDM1B
4,0,1,1,1,1,4,26864203_OVCAR8,CREBBP,ING5


In [18]:
# Next, negative samples will be generated. To do so, we select gene pairs that were never found to be SLi in any cell line, and that were in the lower 10% of the calculated scores.
slkb_predictions_set = set()
for index, row in slkb_predictions.iterrows():
    gene1 = row['gene1']
    gene2 = row['gene2']
    prediction_pair_sorted = tuple(sorted((gene1, gene2)))
    slkb_predictions_set.add(prediction_pair_sorted)

slkb_calculated_scores_set = set()
for index, row in calc_slkb_predictions[calc_slkb_predictions["total_count"] >= 3].iterrows():
    gene1 = row['gene1']
    gene2 = row['gene2']
    calculated_pair_sorted = tuple(sorted((gene1, gene2)))
    slkb_calculated_scores_set.add(calculated_pair_sorted)

negative_samples = slkb_calculated_scores_set - slkb_predictions_set

print(f"Gene pairs in the bottom 10% for their study: {len(slkb_calculated_scores_set)}")
print(f"Gene pairs in the bottom 10% for their study, which were never found in the top 10%: {len(negative_samples)}")

Gene pairs in the bottom 10% for their study: 4022
Gene pairs in the bottom 10% for their study, which were never found in the top 10%: 2882


In [19]:
reactome_genes = pd.read_excel("inputs/reactome genes list (to include).xlsx")

reactome_genes_list = reactome_genes["gene_name"].tolist()

In [20]:
genes_to_remove = []
for pair in negative_samples:
    gene1, gene2 = pair
    if (gene1 not in reactome_genes_list) or (gene2 not in reactome_genes_list):
        genes_to_remove.append(pair)

negative_samples_filtered = set(negative_samples) - set(genes_to_remove)

print(f"Negative sample pairs after filtering: {len(negative_samples_filtered)}")

Negative sample pairs after filtering: 1785


In [21]:
with open("inputs/slkb_negative_samples_stricter.csv", "w") as f:
    f.write("gene1,gene2\n")
    for pair in negative_samples_filtered:
        f.write(f"{pair[0]},{pair[1]}\n")