# Generating a negative sample dataset

Here, we generate a dataset of "sure" negative samples so that we can properly benchmark our model. 

In [2]:
import os
import pandas as pd
import gdown
import numpy as np

In [3]:
# This cell allows you to download the necessary SLKB files
# Google drive links come from slkb.osubmi.org

file_urls = {
    # SLKB files, obtrained from slkb.osubmi.org
    "inputs/SLKB_predictions.xlsx": "https://slkb.osubmi.org/session/09c5c4738990db9810156682c87de6b8/download/download_data-predSL?w=",
    "inputs/SLKB_calculated_scores.csv": "https://slkb.osubmi.org/session/ba83f6fcd5e34694819567d3091d7dce/download/download_data-calcSL?w="
}
# Create the inputs directory if it doesn't exist
os.makedirs("inputs", exist_ok=True)

# Download missing files
for filepath, file_url in file_urls.items():
    if not os.path.exists(filepath):
        print(f"Downloading {filepath}...")
        gdown.download(file_url, filepath, quiet=False)

In [4]:
# This table, from SLKB, shows the calculated scores with 5 different scoring systems, among 22 different cell lines.
#
# Downloaded on 04/04/2025 from https://slkb.osubmi.org/

slkb_calculated_scores = pd.read_csv('inputs/SLKB_calculated_scores.csv')


# Create combined b/nb scores in accordance with the SLKB paper, quoting :
# "Normally, Median-B and sgRNA-B scores are used in the Venn diagram, but Median-NB and sgRNA-NB scores are used if a study has no available dual-targeting controls."
slkb_calculated_scores["median_b_nb_score"] = slkb_calculated_scores["median_b_score_Z_SL_score"].fillna(
    slkb_calculated_scores["median_nb_score_Z_SL_score"]
)
slkb_calculated_scores["sgrna_derived_b_nb_score"] = slkb_calculated_scores["sgrna_derived_b_score_SL_score"].fillna(
    slkb_calculated_scores["sgrna_derived_nb_score_SL_score"]
)

slkb_calculated_scores = slkb_calculated_scores.drop(columns=[
    # "median_b_score_SL_score",
    # "median_nb_score_SL_score",
    # "sgrna_derived_b_score_SL_score",
    # "sgrna_derived_nb_score_SL_score",
    "median_b_score_standard_error",
    "median_nb_score_standard_error",
    "mageck_score_standard_error",
    "horlbeck_score_standard_error",
    "median_nb_score_SL_score",
    "median_b_score_SL_score",
    "mageck_score_SL_score",
    "gemini_score_SL_score_SensitiveLethality",
    "gemini_score_SL_score_SensitiveRecovery"
])

print("Total number of gene pairs in SLKB database: ",slkb_calculated_scores.shape[0])
slkb_calculated_scores[(slkb_calculated_scores["gene_1"] == "CREBBP")].sort_values("gene_2").iloc[:4]

Total number of gene pairs in SLKB database:  261958


Unnamed: 0.1,Unnamed: 0,gene_1,gene_2,study_origin,cell_line_origin,gemini_score_SL_score_Strong,horlbeck_score_SL_score,mageck_score_Z_SL_score,median_b_score_Z_SL_score,median_nb_score_Z_SL_score,sgrna_derived_b_score_SL_score,sgrna_derived_nb_score_SL_score,median_b_nb_score,sgrna_derived_b_nb_score
258188,258189,CREBBP,DNMT1,26864203,OVCAR8,-0.227918,-2.074034,1.313669,0.212457,0.214889,0.622403,1.100861,0.212457,0.622403
258236,258237,CREBBP,DNMT3A,26864203,OVCAR8,-0.20583,-1.100728,-1.29844,-0.350839,-0.347878,-2.77521,-2.491147,-0.350839,-2.77521
258283,258284,CREBBP,DNMT3B,26864203,OVCAR8,-0.119662,-2.172712,-0.617945,-1.793505,-1.791938,-1.374846,-1.275087,-1.793505,-1.374846
258329,258330,CREBBP,DNMT3L,26864203,OVCAR8,-0.263856,0.755723,-0.981394,-0.396679,-0.394356,-1.527794,-1.271348,-0.396679,-1.527794


In [None]:
def get_pairs_worse_than_threshold(threshold=10):
    # Here, we calculate the worst scoring pairs on each scoring system

    # Define thresholds for different scoring systems
    high_percentile_scores = [
        'gemini_score_SL_score_SensitiveLethality',
        'gemini_score_SL_score_SensitiveRecovery', 
        'gemini_score_SL_score_Strong'
        ]

    # List of scoring systems 
    scoring_systems_list = slkb_calculated_scores.columns[5:].to_list()

    # List of study origins
    study_origin_list = slkb_calculated_scores["study_origin"].unique().tolist()

    # Initialize DataFrames with gene identifiers
    calc_slkb_predictions = slkb_calculated_scores[["gene_1", "gene_2", "cell_line_origin"]].copy()
    calc_slkb_predictions.rename(columns={"gene_1":"gene1", "gene_2":"gene2"}, inplace=True)

    # Initialize columns with NaN values instead of zeros
    for scoring_system in scoring_systems_list:
        calc_slkb_predictions[scoring_system] = np.nan

    # Get the predictions for each scoring system based on the defined quantile thresholds
    for scoring_system in scoring_systems_list:    
        for study_origin in study_origin_list:
            # Filter data for this study origin
            study_mask = slkb_calculated_scores["study_origin"] == study_origin
            study_data = slkb_calculated_scores[study_mask]
            
            # Get valid values for this scoring system in this study
            valid_values = study_data[scoring_system].dropna().values
            
            # Skip if no valid values
            if len(valid_values) == 0:
                continue
            
            # Calculate percentile based on scoring system type
            if scoring_system not in high_percentile_scores:
                percentile = (100 - threshold)
                sl_cutoff = np.percentile(valid_values, percentile)
                
                # Create mask for values >= cutoff and ensure not NaN
                # Note: we're only operating on rows where values are not NaN
                non_nan_mask = study_mask & (~pd.isna(slkb_calculated_scores[scoring_system]))
                score_mask = non_nan_mask & (slkb_calculated_scores[scoring_system] >= sl_cutoff)
            else:
                percentile = threshold
                sl_cutoff = np.percentile(valid_values, percentile)
                
                # Create mask for values <= cutoff and ensure not NaN
                non_nan_mask = study_mask & (~pd.isna(slkb_calculated_scores[scoring_system]))
                score_mask = non_nan_mask & (slkb_calculated_scores[scoring_system] <= sl_cutoff)
            
            # Update our predictions - set to 1 for those meeting criteria
            calc_slkb_predictions.loc[score_mask, scoring_system] = 1
            
            # Set to 0 for those with values but not meeting criteria (keeping NaN for missing data)
            non_score_mask = non_nan_mask & (~score_mask)
            calc_slkb_predictions.loc[non_score_mask, scoring_system] = 0
        
        # Print overall statistics
        total_positive = calc_slkb_predictions[scoring_system].sum()

    for scoring_system in scoring_systems_list:
        positive_count = calc_slkb_predictions[scoring_system].sum()
        percent = (positive_count / len(calc_slkb_predictions)) * 100

    # Remove the intermediate columns used for calculation
    calc_slkb_predictions = calc_slkb_predictions.drop(columns=[
        "median_b_score_Z_SL_score",
        "median_nb_score_Z_SL_score",
        "sgrna_derived_b_score_SL_score",
        "sgrna_derived_nb_score_SL_score"
    ])

    # Add a total_count column that sums the number of positive scores across all scoring systems
    scoring_columns = [col for col in calc_slkb_predictions.columns if col not in ['gene1', 'gene2', 'cell_line_origin']]
    calc_slkb_predictions['total_count'] = calc_slkb_predictions[scoring_columns].fillna(0).sum(axis=1).astype(int)

    return calc_slkb_predictions

calc_slkb_predictions = get_pairs_worse_than_threshold(10)

# Preview results for a specific gene
calc_slkb_predictions[(calc_slkb_predictions["gene1"] == "AKT1")].sort_values(["gene2", "total_count"], ascending=[True, False]).iloc[:20]

In [None]:
def get_filtered_nonsli_list(calc_slkb_predictions, threshold):
    # Printing the number of unique pairs found among the bottom n% of three scoring systems
    calc_slkb_predictions = calc_slkb_predictions[calc_slkb_predictions["total_count"] >= 3]
    print(f"Number of unique pairs found in the bottom {threshold}% of three scoring systems, for all studies and cell lines : ",calc_slkb_predictions.shape[0])

    # Filter the predictions to only keep unique pairs
    calc_slkb_predictions = calc_slkb_predictions.groupby(["gene1", "gene2"]).first().reset_index()
    print("Number of unique nonSLi pairs meeting the previous criteria : ", calc_slkb_predictions.shape[0])

    # Filter the SLKB predictions to only keep those that are in the reactome genes list
    reactome_genes = pd.read_excel("inputs/reactome genes list (to include).xlsx")
    reactome_genes_list = reactome_genes["gene_name"].tolist()
    calc_slkb_predictions = calc_slkb_predictions[calc_slkb_predictions["gene1"].isin(reactome_genes_list) & calc_slkb_predictions["gene2"].isin(reactome_genes_list)]
    print("Number of unique pairs after removing non-reactome genes ",calc_slkb_predictions.shape[0])

    # Filtering to remove gene pairs that were found in the top 10% of any score in any study and cell line
    slkb_predictions = pd.read_excel("inputs/SLKB_predictions.xlsx", sheet_name=None)
    slkb_predictions = pd.concat(
      [df.assign(cell_line=name) for name, df in slkb_predictions.items()],
      ignore_index=True
    )
    slkb_predictions[["gene1", "gene2"]] = slkb_predictions["gene_pair"].str.split('|', expand=True)
    slkb_predictions.drop(columns="gene_pair", inplace=True)

    # First, create a set of all gene pairs from slkb_predictions
    slkb_predictions_set = set()
    for index, row in slkb_predictions.iterrows():
        gene1 = row['gene1']
        gene2 = row['gene2']
        prediction_pair_sorted = tuple(sorted((gene1, gene2)))
        slkb_predictions_set.add(prediction_pair_sorted)

    # Now filter calc_slkb_predictions to only keep rows not in slkb_predictions_set
    filtered_calc_slkb_predictions_list = []
    for index, row in calc_slkb_predictions.iterrows():
        gene1 = row['gene1']
        gene2 = row['gene2']
        pair_sorted = tuple(sorted((gene1, gene2)))
        if pair_sorted not in slkb_predictions_set:
            filtered_calc_slkb_predictions_list.append(row)

    print(f"Number of pairs after removing genes that appeared in the top 10% of any score in any study: {len(filtered_calc_slkb_predictions_list)}")

    return filtered_calc_slkb_predictions_list

filtered_calc_slkb_predictions_list = get_filtered_nonsli_list(calc_slkb_predictions, 10)

In [34]:
with open("inputs/slkb_negative_samples_strict.csv", "w") as f:
    f.write("gene1,gene2\n")
    for pair in filtered_calc_slkb_predictions_list:
        f.write(f"{pair.iloc[0]},{pair.iloc[1]}\n")

# Re-running the pipeline with stricter criteria

In [None]:
calc_slkb_predictions_stricter = get_pairs_worse_than_threshold(5) 
filtered_calc_slkb_predictions_list = get_filtered_nonsli_list(calc_slkb_predictions_stricter, 4)

Number of unique pairs found in the bottom 4% of three scoring systems, for all studies and cell lines :  785
Number of unique nonSLi pairs meeting the previous criteria :  755
Number of unique pairs after removing non-reactome genes  452
Number of pairs after removing genes that appeared in the top 10% of any score in any study: 317


In [38]:
with open("inputs/slkb_negative_samples_stricter.csv", "w") as f:
    f.write("gene1,gene2\n")
    for pair in filtered_calc_slkb_predictions_list:
        f.write(f"{pair.iloc[0]},{pair.iloc[1]}\n")

# Re-running with criteria for 1:1 ratio

In [23]:
thresh = 1.6775

calc_slkb_predictions_stricter = get_pairs_worse_than_threshold(thresh) 
filtered_calc_slkb_predictions_list = get_filtered_nonsli_list(calc_slkb_predictions_stricter, thresh)

Number of unique pairs found in the bottom 1.6775% of three scoring systems, for all studies and cell lines :  781
Number of unique nonSLi pairs meeting the previous criteria :  752
Number of unique pairs after removing non-reactome genes  449
Number of pairs after removing genes that appeared in the top 10% of any score in any study: 314


In [24]:
with open("inputs/slkb_negative_samples_1to1.csv", "w") as f:
    f.write("gene1,gene2\n")
    for pair in filtered_calc_slkb_predictions_list:
        f.write(f"{pair.iloc[0]},{pair.iloc[1]}\n")