In [258]:
import pandas as pd

hits_excel_path = 'Supplementary_Table_11_CRISPR_DepMap_analysis.xlsx'
prediction_data = pd.read_excel(hits_excel_path, sheet_name=None)

In [259]:
# We can see the structure of hits_data : it's a dictionary with sheet names as keys, containing the dataframes as values

prediction_data['SMARCA4'].head()

Unnamed: 0,mutant,gene,num_lines_high,num_lines_low,estimate,p_adj
0,SMARCA4,CDAN1,97,98,-0.48575,4.039878e-09
1,SMARCA4,CDIN1,97,98,-0.224043,1.315044e-08
2,SMARCA4,MICOS10,97,98,-0.255113,1.518594e-06
3,SMARCA4,CBFB,97,98,-0.196786,1.982028e-06
4,SMARCA4,SDHD,97,98,-0.254965,3.671643e-06


In [278]:
# We can see the shape of the dataframe : it contains thousants of tested pairs. We want to extract those with a sufficiently high p_adj value

prediction_data['SMARCA4'].shape

(10052, 6)

In [262]:
# We first get the full list of genes that were tested for synthetic lethality in our study 

tested_genes = hits_data.keys()
tested_genes_list = list(tested_genes)

print(tested_genes_list)

['ARID1A', 'ARID1B', 'ARID2', 'PBRM1', 'SMARCA2', 'SMARCA4', 'SMARCB1', 'BAP1', 'CREBBP', 'EED', 'KMT2C', 'KMT2D', 'SETD2']


In [277]:
# now, we can make sure to drop all gene pairs that don't have high enough p_adj values, for all tested genes

for gene in tested_genes_list:
    hits_data[gene] = prediction_data[gene][prediction_data[gene]['p_adj'] < 0.001]

# We get a much smaller proportion of the initial data (only 105 lines), which is expected. 

hits_data['SMARCA4'].shape

(105, 6)

In [267]:
# Then, we get all gene pairs that are included in the SynLethDB dataset : this is what we will compare our predictions to

gene_sl_gene_df = pd.read_csv('gene_sl_gene.tsv', sep='\t')

gene_sl_gene_df2 = gene_sl_gene_df[['x_name', 'y_name', 'rel_source']]

positive_bm_df = gene_sl_gene_df2[gene_sl_gene_df['x_name'].isin(tested_genes_list) | gene_sl_gene_df['y_name'].isin(tested_genes_list)]

positive_bm_df.head()

Unnamed: 0,x_name,y_name,rel_source
173,PTTG1,CREBBP,Text Mining
346,ARID1A,MMP19,Computational Prediction
422,CDH1,SMARCA4,High Throughput
536,ARID2,CDK9,High Throughput
759,KMT2D,MYH4,Computational Prediction


In [268]:
# Now, we demonstrate our pipeline with SMARCA4 as the gene of interest

tested_gene = 'SMARCA4'

# We will store all SLI pairs as pandas dataframes withing dictionaries, similar to how our hits_data is structured

known_sli = identified_sli = {}

# First, we get the subset of known SLIs from SynLethDB where either 'x_name' or 'y_name' is the tested_gene
known_sli[tested_gene] = positive_bm_df[
    (positive_bm_df['x_name'] == tested_gene) | (positive_bm_df['y_name'] == tested_gene)
].copy()  # Make a copy to avoid setting on a slice

known_sli[tested_gene]

Unnamed: 0,x_name,y_name,rel_source
422,CDH1,SMARCA4,High Throughput
1420,SMARCA4,CDK6,Text Mining
2018,HDAC2,SMARCA4,High Throughput;CRISPR/CRISPRi
2643,SETD2,SMARCA4,CRISPR/CRISPRi
3602,SMARCA2,SMARCA4,Low Throughput;High Throughput;RNAi Screen
4639,SMARCA4,VHL,High Throughput
5074,CDK6,SMARCA4,High Throughput
7634,RRM2,SMARCA4,High Throughput
8328,MAX,SMARCA4,Computational Prediction
8435,SMARCA4,TSC1,High Throughput


In [269]:
# Then, we make sure the database is properly structured for our analysis, distinguishing between the "mutant" gene of interest and the other gene in the pair

# Create a copy to avoid modifying the original DataFrame
known_sli_tidy[tested_gene] = known_sli[tested_gene].copy()

# Check if tested_gene is in y_name column
mask = known_sli_tidy[tested_gene]['y_name'] == tested_gene  # Fix: Compare only y_name column

if mask.any():
    # Swap x_name and y_name where tested_gene is in y_name
    known_sli_tidy[tested_gene].loc[mask, ['x_name', 'y_name']] = known_sli_tidy[tested_gene].loc[mask, ['y_name', 'x_name']].values

# Rename the columns
known_sli_tidy[tested_gene].columns = ['mutant', 'gene', 'source']

known_sli_tidy[tested_gene]

Unnamed: 0,mutant,gene,source
422,SMARCA4,CDH1,High Throughput
1420,SMARCA4,CDK6,Text Mining
2018,SMARCA4,HDAC2,High Throughput;CRISPR/CRISPRi
2643,SMARCA4,SETD2,CRISPR/CRISPRi
3602,SMARCA4,SMARCA2,Low Throughput;High Throughput;RNAi Screen
4639,SMARCA4,VHL,High Throughput
5074,SMARCA4,CDK6,High Throughput
7634,SMARCA4,RRM2,High Throughput
8328,SMARCA4,MAX,Computational Prediction
8435,SMARCA4,TSC1,High Throughput


In [270]:
# Finally, we look at how many of our gene predictions were included in the known SLIs we extracted from SynLethDB

pred_sli_list = hits_data[tested_gene]['gene'].to_list()
identified_sli[tested_gene] = known_sli_tidy[tested_gene][known_sli_tidy[tested_gene]['gene'].isin(pred_sli_list)]
identified_ratio = identified_sli[tested_gene].shape[0] / known_sli_tidy[tested_gene].shape[0]
print(f'{tested_gene} identified ratio: {identified_ratio}')

SMARCA4 identified ratio: 0.0


In [287]:
# For reference, these are the proportions of SLI pairs incluced in SynLethDB that were tested by our predictor

known_sli = studied_sli_list = {}

for tested_gene in tested_genes_list:
    # Get the subset of the DataFrame where either 'x_name' or 'y_name' is the tested_gene
    known_sli[tested_gene] = positive_bm_df[
        (positive_bm_df['x_name'] == tested_gene) | (positive_bm_df['y_name'] == tested_gene)
    ].copy()  # Make a copy to avoid setting on a slice

    # Create a copy to avoid modifying the original DataFrame
    known_sli_tidy[tested_gene] = known_sli[tested_gene].copy()

    # Check if tested_gene is in y_name column
    mask = known_sli_tidy[tested_gene]['y_name'] == tested_gene  # Fix: Compare only y_name column

    if mask.any():
        # Swap x_name and y_name where tested_gene is in y_name
        known_sli_tidy[tested_gene].loc[mask, ['x_name', 'y_name']] = known_sli_tidy[tested_gene].loc[mask, ['y_name', 'x_name']].values

    # Rename the columns
    known_sli_tidy[tested_gene].columns = ['mutant', 'gene', 'source']

    studied_sli_list[tested_gene] = prediction_data[tested_gene]['gene'].to_list()
    included_sli[tested_gene] = known_sli_tidy[tested_gene][known_sli_tidy[tested_gene]['gene'].isin(studied_sli_list[tested_gene])]
    included_ratio = included_sli[tested_gene].shape[0] / known_sli_tidy[tested_gene].shape[0]
    print(f'{tested_gene} studied ratio: {included_ratio}')

ARID1A studied ratio: 0.88
ARID1B studied ratio: 0.75
ARID2 studied ratio: 0.9411764705882353
PBRM1 studied ratio: 0.9166666666666666
SMARCA2 studied ratio: 1.0
SMARCA4 studied ratio: 0.9393939393939394
SMARCB1 studied ratio: 0.9615384615384616
BAP1 studied ratio: 0.75
CREBBP studied ratio: 0.8461538461538461
EED studied ratio: 1.0
KMT2C studied ratio: 0.9
KMT2D studied ratio: 0.9333333333333333
SETD2 studied ratio: 1.0


In [291]:
# Putting it all together in a loop for all tested genes

pred_sli_list = identified_sli = {}

for tested_gene in tested_genes_list:

    pred_sli_list[tested_gene] = hits_data[tested_gene]['gene'].to_list()
    identified_sli[tested_gene] = included_sli[tested_gene][included_sli[tested_gene]['gene'].isin(pred_sli_list[tested_gene])]
    identified_ratio = identified_sli[tested_gene].shape[0] / included_sli[tested_gene].shape[0]
    print(f'{tested_gene} identified ratio: {identified_ratio}')


ARID1A identified ratio: 0.0
ARID1B identified ratio: 0.0
ARID2 identified ratio: 0.0
PBRM1 identified ratio: 0.09090909090909091
SMARCA2 identified ratio: 0.125
SMARCA4 identified ratio: 0.0
SMARCB1 identified ratio: 0.0
BAP1 identified ratio: 0.0
CREBBP identified ratio: 0.0
EED identified ratio: 0.0
KMT2C identified ratio: 0.0
KMT2D identified ratio: 0.0
SETD2 identified ratio: 0.0
