In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, accuracy_score, recall_score, precision_score

pd.set_option('display.max_columns', 200)

In [2]:
# Supplementary Table 6 from https://www.nature.com/articles/s41564-021-00958-0#Sec24 Baggen et al. 2021
# Direct download: https://static-content.springer.com/esm/art%3A10.1038%2Fs41564-021-00958-0/MediaObjects/41564_2021_958_MOESM3_ESM.xlsx
screens = pd.read_excel('../../data/41564_2021_958_MOESM3_ESM.xlsx',  header=3, usecols="F:AT")
screens['Reference'] = screens['Reference'].str.strip()
screens = screens[screens['Assay type'] != 'Functional validation'].copy()
screens = screens[screens['Reference'].notna()]

In [3]:
labels = pd.read_excel('../../data/41564_2021_958_MOESM3_ESM.xlsx',  header=3, usecols="F:AT")
labels['Reference'] = labels['Reference'].str.strip()
labels = labels.drop_duplicates(subset=['Reference', 'Assay type', 'Gene name', 'Uniprot accession'])
list_of_references = labels.groupby(['Gene name'])['Reference'].apply(list).reset_index()
validated = labels.groupby(['Gene name']).count()['Functionally validated by authors'].reset_index()

In [4]:
data = pd.read_pickle('../../data/all_with_candidates.pickle')

In [5]:
for name, screen in screens.groupby('Reference'):
    data[f'{name}'] = data['gene'].isin(screen['Gene name']).astype(int)

In [6]:
data = data.merge(list_of_references, 'left', left_on='gene', right_on='Gene name')
data = data.merge(validated, 'left', on='Gene name')

In [7]:
ref_to_counts = labels['Reference'].value_counts().to_dict()

results = []
for ref in screens['Reference'].unique():
    test_data = data.copy()
    test_data['Functionally validated by authors'] = test_data['Functionally validated by authors'].fillna(0)
    test_data['Reference_without_ref'] = test_data['Reference'].copy()
    test_data['Reference_without_ref'] = test_data['Reference_without_ref'].apply(lambda x: [item for item in x if item != ref] if isinstance(x, list) else x)
    test_data['count_without_ref'] = test_data['Reference_without_ref'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    test_data['label_without_ref'] = (test_data['count_without_ref'] >=3) | (test_data['Functionally validated by authors'] >=1)
    test_data['label_without_ref'] = test_data['label_without_ref'].astype(int)
    # test_data = test_data[test_data['group_split_0'] == 'test'].copy()
    test_data['candidates'] = (test_data['count_without_ref'] > 0) & (test_data['count_without_ref'] < 3) & (test_data['Functionally validated by authors'] < 1)
    test_data = test_data[~test_data['candidates']]
    test_data['label_without_ref'] = test_data['label_without_ref'].astype(float)
    test_data[ref] = test_data[ref].astype(float)
    
    results.append({'Reference': ref, 
                    'F1-score': f1_score(test_data['label_without_ref'].values, test_data[ref].values),
                    'Precision': precision_score(test_data['label_without_ref'].values, test_data[ref].values),
                    'Recall': recall_score(test_data['label_without_ref'].values, test_data[ref].values),
                    })
results = pd.DataFrame(results)
results = results.sort_values('F1-score')
display(results.round(2))
results.describe().round(2)

Unnamed: 0,Reference,F1-score,Precision,Recall
17,Baggen et al.,0.02,0.41,0.01
13,Wang et al.,0.02,0.6,0.01
14,Wei et al.,0.04,0.78,0.02
9,Biering et al.,0.04,0.2,0.02
11,Hoffman et al.,0.04,1.0,0.02
15,Zhu et al.,0.05,0.96,0.02
16,Daniloski et al.,0.06,0.53,0.03
10,Rebendenne et al.,0.06,0.33,0.03
12,Schneider et al.,0.06,0.44,0.04
18,Schmidt et al.,0.11,0.88,0.06


Unnamed: 0,F1-score,Precision,Recall
count,21.0,21.0,21.0
mean,0.13,0.6,0.11
std,0.11,0.25,0.13
min,0.02,0.2,0.01
25%,0.05,0.41,0.02
50%,0.11,0.6,0.06
75%,0.21,0.79,0.13
max,0.4,1.0,0.52
