In [59]:
import numpy as np
import pandas as pd
from ast import literal_eval

In [60]:
# dataset_path = '../../data/benchmark/lt2_windows__cvs_gt2.csv'
dataset_path = '../../data/benchmark/multiple_windows.csv'

n_preds_path = '../../data/benchmark/preds/n_bilstm_mw_.npy'
c_preds_path = '../../data/benchmark/preds/c_bilstm_mw.npy'

In [61]:
data_df = pd.read_csv(dataset_path)
data_df.head()

Unnamed: 0,protein,cleavages
0,AAADSAQWLSVKEETIFLHDGLIRVTDLAELPSEILGAPEAADTDL...,"[(52, 63), (17, 25), (52, 62)]"
1,ACAWLEAQEEEEVGFPVRPQVPLRPMTYKAAVDLSHFLKEKGGLEG...,"[(29, 37), (141, 150), (141, 149)]"
2,AFAQRALSDSLSRRLRRHVPTHQSRRRSLGHLSPTARRACEDAIRC...,"[(108, 116), (92, 100), (103, 112), (74, 82), ..."
3,AWRRRRSGTSGKATWWCSGLRRASPTPSRRVQSWATAVMWKPSPSS...,"[(224, 232), (225, 233), (219, 232), (65, 73)]"
4,DHVASCGVNLYQFYGPSGQFTHEFDGDEQFYVDLEKKETAWRWPEF...,"[(38, 46), (22, 31), (22, 32), (41, 49), (22, ..."


In [62]:
n_preds = np.load(n_preds_path, allow_pickle=True)
c_preds = np.load(c_preds_path, allow_pickle=True)

n_preds.shape, c_preds.shape

((867,), (867,))

In [63]:
cleavages = data_df['cleavages'].apply(literal_eval).values
seq_lens = data_df['protein'].apply(len).values

In [64]:
n_targets = [np.zeros(seq_len + 1) for seq_len in seq_lens]
c_targets = [np.zeros(seq_len + 1) for seq_len in seq_lens]

for i, cleavage in enumerate(cleavages):
    for n, c in cleavage:
        n_targets[i][n - 1] = 1
        c_targets[i][c - 1] = 1

In [65]:
n_preds = [pred > 0 for pred in n_preds]
c_preds = [pred > 0 for pred in c_preds]

In [66]:
n_preds[0], n_targets[0]

(array([False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False,  True, False, False, False, False, False,
        False, False, False, False, False,  True, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [67]:
n_preds[0] - n_targets[0]

array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.])

In [68]:
c_preds_concat = np.concatenate(c_preds)
n_preds_concat = np.concatenate(n_preds)

c_targets_concat = np.concatenate(c_targets)
n_targets_concat = np.concatenate(n_targets)

c_preds_concat.shape, n_preds_concat.shape, c_targets_concat.shape, n_targets_concat.shape

((397783,), (397783,), (397783,), (397783,))

In [69]:
from sklearn import metrics

In [70]:
fpr_c, tpr_c, thresholds_c = metrics.roc_curve(c_targets_concat, c_preds_concat)
fpr_n, tpr_n, thresholds_n = metrics.roc_curve(n_targets_concat, n_preds_concat)

roc_auc_c = metrics.auc(fpr_c, tpr_c)
roc_auc_n = metrics.auc(fpr_n, tpr_n)

print(f"C - Terminus ROC AUC: {roc_auc_c}")
print(f"N - Terminus ROC AUC: {roc_auc_n}")

C - Terminus ROC AUC: 0.48646361675011546
N - Terminus ROC AUC: 0.5681448335807696


In [71]:
print(f"C Terminus Precission: {metrics.precision_score(c_targets_concat, c_preds_concat)}")
print(f"N Terminus Precission: {metrics.precision_score(n_targets_concat, n_preds_concat)}")

C Terminus Precission: 0.006300027391423441
N Terminus Precission: 0.030858369098712448


In [72]:
print(f"C Terminus Recall: {metrics.recall_score(c_targets_concat, c_preds_concat)}")
print(f"N Terminus Recall: {metrics.recall_score(n_targets_concat, n_preds_concat)}")

C Terminus Recall: 0.05578011317704123
N Terminus Recall: 0.19359181475498116


In [73]:
print(f"C Terminus F1: {metrics.f1_score(c_targets_concat, c_preds_concat)}")
print(f"N Terminus F1: {metrics.f1_score(n_targets_concat, n_preds_concat)}")

C Terminus F1: 0.011321373878801137
N Terminus F1: 0.053231657658991634


In [74]:
for i in range(10):
    pos_n_preds_idx = np.where(n_preds[i])[0]
    pos_c_preds_idx = np.where(c_preds[i])[0]
    pos_n_targets_idx = np.where(n_targets[i])[0]
    pos_c_targets_idx = np.where(c_targets[i])[0]

    print(f"Protein {i}")
    print(f"Positive N preds: {pos_n_preds_idx}")
    print(f"Positive N targets: {pos_n_targets_idx}")
    print()
    print(f"Positive C preds: {pos_c_preds_idx}")
    print(f"Positive C targets: {pos_c_targets_idx}")
    print()

Protein 0
Positive N preds: [ 1 28 75 86]
Positive N targets: [16 51]

Positive C preds: [ 8 18 25 28 51 59 63 74 80 89 97]
Positive C targets: [24 61 62]

Protein 1
Positive N preds: [ 10  14  51  60  74  79  90 105 129]
Positive N targets: [ 28 140]

Positive C preds: [ 15  21  28  38  47  52  60  67  74  82  88 103 111 117 128 130 138 145
 150]
Positive C targets: [ 36 148 149]

Protein 2
Positive N preds: [  5  15  41  70 158 172 195 216 230 245 263 268 289 309 349 403 407 412
 415 433 447 458 465 478 485 500 518 534 538 540 560 575]
Positive N targets: [ 73  91 102 107 309]

Positive C preds: [  2  11  25  58  70  81  92 102 112 121 142 152 177 184 191 203 248 260
 281 284 289 297 311 338 362 372 379 388 391 415 426 432 445 450 460 466
 476 488 503 527 537 573 584]
Positive C targets: [ 81  99 111 115 319]

Protein 3
Positive N preds: [ 20  23  46  74  87 103 119 135 151 201 218 223 250]
Positive N targets: [ 64 218 223 224]

Positive C preds: [  2  20  34  40  51  63  73  88 104 