In [13]:
import pandas as pd
import numpy as np
import math

In [14]:
def calc_metrics(tp, tn, fp, fn):
    # balanced accuracy
    se_total = tp + fn
    sp_total = fp + tn

    # Sensitivity / recall / TPR
    se = 0
    if se_total > 0:
        se = tp / se_total

    # Specificity / TNR
    sp = 0
    if sp_total > 0:
        sp = tn / sp_total

    # Balanced accuracy (Sensitivity + specificity) / 2
    bacc = (se + sp) / 2

    # Precision / PPV
    p_total = tp + fp
    p = 0
    if p_total > 0:
        p = tp / p_total

    # F1 score
    f1 = 0
    if (p + se) > 0:
        f1 = 2 * ((p * se)/(p+se))

    # MCC
    mcc = np.nan
    d = math.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
    if d > 0:
        mcc = ((tp * tn) - (fp * fn)) / d

    return [bacc, p, se, sp, f1, mcc]

In [15]:
# dataset
df_pos = pd.read_csv('data/binary_pdbs_classes.csv')
df_neg = pd.read_csv('data/binary_pdb_negatives.csv')
df = pd.concat([df_pos, df_neg])
df = df.loc[df['pdb_residue_id'].notnull()] # filter missing residues
df

  df_pos = pd.read_csv('data/binary_pdbs_classes.csv')
  df_neg = pd.read_csv('data/binary_pdb_negatives.csv')


Unnamed: 0,pdb_id,pdb_chain,seqres_index,pdb_residue_id,REGION,CURATED,RDB1,RDB2,CATEGORY,UNIT_CURATED,UNIT_RDB1,UNIT_RDB2,PDB,classes,classes_names,topologies,topologies_names
2,3u3w,B,3,3.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
3,3u3w,B,4,4.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
4,3u3w,B,5,5.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
5,3u3w,B,6,6.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
6,3u3w,B,7,7.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733045,6b9i,A,284,284.0,,0,0,0,0,0,0,0,6b9iA,,,,
733046,6b9i,A,285,285.0,,0,0,0,0,0,0,0,6b9iA,,,,
733047,6b9i,A,286,286.0,,0,0,0,0,0,0,0,6b9iA,,,,
733048,6b9i,A,287,287.0,,0,0,0,0,0,0,0,6b9iA,,,,


In [16]:
# REPEATSDB-LITE EVALUATION (generate tables)
# Repeats detection at the protein level (PDB chains)

# rdb1
df['TP_1'] = np.where((df['RDB1'] == df['CURATED']) & (df['CURATED'] == 1), 1, 0)
df['TN_1'] = np.where((df['RDB1'] == df['CURATED']) & (df['CURATED'] == 0), 1, 0)
df['FP_1'] = np.where((df['RDB1'] != df['CURATED']) & (df['CURATED'] == 0), 1, 0)
df['FN_1'] = np.where((df['RDB1'] != df['CURATED']) & (df['CURATED'] == 1), 1, 0)

df['TP_2'] = np.where((df['RDB2'] == df['CURATED']) & (df['CURATED'] == 1), 1, 0)
df['TN_2'] = np.where((df['RDB2'] == df['CURATED']) & (df['CURATED'] == 0), 1, 0)
df['FP_2'] = np.where((df['RDB2'] != df['CURATED']) & (df['CURATED'] == 0), 1, 0)
df['FN_2'] = np.where((df['RDB2'] != df['CURATED']) & (df['CURATED'] == 1), 1, 0)

df

(672384, 1314520, 1284181, 702723)

In [17]:
df_prot = df.groupby(['pdb_id', 'pdb_chain']).agg({'CATEGORY': 'first', 'seqres_index':'count','TP_1': 'sum', 'TN_1': 'sum', 'FP_1': 'sum', 'FN_1': 'sum', 'TP_2': 'sum', 'TN_2': 'sum', 'FP_2': 'sum', 'FN_2': 'sum'})
df_prot.reset_index(inplace=True)
df_prot

Unnamed: 0,pdb_id,pdb_chain,seqres_index,pdb_residue_id,REGION,CURATED,RDB1,RDB2,CATEGORY,UNIT_CURATED,...,topologies,topologies_names,TP_1,TN_1,FP_1,FN_1,TP_2,TN_2,FP_2,FN_2
2,3u3w,B,3,3.0,0,0,1,0,1,0,...,3.3,Alpha-solenoid,0,0,1,0,0,1,0,0
3,3u3w,B,4,4.0,0,0,1,0,1,0,...,3.3,Alpha-solenoid,0,0,1,0,0,1,0,0
4,3u3w,B,5,5.0,0,0,1,0,1,0,...,3.3,Alpha-solenoid,0,0,1,0,0,1,0,0
5,3u3w,B,6,6.0,0,0,1,0,1,0,...,3.3,Alpha-solenoid,0,0,1,0,0,1,0,0
6,3u3w,B,7,7.0,0,0,1,0,1,0,...,3.3,Alpha-solenoid,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733045,6b9i,A,284,284.0,,0,0,0,0,0,...,,,0,1,0,0,0,1,0,0
733046,6b9i,A,285,285.0,,0,0,0,0,0,...,,,0,1,0,0,0,1,0,0
733047,6b9i,A,286,286.0,,0,0,0,0,0,...,,,0,1,0,0,0,1,0,0
733048,6b9i,A,287,287.0,,0,0,0,0,0,...,,,0,1,0,0,0,1,0,0


In [70]:
df_prot.set_index(['pdb_id', 'pdb_chain', 'seqres_index'], inplace=True)
df_prot = df_prot.drop(columns=['CATEGORY'])
df_prot.columns=pd.MultiIndex.from_arrays([['RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB2', 'RDB2', 'RDB2', 'RDB2'], ['TP', 'TN', 'FP', 'FN', 'TP', 'TN', 'FP', 'FN']])
df_prot

Unnamed: 0,pdb_id,pdb_chain,CATEGORY,seqres_index,TP_1,TN_1,FP_1,FN_1,TP_2,TN_2,FP_2,FN_2
0,102l,A,0,163,0,71,92,0,0,49,114,0
1,13pk,A,0,415,0,346,69,0,0,217,198,0
2,1a0t,P,1,413,0,2,0,411,353,2,0,58
3,1a0t,Q,1,413,256,0,2,155,192,2,0,219
4,1a12,A,1,401,357,18,0,26,357,6,12,26
...,...,...,...,...,...,...,...,...,...,...,...,...
5252,7w3h,a,1,373,0,161,0,212,155,70,91,57
5253,7w3i,a,1,373,0,161,0,212,152,58,103,60
5254,7w3j,a,1,373,0,161,0,212,194,156,5,18
5255,7w3k,a,1,373,0,161,0,212,204,104,57,8


In [76]:
# tp, tn fp, fn
# bacc, p, se, sp, f1, mcc
print(*df_prot['RDB1'].sum(), calc_metrics(*df_prot['RDB1'].sum()))
print(*df_prot['RDB2'].sum(), calc_metrics(*df_prot['RDB2'].sum()))

In [20]:
# apply calc_metrics function to each row for both RDB1 and RDB2
data = []
for index, row in df_prot.iterrows():
    # print(*row['RDB1'], *calc_metrics(*row['RDB1']))
    data.append([index[0], index[1], index[2], *row[:4], *calc_metrics(*row['RDB1']), *row[4:], *calc_metrics(*row['RDB2'])])
df_ = pd.DataFrame(data)
df_

451973 756711 220411 557809 [0.6110115008186754, 0.672194757757472, 0.4475946293358368, 0.7744283723015141, 0.5373702714238665, 0.2345815727683996]
768967 461908 515214 240815 [0.6171203941721346, 0.5987995461698935, 0.7615178325618797, 0.4727229557823895, 0.6704266808139451, 0.2449319979510994]


In [21]:
df_.columns=pd.MultiIndex.from_arrays([[None, None, None, 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2'], ['pdb_id', 'pdb_chain', 'length', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc']])
df_

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,102l,A,163,0,71,92,0,0.217791,0.000000,0.000000,...,0,49,114,0,0.150307,0.000000,0.000000,0.300613,0.000000,
1,13pk,A,415,0,346,69,0,0.416867,0.000000,0.000000,...,0,217,198,0,0.261446,0.000000,0.000000,0.522892,0.000000,
2,1a0t,P,413,0,2,0,411,0.500000,0.000000,0.000000,...,353,2,0,58,0.929440,1.000000,0.858881,1.000000,0.924084,0.169202
3,1a0t,Q,413,256,0,2,155,0.311436,0.992248,0.622871,...,192,2,0,219,0.733577,1.000000,0.467153,1.000000,0.636816,0.065020
4,1a12,A,401,357,18,0,26,0.966057,1.000000,0.932115,...,357,6,12,26,0.632724,0.967480,0.932115,0.333333,0.949468,0.202828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5252,7w3h,a,373,0,161,0,212,0.500000,0.000000,0.000000,...,155,70,91,57,0.582957,0.630081,0.731132,0.434783,0.676856,0.173419
5253,7w3i,a,373,0,161,0,212,0.500000,0.000000,0.000000,...,152,58,103,60,0.538615,0.596078,0.716981,0.360248,0.650964,0.082253
5254,7w3j,a,373,0,161,0,212,0.500000,0.000000,0.000000,...,194,156,5,18,0.942019,0.974874,0.915094,0.968944,0.944039,0.877710
5255,7w3k,a,373,0,161,0,212,0.500000,0.000000,0.000000,...,204,104,57,8,0.804113,0.781609,0.962264,0.645963,0.862579,0.657230


In [22]:
df_.loc[:, ('RDB1', ['bacc', 'p', 'se', 'sp', 'f1', 'mcc'])]

Unnamed: 0_level_0,NaN,NaN,NaN,RDB1,RDB1,RDB1,RDB1,RDB1,RDB1,RDB1,...,RDB2,RDB2,RDB2,RDB2,RDB2,RDB2,RDB2,RDB2,RDB2,RDB2
Unnamed: 0_level_1,pdb_id,pdb_chain,length,TP,TN,FP,FN,bacc,p,se,...,TP,TN,FP,FN,bacc,p,se,sp,f1,mcc
0,102l,A,163,0,71,92,0,0.217791,0.000000,0.000000,...,0,49,114,0,0.150307,0.000000,0.000000,0.300613,0.000000,
1,13pk,A,415,0,346,69,0,0.416867,0.000000,0.000000,...,0,217,198,0,0.261446,0.000000,0.000000,0.522892,0.000000,
2,1a0t,P,413,0,2,0,411,0.500000,0.000000,0.000000,...,353,2,0,58,0.929440,1.000000,0.858881,1.000000,0.924084,0.169202
3,1a0t,Q,413,256,0,2,155,0.311436,0.992248,0.622871,...,192,2,0,219,0.733577,1.000000,0.467153,1.000000,0.636816,0.065020
4,1a12,A,401,357,18,0,26,0.966057,1.000000,0.932115,...,357,6,12,26,0.632724,0.967480,0.932115,0.333333,0.949468,0.202828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5252,7w3h,a,373,0,161,0,212,0.500000,0.000000,0.000000,...,155,70,91,57,0.582957,0.630081,0.731132,0.434783,0.676856,0.173419
5253,7w3i,a,373,0,161,0,212,0.500000,0.000000,0.000000,...,152,58,103,60,0.538615,0.596078,0.716981,0.360248,0.650964,0.082253
5254,7w3j,a,373,0,161,0,212,0.500000,0.000000,0.000000,...,194,156,5,18,0.942019,0.974874,0.915094,0.968944,0.944039,0.877710
5255,7w3k,a,373,0,161,0,212,0.500000,0.000000,0.000000,...,204,104,57,8,0.804113,0.781609,0.962264,0.645963,0.862579,0.657230


In [23]:
df_.loc[:,('RDB1', ['bacc', 'p', 'se', 'sp', 'f1', 'mcc'])].boxplot(figsize=(20,10))




Unnamed: 0_level_0,RDB1,RDB1,RDB1,RDB1,RDB1,RDB1
Unnamed: 0_level_1,bacc,p,se,sp,f1,mcc
0,0.217791,0.000000,0.000000,0.435583,0.000000,
1,0.416867,0.000000,0.000000,0.833735,0.000000,
2,0.500000,0.000000,0.000000,1.000000,0.000000,
3,0.311436,0.992248,0.622871,0.000000,0.765321,-0.054069
4,0.966057,1.000000,0.932115,1.000000,0.964865,0.617511
...,...,...,...,...,...,...
5252,0.500000,0.000000,0.000000,1.000000,0.000000,
5253,0.500000,0.000000,0.000000,1.000000,0.000000,
5254,0.500000,0.000000,0.000000,1.000000,0.000000,
5255,0.500000,0.000000,0.000000,1.000000,0.000000,


In [24]:
# df_.loc[:,(slice(None), ['bacc', 'p', 'se', 'sp', 'f1', 'mcc'])].boxplot(figsize=(20,10))

<Axes: >

In [24]:
#