In [105]:
import pandas as pd
import numpy as np
import math

# dataset
df_pos = pd.read_csv('data/binary_pdbs_classes.csv')
df_neg = pd.read_csv('data/binary_pdb_negatives.csv')
df = pd.concat([df_pos, df_neg])
df = df.loc[df['pdb_residue_id'].notnull()] # filter missing residues
df

  df_pos = pd.read_csv('data/binary_pdbs_classes.csv')
  df_neg = pd.read_csv('data/binary_pdb_negatives.csv')


Unnamed: 0,pdb_id,pdb_chain,seqres_index,pdb_residue_id,REGION,CURATED,RDB1,RDB2,CATEGORY,UNIT_CURATED,UNIT_RDB1,UNIT_RDB2,PDB,classes,classes_names,topologies,topologies_names
2,3u3w,B,3,3.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
3,3u3w,B,4,4.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
4,3u3w,B,5,5.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
5,3u3w,B,6,6.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
6,3u3w,B,7,7.0,0,0,1,0,1,0,3u3wB_3_21,0,3u3wB,3,Elongated repeats,3.3,Alpha-solenoid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733045,6b9i,A,284,284.0,,0,0,0,0,0,0,0,6b9iA,,,,
733046,6b9i,A,285,285.0,,0,0,0,0,0,0,0,6b9iA,,,,
733047,6b9i,A,286,286.0,,0,0,0,0,0,0,0,6b9iA,,,,
733048,6b9i,A,287,287.0,,0,0,0,0,0,0,0,6b9iA,,,,


In [106]:
# REPEATSDB-LITE EVALUATION (generate tables)
# Repeats detection at the protein level (PDB chains)
df['curat'] = df.groupby('PDB')['CURATED'].transform('max')
df['pred1'] = df.groupby('PDB')['RDB1'].transform('max')
df['pred2'] = df.groupby('PDB')['RDB2'].transform('max')

# rdb1
df['TP_1'] = np.where((df['pred1'] == df['curat']) & (df['curat'] == 1), '1', '0')
df['TN_1'] = np.where((df['pred1'] == df['curat']) & (df['curat'] == 0), '1', '0')
df['FP_1'] = np.where((df['pred1'] != df['curat']) & (df['curat'] == 0), '1', '0')
df['FN_1'] = np.where((df['pred1'] == df['curat']) & (df['curat'] == 1), '1', '0')

# rdb2
df['TP_2'] = np.where((df['pred2'] == df['curat']) & (df['curat'] == 1), '1', '0')
df['TN_2'] = np.where((df['pred2'] == df['curat']) & (df['curat'] == 0), '1', '0')
df['FP_2'] = np.where((df['pred2'] != df['curat']) & (df['curat'] == 0), '1', '0')
df['FN_2'] = np.where((df['pred2'] == df['curat']) & (df['curat'] == 1), '1', '0')

In [107]:
tp1 = len(df.loc[df['TP_1'] == '1'].groupby('PDB'))
tn1 = len(df.loc[df['TN_1'] == '1'].groupby('PDB'))
fp1 = len(df.loc[df['FP_1'] == '1'].groupby('PDB'))
fn1 = len(df.loc[df['FN_1'] == '1'].groupby('PDB'))
total1 = len(list(df.loc[(df['pred1'] == 1)]['PDB'].drop_duplicates()))

tp2 = len(df.loc[df['TP_2'] == '1'].groupby('PDB'))
tn2 = len(df.loc[df['TN_2'] == '1'].groupby('PDB'))
fp2 = len(df.loc[df['FP_2'] == '1'].groupby('PDB'))
fn2 = len(df.loc[df['FN_2'] == '1'].groupby('PDB'))
total2 = len(list(df.loc[(df['pred2'] == 1)]['PDB'].drop_duplicates()))
total2

5068

In [108]:
# accuracy1, accuracy2, precision1, precision2, recall1, recall2, f_score1, f_score2, mcc1, mcc2, tnr1, tnr2
def calc_metrics(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2):
    # balanced accuracy
    se1_total = tp1 + fn1
    se2_total = tp2 + fn2
    sp1_total = fp1 + tn1
    sp2_total = fp2 + tn2
    if se1_total == 0 or sp1_total == 0:
        accuracy1 = 0
    else:
        accuracy1 = round(((tp1 / se1_total) + (tn1 / sp1_total)) / 2, 2)

    if se2_total == 0 or sp2_total == 0:
        accuracy2 = 0
    else:
        accuracy2 = round(((tp2 / se2_total) + (tn2 / sp2_total)) / 2,2)


    p1_total = tp1 + fp1
    p2_total = tp2 + fp2
    r1_total = tp2 + fn2
    r2_total = tp2 + fn2

    if p1_total == 0:
        precision1 = 0
    else:
        precision1 = round(tp1 / p1_total, 2)
    if p2_total == 0:
        precision2 = 0
    else:
        precision2 = round(tp2 / p2_total, 2)
    if r1_total == 0:
        recall1 = 0
    else:
        recall1 = round(tp1 / r1_total, 2)
    if r2_total == 0:
        recall2 = 0
    else:
        recall2 = round(tp2 / r2_total, 2)

    if precision1 == 0 and recall1 == 0:
        f_score1 = 0
    else:
        f_score1 = round(2 * ((precision1 * recall1)/(precision1+recall1)),2)
    if precision2 == 0 and recall2 == 0:
        f_score2 = 0
    else:
        f_score2 = round(2 * ((precision2 * recall2)/(precision2+recall2)), 2)

    d1 = math.sqrt((tp1 + fp1)*(tp1 + fn1)*(tn1 + fp1)*(tn1 + fn1))
    d2 = math.sqrt((tp2 + fp2)*(tp2 + fn2)*(tn2 + fp2)*(tn2 + fn2))
    if d1 == 0:
        mcc1 = 'n.a.'
    else:
        mcc1 = round(((tp1 * tn1) - (fp1 * fn1)) / d1, 2)
    if d1 == 0:
        mcc2 = 'n.a.'
    else:
        mcc2 = round(((tp2 * tn2) - (fp2 * fn2)) / d2, 2)

    tnr1 = round(tn1 / (tn1 + fp1),2)
    tnr2 = round(tn2 / (tn2 + fp2),2)
    return [accuracy1, accuracy2, precision1, precision2, recall1, recall2, f_score1, f_score2, mcc1, mcc2, tnr1, tnr2]

In [109]:
# write table protein level
metrics = calc_metrics(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2)
data = []
data.append(['RDBLITE1', tp1, fp1, tn1, fn1, total1, metrics[0], metrics[2], metrics[4], metrics[6], metrics[8], metrics[10]])
data.append(['RDBLITE2',  tp2, fp2, tn2, fn2, total2, metrics[1], metrics[3], metrics[5], metrics[7], metrics[9], metrics[11]])
table = pd.DataFrame(data, columns=['TOOL', 'TP', 'FP', 'TN', 'FN', 'TOTAL', 'BALANCED ACCURACY', 'PRECISION', 'RECALL', 'F-SCORE', 'MCC', 'TNR1'])
table.to_csv('tables/table_protein_level.csv', index=False)
table

Unnamed: 0,TOOL,TP,FP,TN,FN,TOTAL,BALANCED ACCURACY,PRECISION,RECALL,F-SCORE,MCC,TNR1
0,RDBLITE1,1841,1045,774,1841,2886,0.46,0.64,0.27,0.38,-0.07,0.43
1,RDBLITE2,3377,1691,128,3377,5068,0.29,0.67,0.5,0.57,-0.36,0.07


In [110]:
# Region overlap at the residue level

tp1 = len(df.loc[(df['CURATED'] == 1) & (df['RDB1'] == 1)])
tn1 = len(df.loc[(df['CURATED'] == 0) & (df['RDB1'] == 0)])
fp1 = len(df.loc[(df['CURATED'] == 0) & (df['RDB1'] == 1)])
fn1 = len(df.loc[(df['CURATED'] == 1) & (df['RDB1'] == 0)])
total1 = len(df.loc[(df['RDB1'] == 1)])

tp2 = len(df.loc[(df['CURATED'] == 1) & (df['RDB2'] == 1)])
tn2 = len(df.loc[(df['CURATED'] == 0) & (df['RDB2'] == 0)])
fp2 = len(df.loc[(df['CURATED'] == 0) & (df['RDB2'] == 1)])
fn2 = len(df.loc[(df['CURATED'] == 1) & (df['RDB2'] == 0)])
total2 = len(df.loc[(df['RDB2'] == 1)])

# write table residue level
metrics = calc_metrics(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2)
data = []
data.append(['RDBLITE1', tp1, fp1, tn1, fn1, total1, metrics[0], metrics[2], metrics[4], metrics[6], metrics[8], metrics[10]])
data.append(['RDBLITE2', tp2, fp2, tn2, fn2, total2,  metrics[1], metrics[3], metrics[5], metrics[7], metrics[9], metrics[11]])
table = pd.DataFrame(data, columns=['TOOL', 'TP', 'FP', 'TN', 'FN', 'TOTAL', 'BALANCED ACCURACY', 'PRECISION', 'RECALL', 'F-SCORE', 'MCC', 'TNR1'])
table.to_csv('tables/table_positives_negatives.csv', index=False)


In [114]:
# Region overlap at the residue level for the positive class (category = 1)

tp1 = len(df.loc[(df['CURATED'] == 1) & (df['RDB1'] == 1) & (df['CATEGORY'] == 1)])
tn1 = len(df.loc[(df['CURATED'] == 0) & (df['RDB1'] == 0) & (df['CATEGORY'] == 1)])
fp1 = len(df.loc[(df['CURATED'] == 0) & (df['RDB1'] == 1) & (df['CATEGORY'] == 1)])
fn1 = len(df.loc[(df['CURATED'] == 1) & (df['RDB1'] == 0) & (df['CATEGORY'] == 1)])
total1 = len(df.loc[(df['RDB1'] == 1) & (df['CATEGORY'] == 1)])

tp2 = len(df.loc[(df['CURATED'] == 1) & (df['RDB2'] == 1) & (df['CATEGORY'] == 1)])
tn2 = len(df.loc[(df['CURATED'] == 0) & (df['RDB2'] == 0) & (df['CATEGORY'] == 1)])
fp2 = len(df.loc[(df['CURATED'] == 0) & (df['RDB2'] == 1) & (df['CATEGORY'] == 1)])
fn2 = len(df.loc[(df['CURATED'] == 1) & (df['RDB2'] == 0) & (df['CATEGORY'] == 1)])
total2 = len(df.loc[(df['RDB2'] == 1) & (df['CATEGORY'] == 1)])

# write table residue level
metrics = calc_metrics(tp1, fp1, tn1, fn1, tp2, fp2, tn2, fn2)
data = []
data.append(['RDBLITE1', tp1, fp1, tn1, fn1, total1, metrics[0], metrics[2], metrics[4], metrics[6], metrics[8], metrics[10]])
data.append(['RDBLITE2', tp2, fp2, tn2, fn2, total2,  metrics[1], metrics[3], metrics[5], metrics[7], metrics[9], metrics[11]])
table = pd.DataFrame(data, columns=['TOOL', 'TP', 'FP', 'TN', 'FN', 'TOTAL', 'BALANCED ACCURACY', 'PRECISION', 'RECALL', 'F-SCORE', 'MCC', 'TNR1'])
table.to_csv('tables/table_positives.csv', index=False)
table


Unnamed: 0,TOOL,TP,FP,TN,FN,TOTAL,BALANCED ACCURACY,PRECISION,RECALL,F-SCORE,MCC,TNR1
0,RDBLITE1,451973,33835,259493,557809,485808,0.67,0.93,0.45,0.61,0.29,0.88
1,RDBLITE2,768967,138330,154998,240815,907297,0.64,0.85,0.76,0.8,0.26,0.53


In [111]:
# REPEATSDB-LITE EVALUATION (generate plots)
# Region overlap
# rdb1
df['TRUE_PRED'] = np.where((df['pred1'] == df['curat']), '1', '0')