In [118]:
import pandas as pd
import numpy as np
import math
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from collections import Counter
import plotly.express as px


In [119]:
def calc_metrics(tp, tn, fp, fn):
    # balanced accuracy
    se_total = tp + fn
    sp_total = fp + tn

    # Sensitivity / recall / TPR
    se = 0
    if se_total > 0:
        se = tp / se_total

    # Specificity / TNR
    sp = 0
    if sp_total > 0:
        sp = tn / sp_total

    # Balanced accuracy (Sensitivity + specificity) / 2
    bacc = (se + sp) / 2

    # Precision / PPV
    p_total = tp + fp
    p = 0
    if p_total > 0:
        p = tp / p_total

    # F1 score
    f1 = 0
    if (p + se) > 0:
        f1 = 2 * ((p * se)/(p+se))

    # MCC
    mcc = np.nan
    d = math.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
    if d > 0:
        mcc = ((tp * tn) - (fp * fn)) / d

    return [round(bacc, 2), round(p, 2), round(se, 2), round(sp, 2), round(f1,2), round(mcc, 2)]

Repository GitHub
https://github.com/BioComputingUP/repeatsdb-lite-analysis

Datasets
- RepeatsDB curated: All manually curated proteins in RepeatsDB + negatives (1821 PDB chains / 1821 UniProt proteins manually curated as not repeats)
- 2cn2C was removed from the dataset because RDB2 lite tool could not process the pdb file
- Date: 23/08/2022
- SRUL RepeatsDB-lite 2 version: 22/02/2023 with PDBs annotated from 01 March until 23/08/2022.

In [153]:
# dataset
df_pos = pd.read_csv('data/binary_pdbs_classes.csv')
df_neg = pd.read_csv('data/binary_pdb_negatives.csv')
df = pd.concat([df_pos, df_neg])
# df = df_pos # uncomment and comment previous line when only positives wanted
df = df.loc[df['pdb_residue_id'].notnull()] # filter missing residues
df

In [154]:
# REPEATSDB-LITE EVALUATION

# rdb1
df['TP_1'] = np.where((df['RDB1'] == df['CURATED']) & (df['CURATED'] == 1), 1, 0)
df['TN_1'] = np.where((df['RDB1'] == df['CURATED']) & (df['CURATED'] == 0), 1, 0)
df['FP_1'] = np.where((df['RDB1'] != df['CURATED']) & (df['CURATED'] == 0), 1, 0)
df['FN_1'] = np.where((df['RDB1'] != df['CURATED']) & (df['CURATED'] == 1), 1, 0)

df['TP_2'] = np.where((df['RDB2'] == df['CURATED']) & (df['CURATED'] == 1), 1, 0)
df['TN_2'] = np.where((df['RDB2'] == df['CURATED']) & (df['CURATED'] == 0), 1, 0)
df['FP_2'] = np.where((df['RDB2'] != df['CURATED']) & (df['CURATED'] == 0), 1, 0)
df['FN_2'] = np.where((df['RDB2'] != df['CURATED']) & (df['CURATED'] == 1), 1, 0)

df

In [155]:
df_prot = df.groupby(['pdb_id', 'pdb_chain']).agg({'CATEGORY': 'first', 'seqres_index':'count','TP_1': 'sum', 'TN_1': 'sum', 'FP_1': 'sum', 'FN_1': 'sum', 'TP_2': 'sum', 'TN_2': 'sum', 'FP_2': 'sum', 'FN_2': 'sum'})
df_prot.reset_index(inplace=True)
df_prot

In [156]:
# protein level matrix
# rdb1
df_prot_lvl = pd.DataFrame()
df_prot_lvl['TP_1'] = np.where(((df_prot['TP_1'] > 0) | (df_prot['FP_1'] > 0)) & (df_prot['CATEGORY'] == 1), 1, 0)
df_prot_lvl['TN_1'] = np.where(((df_prot['TP_1'] == 0) & (df_prot['FP_1'] == 0)) & (df_prot['CATEGORY'] == 0), 1, 0)
df_prot_lvl['FP_1'] = np.where(((df_prot['TP_1'] > 0) | (df_prot['FP_1'] > 0)) & (df_prot['CATEGORY'] == 0), 1, 0)
df_prot_lvl['FN_1'] = np.where(((df_prot['TP_1'] == 0) & (df_prot['FP_1'] == 0)) & (df_prot['CATEGORY'] == 1), 1, 0)

df_prot_lvl['TP_2'] = np.where(((df_prot['TP_2'] > 0) | (df_prot['FP_2'] > 0)) & (df_prot['CATEGORY'] == 1), 1, 0)
df_prot_lvl['TN_2'] = np.where(((df_prot['TP_2'] == 0) | (df_prot['FP_2'] == 0)) & (df_prot['CATEGORY'] == 0), 1, 0)
df_prot_lvl['FP_2'] = np.where(((df_prot['TP_2'] > 0) | (df_prot['FP_2'] > 0)) & (df_prot['CATEGORY'] == 0), 1, 0)
df_prot_lvl['FN_2'] = np.where(((df_prot['TP_2'] == 0) | (df_prot['FP_2'] == 0)) & (df_prot['CATEGORY'] == 1), 1, 0)



In [157]:
df_prot_lvl.columns=pd.MultiIndex.from_arrays([['RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB2', 'RDB2', 'RDB2', 'RDB2'], ['TP', 'TN', 'FP', 'FN', 'TP', 'TN', 'FP', 'FN']])

# tp, tn fp, fn
# bacc, p, se, sp, f1, mcc
matrix1 = *df_prot_lvl['RDB1'].sum(), calc_metrics(*df_prot_lvl['RDB1'].sum())
matrix2 = *df_prot_lvl['RDB2'].sum(), calc_metrics(*df_prot_lvl['RDB2'].sum())

In [158]:
data = []
data.append(['RDBLITE1', matrix1[0], matrix1[1], matrix1[2], matrix1[3], len(df_prot_lvl), round(matrix1[4][0],2), round(matrix1[4][1],2), round(matrix1[4][2],2), round(matrix1[4][3],2), round(matrix1[4][4],2), round(matrix1[4][5],2)])
data.append(['RDBLITE2',  matrix2[0], matrix2[1], matrix2[2], matrix2[3], len(df_prot_lvl), round(matrix2[4][0],2), round(matrix2[4][1],2), round(matrix2[4][2],2), round(matrix2[4][3],2), round(matrix2[4][4],2), round(matrix2[4][5],2)])
table = pd.DataFrame(data, columns=['TOOL', 'TP', 'FP', 'TN', 'FN', 'TOTAL', 'BALANCED ACCURACY', 'PRECISION', 'RECALL', 'F-SCORE', 'MCC', 'TNR1'])
table.to_csv('tables/table_protein_level.csv', index=False)

Table 3. Repeats detection at the protein level (PDB chains)
Region overlap is not considered, instead it is evaluated only if the chain is predicted. Both positive and negative proteins are considered. Manually curated examples are 5261 PDB chains of which 1821 (1821 UniProt proteins) are negatives and 3440 are positives (1623 UniProt). TP is assigned when a repeated region is predicted, irrespectively the predicted region overlaps the curated one.
TOTAL PDBs (positives, negatives) predicted by RDB1: 2887
TOTAL PDBs (positives, negatives) predicted by RDB2: 5069


In [159]:
# Region overlap
df_prot.set_index(['pdb_id', 'pdb_chain', 'seqres_index'], inplace=True)
df_prot = df_prot.drop(columns=['CATEGORY'])
df_prot.columns=pd.MultiIndex.from_arrays([['RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB2', 'RDB2', 'RDB2', 'RDB2'], ['TP', 'TN', 'FP', 'FN', 'TP', 'TN', 'FP', 'FN']])
df_prot

In [160]:
# apply calc_metrics function to each row for both RDB1 and RDB2
data = []
for index, row in df_prot.iterrows():
    # print(*row['RDB1'], *calc_metrics(*row['RDB1']))
    data.append([index[0], index[1], index[2], *row[:4], *calc_metrics(*row['RDB1']), *row[4:], *calc_metrics(*row['RDB2'])])
df_ = pd.DataFrame(data)
df_

In [161]:
df_.columns=pd.MultiIndex.from_arrays([[None, None, None, 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2'], ['pdb_id', 'pdb_chain', 'length', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc']])
df_

In [162]:
df_.loc[:, ('RDB1', ['bacc', 'p', 'se', 'sp', 'f1', 'mcc'])]

In [163]:
df_.loc[:,('RDB1', ['bacc', 'p', 'se', 'sp', 'f1', 'mcc'])].boxplot(figsize=(20,10))

In [164]:
# df_.loc[:,(slice(None), ['bacc', 'p', 'se', 'sp', 'f1', 'mcc'])].boxplot(figsize=(20,10))

In [165]:
# table region overlap level (positives + negatives)
metrics1 = df_.loc[:,('RDB1', ['bacc', 'p', 'se', 'sp', 'f1', 'mcc'])].mean().round(2)
matrix1 = df_.loc[:,('RDB1', ['TP', 'TN', 'FP', 'FN'])].sum()
metrics2 = df_.loc[:,('RDB2', ['bacc', 'p', 'se', 'sp', 'f1', 'mcc'])].mean().round(2)
matrix2 = df_.loc[:,('RDB2', ['TP', 'TN', 'FP', 'FN'])].sum()

In [166]:
data = []
data.append(['RDBLITE1', matrix1[0], matrix1[1], matrix1[2], matrix1[3], len(df), metrics1[0], metrics1[1], metrics1[2], metrics1[3], metrics1[4], metrics1[5]])
data.append(['RDBLITE2',  matrix2[0], matrix2[1], matrix2[2], matrix2[3], len(df), metrics2[0], metrics2[1], metrics2[2], metrics2[3], metrics2[4], metrics2[5]])
table = pd.DataFrame(data, columns=['TOOL', 'TP', 'FP', 'TN', 'FN', 'TOTAL', 'BALANCED ACCURACY', 'PRECISION', 'RECALL', 'F-SCORE', 'MCC', 'TNR1'])
# table.to_csv('tables/table_residue_overlap.csv', index=False)

table

Table 4. Region overlap at the residue level
Both positive and negative proteins are considered.
TOTAL residues predicted by RDB1: 672384
TOTAL residues non-repeat predicted by RDB1: 1314520
TOTAL residues predicted by RDB2: 1284181
TOTAL residues non-repeat predicted by RDB2: 702723

TN: residues correctly predicted as not repeated
TP: residues correctly predicted as repeated
FP: residues incorrectly predicted as repeated
FN: residues incorrectly predicted as not repeated

In [167]:
# to do the positive table I filter the negatives in the dataframe at the beginning
# I don't know how to do it better without repeating the same code
# table.to_csv('tables/table_residue_overlap_positives.csv', index=False)

Table 5. Region overlap at the residue level for the positive class
TOTAL residues predicted by RDB1: 485808
TOTAL residues non-repeat predicted by RDB1: 817302
TOTAL residues predicted by RDB2: 907297
TOTAL residues non-repeat predicted by RDB2: 395813

TN: residues correctly predicted as not repeated
TP: residues correctly predicted as repeated
FP: residues incorrectly predicted as repeated
FN: residues incorrectly predicted as not repeated

All the following plots have been calculated filtering PDBs with duplicate class or topology.

In [168]:
# PLOTS: GROUP BY TOPOLOGY, CALC METRICS AND PLOT

# remove the ones with 2 topologies definition for simplification
df_topologies = df[~df["topologies"].astype(str).str.contains(';')]
df_topologies = df_topologies[df_topologies["topologies"].notna()]
df_topologies['topologies'] = df_topologies['topologies'].astype(str)
df_topologies


In [169]:
df_top = df_topologies.groupby(['topologies']).agg({'TP_1': 'sum', 'TN_1': 'sum', 'FP_1': 'sum', 'FN_1': 'sum', 'TP_2': 'sum', 'TN_2': 'sum', 'FP_2': 'sum', 'FN_2': 'sum'})


df_top.columns=pd.MultiIndex.from_arrays([['RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB2', 'RDB2', 'RDB2', 'RDB2'], ['TP', 'TN', 'FP', 'FN', 'TP', 'TN', 'FP', 'FN']])
df_top

In [170]:
# apply calc_metrics function to each row for both RDB1 and RDB2
data = []
for index, row in df_top.iterrows():
    data.append([index, *row[:4], *calc_metrics(*row['RDB1']), *row[4:], *calc_metrics(*row['RDB2'])])
df_top = pd.DataFrame(data)
df_top

In [171]:
# # # plot
df_top.rename(columns={0: 'topologies'}, inplace=True)
df_top[['c', 't']] = df_top['topologies'].str.split('.', expand=True)
df_top["t"] = pd.to_numeric(df_top["t"])
df_top = df_top.sort_values(['c', 't'])
df_top

In [172]:
df_top.columns=pd.MultiIndex.from_arrays([['id', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB1', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'RDB2', 'id', 'id'], ['topologies', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc', 'c', 't']])
df_top

In [173]:
# table topologies
df_1 = df_top.copy()
df_1.columns = pd.MultiIndex.from_arrays([['topologies', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc', 'c', 't']])
df_1 = df_1.iloc[:,0:11]
df_1['tool'] = 'RDBLITE1'

df_2 = df_top.copy()
df_2.columns = pd.MultiIndex.from_arrays([['topologies', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc', 'TP', 'TN', 'FP', 'FN', 'bacc', 'p', 'se', 'sp', 'f1', 'mcc', 'c', 't']])
df_2 = df_2.iloc[:,11:21]
df_2['tool'] = 'RDBLITE2'
df_2['topologies'] = df_1['topologies']

cols = list(df_2.columns)
cols = [cols[-1]] + cols[:-1]
df_2 = df_2[cols]
df_merge_top = pd.concat([df_1, df_2]).sort_index()
df_merge_top.to_csv('tables/table_topologies.csv', index=False)

Table 6. Region overlap at the residue level for the different topologies for RepeatsDB-lite2. Class and topology are associated only to repeat residues. Not repeated residues do not have class-topology association. To be able to consider not repeated residues in this table, class and topology have been associated with the PDB and not the single residue. PDBs with 2 different classes were removed since they were a small part of the total (~100) and to be able to include negative residues in the analysis. E.g. this is why the only PDB of class 2.2 (5yfpH) disappears in this analysis (because it had 2 topologies: 2.2 and 3.3)

In [319]:
# plot
x = ['3.1 β-solenoid', '3.2 α/β solenoid',
     '3.3 Alpha-solenoid', '3.4 β hairpins', '3.6 Box', '4.1 TIM-barrel', '4.2 β-barrel/β-hairpins',
     '4.3 Beta-trefoil', '4.4 Propeller', '4.5 α/β prism',
     '4.6 α-barrel', '4.7 α/β barrel','4.9 α/β trefoil',  '4.10 Aligned prism',
     '5.1 α-beads', '5.2 β-beads', '5.3 α/β-beads', '5.4 β sandwich beads', '5.5 α/β sandwich beads']
fig = make_subplots(rows=2, cols=1)

In [320]:
# RDBL 1

fig1 = make_subplots(rows=1, cols=1)
fig1.add_trace(go.Bar(
    y= df_top[('RDB1','bacc')].values.tolist(),
    x=x,
    name='accuracy',
    marker_color='#EDAE49'
), row=1, col=1)
fig1.add_trace(go.Bar(
    y=df_top[('RDB1','p')].values.tolist(),
    x=x,
    name='precision',
    marker_color='#D1495B'
), row=1, col=1)
fig1.add_trace(go.Bar(
    y=df_top[('RDB1','se')].values.tolist(),
    x=x,
    name='recall',
    marker_color='#00798C'
), row=1, col=1)
fig1.add_trace(go.Bar(
    y=df_top[('RDB2','f1')].values.tolist(),
    x=x,
    name='f-score',
    marker_color='#003D5B'
), row=1, col=1)
# fig1.write_image("plots/a_p_r_m_dataset_rdb1.png")

Figure 1. Region overlap evaluation of RepeatsDB-lite1 & 2 against curated annotations. Class 3.6 is always predicted as not repeated by RDB1. For this reason accuracy is > 0, but not precision and recall.

In [321]:
# RDBL 2

fig2 = make_subplots(rows=1, cols=1)
fig2.add_trace(go.Bar(
    y= df_top[('RDB2','bacc')].values.tolist(),
    x=x,
    name='accuracy',
    marker_color='#EDAE49'
), row=1, col=1)
fig2.add_trace(go.Bar(
    y=df_top[('RDB2','p')].values.tolist(),
    x=x,
    name='precision',
    marker_color='#D1495B'
), row=1, col=1)
fig2.add_trace(go.Bar(
    y=df_top[('RDB2','se')].values.tolist(),
    x=x,
    name='recall',
    marker_color='#00798C'
), row=1, col=1)
fig2.add_trace(go.Bar(
    y=df_top[('RDB2','f1')].values.tolist(),
    x=x,
    name='f-score',
    marker_color='#003D5B'
), row=1, col=1)
# fig2.write_image("plots/a_p_r_m_dataset_rdb2.png")

Figure 2. Region overlap evaluation of RepeatsDB-lite1 & 2 against curated annotations at the target level (distribution).

In [322]:
# RDBL 3
fig.add_trace(go.Bar(
    y= np.array(df_top[('RDB2','bacc')].values.tolist()) -  np.array(df_top[('RDB1','bacc')].values.tolist()),
    x=x,
    name='accuracy',
    marker_color='#EDAE49'
), row=1, col=1)
fig.add_trace(go.Bar(
    y=np.array(df_top[('RDB2','p')].values.tolist()) - np.array(df_top[('RDB1','p')].values.tolist()),
    x=x,
    name='precision',
    marker_color='#D1495B'
), row=1, col=1)
fig.add_trace(go.Bar(
    y=np.array(df_top[('RDB2','se')].values.tolist()) - np.array(df_top[('RDB1','se')].values.tolist()),
    x=x,
    name='recall',
    marker_color='#00798C'
), row=1, col=1)
fig.add_trace(go.Bar(
    y=np.array(df_top[('RDB2','f1')].values.tolist()) - np.array(df_top[('RDB1','f1')].values.tolist()),
    x=x,
    name='f-score',
    marker_color='#003D5B'
), row=1, col=1)

In [323]:
# PDBS distribution
df_pdbs = pd.DataFrame.from_dict(Counter(df_topologies['topologies']), orient='index', columns=['#PDBs'])

fig.add_trace(go.Bar(
    y=df_pdbs['#PDBs'].values.tolist(),
    x=x,
    name='PDBs',
    marker_color='#003D5F'
), row=2, col=1)
fig.update_layout(width=1300, height=920, plot_bgcolor = "white", paper_bgcolor = "white",  font_color="black", font=dict(
        size=20))

fig.write_image("plots/a_p_r_m_dataset_rdb3.png")

Figure 3.
Region overlap evaluation of delta RepeatsDB-lite2 - RepeatsDB-lite1 against curated annotations at the target level (distribution).
PDBs distribution over the different topologies. The plot shows PDBs distribution by topology, highlighting less represented (e.g. 4.3, 3.7, 5.1) and more represented (e.g. 3.3, 4.4) topologies.

The following plots are calculated at the region level for all PDBs. PDB residues can be predicted as repeated by one method and not the other and still be considered for the analysis. If a PDB residue is not predicted as repeated then it will be considered as TN or FN depending on the comparison with the curation.

In [313]:
# boxplots PDB - topologies
# apply calc_metrics function to each row for both RDB1 and RDB2
data = []
for index, row in df_prot.iterrows():
    # print(*row['RDB1'], *calc_metrics(*row['RDB1']))
    data.append([index[0], index[1], index[2], *row[:4], *calc_metrics(*row['RDB1']), *row[4:], *calc_metrics(*row['RDB2'])])
df_classes = pd.DataFrame(data)
df_classes
df_classes.rename(columns={0: 'pdb_id'}, inplace=True)
df_classes.rename(columns={1: 'pdb_chain'}, inplace=True)
df_boxplots = df_classes.merge(df_topologies, how='right', on=['pdb_id', 'pdb_chain'])
df_boxplots

In [235]:
# accuracy
df_boxplots.rename(columns={7: 'a1'}, inplace=True)
df_boxplots.rename(columns={17: 'a2'}, inplace=True)
df_plot1 = df_boxplots[['a1', 'topologies']].rename(columns={"a1": "accuracy"})
df_plot2 = df_boxplots[['a2', 'topologies']].rename(columns={"a2": "accuracy"})
df_plot1['tool'] = 'RepeatsDB-lite1'
df_plot2['tool'] = 'RepeatsDB-lite2'

df_final = pd.concat([df_plot1, df_plot2])
df_final = df_final.sort_values(by=['topologies'], ascending=True)
df_final[['class','subclass']] = df_final['topologies'].str.split('.',expand=True)

fig = px.box(df_final, x="topologies", y="accuracy", color='tool')
fig.update_layout(width=1000)
fig.write_image("plots/accuracy_pdbs.png")

Figure 3. Accuracy evaluation of RepeatsDB-lite1 & 2 at target level for RepeatsDB classes.

In [83]:
fig = px.box(df_final, x="class", y="accuracy", color='tool')
fig.update_layout(width=1000)
fig.write_image("plots/accuracy_pdbs_classes.png")

Figure 4. Accuracy evaluation of RepeatsDB-lite1 & 2 at target level for RepeatsDB topologies.  Class 4.7 results are explained by the fact that it has only one PDB.

In [84]:
# precision
df_boxplots.rename(columns={8: 'p1'}, inplace=True)
df_boxplots.rename(columns={18: 'p2'}, inplace=True)
df_plot1 = df_boxplots[['p1', 'topologies']].rename(columns={'p1': "precision"})
df_plot2 = df_boxplots[['p2', 'topologies']].rename(columns={'p2': "precision"})
df_plot1['tool'] = 'RepeatsDB-lite1'
df_plot2['tool'] = 'RepeatsDB-lite2'
df_final = pd.concat([df_plot1, df_plot2])
df_final = df_final.sort_values(by=['topologies'], ascending=True)
df_final[['class','subclass']] = df_final['topologies'].str.split('.',expand=True)


fig = px.box(df_final, x="topologies", y="precision", color='tool')
fig.update_layout(width=1000)
fig.write_image("plots/precision_pdbs.png")

Figure 5. Precision evaluation of RepeatsDB-lite1 & 2 at target level for RepeatsDB classes.

In [85]:
fig = px.box(df_final, x="class", y="precision", color='tool')
fig.update_layout(width=1000)
fig.write_image("plots/precision_pdbs_classes.png")

Figure 6. Precision evaluation of RepeatsDB-lite1 & 2 at target level for RepeatsDB topologies.

In [86]:
# recall
df_boxplots.rename(columns={9: 'r1'}, inplace=True)
df_boxplots.rename(columns={19: 'r2'}, inplace=True)
df_plot1 = df_boxplots[['r1', 'topologies']].rename(columns={"r1": "recall"})
df_plot2 = df_boxplots[['r2', 'topologies']].rename(columns={"r2": "recall"})
df_plot1['tool'] = 'RepeatsDB-lite1'
df_plot2['tool'] = 'RepeatsDB-lite2'
df_final = pd.concat([df_plot1, df_plot2])
df_final = df_final.sort_values(by=['topologies'], ascending=True)
df_final[['class','subclass']] = df_final['topologies'].str.split('.',expand=True)


fig = px.box(df_final, x="topologies", y="recall", color='tool')
fig.update_layout(width=1000)
fig.write_image("plots/recall_pdbs.png")

Figure 7. Recall evaluation of RepeatsDB-lite1 & 2 at target level for RepeatsDB classes.

In [87]:
fig = px.box(df_final, x="class", y="recall", color='tool')
fig.update_layout(width=1000)
fig.write_image("plots/recall_pdbs_classes.png")

Figure 8. Recall evaluation of RepeatsDB-lite1 & 2 at target level for RepeatsDB topologies.

In [88]:
# F score
df_boxplots.rename(columns={10: 'f1'}, inplace=True)
df_boxplots.rename(columns={20: 'f2'}, inplace=True)
df_f1 = df_boxplots[['f1', 'topologies']].rename(columns={"f1": "f-score"})
df_f2 = df_boxplots[['f2', 'topologies']].rename(columns={"f2": "f-score"})
df_f1['tool'] = 'RepeatsDB-lite1'
df_f2['tool'] = 'RepeatsDB-lite2'
df_final = pd.concat([df_f1, df_f2])
df_final = df_final.sort_values(by=['topologies'], ascending=True)
df_final[['class','subclass']] = df_final['topologies'].str.split('.',expand=True)

fig = px.box(df_final, x="topologies", y="f-score", color='tool')
fig.update_layout(width=1000, plot_bgcolor = "white", paper_bgcolor = "white",  font_color="black")
fig.write_image("plots/f-score_pdbs.png")

Figure 9. F-score evaluation of RepeatsDB-lite1 & 2 at target level for RepeatsDB classes.

In [89]:
fig = px.box(df_final, x="class", y="f-score", color='tool')
fig.update_layout(width=1000, plot_bgcolor = "white", paper_bgcolor = "white",  font_color="black")
fig.write_image("plots/f-score_pdbs_classes.png")

Figure 10. F-score evaluation of RepeatsDB-lite1 & 2 at target level for RepeatsDB topologies.

In [None]:
# f1 by method
fig = px.box(df_final, x="tool", y="f-score")
fig.write_image("plots/f-score_by_method.png")

Figure 10. Boxplot distribution of F-score values of RepeatsDB-lite1 & 2 by method.