In [None]:
%load_ext autoreload
%autoreload 2

import json
import os
import pickle
from collections import defaultdict
from glob import glob
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib import rcParams
from matplotlib.lines import Line2D

from plot_utils import (median_score_compound, median_score_single, npflatten,
                        plot_wrapper, ratio_active_compound,
                        ratio_active_single)

# rcParams.update(matplotlib.rcParamsDefault)
params = {
    #    'axes.labelsize': 8,
    'font.size': 14,

    #    'legend.fontsize': 14,
    #    'xtick.labelsize': 10,
    #    'ytick.labelsize': 10,
    'text.usetex': False,
    #    'figure.figsize': [4.5, 4.5]
}
rcParams.update(params)


In [None]:
# folder structure is as follows:
# /results/{algorithm}/{chid}/{date}/
# algorithm: specific optimization algorithm
# chid: refers to a CHEMBL assay id. 
# date: folders for different runs. the date/time refers to when the run finished

from plot_utils import load_chid
results_dir = Path('results')
  
order = ('Split1', 'Split2', 'Split1_alt')
col_dict = dict(zip(order, ['blue', 'red', 'green']))
legend_dict = dict(zip(order, ['Optimizer', 'Control', 'Control 2']))

In [None]:
columns = ['AssayID', 'TargetName', 'TargetID']
rows = [('CHEMBL3888429', 'JAK2', 'CHEMBL2971'),
     ('CHEMBL1909203', 'EGFR', 'CHEMBL203'),
     ('CHEMBL1909140', 'DRD2', 'CHEMBL217')]

assay_info = pd.DataFrame(rows, columns=columns)
chid_name = dict(zip(assay_info['AssayID'], assay_info['TargetName']))

In [None]:
assay_stats = []

for chid in assay_info.AssayID:
    row = {}
    assay_data = pd.read_csv(f'./assays/processed/{chid}.csv')
    label = assay_data['label']
    
    row['active'] = label.sum()
    row['inactive'] = len(label) - row['active']
    row['size'] = len(label)
    assay_stats.append(row)

assay_info = pd.concat([assay_info, pd.DataFrame(assay_stats)], 1)

In [None]:
algorithm = 'lstm_hc'
aucs = defaultdict(list)
chids = os.listdir(results_dir/algorithm)
for optimizer in ['graph_ga', 'lstm_hc']:    
    for i, chid in enumerate(chids):
        _, _, auc = load_chid(results_dir/optimizer/chid, order)
        aucs[chid] += [v for x in auc.values() for v in x]

auc_ms = {}
for chid, auc in aucs.items():
    a = np.array(auc)
    auc_ms[chid] = (a.mean(), a.std())

for chid, (m,s) in auc_ms.items():
    print(f'{chid_name[chid]}: {m:.2f} ± {s:.2f}')

JAK2: 0.78 ± 0.03
EGFR: 0.76 ± 0.05
DRD2: 0.86 ± 0.03


# Calculate AUCs with stds

In [None]:
auc_df = pd.DataFrame(auc_ms).transpose()
auc_df.columns = ['AUC_mean', 'AUC_std']
auc_df['AssayID'] = auc_df.index
assay_info = assay_info.merge(auc_df, on='AssayID')
assay_info

Unnamed: 0,AssayID,TargetName,TargetID,active,inactive,size,AUC_mean,AUC_std
0,CHEMBL3888429,JAK2,CHEMBL2971,140,527,667,0.781237,0.025167
1,CHEMBL1909203,EGFR,CHEMBL203,40,802,842,0.759297,0.052238
2,CHEMBL1909140,DRD2,CHEMBL217,59,783,842,0.860297,0.025578


In [None]:
pm = [f"${auc:.2f}\pm{std:.2f}$" for auc, std in zip(assay_info['AUC_mean'], assay_info['AUC_std'])]
assay_info['AUC'] = pm
assay_info

Unnamed: 0,AssayID,TargetName,TargetID,active,inactive,size,AUC_mean,AUC_std,AUC
0,CHEMBL3888429,JAK2,CHEMBL2971,140,527,667,0.781237,0.025167,$0.78\pm0.03$
1,CHEMBL1909203,EGFR,CHEMBL203,40,802,842,0.759297,0.052238,$0.76\pm0.05$
2,CHEMBL1909140,DRD2,CHEMBL217,59,783,842,0.860297,0.025578,$0.86\pm0.03$


In [None]:
columns=['TargetName', 'AssayID', 'active', 'inactive', 'AUC']
names = ['Target', 'ChEMBL ID', 'Active', 'Inactive', 'AUC']
df_fil = assay_info[columns]
df_fil.columns = names
lt = df_fil.to_latex(float_format="%.2f", escape=False, index=False)
print(lt)

\begin{tabular}{llrrl}
\toprule
Target &      ChEMBL ID &  Active &  Inactive &            AUC \\
\midrule
  JAK2 &  CHEMBL3888429 &     140 &       527 &  $0.78\pm0.03$ \\
  EGFR &  CHEMBL1909203 &      40 &       802 &  $0.76\pm0.05$ \\
  DRD2 &  CHEMBL1909140 &      59 &       783 &  $0.86\pm0.03$ \\
\bottomrule
\end{tabular}

