In [1]:
from scipy.sparse import csr_matrix, load_npz, save_npz, coo_matrix
import numpy as np 
import pandas as pd 
from sklearn.metrics import roc_auc_score,average_precision_score
import matplotlib.pyplot as plt 
from scipy.stats import entropy
from tqdm import tqdm 
import os 
import json 
import types 

#import seaborn as sns

In [2]:
from sparsechem import load_results

In [3]:
pvs = ['0'
       ,'0_2'
       ,'0_4'
       ,'0_5'
       ,'0_6'
       ,'0_7'
       ,'0_8'
       ,'0_9'
       ,'0_95'
       #,'0_99'
      ]

In [4]:
datapath = '../aux_data_preperation/baseline_plus_aux_data/'


In [5]:
# labels - filtering on the test fold already here in order to drop data
fold_te = 0 
path = os.path.join(datapath, 'matrices/cls/cls_T11_fold_vector.npy')
folds = np.load(path)

path = os.path.join(datapath, 'matrices/cls/cls_T10_y.npz')
y = load_npz(path)

fte_indx = np.where([folds==fold_te])[1]

y_df = pd.DataFrame({
    'label':coo_matrix(y).data
    ,'row':coo_matrix(y).row
    ,'col':coo_matrix(y).col
}).query('row in @fte_indx')

In [6]:
# get the main tasks that correspond to pseudolabels

path = './files/mapping/baseline_image_model_baselineaux_task_mapping.csv'
df_matching = pd.read_csv(path)

l = list(df_matching['cont_classification_task_id']) # this is the corresponding main task in the baseline+aux setup
y_true_df = y_df.query('col in @l')


In [7]:
# get the lables for the auxiliary tasks that correspond to the pseudolabels

path = os.path.join(datapath, 'results_tmp/classification/T8c.csv')
t8c = pd.read_csv(path)

# baseline dataset should not contain HTS data : if this is the case, other approach should be taken
l = list(t8c[t8c['assay_type'] == 'AUX_HTS']['cont_classification_task_id'])
y_pseudolabels_df = y_df.query('col in @l')

In [8]:
# col -> pseudolabel col
## strategy : add the cont_classification_task_id via the input_assay_id to the y_pseudolabels_df for merging with the preds

df_pseudolabel_to_maintasks = pd.merge(
    # connect the aux tasks' cont_classification_task_id from the base+aux model 
    t8c[['input_assay_id','cont_classification_task_id']] ## this works because every aux task has an unique input assya id 
    ,df_matching[['cont_classification_task_id','baseline_compliant_input_assay_id_image']]
    ,left_on='input_assay_id' ## this works because every aux task has an unique input assya id 
    ,right_on='baseline_compliant_input_assay_id_image'
    ,how='inner'
    ,suffixes=('_baselineaux','_matching')
)

In [9]:
y_pseudolabels_df_mapped = pd.merge(
    y_pseudolabels_df
    ,df_pseudolabel_to_maintasks
    ,left_on='col'
    ,right_on='cont_classification_task_id_baselineaux'
    ,how='inner'
)

In [10]:
## now start picking up the predictions

In [11]:
# baseline

In [12]:
path = '../predictions/preds/pred_inferencemodel_step3_fold0_baseline_plus_aux_baseline_noaux_-class.npy'
preds_baseline = np.load(path, allow_pickle=True).item()

preds_baseline_df = pd.DataFrame({
    'pred':coo_matrix(preds_baseline).data
    ,'row':coo_matrix(preds_baseline).row
    ,'col':coo_matrix(preds_baseline).col
})

FileNotFoundError: [Errno 2] No such file or directory: '../predictions/preds/pred_inferencemodel_step3_fold0_baseline_plus_aux_baseline_noaux_-class.npy'

In [13]:
# matching the labels to the predictions for the baseline
preds_labels_baseline_df = pd.merge(
    preds_baseline_df
    ,y_pseudolabels_df_mapped
    ,left_on=['row','col']
    ,right_on=['row','cont_classification_task_id_matching']
)

NameError: name 'preds_baseline_df' is not defined

In [None]:
# baseline + aux data 

In [None]:
preds_ppvnpv_df = {}
for pv in tqdm(pvs) : 
    path = '../predictions/preds/pred_inferencemodel_step3_fold0_baseline_plus_aux_ppv{}_npv{}_-class.npy'.format(pv,pv)
    preds_ppvnpv = np.load(path, allow_pickle=True).item()
    preds_ppvnpv_df[pv] = pd.DataFrame({
        'pred':coo_matrix(preds_ppvnpv).data
        ,'row':coo_matrix(preds_ppvnpv).row
        ,'col':coo_matrix(preds_ppvnpv).col
    })
    

In [None]:
# matching the labels to the predictions for the baseline
preds_labels_ppvnpv_df = {}

for pv in tqdm(pvs) : 
    preds_labels_ppvnpv_df[pv] = pd.merge(
        preds_ppvnpv_df[pv]
        ,y_pseudolabels_df_mapped
        ,left_on=['row','col']
        ,right_on=['row','cont_classification_task_id_matching']
    )

In [None]:
# metrics calculation

In [None]:
rocs = {}
avgprs = {}
rocs_baseline = {}
avgprs_baseline = {}

cols_to_consider = {}
rocs_baseline = {}
avgprs_baseline = {}

for pv in tqdm(pvs) : 
    
    rocs[pv] = []
    avgprs[pv] = []
    rocs_baseline[pv] = []
    avgprs_baseline[pv] = []

    cols_to_consider[pv] = preds_labels_ppvnpv_df[pv]['col_x'].drop_duplicates()

    for col in cols_to_consider[pv] : 
        # baseline
        arr = preds_labels_baseline_df.query('col_x == @col')
        roc = roc_auc_score(arr['label'],arr['pred'])
        rocs_baseline[pv].append(roc)
        avg_pr = average_precision_score(arr['label'],arr['pred'])
        avgprs_baseline[pv].append(avg_pr)
        
        # with aux data
        arr = preds_labels_ppvnpv_df[pv].query('col_x == @col')
        roc = roc_auc_score(arr['label'],arr['pred'])
        rocs[pv].append(roc)
        avg_pr = average_precision_score(arr['label'],arr['pred'])
        avgprs[pv].append(avg_pr)

### Main plots

In [None]:
# interprete the roc auc as a probability


l = [
    (np.array(rocs[k])/(1-np.array(rocs[k])))
    / 
    (np.array(rocs_baseline[k])/(1-np.array(rocs_baseline[k])))
     for k in rocs.keys()]

_ = plt.boxplot(l, showfliers=False) 
_ = plt.xticks(list(range(1,len(pvs)+1))
               ,pvs
               , rotation=90
              )
plt.plot( 
    [.5,len(pvs)+.5]
    ,[1,1]
    ,c='blue'
    ,ls='--'
)
plt.ylabel('cross-AUC ROC odds (aux/noaux)')
plt.xlabel('task quality criterium (NPV and PPV > threshold)\nused for task selection')
plt.grid(ls='--', axis='y')
plt.savefig('./results/odds.png'    
            ,bbox_inches='tight'
            ,pad_inches=0.2
           )

In [None]:
l = [np.median(rocs[k]) for k in rocs.keys()]
lb = [np.median(rocs_baseline[k]) for k in rocs.keys()]
_ = plt.plot(l) 
_ = plt.plot(lb) 
_ = plt.xticks(list(range(len(pvs)))
               ,pvs
               , rotation=90
              )
plt.grid(axis='y', ls='--')
plt.ylabel('median delta cross-AUC ROC')
plt.legend(['auxiliary data', 'no auxiliary data'])
plt.xlabel('task quality criterium (NPV and PPV > threshold)\nused for task selection')
plt.savefig('./results/cross_auc_roc.png'    
            ,bbox_inches='tight'
            ,pad_inches=0.2
           )

### Layer size

In [None]:
sizes = [4000,6000,8000]
l = []

for size in tqdm(sizes) : 
    
    path = '../modelling/models/sc_baseline_plus_aux_baseline_noaux_2010_h4000_ldo0.8_wd1e-06_lr0.001_lrsteps10_ep20_fva0_fte-1.json'
    prim_perf_baseline = load_results(path)
    prim_perf = {}
    
    for pv in pvs : 
        prim_perf[pv] = []
        path = '../modelling/models/sc_baseline_plus_aux_ppv{}_npv{}_2010_h{}_ldo0.8_wd1e-06_lr0.001_lrsteps10_ep20_fva0_fte-1.json'.format(pv,pv,size)
        prim_perf[pv] = load_results(path)
        tmp = pd.DataFrame(
            prim_perf[pv]['validation']['classification']['roc_auc_score'][cols_to_consider[pv]]
            -prim_perf_baseline['validation']['classification']['roc_auc_score'][cols_to_consider[pv]]
        )
        tmp['pv'] = pv
        tmp['size'] = size
        l.append(tmp.reset_index())

ccd = pd.concat(l)

In [None]:
sns.boxplot(hue="size", y="roc_auc_score",
            x="pv",showfliers=False,
            data=ccd)  #, palette=["m", "g"]
plt.plot( 
    [-.5,len(pvs)-.5]
    ,[0,0]
    ,c='blue'
    ,ls='--'
)

plt.ylabel('(primary) AUC ROC ')
plt.xlabel('task quality criterium (NPV and PPV > threshold)\nused for task selection')
plt.grid(ls='--',axis='y')
plt.savefig('./results/layer_size.png'
            ,bbox_inches='tight'
            ,pad_inches=0.2
           )

In [None]:
### Primary performance

In [None]:
sizes = [4000,6000,8000]
p_roc = []
p_pr = []

    
    
for pv in pvs : 
    perf_roc = 0 
    perf_pr = 0
    for size in tqdm(sizes) : 
        prim_perf[pv] = []
        path = '../modelling/models/sc_baseline_plus_aux_ppv{}_npv{}_2010_h{}_ldo0.8_wd1e-06_lr0.001_lrsteps10_ep20_fva0_fte-1.json'.format(pv,pv,size)
        
        perf_roc_new = load_results(path)['validation']['classification_agg']['roc_auc_score']
        perf_roc = perf_roc_new if perf_roc<perf_roc_new else perf_roc
        
        perf_pr_new = load_results(path)['validation']['classification_agg']['auc_pr']
        perf_pr = perf_pr_new if perf_pr<perf_pr_new else perf_pr
        
    p_roc.append(perf_roc)
    p_pr.append(perf_pr)
p_roc.append(prim_perf_baseline['validation']['classification_agg']['roc_auc_score'])
p_pr.append(prim_perf_baseline['validation']['classification_agg']['auc_pr'])

In [None]:
plt.plot(p_roc)
plt.ylabel('AUC ROC')
_ = plt.xticks(list(range(len(pvs)+1))
               ,pvs+['baseline']
               , rotation=90
              )
plt.savefig('./results/global_auc_roc.png')

In [None]:
plt.plot(p_pr)
plt.ylabel('AUC PR')
_ = plt.xticks(list(range(len(pvs)+1))
               ,pvs+['baseline']
               , rotation=90
              )
plt.savefig('./results/global_auc_pr.png')

### Data volumes 

In [None]:
plt.plot(
    [len(cols_to_consider[k]) for k in rocs.keys() ]
)
_ = plt.xticks(list(range(len(pvs)))
               ,pvs
               , rotation=90
              )
plt.xlabel('task quality criterium (NPV and PPV > threshold)\nused for task selection')
plt.ylabel('number of tasks')
plt.grid(ls='--', axis='y')

In [None]:
plt.plot(
    [len(preds_ppvnpv_df[k]) for k in rocs.keys() ]
    
)
_ = plt.xticks(list(range(len(pvs)))
               ,pvs
               , rotation=90
              )
plt.xlabel('task quality criterium (NPV and PPV > threshold)\nused for task selection')
plt.ylabel('number of auxiliary datapoints')
plt.grid(ls='--', axis='y')

In [None]:

plt.plot(
    np.array([len(preds_labels_ppvnpv_df[k]) for k in rocs.keys() ])
)
_ = plt.xticks(list(range(len(pvs)))
               ,pvs
               , rotation=90
              )
plt.xlabel('task quality criterium (NPV and PPV > threshold)\nused for task selection')
plt.ylabel('number of auxiliary datapoints')
plt.grid(ls='--', axis='y')

### Primary vs secondary

In [None]:
plt.scatter(
    rocs_baseline['0_6']
    ,rocs['0_6']
    ,alpha=0.2
)
plt.plot( 
    [0.3,1]
    ,[0.3,1.]
)
plt.xlabel('baseline')
plt.ylabel('with aux data')
plt.grid()
plt.title('cross-AUC ROC')

In [None]:
fig, ax = plt.subplots(len(pvs), figsize=(10,55))

for i,pv in tqdm(enumerate(pvs)) : 

    k = pv
    ax[i].scatter(
        prim_perf[pv]['validation']['classification']['roc_auc_score'][cols_to_consider[k]]
        -prim_perf_baseline['validation']['classification']['roc_auc_score'][cols_to_consider[k]]
        , 
        (np.array(rocs[k])-np.array(rocs_baseline[k]))/np.array(rocs[k])
        ,alpha=.3
    )
    ax[i].grid('--')
    ax[i].set_xlabel('delta primary AUC ROC')
    ax[i].set_ylim([-.05,.25])
    ax[i].set_xlim([-.06,.03])
    ax[i].set_ylabel('relative delta cross-AUC ROC (%)')
    ax[i].set_title('NPV/PPV threshold {}'.format(pv))
    ax[i].plot( 
    [0,0]
    ,[-.05,0.25]
    ,c='red'
    ,ls='--'
)
    ax[i].plot( 
    [-.06,0.03]
    ,[0,0]
    ,c='red'
    ,ls='--'
)