In [1]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm

In [2]:
def prob_ncm(scores, labels):
    """
    Converts Neural Network scores into Nonconformity Measures for CP.
    Assumes that scores are directly related to the probability of being active
    """
    return np.where( labels > 0, -scores, scores )

### p-Values calculation
def p_values(calibration_alphas, test_alphas, randomized=False):
    sorted_cal_alphas = sorted(calibration_alphas)
    if randomized:
        # for each test alpha, tieBreaker is the (number of calibration alphas with the same value)*(uniform RV between 0 and 1)
        tie_counts = np.searchsorted(sorted_cal_alphas,test_alphas,side='right')-np.searchsorted(sorted_cal_alphas,test_alphas)
        tie_breaker = np.random.uniform(size=len(np.atleast_1d(test_alphas)))*tie_counts
        return  (len(calibration_alphas)-(np.searchsorted(sorted_cal_alphas,test_alphas,side='right')-tie_breaker)+1)/(len(calibration_alphas)+1)
    else:
        return  (len(calibration_alphas)-np.searchsorted(sorted_cal_alphas,test_alphas)+1)/(len(calibration_alphas)+1)

# Mondrian Inductive Conformal Predictor
def micp(calibration_alphas,calibration_labels,test_alphas_0,test_alphas_1,randomized=False):
    """
    Mondrian Inductive Conformal Predictor
    Parameters:
    calibration_alphas: 1d array of Nonconformity Measures for the calibration examples
    calibration_labels: 1d array of labels for the calibration examples - ideally 0/1 or -1/+1,
                        but negative/positive values also accepted
    test_alpha_0: 1d array of NCMs for the test examples, assuming 0 as label
    test_alpha_1: 1d array of NCMs for the test examples, assuming 1 as label
    Returns:
    p0,p1 : pair of arrays containing the p-values for label 0 and label 1
    """
    if not len(calibration_labels)==len(calibration_alphas):
        raise ValueError("calibration_labels and calibration alphas must have the same size")
    
    if not len(np.atleast_1d(test_alphas_0))==len(np.atleast_1d(test_alphas_1)):
        raise ValueError("test_alphas_0 and test_alphas_1 must have the same size")
    
    p_0 = p_values(calibration_alphas[calibration_labels<=0],
                   test_alphas_0,
                   randomized)
    p_1 = p_values(calibration_alphas[calibration_labels>0],
                   test_alphas_1,
                   randomized)
    return p_0,p_1

# function to predict label from p0 and p1
def cp_label_predictor(p0, p1, eps):
    # Active: p1 > ϵ and p0 ≤ ϵ
    # Inactive: p0 > ϵ and p1 ≤ ϵ
    # Uncertain (Both): p1 > ϵ and p0 > ϵ
    # Empty (None): p1 ≤ ϵ and p0 ≤ ϵ
    if p1 > eps and p0 <= eps:
        return 1
    elif p0 > eps and p1 <= eps:
        return 0
    elif p0 > eps and p1 > eps:
        return 'uncertain both'
    elif p0 <= eps and p1 <= eps:
        # return 'empty'
        # it should actually return 'empty', but to avoid a confusion for people
        return 'uncertain none'

In [145]:
# CONFIG 

eps = 0.05
fold_va = 2
    
fva_preds = '../predictions/preds/pred_cpmodel_step1_main_tasks_fold2-class.npy'
path_folds = '../data/y2_ext_with_image/matrices/cls/cls_T11_fold_vector.npy'
path_labels = '../data/y2_ext_with_image/matrices/cls/cls_T10_y.npy'
path_sn = '../data/y2_ext_with_image/results_tmp/folding/T2_folds.csv'
path_t5 = '../data/y2_ext_with_image/mapping_table/T5.csv' 
path_t6_cont = '../data/y2_ext_with_image/results/T10c_cont.csv'

In [None]:
folds = np.load(path_folds,allow_pickle=True)
labels = np.load(path_labels,allow_pickle=True).item()
preds_fva = np.load(fva_preds,allow_pickle=True).item()

In [None]:
labels.shape

In [None]:
sn = pd.read_csv(path_sn)

In [None]:
sn_fold2 = sn.query('fold_id == 2')
sn_scaffolds = sn_fold2.groupby(by='sn_smiles').count()['input_compound_id'].sort_values(ascending=False)
sn_scaffolds

In [43]:
sn_map = sn_scaffolds.reset_index().drop(columns='input_compound_id')
sn_map['fold_split'] = np.tile([0,1],reps=len(sn_scaffolds)//2) # ensuring similar size of both groups

In [56]:
sn_mgd = pd.merge(
    sn_fold2
    ,sn_map
    ,how='inner'
    ,on='sn_smiles'
)

In [65]:
assert len(sn_mgd) == len(sn_fold2)

In [None]:
len(sn_mgd)

In [None]:
# link to the cdi 

In [None]:
# t5 
input_compound_id,descriptor_vector_id
# t6_cont
descriptor_vector_id,cont_descriptor_vector_id

In [93]:
t5 = pd.read_csv(path_t5)
t6_cont = pd.read_csv(path_t6_cont)

In [94]:
df_mgd = pd.merge(
        pd.merge(
            t5
            ,t6_cont
            ,how='inner'
            ,on='descriptor_vector_id'
        ), 
        sn_mgd
        ,how='inner'
        ,on='input_compound_id'
)

In [None]:
df_mgd.shape

In [None]:
df_mgd.columns

In [None]:
# half of fold2 will be used to fit the CP, 
# half of fold2 will be used to evaluate the CP

In [None]:
real_cdvi = pd.DataFrame(
    sorted(t6_cont['cont_descriptor_vector_id'].drop_duplicates())
)[0].to_dict()# .reset_index()
real_cdvi = {v:k for k,v in real_cdvi.items()}
real_cdvi

In [None]:
df_mgd['real_cont_descriptor_vector_id'] = df_mgd['cont_descriptor_vector_id'].map(real_cdvi)

In [None]:
cdvi_fit = np.array(list(set(df_mgd.query('fold_split == 0')['real_cont_descriptor_vector_id'])))
cdvi_eval = np.array(list(set(df_mgd.query('fold_split == 1')['real_cont_descriptor_vector_id'])))

In [None]:
cdvi_fit.shape

In [None]:
cdvi_eval.shape

In [None]:
######## CP stuff ########

In [192]:
e_inacts = []
e_acts = []
val_inacts = []
val_acts = []
lit_val_inacts = []
lit_val_acts = []
unis = []
idxs = []
n_acts = []
n_inacts = []
ncms_fva_fit_dict = {}
labels_fva_fit_dict = {}


for col in tqdm(list(np.unique(preds_fva.nonzero()[1]))):
    try: 
        row_idx_preds_fit = np.intersect1d(
            preds_fva[:,col].nonzero()[0]
            ,cdvi_fit
        )
        row_idx_preds_eval = np.intersect1d(
            preds_fva[:,col].nonzero()[0]
            ,cdvi_eval
        )
        preds_fva_col = preds_fva[row_idx_preds_fit,col].toarray().squeeze()
        preds_fte_col = preds_fva[row_idx_preds_eval,col].toarray().squeeze()
        
        row_idx_labels_fit = np.intersect1d(
            labels[:,col].nonzero()[0]
            ,cdvi_fit
        )
        row_idx_labels_eval = np.intersect1d(
            labels[:,col].nonzero()[0]
            ,cdvi_eval
        )
        
        labels_fva_col = labels[row_idx_labels_fit,col].toarray().squeeze()
        labels_fva_col = np.where(labels_fva_col == -1,0,1)
        labels_fte_col = labels[row_idx_labels_eval,col].toarray().squeeze()
        labels_fte_col = np.where(labels_fte_col == -1,0,1)

        ncms_fva = prob_ncm(preds_fva_col, labels_fva_col)
        ncms_fva_fit_dict[str(col)] = ncms_fva.tolist()  # use tolist() to avoid difficulties with the serialisation
        labels_fva_fit_dict[str(col)] = labels_fva_col.tolist() # use tolist() to avoid difficulties with the serialisation
        #ncms_test_0 = prob_ncm(preds_fte_col, labels_fte_col)
        #ncms_test_1 = prob_ncm(preds_fte_col, labels_fte_col)
        ncms_test_0 = prob_ncm(preds_fte_col, np.repeat(0.,len(preds_fte_col)))
        ncms_test_1 = prob_ncm(preds_fte_col, np.repeat(1.,len(preds_fte_col)))

        p0, p1 = micp(ncms_fva,labels_fva_col,ncms_test_0,ncms_test_1,randomized=False)

        cp_test = [cp_label_predictor(pe0, pe1, eps) for pe0, pe1 in zip(p0,p1)]
        certain_idcs = np.where((np.array(cp_test) == '0') | (np.array(cp_test) == '1'))[0]
        idx_uncertain_none = np.where([e == 'uncertain none' for e in cp_test])[0]
        idx_uncertain_both = np.where([e == 'uncertain both' for e in cp_test])[0]
        idx_inact = np.where(labels_fte_col == 0)[0]
        idx_inact_certain = np.intersect1d(idx_inact,certain_idcs)
        idx_inact_both = np.intersect1d(idx_inact,idx_uncertain_both)
        idx_act = np.where(labels_fte_col == 1)[0]
        idx_act_certain = np.intersect1d(idx_act,certain_idcs)
        idx_act_both = np.intersect1d(idx_act,idx_uncertain_both)

        # efficiency 
        efficiency_inact = len(idx_inact_certain) / len(idx_inact)
        efficiency_act = len(idx_act_certain) / len(idx_act)

        # validity 
        validity_inact = \
             np.sum(np.array(cp_test)[idx_inact_certain] == labels_fte_col[idx_inact_certain].astype(str)) / \
             len(np.array(cp_test)[idx_inact_certain])
        validity_act = \
            np.sum(np.array(cp_test)[idx_act_certain] == labels_fte_col[idx_act_certain].astype(str)) / \
            len(np.array(cp_test)[idx_act_certain])

        # literature validity 
        literature_validity_inact = \
             (np.sum(np.array(cp_test)[idx_inact_certain] == labels_fte_col[idx_inact_certain].astype(str)) \
             + len(idx_inact_both)) / \
             len(idx_inact)
        literature_validity_act = \
            (np.sum(np.array(cp_test)[idx_act_certain] == labels_fte_col[idx_act_certain].astype(str)) \
            + len(idx_act_both)) / \
            len(idx_act)


        uni = np.unique(cp_test)

        e_inacts.append(efficiency_inact)
        e_acts.append(efficiency_act)
        val_inacts.append(validity_inact)
        val_acts.append(validity_act)
        lit_val_inacts.append(literature_validity_inact)
        lit_val_acts.append(literature_validity_act)
        unis.append(str(list(uni)))
        idxs.append(col)
        n_acts.append(len(idx_act))
        n_inacts.append(len(idx_inact))

    except Exception as e:
        print(e)
        


 63%|██████▎   | 525/834 [00:24<00:14, 21.33it/s]

division by zero


100%|██████████| 834/834 [00:38<00:00, 21.87it/s]


In [193]:
# storing the inputs to the micp() function in order to obtain the CP labels for the inference predictions
import json 
with open('./cp/ncms_fva_fit_dict.json', 'w') as fp:
	json.dump(ncms_fva_fit_dict, fp)
with open('./cp/labels_fva_dict.json', 'w') as fp:
	json.dump(labels_fva_fit_dict, fp)


In [176]:
df = pd.DataFrame({
    'n_inactives_eval':n_inacts
    ,'n_actives_eval':n_acts
    ,'efficiency_0' : e_inacts
    ,'efficiency_1':e_acts
    ,'validity_0':val_inacts
    ,'validity_1':val_acts
    ,'literature_validity_0':lit_val_inacts
    ,'literature_validity_1':lit_val_acts
    ,'valuess':unis
    ,'index':idxs
}).to_csv('./cp/summary_eps_' + str(eps) + '.csv', index=False)