In [1]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import json 
import os

In [2]:
def prob_ncm(scores, labels):
    """
    Converts Neural Network scores into Nonconformity Measures for CP.
    Assumes that scores are directly related to the probability of being active
    """
    return np.where( labels > 0, -scores, scores )

### p-Values calculation
def p_values(calibration_alphas, test_alphas, randomized=False):
    sorted_cal_alphas = sorted(calibration_alphas)
    if randomized:
        # for each test alpha, tieBreaker is the (number of calibration alphas with the same value)*(uniform RV between 0 and 1)
        tie_counts = np.searchsorted(sorted_cal_alphas,test_alphas,side='right')-np.searchsorted(sorted_cal_alphas,test_alphas)
        tie_breaker = np.random.uniform(size=len(np.atleast_1d(test_alphas)))*tie_counts
        return  (len(calibration_alphas)-(np.searchsorted(sorted_cal_alphas,test_alphas,side='right')-tie_breaker)+1)/(len(calibration_alphas)+1)
    else:
        return  (len(calibration_alphas)-np.searchsorted(sorted_cal_alphas,test_alphas)+1)/(len(calibration_alphas)+1)

# Mondrian Inductive Conformal Predictor
def micp(calibration_alphas,calibration_labels,test_alphas_0,test_alphas_1,randomized=False):
    """
    Mondrian Inductive Conformal Predictor
    Parameters:
    calibration_alphas: 1d array of Nonconformity Measures for the calibration examples
    calibration_labels: 1d array of labels for the calibration examples - ideally 0/1 or -1/+1,
                        but negative/positive values also accepted
    test_alpha_0: 1d array of NCMs for the test examples, assuming 0 as label
    test_alpha_1: 1d array of NCMs for the test examples, assuming 1 as label
    Returns:
    p0,p1 : pair of arrays containing the p-values for label 0 and label 1
    """
    if not len(calibration_labels)==len(calibration_alphas):
        raise ValueError("calibration_labels and calibration alphas must have the same size")
    
    if not len(np.atleast_1d(test_alphas_0))==len(np.atleast_1d(test_alphas_1)):
        raise ValueError("test_alphas_0 and test_alphas_1 must have the same size")
    
    p_0 = p_values(calibration_alphas[calibration_labels<=0],
                   test_alphas_0,
                   randomized)
    p_1 = p_values(calibration_alphas[calibration_labels>0],
                   test_alphas_1,
                   randomized)
    return p_0,p_1

# function to predict label from p0 and p1
def cp_label_predictor(p0, p1, eps):
    # Active: p1 > ϵ and p0 ≤ ϵ
    # Inactive: p0 > ϵ and p1 ≤ ϵ
    # Uncertain (Both): p1 > ϵ and p0 > ϵ
    # Empty (None): p1 ≤ ϵ and p0 ≤ ϵ
    if p1 > eps and p0 <= eps:
        return 1
    elif p0 > eps and p1 <= eps:
        return 0
    elif p0 > eps and p1 > eps:
        return 'uncertain both'
    elif p0 <= eps and p1 <= eps:
        # return 'empty'
        # it should actually return 'empty', but to avoid a confusion for people
        return 'uncertain none'

In [3]:
# CONFIG 

eps = 0.05

In [4]:
path_preds = '../image_predictions/all_cmpds/preds/pred_cpmodel_step2_inference_allcmpds-class.npy'
preds = np.load(path_preds,allow_pickle=True).item()

In [5]:
preds.shape

(30408, 284)

In [6]:
with open('../analysis/cp/labels_fva_dict.json') as fp:
    labels_fva_dict = json.load(fp)
with open('../analysis/cp/ncms_fva_fit_dict.json') as fp:
    ncms_fva_fit_dict = json.load(fp)

In [7]:
cols = list(np.unique(preds.nonzero()[1]))

In [8]:
indxs = []
n_active_preds = []
n_inactive_preds = []
n_uncertain_preds = []
cp_values = {}

for col in tqdm(cols):
    ncms_fva_col = np.array(ncms_fva_fit_dict[str(col)])
    labels_fva_col = np.array(labels_fva_dict[str(col)])
    
    preds_all_col = preds[:,col].data
    
    ncms_all_0 = prob_ncm(preds_all_col, np.repeat(0.,len(preds_all_col)))
    ncms_all_1 = prob_ncm(preds_all_col, np.repeat(1.,len(preds_all_col)))
    
    p0, p1 = micp(ncms_fva_col,labels_fva_col,ncms_all_0,ncms_all_1,randomized=False)
    cp_all = [cp_label_predictor(pe0, pe1, eps) for pe0, pe1 in zip(p0,p1)]

    cp_values[col] = cp_all
    
    indxs.append(col)
    n_active_preds.append(np.array([e==1 for e in cp_values[col]]).sum())
    n_inactive_preds.append(np.array([e==0 for e in cp_values[col]]).sum())
    n_uncertain_preds.append(np.array([e=='uncertain both' for e in cp_values[col]]).sum())
    
    

0it [00:00, ?it/s]


In [9]:
df_stats = pd.DataFrame({
    'n_active_pred':n_active_preds
    ,'n_inactive_pred':n_inactive_preds
    ,'n_uncertain_pred':n_uncertain_preds
    ,'col_indx':indxs
})

In [10]:
df_stats

Unnamed: 0,n_active_pred,n_inactive_pred,n_uncertain_pred,col_indx


In [11]:
tasks_for_aux = df_stats.query('n_active_pred>0 and n_inactive_pred>0')['col_indx']

In [12]:
arrs = []
for task in tqdm(tasks_for_aux): 
    arrs.append(pd.DataFrame({
            'standard_value':cp_values[task]
            ,'input_compound_id': # put an collection with the compound ids here. These don't necessarily need to match the MELLODDY ids, we will map in step2_7 (prev) compound_map['compound_id']
            ,'standard_qualifier':'='
            ,'input_assay_id':task
        }).query('standard_value == 0 or standard_value == 1'))

arr = pd.concat(arrs)

SyntaxError: invalid syntax (<ipython-input-12-d6c949ca8107>, line 6)

In [None]:
arr.shape

In [None]:
os.makedirs('./files/image_pseudolabel_aux_nolabels', exist_ok=True)

In [None]:
arr.to_csv('./files/image_pseudolabel_aux_nolabels/T1_image_pseudolabel_aux_nolabels.csv',index=False)