## Libraries

In [29]:
import pandas as pd
import numpy as np
from crepes import ConformalClassifier, ConformalPredictiveSystem
from crepes.extras import hinge, margin, binning, DifficultyEstimator

## Data

### Calibration 

In [30]:
data_dir = "calibration_and_test.xlsx" #directory
df_cal=pd.read_excel(data_dir,sheet_name='conf_score_25') #cal dataset with conf=0.25
df_cal_tp = df_cal[df_cal['TP']==1] # True Positive Predictions Only

# 43 samples from each class
sample_size = 43
df_sampled = df_cal_tp.groupby('pred_obj_id').apply(lambda x: x.sample(sample_size))
df_sampled = df_sampled.reset_index(drop=True)

df_test=pd.read_excel(data_dir,sheet_name='test_conf_score_25') #test dataset with conf=0.25
df_test_filtered = df_test[df_test['pred_obj_id'] != '-']
df_calibration=df_sampled
df_testing=df_test_filtered

In [31]:
from my_functions import (softmax, 
                          softmax_df, 
                          calculate_logit, 
                          calculate_sigmoid, 
                          calculate_softmax,
                          calculate_hinge_scores,
                          calculate_p_values,
                          calculate_prediction_sets)

## Calibration stage

In [32]:
alphas_cal = hinge(
    df_calibration[[
        'ascaris_conf_score',
        'trichuris_conf_score',
        'hookworm_conf_score',
        'schistosoma_conf_score']].values,
    [0,1,2,3],df_calibration[['pred_obj_id']].values
)
cc_std = ConformalClassifier()
cc_std.fit(alphas_cal)
display(cc_std)

ConformalClassifier(fitted=True, mondrian=False)

## Testing stage

### Applying the hinge on the testing dataset

In [33]:
conf_score_columns = [
    'ascaris_conf_score',
    'trichuris_conf_score',
    'hookworm_conf_score',
    'schistosoma_conf_score']
hinge_score_cols = [
    'ascaris_hinge_score', 
    'trichuris_hinge_score', 
    'hookworm_hinge_score', 
    'schistosoma_hinge_score']
df_testing = calculate_hinge_scores(
    df_testing, 
    conf_score_columns, 
    hinge_score_cols)

In [34]:
p_value_cols = [
    'ascaris_p_value', 
    'trichuris_p_value', 
    'hookworm_p_value', 
    'schistosoma_p_value']
df_testing = calculate_p_values(
    df_testing, 
    hinge_score_cols, 
    p_value_cols, 
    cc_std)

In [35]:
p_set_cols = [
    'ascaris_p_set', 
    'trichuris_p_set', 
    'hookworm_p_set', 
    'schistosoma_p_set']
df_testing = calculate_prediction_sets(
    df_testing, 
    hinge_score_cols, 
    p_set_cols, 
    cc_std,
    confidence=0.99)

In [117]:
non_confirmity_scores = df_testing[df_testing['gt_obj_id']!='-'][['ascaris_hinge_score', 'trichuris_hinge_score', 'hookworm_hinge_score', 'schistosoma_hinge_score']].values
classes = [0,1,2,3]
y_test = df_testing[df_testing['gt_obj_id']!= '-']['gt_obj_id'].values
# y_test = df_testing['gt_obj_id'].values
# y_test = y_test.

In [123]:
y_test

array([3, 0, 3, ..., 0, 0, 0], dtype=object)

In [122]:
cc_std.evaluate(non_confirmity_scores,classes,y_test)

IndexError: index 0 is out of bounds for axis 0 with size 0

### 1. Singleton TPs

In [52]:
# Count the number of instances for each ground truth class
gt_counts = df_testing['gt_obj_id'].value_counts()

# Convert the counts to a DataFrame and rename columns
gt_counts_df = gt_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Ground Truth Counts'})

# Count the number of instances for each predicted class, excluding '-'
pred_counts = df_testing[df_testing['pred_obj_id'] != '-']['pred_obj_id'].value_counts()

# Convert the counts to a DataFrame and rename columns
pred_counts_df = pred_counts.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': 'Predicted Counts'})

# Replace numeric class labels with string labels, if applicable
if isinstance(gt_counts_df['Class'].iloc[0], int) and isinstance(pred_counts_df['Class'].iloc[0], int):
    gt_counts_df['Class'] = gt_counts_df['Class'].map(class_mapping)
    pred_counts_df['Class'] = pred_counts_df['Class'].map(class_mapping)

# Merge the DataFrames on the 'Class' column
counts_df = pd.merge(gt_counts_df, pred_counts_df, on='Class', how='outer')

# Fill NaN values with 0
counts_df.fillna(0, inplace=True)

# Convert counts to integers
counts_df['Ground Truth Counts'] = counts_df['Ground Truth Counts'].astype(int)
counts_df['Predicted Counts'] = counts_df['Predicted Counts'].astype(int)

# Display the DataFrame
counts_df

Unnamed: 0,Class,Ground Truth Counts,Predicted Counts
0,0,8915,9999
1,-,1468,0
2,1,523,563
3,3,133,410
4,2,51,118


In [36]:
p_set_cols = ['ascaris_p_set', 'trichuris_p_set', 'hookworm_p_set', 'schistosoma_p_set']

# Create a mask where each value is True if the corresponding value in p_set_cols is 1
mask = (df_testing[p_set_cols] == 1)

# Sum the True values along the rows. This gives the number of 1s in each row.
one_counts = mask.sum(axis=1)

# Create a mask where each value is True if the corresponding value in one_counts is 1 (singleton) and gt_obj_id matches pred_obj_id
mask_singleton_tp = (one_counts == 1) & (df_testing['gt_obj_id'] == df_testing['pred_obj_id'])

# Use the mask to filter the DataFrame
df_singleton_tp = df_testing[mask_singleton_tp]

In [37]:
# Filter out '-' class
df_testing_filtered = df_testing[df_testing['gt_obj_id'] != '-']
df_singleton_tp_filtered = df_singleton_tp[df_singleton_tp['gt_obj_id'] != '-']

# Calculate counts
gt_counts = df_testing_filtered['gt_obj_id'].value_counts()
pred_counts = df_testing_filtered['pred_obj_id'].value_counts()
singleton_counts = df_singleton_tp_filtered['gt_obj_id'].value_counts()

# Convert to DataFrame and merge
gt_counts_df = gt_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'GT Counts'})
pred_counts_df = pred_counts.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': 'Pred Counts'})
singleton_counts_df = singleton_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Singleton Counts'})

counts_df = pd.merge(gt_counts_df, pred_counts_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, singleton_counts_df, on='Class', how='outer')

# Create a mapping dictionary
class_mapping = {0: 'ascaris', 1: 'trichuris', 2: 'hookworm', 3: 'schistosoma'}

# Replace numeric class labels with string labels
counts_df['Class'] = counts_df['Class'].replace(class_mapping)

counts_df

Unnamed: 0,Class,GT Counts,Pred Counts,Singleton Counts
0,ascaris,8915,8970,8847
1,trichuris,523,468,454
2,schistosoma,133,136,129
3,hookworm,51,48,17


In [38]:
# Create a mask where each value is True if the corresponding value in p_set_cols is 1
mask = (df_testing[p_set_cols] == 1)

# Sum the True values along the rows. This gives the number of 1s in each row.
one_counts = mask.sum(axis=1)

# Create a mask where each value is True if the corresponding value in one_counts is more than 1 (not singleton) and gt_obj_id matches pred_obj_id
mask_not_singleton_tp = (one_counts > 1) & (df_testing['gt_obj_id'] == df_testing['pred_obj_id'])

# Use the mask to filter the DataFrame
df_not_singleton_tp = df_testing[mask_not_singleton_tp]
not_singleton_counts = df_not_singleton_tp['gt_obj_id'].value_counts()
not_singleton_counts_df = not_singleton_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Not Singleton Counts'})

# Replace numeric class labels with string labels
not_singleton_counts_df['Class'] = not_singleton_counts_df['Class'].replace(class_mapping)

not_singleton_counts_df

Unnamed: 0,Class,Not Singleton Counts
0,trichuris,6
1,hookworm,1
2,ascaris,1


In [42]:
# Create a mask for false positives
mask_fp = ((df_testing['gt_obj_id'] != df_testing['pred_obj_id']) & (df_testing[p_set_cols].sum(axis=1) > 0)) | (df_testing['gt_obj_id'] == '-')

# Filter the DataFrame for false positives
df_fp = df_testing[mask_fp]

# Count the number of false positives for each class
fp_counts = df_fp['gt_obj_id'].value_counts()

# Convert the counts to a DataFrame and rename columns
fp_counts_df = fp_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'False Positives'})

# Replace numeric class labels with string labels, if applicable
if isinstance(fp_counts_df['Class'].iloc[0], int):
    fp_counts_df['Class'] = fp_counts_df['Class'].map(class_mapping)

# Display the DataFrame
fp_counts_df

Unnamed: 0,Class,False Positives
0,-,1468
1,1,54
2,2,32
3,0,30
4,3,2


In [43]:
# Create a mask for singleton false positives
mask_singleton_fp = df_fp[p_set_cols].sum(axis=1) == 1

# Filter the DataFrame for singleton false positives
df_singleton_fp = df_fp[mask_singleton_fp]

# Count the number of singleton false positives for each class
singleton_fp_counts = df_singleton_fp['gt_obj_id'].value_counts()

# Convert the counts to a DataFrame and rename columns
singleton_fp_counts_df = singleton_fp_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Singleton False Positives'})

# Replace numeric class labels with string labels, if applicable
if isinstance(singleton_fp_counts_df['Class'].iloc[0], int):
    singleton_fp_counts_df['Class'] = singleton_fp_counts_df['Class'].map(class_mapping)

# Display the DataFrame
singleton_fp_counts_df

Unnamed: 0,Class,Singleton False Positives
0,-,1327
1,1,50
2,2,32
3,0,25
4,3,2


In [44]:
# Create a mask for non-singleton false positives
mask_non_singleton_fp = df_fp[p_set_cols].sum(axis=1) > 1

# Filter the DataFrame for non-singleton false positives
df_non_singleton_fp = df_fp[mask_non_singleton_fp]

# Count the number of non-singleton false positives for each class
non_singleton_fp_counts = df_non_singleton_fp['gt_obj_id'].value_counts()

# Convert the counts to a DataFrame and rename columns
non_singleton_fp_counts_df = non_singleton_fp_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Non-Singleton False Positives'})

# Replace numeric class labels with string labels, if applicable
if isinstance(non_singleton_fp_counts_df['Class'].iloc[0], int):
    non_singleton_fp_counts_df['Class'] = non_singleton_fp_counts_df['Class'].map(class_mapping)

# Display the DataFrame
non_singleton_fp_counts_df

Unnamed: 0,Class,Non-Singleton False Positives
0,-,12
1,0,5
2,1,4


In [45]:
# Create a mask for empty false positives
mask_empty_fp = df_fp[p_set_cols].sum(axis=1) == 0

# Filter the DataFrame for empty false positives
df_empty_fp = df_fp[mask_empty_fp]

# Count the number of empty false positives for each class
empty_fp_counts = df_empty_fp['gt_obj_id'].value_counts()

# Convert the counts to a DataFrame and rename columns
empty_fp_counts_df = empty_fp_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Empty False Positives'})

# Replace numeric class labels with string labels, if applicable
if isinstance(empty_fp_counts_df['Class'].iloc[0], int):
    empty_fp_counts_df['Class'] = empty_fp_counts_df['Class'].map(class_mapping)

# Display the DataFrame
empty_fp_counts_df

Unnamed: 0,Class,Empty False Positives
0,-,129


In [54]:
# Create a mask for false negatives
mask_fn = ((df_testing['gt_obj_id'] != df_testing['pred_obj_id']) & (df_testing['pred_obj_id'] == '-'))

# Filter the DataFrame for false negatives
df_fn = df_testing[mask_fn]

# Count the number of false negatives for each class
fn_counts = df_fn['gt_obj_id'].value_counts()

# Convert the counts to a DataFrame and rename columns
fn_counts_df = fn_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'False Negatives'})

# Replace numeric class labels with string labels, if applicable
if not fn_counts_df.empty and isinstance(fn_counts_df['Class'].iloc[0], int):
    fn_counts_df['Class'] = fn_counts_df['Class'].map(class_mapping)

# Display the DataFrame
fn_counts_df

Unnamed: 0,Class,False Negatives


In [62]:
# Create a mask for singleton predictions across all p_set_cols
mask_singleton = df_testing[p_set_cols].applymap(lambda x: sum(x) == 1 if isinstance(x, set) else False)

# Calculate OneC: proportion of all predictions that are singleton
one_c = (mask_singleton.sum().sum()) / (mask_singleton.count().sum())

one_c

0.0

In [61]:
mask_singleton

Unnamed: 0,ascaris_p_set,trichuris_p_set,hookworm_p_set,schistosoma_p_set
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,1,1,1
4,1,1,1,1
...,...,...,...,...
11869,1,1,1,1
11870,1,1,1,1
11871,1,1,1,1
11873,1,1,1,1


In [62]:
df_testing.to_csv('hinge_conf_score.csv',index=False)