## Libraries

In [1]:
import pandas as pd
import numpy as np
from crepes import ConformalClassifier, ConformalPredictiveSystem
from crepes.extras import hinge, margin, binning, DifficultyEstimator

## Data

### Calibration 

In [46]:
data_dir = "calibration_and_test.xlsx" #directory
df_cal=pd.read_excel(data_dir,sheet_name='conf_score_25') #cal dataset with conf=0.25
# df_cal_tp = df_cal[df_cal['TP']==1] # True Positive Predictions Only

# Create a mask where 'gt_obj_id' is '-'
mask = df_cal['gt_obj_id'] == '-'

# Replace the 'gt_obj_id' and 'pred_obj_id' values with 4 where the mask is True
df_cal.loc[mask, 'gt_obj_id'] = 4
df_cal.loc[mask, 'pred_obj_id'] = 4
df_equal = df_cal[df_cal['gt_obj_id'] == df_cal['pred_obj_id']]

sample_size = 43

# Set a seed for the random number generator
seed = 100

df_sampled = df_equal.groupby('pred_obj_id').apply(lambda x: x.sample(sample_size, replace=True, random_state=seed))
df_sampled = df_sampled.reset_index(drop=True)

In [58]:
df_sampled[df_sampled['gt_obj_id'] == 3].shape

(43, 22)

In [29]:


# 43 samples from each class


df_test=pd.read_excel(data_dir,sheet_name='test_conf_score_25') #test dataset with conf=0.25
df_test_filtered = df_test[df_test['pred_obj_id'] != '-']
df_calibration=df_sampled
df_testing=df_test_filtered

In [30]:
from my_functions import (softmax, 
                          softmax_df, 
                          calculate_logit, 
                          calculate_sigmoid, 
                          calculate_softmax,
                          calculate_hinge_scores,
                          calculate_p_values,
                          calculate_prediction_sets)

## Calibration stage

In [31]:
alphas_cal = hinge(
    df_calibration[[
        'ascaris_conf_score',
        'trichuris_conf_score',
        'hookworm_conf_score',
        'schistosoma_conf_score']].values,
    [0,1,2,3],df_calibration[['pred_obj_id']].values
)
cc_std = ConformalClassifier()
cc_std.fit(alphas_cal)
display(cc_std)

ConformalClassifier(fitted=True, mondrian=False)

## Testing stage

### Applying the hinge on the testing dataset

In [32]:
conf_score_columns = [
    'ascaris_conf_score',
    'trichuris_conf_score',
    'hookworm_conf_score',
    'schistosoma_conf_score']
hinge_score_cols = [
    'ascaris_hinge_score', 
    'trichuris_hinge_score', 
    'hookworm_hinge_score', 
    'schistosoma_hinge_score']
df_testing = calculate_hinge_scores(
    df_testing, 
    conf_score_columns, 
    hinge_score_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [33]:
p_value_cols = [
    'ascaris_p_value', 
    'trichuris_p_value', 
    'hookworm_p_value', 
    'schistosoma_p_value']
df_testing = calculate_p_values(
    df_testing, 
    hinge_score_cols, 
    p_value_cols, 
    cc_std)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [34]:
p_set_cols = [
    'ascaris_p_set', 
    'trichuris_p_set', 
    'hookworm_p_set', 
    'schistosoma_p_set']
df_testing = calculate_prediction_sets(
    df_testing, 
    hinge_score_cols, 
    p_set_cols, 
    cc_std,
    confidence=0.99)

In [35]:
non_confirmity_scores = df_testing[df_testing['gt_obj_id']!='-'][['ascaris_hinge_score', 'trichuris_hinge_score', 'hookworm_hinge_score', 'schistosoma_hinge_score']].values
non_confirmity_scores = non_confirmity_scores.astype(np.float64)
classes = [0,1,2,3]
y_test = df_testing[df_testing['gt_obj_id']!= '-']['gt_obj_id'].values
y_test = y_test.astype(np.int64)
cc_std.evaluate(non_confirmity_scores,classes, y_test, metrics=['error','avg_c','one_c','empty'])

{'error': 0.0392849719393058,
 'avg_c': 0.9701725213053419,
 'one_c': 0.9697568073165662,
 'empty': 0.030035335689045935}

### 1. Singleton TPs

In [36]:
# Filter out '-' class
df_testing_filtered = df_testing[df_testing['gt_obj_id'] != '-']

# Calculate counts
gt_counts = df_testing_filtered['gt_obj_id'].value_counts()
pred_counts = df_testing_filtered['pred_obj_id'].value_counts()

# Create a mask where each value is True if the corresponding value in p_set_cols is 1
mask = (df_testing[p_set_cols] == 1)

# Sum the True values along the rows. This gives the number of 1s in each row.
one_counts = mask.sum(axis=1)

# Create a mask where each value is True if the corresponding value in one_counts is 1 (singleton) and gt_obj_id matches pred_obj_id
mask_singleton_tp = (one_counts == 1) & (df_testing['gt_obj_id'] == df_testing['pred_obj_id'])

# Use the mask to filter the DataFrame
df_singleton_tp = df_testing[mask_singleton_tp]
singleton_counts = df_singleton_tp['gt_obj_id'].value_counts()

# For non-singleton sets, we want rows where the sum of 1s is more than 1
mask_non_singleton = (one_counts > 1) & (df_testing['gt_obj_id'] == df_testing['pred_obj_id'])
df_non_singleton_tp = df_testing[mask_non_singleton]
non_singleton_counts = df_non_singleton_tp['gt_obj_id'].value_counts()

# For empty sets, we want rows where the sum of 1s is 0
mask_empty_set = (one_counts == 0) & (df_testing['gt_obj_id'] == df_testing['pred_obj_id'])
df_empty_set_tp = df_testing[mask_empty_set]
empty_set_counts = df_empty_set_tp['gt_obj_id'].value_counts()

# Convert to DataFrame and merge
gt_counts_df = gt_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'GT Counts'})
pred_counts_df = pred_counts.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': 'Pred Counts'})
singleton_counts_df = singleton_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Singleton Counts'})
non_singleton_counts_df = non_singleton_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Non-Singleton Counts'})
empty_set_counts_df = empty_set_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Empty Set Counts'})

counts_df = pd.merge(gt_counts_df, pred_counts_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, singleton_counts_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, non_singleton_counts_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, empty_set_counts_df, on='Class', how='outer')

# Create a mapping dictionary
class_mapping = {0: 'ascaris', 1: 'trichuris', 2: 'hookworm', 3: 'schistosoma'}

# Replace numeric class labels with string labels
counts_df['Class'] = counts_df['Class'].replace(class_mapping)

counts_df

Unnamed: 0,Class,GT Counts,Pred Counts,Singleton Counts,Non-Singleton Counts,Empty Set Counts
0,ascaris,8915,8970,8847,1.0,36
1,trichuris,523,468,454,6.0,6
2,schistosoma,133,136,129,,2
3,hookworm,51,48,17,1.0,1


## FNs

In [37]:
# Filter out '-' class and create a new DataFrame
df_filtered = df_testing[df_testing['gt_obj_id'] != '-']

# Calculate counts
gt_counts = df_filtered['gt_obj_id'].value_counts()
pred_counts = df_filtered['pred_obj_id'].value_counts()

# Create a mask where each value is True if the corresponding value in p_set_cols is 1
mask = (df_filtered[p_set_cols] == 1)

# Sum the True values along the rows. This gives the number of 1s in each row.
one_counts = mask.sum(axis=1)

# Create a mask where each value is True if the corresponding value in one_counts is 1 (singleton) and gt_obj_id does not match pred_obj_id
mask_singleton_fn = (one_counts == 1) & (df_filtered['gt_obj_id'] != df_filtered['pred_obj_id'])

# Use the mask to filter the DataFrame
df_singleton_fn = df_filtered[mask_singleton_fn]
singleton_counts_fn = df_singleton_fn['gt_obj_id'].value_counts()

# For non-singleton sets, we want rows where the sum of 1s is more than 1 and the corresponding class in the set is 0
mask_non_singleton = (one_counts > 1) & ~mask
df_non_singleton_fn = df_filtered[mask_non_singleton]
non_singleton_counts_fn = df_non_singleton_fn['gt_obj_id'].value_counts()

# For empty sets, we want rows where the sum of 1s is 0
mask_empty_set = (one_counts == 0) & (df_filtered['gt_obj_id'] != df_filtered['pred_obj_id'])
df_empty_set_fn = df_filtered[mask_empty_set]
empty_set_counts_fn = df_empty_set_fn['gt_obj_id'].value_counts()

# Convert to DataFrame and merge
gt_counts_df = gt_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'GT Counts'})
pred_counts_df = pred_counts.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': 'Pred Counts'})
singleton_counts_fn_df = singleton_counts_fn.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Singleton FN Counts'})
non_singleton_counts_fn_df = non_singleton_counts_fn.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Non-Singleton FN Counts'})
empty_set_counts_fn_df = empty_set_counts_fn.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'Empty Set FN Counts'})

counts_df = pd.merge(gt_counts_df, pred_counts_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, singleton_counts_fn_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, non_singleton_counts_fn_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, empty_set_counts_fn_df, on='Class', how='outer')

# Create a mapping dictionary
class_mapping = {0: 'ascaris', 1: 'trichuris', 2: 'hookworm', 3: 'schistosoma'}

# Replace numeric class labels with string labels
counts_df['Class'] = counts_df['Class'].replace(class_mapping)

counts_df

Unnamed: 0,Class,GT Counts,Pred Counts,Singleton FN Counts,Non-Singleton FN Counts,Empty Set FN Counts
0,ascaris,8915,8970,25,,1.0
1,trichuris,523,468,50,,3.0
2,schistosoma,133,136,2,,
3,hookworm,51,48,32,,


## FPs


In [38]:
# Filter out '-' class
df_filtered = df_testing[df_testing['gt_obj_id'] != '-']

# Calculate counts
gt_counts = df_filtered['gt_obj_id'].value_counts()
pred_counts = df_filtered['pred_obj_id'].value_counts()

# Create a mask where each value is True if the corresponding value in p_set_cols is 1
mask = (df_filtered[p_set_cols] == 1)

# Sum the True values along the rows. This gives the number of 1s in each row.
one_counts = mask.sum(axis=1)

# Create a mask where each value is True if the corresponding value in one_counts is 1 (singleton) and gt_obj_id does not match pred_obj_id
mask_singleton_fp = (one_counts == 1) & (df_filtered['gt_obj_id'] != df_filtered['pred_obj_id'])

# Use the mask to filter the DataFrame
df_singleton_fp = df_filtered[mask_singleton_fp]
singleton_counts_fp = df_singleton_fp['pred_obj_id'].value_counts()

# For non-singleton sets, we want rows where the sum of 1s is more than 1 and the corresponding class in the set is 0
mask_non_singleton = (one_counts > 1) & ~mask
df_non_singleton_fp = df_filtered[mask_non_singleton]
non_singleton_counts_fp = df_non_singleton_fp['pred_obj_id'].value_counts()

# For empty sets, we want rows where the sum of 1s is 0
mask_empty_set = (one_counts == 0) & (df_filtered['gt_obj_id'] != df_filtered['pred_obj_id'])
df_empty_set_fp = df_filtered[mask_empty_set]
empty_set_counts_fp = df_empty_set_fp['pred_obj_id'].value_counts()

# Convert to DataFrame and merge
gt_counts_df = gt_counts.reset_index().rename(columns={'index': 'Class', 'gt_obj_id': 'GT Counts'})
pred_counts_df = pred_counts.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': 'Pred Counts'})
singleton_counts_fp_df = singleton_counts_fp.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': 'Singleton FP Counts'})
non_singleton_counts_fp_df = non_singleton_counts_fp.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': 'Non-Singleton FP Counts'})
empty_set_counts_fp_df = empty_set_counts_fp.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': 'Empty Set FP Counts'})

counts_df = pd.merge(gt_counts_df, pred_counts_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, singleton_counts_fp_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, non_singleton_counts_fp_df, on='Class', how='outer')
counts_df = pd.merge(counts_df, empty_set_counts_fp_df, on='Class', how='outer')

# Create a mapping dictionary
class_mapping = {0: 'ascaris', 1: 'trichuris', 2: 'hookworm', 3: 'schistosoma'}

# Replace numeric class labels with string labels
counts_df['Class'] = counts_df['Class'].replace(class_mapping)

counts_df

Unnamed: 0,Class,GT Counts,Pred Counts,Singleton FP Counts,Non-Singleton FP Counts,Empty Set FP Counts
0,ascaris,8915,8970,79,,3.0
1,trichuris,523,468,1,,
2,schistosoma,133,136,4,,1.0
3,hookworm,51,48,25,,


In [39]:
# Create a mask where each value is True if gt_obj_id is "-"
mask = (df_testing['gt_obj_id'] == '-')

# Use the mask to filter the DataFrame
df_filtered = df_testing[mask]

def count_sets(df_filtered, p_set_cols, condition, column_name):
    # Create a mask where each value is True if the corresponding value in p_set_cols is 1
    mask = (df_filtered[p_set_cols] == 1)

    # Sum the True values along the rows. This gives the number of 1s in each row.
    one_counts = mask.sum(axis=1)

    # Create a mask based on the condition
    mask_condition = condition(one_counts) & (df_filtered['gt_obj_id'] == '-')

    # Use the mask to filter the DataFrame
    df_condition = df_filtered[mask_condition]

    # Count the sets per class
    condition_counts = df_condition['pred_obj_id'].value_counts()

    # Convert to DataFrame
    condition_counts_df = condition_counts.reset_index().rename(columns={'index': 'Class', 'pred_obj_id': column_name})

    # Create a mapping dictionary
    class_mapping = {0: 'ascaris', 1: 'trichuris', 2: 'hookworm', 3: 'schistosoma'}

    # Replace numeric class labels with string labels
    condition_counts_df['Class'] = condition_counts_df['Class'].replace(class_mapping)

    return condition_counts_df

# Count the singletons, non-singletons, and empty sets per class
singleton_counts_fp_df = count_sets(df_filtered, p_set_cols, lambda x: x == 1, 'Singleton FP Counts')
non_singleton_counts_fp_df = count_sets(df_filtered, p_set_cols, lambda x: x > 1, 'Non-Singleton FP Counts')
empty_set_counts_fp_df = count_sets(df_filtered, p_set_cols, lambda x: x == 0, 'Empty Set FP Counts')

# Merge the three dataframes on 'Class'
merged_df = pd.merge(singleton_counts_fp_df, non_singleton_counts_fp_df, on='Class', how='outer')
merged_df = pd.merge(merged_df, empty_set_counts_fp_df, on='Class', how='outer')

# Fill NaN values with 0
merged_df = merged_df.fillna(0)

# Convert counts to integers
merged_df['Singleton FP Counts'] = merged_df['Singleton FP Counts'].astype(int)
merged_df['Non-Singleton FP Counts'] = merged_df['Non-Singleton FP Counts'].astype(int)
merged_df['Empty Set FP Counts'] = merged_df['Empty Set FP Counts'].astype(int)

merged_df

Unnamed: 0,Class,Singleton FP Counts,Non-Singleton FP Counts,Empty Set FP Counts
0,ascaris,941,3,85
1,schistosoma,246,0,28
2,trichuris,83,3,9
3,hookworm,57,6,7


In [40]:
df_testing.to_csv('hinge_seed_100.csv',index=False)