# Measuring Calibration Error with Adaptive Binning

In [1]:
import os
import math

from collections import Counter

import numpy as np
import pandas as pd

confidence_path = "../confidence_results/validation"

### Finding Invalid Confidence Scores

In [2]:
df = pd.read_csv(f"{confidence_path}/0.csv")

# 6 metadata columns; rest are tags
num_tags = len(df.columns) - 6

In [3]:
def print_invalid_scores():
    """Prints any confidence scores greater than 1. There shouldn't be any."""
    for filename in os.listdir(confidence_path):
        if filename.endswith(".csv"):
            df = pd.read_csv(f"{confidence_path}/{filename}")
            for t in range(0, num_tags):
                for j, x in enumerate(df[f"{t}"]):
                    if x > 1:
                        print(f"Sentence {i}, tag {t}, word {j}, score {x}, \t {df.iloc[j][3]} \t {df.iloc[j][5]} {t}")

print_invalid_scores()

In [4]:
# Merge the CSVs for all sentences
dfs = []
for filename in os.listdir(confidence_path):
    if filename.endswith(".csv"):
        df = pd.read_csv(f"{confidence_path}/{filename}")
        dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

In [5]:
def single_label_calibration_error(confidence_scores, labels, num_bins):
    """Calculates calibration error using adaptive binning."""
    
    bins = []
    for i in range(num_bins):
        bins.append([])
    b = 0
    # TODO: handle last bin by merging last bin if it's not items_per_bin-sized
    items_per_bin = len(confidence_scores) / num_bins
    sorted_data = sorted(zip(confidence_scores, labels), key=lambda x: x[0])
    for (score, label) in sorted_data:
        if len(bins[b]) < items_per_bin:
            bins[b].append((score, label))
        else:
            b += 1
            bins[b].append((score, label))
    
    bin_square_errors = []
    
    for i, b in enumerate(bins):
        average_score = sum([x for (x, y) in b]) / len(b)
        average_label = sum([y for (x, y) in b]) / len(b)
        square_error = (average_score - average_label) ** 2
        bin_square_errors.append(len(b) * square_error)
        
    return math.sqrt(np.mean(bin_square_errors))

In [6]:
def calibration_error(df, tag_num):
    ground = list(df["Ground Truth Indexes"])
    tag_confidence = list(df[f"{tag_num}"])
    binary_ground = [0 if i != tag_num else 1 for i in ground]
    return single_label_calibration_error(tag_confidence, binary_ground, 3)

In [7]:
tag_counts = Counter()

for x in df["Ground Truth Indexes"]:
    tag_counts[x] += 1

In [8]:
tag_errors = [calibration_error(df, i) for i in range(0, num_tags)]

In [9]:
min(tag_errors)

4.237171889088204e-08

In [10]:
max(tag_errors)

0.3651041775302343