# Fleiss' Kappa 
To understand how much your judges agree with each other.  It is meant to be used with more than two judges.

Read https://www.datanovia.com/en/blog/kappa-coefficient-interpretation/ to learn more.

Please copy this example and customize it for your own purposes!

## Imports

In [1]:
import pandas as pd
from js import fetch
import json

from collections import defaultdict
from statsmodels.stats.inter_rater import aggregate_raters
from statsmodels.stats.inter_rater import fleiss_kappa
from IPython.display import display, Markdown

## Step 0: Configuration

In [2]:
QUEPID_BOOK_NUM = 25

## Step 1: Download the Quepid Book

In [3]:
# Generic GET call to a JSON endpoint 
async def get_json(url):
    resp = await fetch(url)
    resp_text = await resp.text()
    return json.loads(resp_text)

async def get_text(url):
    resp = await fetch(url)
    resp_text = await resp.text()
    return resp_text


In [None]:
data = await get_text(f'/api/books/{QUEPID_BOOK_NUM}.csv')

## Step 2: Extract and Prepare Data

In [None]:
# Initialize a list to hold the tuples of (doc_id, rating, count)
ratings_data = []

# Iterate through each query-doc pair
for pair in data['query_doc_pairs']:
    # Initialize a dictionary to count the ratings for this pair
    ratings_count = defaultdict(int)
    
    # Extract judgements and count the ratings
    for judgement in pair['judgements']:
        rating = judgement['rating']
        ratings_count[rating] += 1

    # Append the counts to the ratings_data list
    for rating, count in ratings_count.items():
        ratings_data.append((pair['doc_id'], rating, count))


## Step 3: Aggregate Raters' Data

In [None]:
# Convert ratings_data to a DataFrame
df = pd.DataFrame(ratings_data, columns=['doc_id', 'rating', 'count'])

# Use crosstab to create a contingency table
data_crosstab = pd.crosstab(index=df['doc_id'], columns=df['rating'], values=df['count'], aggfunc='sum')

# Drop any rows missing judgements
data_crosstab = data_crosstab.dropna(how='any')

# Convert the DataFrame to the format expected by aggregate_raters
data_for_aggregation = data_crosstab.values

# Aggregate the raters' data
table, _ = aggregate_raters(data_for_aggregation)

## Step 4: Compute Fleiss' Kappa

In [None]:
kappa = fleiss_kappa(table, method='fleiss')
display(Markdown(f"## Fleiss' Kappa: {kappa:.4f}"))

_This notebook was last updated 16_January_2025_