In [None]:
from statsmodels.stats.inter_rater import fleiss_kappa
import numpy as np
import pandas as pd
import scipy.stats

# Wish List - add agreement per category...
# Fleiss, J.L., Levin, B., Paik, M.C.: Statistical methods for rates and proportions. Wiley, Hoboken (2003) - Check out another formula for the variance
# Direct users to the raters package to derie bootstrapp CI's
# Quatto 2010 and randolph provide the same results as randolphs. suggest user to use the package raters to derive confidence intervals for this measure

# This code use the format of subjects in rows and values in coloumns as in Fleisst


Wiki_Data = [
    [0, 0, 0, 0, 14],
    [0, 2, 6, 4, 2],
    [0, 0, 3, 5, 6],
    [0, 3, 9, 2, 0],
    [2, 2, 8, 1, 1],
    [7, 7, 0, 0, 0],
    [3, 2, 6, 3, 0],
    [2, 5, 3, 2, 2],
    [6, 5, 2, 1, 0],
    [0, 2, 2, 3, 7]
]

Fleiss_Data = np.array([
    [0, 0, 0, 6, 0],
    [0, 3, 0, 0, 3],
    [0, 1, 4, 0, 1],
    [0, 0, 0, 0, 6],
    [0, 3, 0, 3, 0],
    [2, 0, 4, 0, 0],
    [0, 0, 4, 0, 2],
    [2, 0, 3, 1, 0],
    [2, 0, 0, 4, 0],
    [0, 0, 0, 0, 6],
    [1, 0, 0, 5, 0],
    [1, 1, 0, 4, 0],
    [0, 3, 3, 0, 0],
    [1, 0, 0, 5, 0],
    [0, 2, 0, 3, 1],
    [0, 0, 5, 0, 1],
    [3, 0, 0, 1, 2],
    [5, 1, 0, 0, 0],
    [0, 2, 0, 4, 0],
    [1, 0, 2, 0, 3],
    [0, 0, 0, 0, 6],
    [0, 1, 0, 5, 0],
    [0, 2, 0, 1, 3],
    [2, 0, 0, 4, 0],
    [1, 0, 0, 4, 1],
    [0, 5, 0, 1, 0],
    [4, 0, 0, 0, 2],
    [0, 2, 0, 4, 0],
    [1, 0, 5, 0, 0],
    [0, 0, 0, 0, 6]
])

def calculate_p_value_from_z_score(score):
    p_value = scipy.stats.t.sf((abs(score)), 100000) * 2
    return min(float(p_value), 0.99999)

confidence_level = 0.95

def kappa_fleiss(Subjects_In_rows_Values_in_coloumns_Matrix, confidence_level):
    Sample_Size = Fleiss_Data.shape[0]
    Number_Of_Levels = Fleiss_Data.shape[1]
    Overall_Agreement = (np.sum(Fleiss_Data**2) - ((Sample_Size*(Number_Of_Levels+1)))) / ( Sample_Size * (Number_Of_Levels+1) * Number_Of_Levels ) 
    Random_Agreement = np.sum((np.sum(Fleiss_Data, axis=0) / ((Sample_Size*(Number_Of_Levels+1))))**2)
    Random_Agreement3 = np.sum((np.sum(Fleiss_Data, axis=0) / ((Sample_Size*(Number_Of_Levels+1))))**3)
    Random_Agreement_Randolph = 1 / (Number_Of_Levels)
    Kappa_Fleiss =  (Overall_Agreement- Random_Agreement) / (1- Random_Agreement)
    Randolphs_Kappa = (Overall_Agreement- Random_Agreement_Randolph) / (1- Random_Agreement_Randolph)
    Variance_term1 =  2/ ( Sample_Size * (Number_Of_Levels+1) * Number_Of_Levels )
    Variance_term2 = Random_Agreement - (2*(Number_Of_Levels+1) - 3) * Random_Agreement**2 + 2*(Number_Of_Levels - 1) * Random_Agreement3
    Variance_term3 = (1-Random_Agreement)**2
    Standard_Error = np.sqrt(Variance_term1*(Variance_term2/Variance_term3))

    # Signficance
    Statistic = Kappa_Fleiss / Standard_Error
    p_value = calculate_p_value_from_z_score(Statistic)

    # Confidence Interval
    zcrit = scipy.stats.t.ppf(1 - (1 - confidence_level) / 2, 100000)
    Lower_Confidence_Interval_Kappa = Kappa_Fleiss - Standard_Error*zcrit
    Upper_Confidence_Interval_Kappa = Kappa_Fleiss + Standard_Error*zcrit


    results = {}

    results["Fleiss' Kappa"]= Kappa_Fleiss
    results["Randolph's Kappa"]= Randolphs_Kappa
    results["Statistic"]= Statistic
    results["p_value"] = p_value
    results["Standard_Error"]= Standard_Error
    results["Confidence Intervals Fleiss Kappa"] = f"({round(Lower_Confidence_Interval_Kappa, 4)}, {round(Upper_Confidence_Interval_Kappa, 4)})"

    result_str = "\n".join([f"{key}: {value}" for key, value in results.items()])
    return result_str

a = kappa_fleiss(Fleiss_Data, 0.95) 

print(a)