Nous allons utiliser la méthode des kappa de Fleiss pour évaluer l'accord entre les 3 évaluateurs de notre dataset. Nous ne pouvons pas utiliser les kappa de Cohen car ils ne s'appliquent qu'à 2 évaluateurs.

In [9]:
import pandas as pd 
import numpy as np 
from scipy.stats import norm
from statsmodels.stats.inter_rater import fleiss_kappa

In [10]:
clement = pd.read_csv("Data.csv")
romain = pd.read_csv("Romain.csv")
tania = pd.read_csv("Tania.csv")
evaluateurs = [clement, romain, tania]

In [11]:
for df in evaluateurs:
    df.rename(columns={'Score esthéthique (0-10)':'Score esthétique (0-10)'}, inplace=True)

# Only positive values
for i,df in enumerate(evaluateurs):
    df.loc[df['Score esthétique (0-10)'] < 0, 'Score esthétique (0-10)'] = 0
    assert (df['Score esthétique (0-10)']>=0).all(), "esthétique " + str(i)
    assert (df['Score business (0-10)']>=0).all(), "business " + str(i)

In [12]:
# Buckets (or they will always disagree)
# 3 buckets (0-3, 4-6, 7-10)
bins = {0:0, 1:0, 2:0, 3:0, 4:1, 5:1, 6:1, 7:2, 8:2, 9:2, 10:2}
for i,df in enumerate(evaluateurs):
    df["Score business (0-10)"] = df["Score business (0-10)"].replace(bins)
    df["Score esthétique (0-10)"] = df["Score esthétique (0-10)"].replace(bins)

In [13]:
score_business = [[df.loc[i,"Score business (0-10)"] for df in [clement, romain, tania]] for i in range(len(clement))]
score_esthetique = [[df.loc[i,"Score esthétique (0-10)"] for df in [clement, romain, tania]] for i in range(len(clement))]

def convert(score, categories=[0,1,2]):
    """
    Counts the number of votes for each category.
    """
    new_score = []
    for item in score :
        new_score.append([len([i for i in item if i==cat]) for cat in categories])
    return(np.array(new_score))

score_business = convert(score_business)
score_esthetique = convert(score_esthetique)

In [14]:
business_kappa = fleiss_kappa(score_business)
esthetique_kappa = fleiss_kappa(score_esthetique)

def affichage(score):
    n = len(score) 
    m = len(evaluateurs) 
    
    kappa = fleiss_kappa(score)
    
    # Proportion of ratings in each category
    pi = np.sum(score, axis=0) / (n * m)
    
    # Expected agreement by chance
    Pe = np.sum(pi ** 2)
    
    variance = (2 / (n * m * (m - 1))) * (
        (Pe + Pe**2) - 
        np.sum(pi * np.sum((score * (score - 1)), axis=0) / (n * m * (m - 1)))
    )
    
    if variance > 0:
        z_value = kappa / np.sqrt(variance)
        p_value = 2 * (1 - norm.cdf(np.abs(z_value)))
        z_critical = norm.ppf(0.975)  # For 95% CI
        margin_of_error = z_critical * np.sqrt(variance)
        lower_bound = max(-1, kappa - margin_of_error)
        upper_bound = min(1, kappa + margin_of_error)

        print("Fleiss' kappa:", kappa)
        print("Z-value:", z_value)
        print("P-value:", p_value)
        print("Confidence interval (95%):", (lower_bound, upper_bound))
    else:
        print("Fleiss' kappa:", kappa)
        print("Variance calculation error: Non-positive variance", variance)
    
print("--- Score business ---")
affichage(score_business)
print("\n--- Score esthétique ---")
affichage(score_esthetique)

--- Score business ---
Fleiss' kappa: 0.289425560941627
Z-value: 8.962744774981424
P-value: 0.0
Confidence interval (95%): (0.22613427128732944, 0.3527168505959246)

--- Score esthétique ---
Fleiss' kappa: 0.22693887713183125
Z-value: 6.773226363099416
P-value: 1.259414794674285e-11
Confidence interval (95%): (0.16126972597314862, 0.2926080282905139)


In [15]:
score_logo = np.array([[df.loc[i,"Is it a logo?"] for df in [clement, romain, tania]] for i in range(len(clement))])
score_logo = (score_logo=='Oui').astype(int)
score_logo = convert(score_logo,categories=[0,1])
print("\n--- Score Logo ---")
affichage(score_logo)


--- Score Logo ---
Fleiss' kappa: 0.9525925925925928
Z-value: 33.371407852262365
P-value: 0.0
Confidence interval (95%): (0.8966450826257771, 1)
