# AGREEMENT ANALYSIS

### Initialise packages

In [1]:
import pandas as pd
import statistics

from statsmodels.stats.inter_rater import fleiss_kappa

### Import experimental results

In [2]:
annotations = dict()

for group in ["g1","g2","g3"]:
    annotations[group] = pd.read_csv(f"./0_data/annotations_{group}.csv")

# create combinations of any two groups
for key in [("g1","g2"), ("g2","g3"), ("g1","g3")]:
    
    annotations[key] = annotations[key[0]].copy()
    annotations[key].columns = annotations[key].columns.str.replace("label", f"{key[0]}_label")
    
    target = annotations[key[1]].copy()
    target.columns = target.columns.str.replace("label", f"{key[1]}_label")
    
    annotations[key] = annotations[key].merge(target, on=["id", "text"])

### Compute inter-annotator agreement metrics

**Percentage agreement**

In [3]:
def count_votes(row, label):
    l = list(row.values[2:])
    return l.count(label)

def maj_label(row):
    if row.n_hateful >= row.n_nonhateful:
        return "Hateful"
    else:
        return "Non-hateful"

def pct_agreement(row):
    return max(row.n_hateful, row.n_nonhateful)/(row.n_hateful+row.n_nonhateful)

for g in annotations:
    annotations[g]["n_hateful"]=annotations[g].apply(lambda x: count_votes(x, "Hateful"), axis=1)
    annotations[g]["n_nonhateful"]=annotations[g].apply(lambda x: count_votes(x, "Non-hateful"), axis=1)
    annotations[g]["label_maj"]=annotations[g].apply(lambda x: maj_label(x), axis=1)
    annotations[g]['pct_agreement'] = annotations[g].apply(lambda x: pct_agreement(x), axis=1)
    print('Average percentage agreement in {}: {:.2%}'.format(g, annotations[g].pct_agreement.mean()))

Average percentage agreement in g1: 73.90%
Average percentage agreement in g2: 93.72%
Average percentage agreement in g3: 72.50%
Average percentage agreement in ('g1', 'g2'): 79.04%
Average percentage agreement in ('g2', 'g3'): 78.61%
Average percentage agreement in ('g1', 'g3'): 72.90%


**Fleiss' Kappa**

In [4]:
for g in annotations:
    vote_matrix = annotations[g][['n_hateful','n_nonhateful']].to_numpy()
    
    print('Fleiss\' Kappa in {}: {:.2f}'.format(g,fleiss_kappa(vote_matrix)))

Fleiss' Kappa in g1: 0.20
Fleiss' Kappa in g2: 0.78
Fleiss' Kappa in g3: 0.15
Fleiss' Kappa in ('g1', 'g2'): 0.36
Fleiss' Kappa in ('g2', 'g3'): 0.34
Fleiss' Kappa in ('g1', 'g3'): 0.18


### Compute bootstrap standard errors for agreement metrics

In [5]:
def bootstrap_se(df, runs):
    
    results_dict={}
    
    # MEANS
    
    # RAW AGREEMENT
    results_dict['mean_raw_agreement']={}
    results_dict['mean_raw_agreement']['abs value'] = df['pct_agreement'].mean()
    
    # FLEISS KAPPA
    results_dict['fleiss_kappa']={}
    results_dict['fleiss_kappa']['abs value'] = fleiss_kappa(df[['n_hateful','n_nonhateful']].to_numpy())
    
    
    # BOOTSTRAP
    
    metric_list = []
    for i in range(runs):
        b_df = df.sample(frac=1, replace=True)
        metric_list.append(b_df['pct_agreement'].mean())
    
    results_dict['mean_raw_agreement']['bootstrap_sd'] = statistics.pstdev(metric_list)
    results_dict['mean_raw_agreement']['bootstrap_ci_995'] = sorted(metric_list)[round(runs*0.995)]
    results_dict['mean_raw_agreement']['bootstrap_ci_005'] = sorted(metric_list)[round(runs*0.005)]
    
    
    metric_list = []
    for i in range(runs):
        b_df = df.sample(frac=1, replace=True)
        metric_list.append(fleiss_kappa(b_df[['n_hateful','n_nonhateful']].to_numpy()))
    
    results_dict['fleiss_kappa']['bootstrap_sd'] = statistics.pstdev(metric_list)
    results_dict['fleiss_kappa']['bootstrap_ci_995'] = sorted(metric_list)[round(runs*0.995)]
    results_dict['fleiss_kappa']['bootstrap_ci_005'] = sorted(metric_list)[round(runs*0.005)]
    
    return results_dict

In [6]:
runs = 1000

for g in ["g1","g2","g3"]:
    print(g)
    results = bootstrap_se(annotations[g], runs = runs)
    print("mean_raw_agreement:", results["mean_raw_agreement"])
    print("fleiss kappa:", results["fleiss_kappa"])
    print()

g1
mean_raw_agreement: {'abs value': 0.7389999999999998, 'bootstrap_sd': 0.010207648798817512, 'bootstrap_ci_995': 0.7662500000000001, 'bootstrap_ci_005': 0.7137499999999997}
fleiss kappa: {'abs value': 0.19762597248876537, 'bootstrap_sd': 0.020369067917837692, 'bootstrap_ci_995': 0.25010525011233514, 'bootstrap_ci_005': 0.1472697246577277}

g2
mean_raw_agreement: {'abs value': 0.9372499999999997, 'bootstrap_sd': 0.005853340392459645, 'bootstrap_ci_995': 0.9530000000000004, 'bootstrap_ci_005': 0.9219999999999989}
fleiss kappa: {'abs value': 0.7780866026522338, 'bootstrap_sd': 0.01951185692610804, 'bootstrap_ci_995': 0.8308421515853782, 'bootstrap_ci_005': 0.726547514064108}

g3
mean_raw_agreement: {'abs value': 0.7249999999999999, 'bootstrap_sd': 0.008634983899087503, 'bootstrap_ci_995': 0.74625, 'bootstrap_ci_005': 0.7030000000000001}
fleiss kappa: {'abs value': 0.1476217606352577, 'bootstrap_sd': 0.020746850245156562, 'bootstrap_ci_995': 0.2090009339654267, 'bootstrap_ci_005': 0.0968

### Compute bootstrap CIs for difference between groups

In [7]:
def diff_bootstrap_se(df1, df2, runs):
    
    results_dict={}
    
    # DIFF IN MEANS
    
    # RAW AGREEMENT
    results_dict['diff_mean_raw_agreement']={}
    results_dict['diff_mean_raw_agreement']['abs value'] = df1['pct_agreement'].mean()-df2['pct_agreement'].mean()
    
    # FLEISS KAPPA
    results_dict['diff_fleiss_kappa']={}
    results_dict['diff_fleiss_kappa']['abs value'] = fleiss_kappa(df1[['n_hateful','n_nonhateful']].to_numpy())-fleiss_kappa(df2[['n_hateful','n_nonhateful']].to_numpy())
    
    
    # BOOTSTRAP
    
    metric_list = []
    for i in range(runs):
        b_df1 = df1.sample(frac=1, replace=True, random_state=i)
        b_df2 = df2.sample(frac=1, replace=True, random_state=i)
        metric_list.append(b_df1['pct_agreement'].mean()-b_df2['pct_agreement'].mean())
    
    results_dict['diff_mean_raw_agreement']['bootstrap_sd'] = statistics.pstdev(metric_list)
    results_dict['diff_mean_raw_agreement']['bootstrap_ci_995'] = sorted(metric_list)[round(runs*0.995)]
    results_dict['diff_mean_raw_agreement']['bootstrap_ci_005'] = sorted(metric_list)[round(runs*0.005)]
    
    
    metric_list = []
    for i in range(runs):
        b_df1 = df1.sample(frac=1, replace=True, random_state=i)
        b_df2 = df2.sample(frac=1, replace=True, random_state=i)
        metric_list.append(fleiss_kappa(b_df1[['n_hateful','n_nonhateful']].to_numpy())-fleiss_kappa(b_df2[['n_hateful','n_nonhateful']].to_numpy()))
    
    results_dict['diff_fleiss_kappa']['bootstrap_sd'] = statistics.pstdev(metric_list)
    results_dict['diff_fleiss_kappa']['bootstrap_ci_995'] = sorted(metric_list)[round(runs*0.995)]
    results_dict['diff_fleiss_kappa']['bootstrap_ci_005'] = sorted(metric_list)[round(runs*0.005)]
    
    return results_dict

In [8]:
runs = 1000

In [9]:
diff_bootstrap_se(annotations["g1"],annotations["g2"], runs = runs)

{'diff_mean_raw_agreement': {'abs value': -0.19824999999999993,
  'bootstrap_sd': 0.011069228563906346,
  'bootstrap_ci_995': -0.169249999999999,
  'bootstrap_ci_005': -0.22649999999999926},
 'diff_fleiss_kappa': {'abs value': -0.5804606301634685,
  'bootstrap_sd': 0.02444507553644793,
  'bootstrap_ci_995': -0.5121998744546794,
  'bootstrap_ci_005': -0.6417281912716728}}

In [10]:
diff_bootstrap_se(annotations["g2"],annotations["g3"], runs = runs)

{'diff_mean_raw_agreement': {'abs value': 0.21224999999999983,
  'bootstrap_sd': 0.010280445163026774,
  'bootstrap_ci_995': 0.23650000000000027,
  'bootstrap_ci_005': 0.18449999999999966},
 'diff_fleiss_kappa': {'abs value': 0.6304648420169762,
  'bootstrap_sd': 0.024782199760685585,
  'bootstrap_ci_995': 0.6936047678263629,
  'bootstrap_ci_005': 0.5697894532376929}}

In [11]:
diff_bootstrap_se(annotations["g1"],annotations["g3"], runs = runs)

{'diff_mean_raw_agreement': {'abs value': 0.013999999999999901,
  'bootstrap_sd': 0.0081724041597317,
  'bootstrap_ci_995': 0.03599999999999948,
  'bootstrap_ci_005': -0.006499999999999284},
 'diff_fleiss_kappa': {'abs value': 0.050004211853507674,
  'bootstrap_sd': 0.015458239712014649,
  'bootstrap_ci_995': 0.08974109044085335,
  'bootstrap_ci_005': 0.011126210618792848}}