In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from nltk.metrics.agreement import AnnotationTask
from sklearn.metrics import cohen_kappa_score as kappa
from sklearn.metrics import accuracy_score as accuracy
from scipy.stats import pearsonr as pearson
from scipy.stats import spearmanr as spearman
from math import isnan
from collections import Counter

In [6]:
data_file = "pred_devtest.csv"
# data_file = "pred_train.csv"
raw_data_file = pd.read_csv(data_file)
raw_data_file.columns = [c.replace('.', '_') for c in raw_data_file.columns]

In [7]:
def extract_dataframe(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["Input_var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            var_dyn = "Answer_pred_dyn" + str(idx + 1)
            var_dyn_c = "Answer_dyn_conf" + str(idx + 1)
            var_part = "Answer_pred_part" + str(idx + 1)
            var_part_c = "Answer_part_conf" + str(idx + 1)
            var_hyp = "Answer_pred_hyp" + str(idx + 1)
            var_hyp_c = "Answer_hyp_conf" + str(idx + 1)
            temp_dict['part'] = getattr(row, var_part)
            temp_dict['part_conf'] = getattr(row, var_part_c)
            temp_dict['dyn'] = getattr(row, var_dyn)
            temp_dict['dyn_conf'] = getattr(row, var_dyn_c)
            temp_dict['hyp'] = getattr(row, var_hyp)
            temp_dict['hyp_conf'] = getattr(row, var_hyp_c)
            temp_dict['worker_id'] = row.WorkerId
            temp_dict['hit_id'] = row.HITId
            temp_dict['status'] = row.AssignmentStatus
            global_list.append(temp_dict)
    
    return pd.DataFrame(global_list)

In [8]:
raw_data = extract_dataframe(raw_data_file)
raw_data = raw_data[raw_data['status']!='Rejected']
raw_data['sent_pred'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                           raw_data['pred_token'].map(lambda x: str(x))
# Rearrange the columns
cols = ['hit_id', 'worker_id','sent_pred','sent_id','pred_token','part','part_conf',
        'dyn','dyn_conf','hyp','hyp_conf']
data = raw_data[cols]

x=Counter(list(data['worker_id'].values))
print(x.most_common()[:15])
print(data.shape)
# ann_data = data[data['worker_id']=='A323RBOHUTRW53']
# xdata = data[data['worker_id']!='A323RBOHUTRW53']
# print(ann_data.shape)
# print(xdata.shape)

[('A2UF2FRGVW4T89', 290), ('A3OPHHRV96Y2UH', 280), ('A215S35FQFQYJ1', 260), ('A1JLQNPXN3NHVP', 260), ('A2HHOH92TCP7WA', 260), ('A14OPFM8OFA4WF', 250), ('A30GPAEVFFIAIW', 250), ('A3MQ8BS6EYO2WW', 250), ('A30UE6DWFNUCWX', 250), ('A1U5YW7RKHBOFP', 250), ('A2BVVBUTQ4AFH1', 250), ('A1BPXI87NVUA0R', 250), ('A3M01W8KKZF99X', 250), ('AOUZBLGN7NVT0', 250), ('AN4D1WRTKLUYZ', 250)]
(16920, 11)


In [None]:
fig, axs = plt.subplots(ncols=2, nrows=6, figsize=(15, 15))
sns.countplot(x='part', data=data, ax=axs[0][0])
sns.countplot(x='part', data=ann_data, ax=axs[0][1])
sns.countplot(x='dyn', data=data, ax=axs[1][0])
sns.countplot(x='dyn', data=ann_data, ax=axs[1][1])
sns.countplot(x='hyp', data=data, ax=axs[2][0])
sns.countplot(x='hyp', data=ann_data, ax=axs[2][1])

sns.countplot(x='part_conf', data=data, ax=axs[3][0])
sns.countplot(x='part_conf', data=ann_data, ax=axs[3][1])
sns.countplot(x='dyn_conf', data=data, ax=axs[4][0])
sns.countplot(x='dyn_conf', data=ann_data, ax=axs[4][1])
sns.countplot(x='hyp_conf', data=data, ax=axs[5][0])
sns.countplot(x='hyp_conf', data=ann_data, ax=axs[5][1])

plt.show()

In [None]:
%matplotlib inline

print(data.pivot_table(index=['hyp', 'part', 'dyn'], 
                                  columns='dyn_conf', 
                                  values='worker_id', aggfunc=len))
sns.heatmap(data.pivot_table(index=['hyp', 'part', 'dyn'], 
                                  columns='dyn_conf', 
                                  values='worker_id', aggfunc=len).fillna(0))

### Response distribution

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(15, 10))
sns.countplot(x='part', data=data, ax=axs[0][0])
sns.countplot(x='part_conf', data=data, ax=axs[0][1])
sns.countplot(x='dyn', data=data, ax=axs[1][0])
sns.countplot(x='dyn_conf', data=data, ax=axs[1][1])
sns.countplot(x='hyp', data=data, ax=axs[2][0])
sns.countplot(x='hyp_conf', data=data, ax=axs[2][1])

plt.show()

## Inter Annotator agreement

## Raw agreement

In [None]:
def calc_raw_agreement(data, key_var, check_var):
    '''
    Input: 
    1. data: Pandas dataframe
    2. key_var: variable based on which raw agreement is to be calculated
    3. check_var: vaiable on which raw agreement is calculated
    
    '''
    print("####### Raw Count for {} ###########".format(check_var))
    ids = set(list(data[key_var].values))

    total_count = len(ids)
    raw_count = 0
    keys = []
    
    for iden in ids:
        temp = list(data[data[key_var] == iden][check_var].values)
        if temp.count(temp[0]) == len(temp):
            raw_count += 1
            keys.append(iden)
     
    agreement = (raw_count/total_count)*100
    
    print("Total count of unique {} is {}".format(key_var, total_count))
    print("Raw count of matched for {} is {}".format(check_var, raw_count))
    print("Inter-annotator agreement for {} is {}%".format(check_var, agreement))
    print("\n")
    
    return agreement, keys
start_agreement, key_start = calc_raw_agreement(data, 'sent_pred', 'hyp')
instant_agreement, key_inst = calc_raw_agreement(data, 'sent_pred', 'part')
start_agreement, key_start = calc_raw_agreement(data, 'sent_pred', 'dyn')

## Average of accuracy and kappa for each pair

In [None]:
def extract_pairs_of_workers(data, worker_id):
    '''
    Given a pandas dataframe, and worker_id variable,
    extracts a list of pairs of worker_ids
    '''
    workers = list(set(data[worker_id].values))
    
    return list(itertools.combinations(workers, 2))

def extract_worker_sent_dict(data, worker_id, sent_id):
    '''
    Given a pandas dataframe, worker_id variable, and sentence_id variable,
    extracts a dict where key is worker_id and value is set(sentences_ids annotated by that worker)
    
    '''
    workers = list(set(data[worker_id].values))
    
    ans = {}
    
    for worker in workers:
        sents = set(list(data[data[worker_id] == worker][sent_id].values))
        ans[worker] = sents
        
    return ans

def average_kappa_acc(data, worker_id, key_var, check_var):
    '''
    Input: 1. data: pandas dataframe
           2. worker_id: Annotator id variable
           3. key_var: level of the data (sentence-predicate id)
           4. check_var: variable to be checked for kappa score
    
    Output: kappa score and average accuracy for (pairs of annotators) in the dataset

    '''
    worker_pairs = extract_pairs_of_workers(data, worker_id)
    
    worker_key_dict = extract_worker_sent_dict(data, worker_id, key_var)
    
    kappas = []
    accuracies = []
    lens = []
    for (w1, w2) in worker_pairs:
        
        common_set = worker_key_dict[w1].intersection(worker_key_dict[w2])
        temp1 = []
        temp2 = []
        
        if common_set == set():
            continue
        if len(common_set) == 150:
            print(w1, w2)
        for key in common_set:
            val1 = data[(data[key_var] == key) & 
                        (data[worker_id] == w1)][check_var].values
            val2 = data[(data[key_var] == key) & 
                        (data[worker_id] == w2)][check_var].values

            temp1.append(val1[0])
            temp2.append(val2[0])

        kappas.append(kappa(temp1, temp2))
        accuracies.append(accuracy(temp1, temp2))
        lens.append(len(temp1))
    return kappas, accuracies

def rank_correlation(data, worker_id, key_var, check_var):
    '''
    Input: 1. data: pandas dataframe
           2. worker_id: Annotator id variable
           3. key_var: level of the data (sentence-predicate id)
           4. check_var: variable to be checked for kappa score
    
    Output: pearson rank correlation

    '''
    worker_pairs = extract_pairs_of_workers(data, worker_id)
    
    worker_key_dict = extract_worker_sent_dict(data, worker_id, key_var)
    
    corrs = []
    accuracies = []
    for (w1, w2) in worker_pairs:
        
        common_set = worker_key_dict[w1].intersection(worker_key_dict[w2])
        temp1 = []
        temp2 = []
        
        if common_set == set():
            continue

        for key in common_set:
            val1 = data[(data[key_var] == key) & 
                        (data[worker_id] == w1)][check_var].values
            val2 = data[(data[key_var] == key) & 
                        (data[worker_id] == w2)][check_var].values

            temp1.append(val1[0])
            temp2.append(val2[0])

        corrs.append(spearman(temp1, temp2)[0])
        accuracies.append(accuracy(temp1, temp2))
        
    return corrs, accuracies

In [None]:
kappas = {}
corrs = {}
accs = {}
variables = ['hyp', 'part', 'dyn']
variables_ord = ['hyp_conf', 'part_conf', 'dyn_conf']
for var in variables:
    kappas[var], accs[var] = average_kappa_acc(data, 'worker_id', 'sent_pred', var)

for var in variables_ord:
    corrs[var], accs[var] = rank_correlation(data, 'worker_id', 'sent_pred', var)

In [None]:
kappa_data = pd.DataFrame.from_dict(kappas)
acc_data = pd.DataFrame.from_dict(accs)
corr_data = pd.DataFrame.from_dict(corrs)

ax = sns.boxplot(data=kappa_data)
ax.set(ylabel='Kappa Score using builtin function', title="Kappa")
plt.show()
print(np.mean(kappa_data))

ax = sns.boxplot(data=acc_data)
ax.set(ylabel='Accuracy(raw agreement)', title="Accuracy")
plt.show()
print(np.mean(acc_data))

ax = sns.boxplot(data=corr_data)
ax.set(ylabel='Corr coeff', title="Pearsons/Spearman Rank Correlation")
plt.show()
print(np.mean(corr_data))

# Run mixed effects model in R

## Hyp

In [None]:
# Enables the %%R magic 
%load_ext rpy2.ipython
%R require(ggplot2); require(tidyr); require(lme4)

In [None]:
%%R -i data -o df_hyp

#Convert to factors
data$part = as.factor(data$part)
data$dyn = as.factor(data$dyn)
data$hyp = as.factor(data$hyp)

#Mixed Effects Model
model = glmer(part ~ 1 + (1|worker_id) + (1|sent_pred) + (1|hit_id), data=data,  family=binomial)

#Model intercepts:
df_hyp = ranef(model)$worker_id
colnames(df_hyp) <- c('intercept')

df_hyp$glmer_intercept_hyp = df_hyp$intercept + 0.4795 #constant added manually
df_hyp$worker_id <- rownames(df_hyp)
print(summary(model))

In [None]:
df_hyp['glmer_intercept_hyp'] = df_hyp['glmer_intercept_hyp'].apply(lambda x: 1/(1+np.exp(-x)))

In [None]:
df_hyp.glmer_intercept_hyp.plot(kind='density')
plt.title("Annotator probability density of saying part = true")
plt.show()

## Part

In [None]:
%%R -i data -o df_part

#Convert to factors
data$part = as.factor(data$part)
data$dyn = as.factor(data$dyn)
data$hyp = as.factor(data$hyp)

#Mixed Effects Model
model = glmer(part ~ 1 + (1|worker_id) + (1|sent_pred) + (1|hit_id), data=data,  family=binomial)

#Model intercepts:
df_part = ranef(model)$worker_id
colnames(df_part) <- c('intercept')

df_part$glmer_intercept_part = df_part$intercept + 0.4795 #constant added manually
df_part$worker_id <- rownames(df_part)
print(summary(model))

In [None]:
df_part['glmer_intercept_part'] = df_part['glmer_intercept_part'].apply(lambda x: 1/(1+np.exp(-x)))

In [None]:
df_part.glmer_intercept_part.plot(kind='density')
plt.title("Annotator probability density of saying part = true")
plt.show()

## Dynamic

In [None]:
%%R -i data -o df_dyn

#Convert to factors
data$part = as.factor(data$part)
data$dyn = as.factor(data$dyn)
data$hyp = as.factor(data$hyp)
#Mixed Effects Model
model = glmer(dyn ~ 1 + (1|worker_id) + (1|sent_pred) + (1|hit_id), data=data,  family="binomial")

#Model intercepts:
df_dyn = ranef(model)$worker_id
colnames(df_dyn) <- c('intercept')

df_dyn$glmer_intercept_dyn = df_dyn$intercept - 0.2052 #constant added manually
df_dyn$worker_id <- rownames(df_dyn)

print(summary(model))

In [None]:
df_dyn['glmer_intercept_dyn'] = df_dyn['glmer_intercept_dyn'].apply(lambda x: 1/(1+np.exp(-x)))

In [None]:
df_dyn.glmer_intercept_dyn.plot(kind='density')
plt.title("Annotator probability density of saying dynamic = true")
plt.show()

In [None]:
def extract_pairs_of_workers1(data, worker_id):
    '''
    Given a pandas dataframe, and worker_id variable,
    extracts a list of pairs of worker_ids
    '''
    workers = list(set(data[worker_id].values))
    
    return list(itertools.combinations(workers, 2))

def extract_worker_sent_dict1(data, worker_id, sent_id):
    '''
    Given a pandas dataframe, worker_id variable, and sentence_id variable,
    extracts a dict where key is worker_id and value is set(sentences_ids annotated by that worker)
    
    '''
    workers = list(set(data[worker_id].values))
    
    ans = {}
    
    for worker in workers:
        sents = set(list(data[data[worker_id] == worker][sent_id].values))
        ans[worker] = sents
        
    return ans

def average_kappa_acc1(data, worker_id, key_var, check_var):
    '''
    Input: 1. data: pandas dataframe
           2. worker_id: Annotator id variable
           3. key_var: level of the data (sentence-predicate id)
           4. check_var: variable to be checked for kappa score
    
    Output: kappa score and average accuracy for (pairs of annotators) in the dataset

    '''
    worker_pairs = extract_pairs_of_workers1(data, worker_id)
    
    worker_key_dict = extract_worker_sent_dict1(data, worker_id, key_var)
    if check_var == "part":
        df = df_part
        int_prob = "glmer_intercept_part"
    elif check_var == "dyn":
        df = df_dyn
        int_prob = "glmer_intercept_dyn"
    else:
        df = df_hyp
        int_prob = "glmer_intercept_hyp"

    kappas = []
    accuracies = []
    for (w1, w2) in worker_pairs:
        
        common_set = worker_key_dict[w1].intersection(worker_key_dict[w2])
        temp1 = []
        temp2 = []
        
        if common_set == set():
            continue

        for key in common_set:
            val1 = data[(data[key_var] == key) & 
                        (data[worker_id] == w1)][check_var].values
            val2 = data[(data[key_var] == key) & 
                        (data[worker_id] == w2)][check_var].values

            temp1.append(val1[0])
            temp2.append(val2[0])
        accuracies.append(accuracy(temp1, temp2))
        # Now for modified kappa calculation
        p_e = (df[df[worker_id] == w1][int_prob][0] * df[df[worker_id] == w2][int_prob][0]) + ((1 - df[df[worker_id] == w1][int_prob][0]) * (1 - df[df[worker_id] == w2][int_prob][0]))
        kappas.append((accuracies[-1] - p_e) / (1 - p_e))
        
    return kappas

In [None]:
kappas = {}
corrs = {}
accs = {}
variables = ['hyp', 'part', 'dyn']
# variables_ord = ['part_conf', 'kind_conf', 'abs_conf']
for var in variables:
    kappas[var] = average_kappa_acc1(data, 'worker_id', 'sent_pred', var)

In [None]:
kappa_data = pd.DataFrame.from_dict(kappas)

ax = sns.boxplot(data=kappa_data)
ax.set(ylabel='Kappa Score', title="Kappa from mixed effects mode")
plt.show()

print("KAPPA", np.mean(kappa_data))

## Check if average confidence correlates with agreement

In [None]:
xyz = list(set(raw_data['sent_pred'].values))
raw_data[raw_data['sent_pred'] == "en-ud-train.conllu sent_5978_3"]

In [None]:
all_list=[]

for question in list(set(data.sent_pred)):
    temp = {}
    temp['sent_pred'] = question
    temp['part_avg_confidence'] = np.mean(list(data[data["sent_pred"] == question]['part_conf']))
    temp['dyn_avg_confidence'] = np.mean(list(data[data["sent_pred"] == question]['dyn_conf']))
    p = sum(list(data[data["sent_pred"] == question]['part'].astype(int))) / 5
    q = sum(list(data[data["sent_pred"] == question]['dyn'].astype(int))) / 5
    temp['part_agreement'] = p * p + (1 - p) * (1 - p)
    temp['dyn_agreement'] = q * q + (1 - q) * (1 - q)
    all_list.append(temp)
analyse_data = pd.DataFrame(all_list)
print(spearman(list(analyse_data['dyn_agreement']), list(analyse_data['dyn_avg_confidence'])))
print(spearman(list(analyse_data['part_agreement']), list(analyse_data['part_avg_confidence'])))

## Random Checking

In [None]:
cols = ['raw_sentence', 'part', 'part_conf', 'dyn','dyn_conf','hyp','hyp_conf']
# check_data = raw_data[cols]
for sen in xyz:
#     print(check_data[raw_data['sent_pred'] == sen])
    print(str(set(raw_data[raw_data['sent_pred'] == sen]['raw_sentence'].values)), str(set(raw_data[raw_data['sent_pred'] == sen]['pred'].values)))
    print("PART", sum(list(raw_data[raw_data['sent_pred'] == sen]['part'].astype(int)))/5, "CONF", sum(list(raw_data[raw_data['sent_pred'] == sen]['part_conf'].values))/5, "\t\t",
          "HYP", sum(list(raw_data[raw_data['sent_pred'] == sen]['hyp'].astype(int)))/5, "CONF", sum(list(raw_data[raw_data['sent_pred'] == sen]['hyp_conf'].values))/5, "\t\t"
          "DYN", sum(list(raw_data[raw_data['sent_pred'] == sen]['dyn'].astype(int)))/5, "CONF", sum(list(raw_data[raw_data['sent_pred'] == sen]['dyn_conf'].values))/5,"\n\n")