# Combining Annotations and Majority Voting

## Setup and Data Loading

In [5]:
import pandas as pd
import numpy as np
import os
from collections import Counter

In [None]:
# print current working directory
print(os.getcwd())

# set working directory if necessary
#os.chdir('/your/path/here/')


In [8]:
results_dir = '../results/EN/'
annotations_dir = '../results/annotations/'

# need the actual file to get ante_gender
original_file = 'next_token_results_EN_cohtemplates_hifr_PL_qwen32B-4bit.csv'

In [9]:
#   read in the original file
og_df = pd.read_csv(os.path.join(results_dir,original_file))
og_df.index = np.arange(1, len(og_df) + 1)
og_df.index.name = 'item_no'

In [10]:
og_df

Unnamed: 0_level_0,phrase1,phrase2_cut,ante_noun,ante_gender,phrases_cut,next_5
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,The grandparents were waiting on a bench.,"Because of the cloudy weather, one of the",grandparents,n,The grandparents were waiting on a bench. Beca...,"grandparents said, ""It"
2,The grandmothers were waiting on a bench.,"Because of the cloudy weather, one of the",grandmothers,f,The grandmothers were waiting on a bench. Beca...,"grandmothers said,"
3,The grandfathers were waiting on a bench.,"Because of the cloudy weather, one of the",grandfathers,m,The grandfathers were waiting on a bench. Beca...,"grandfathers said,"
4,The monarchs were waiting on a bench.,"Because of the cloudy weather, one of the",monarchs,n,The monarchs were waiting on a bench. Because ...,monarchs was not happy
5,The queens were waiting on a bench.,"Because of the cloudy weather, one of the",queens,f,The queens were waiting on a bench. Because of...,queens was wearing a hat
...,...,...,...,...,...,...
626,The daughters were getting off the plane.,"After such a long day, most of the",daughters,f,The daughters were getting off the plane. Afte...,passengers were tired and ready
627,The sons were getting off the plane.,"After such a long day, most of the",sons,m,The sons were getting off the plane. After suc...,passengers were tired and just
628,The spouses were getting off the plane.,"After such a long day, most of the",spouses,n,The spouses were getting off the plane. After ...,passengers were just happy to
629,The wives were getting off the plane.,"After such a long day, most of the",wives,f,The wives were getting off the plane. After su...,wives were tired and just


## Collect Annotator Data

In [11]:
# get annotator data
ann_1 = pd.read_csv(os.path.join(annotations_dir,'annotator_1.csv'), index_col=0)
ann_2 = pd.read_csv(os.path.join(annotations_dir,'annotator_2.csv'), index_col=0)
ann_3 = pd.read_csv(os.path.join(annotations_dir,'annotator_3.csv'), index_col=0)

# shorten column names
ann_1.rename(columns={'mentioned gender':'gender_a1', 'refers back?':'ref_a1'}, inplace=True)
ann_2.rename(columns={'mentioned gender':'gender_a2', 'refers back?':'ref_a2'}, inplace=True)
ann_3.rename(columns={'mentioned gender':'gender_a3', 'refers back?':'ref_a3'}, inplace=True)

# add two new coplumns to the data frame
annotations = pd.concat([og_df.iloc[:,2:4],
                         ann_1, 
                         ann_2.loc[:,'gender_a2':'ref_a2'],
                         ann_3.loc[:,'gender_a3':'ref_a3']], axis=1)

In [12]:
annotations.head()

Unnamed: 0_level_0,ante_noun,ante_gender,sentences,continuation,gender_a1,ref_a1,gender_a2,ref_a2,gender_a3,ref_a3
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,grandparents,n,The grandparents were waiting on a bench. Beca...,"grandparents said, ""It",n,yes,n,yes,n,yes
2,grandmothers,f,The grandmothers were waiting on a bench. Beca...,"grandmothers said,",f,yes,f,yes,f,yes
3,grandfathers,m,The grandfathers were waiting on a bench. Beca...,"grandfathers said,",m,yes,m,yes,m,yes
4,monarchs,n,The monarchs were waiting on a bench. Because ...,monarchs was not happy,n,yes,n,yes,n,yes
5,queens,f,The queens were waiting on a bench. Because of...,queens was wearing a hat,f,yes,f,yes,f,yes


## Majority Voting

In [13]:
def majority_vote(lst):
    """Function to get the majority vote of a list of 3 elements"""
    # count elements
    c = Counter(lst)
    # with three elements, majority is 2
    if c.most_common(1)[0][1] >= 2:
        return c.most_common(1)[0][0]
    # if there is no majority, return nan
    else:
        return np.nan

In [14]:
# collect majority votes
mv_gender = []
mv_ref = []
for index, row in annotations.iterrows():
    mv_gender.append(majority_vote([row[4], row[6], row[8]]))
    mv_ref.append(majority_vote([row[5], row[7], row[9]]))
    

  mv_gender.append(majority_vote([row[4], row[6], row[8]]))
  mv_ref.append(majority_vote([row[5], row[7], row[9]]))


In [15]:
# add majority votes to the data frame
annotations['gender_mv'] = mv_gender
annotations['ref_mv'] = mv_ref

In [16]:
annotations

Unnamed: 0_level_0,ante_noun,ante_gender,sentences,continuation,gender_a1,ref_a1,gender_a2,ref_a2,gender_a3,ref_a3,gender_mv,ref_mv
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,grandparents,n,The grandparents were waiting on a bench. Beca...,"grandparents said, ""It",n,yes,n,yes,n,yes,n,yes
2,grandmothers,f,The grandmothers were waiting on a bench. Beca...,"grandmothers said,",f,yes,f,yes,f,yes,f,yes
3,grandfathers,m,The grandfathers were waiting on a bench. Beca...,"grandfathers said,",m,yes,m,yes,m,yes,m,yes
4,monarchs,n,The monarchs were waiting on a bench. Because ...,monarchs was not happy,n,yes,n,yes,n,yes,n,yes
5,queens,f,The queens were waiting on a bench. Because of...,queens was wearing a hat,f,yes,f,yes,f,yes,f,yes
...,...,...,...,...,...,...,...,...,...,...,...,...
626,daughters,f,The daughters were getting off the plane. Afte...,passengers were tired and ready,f,no,n,no,n,no,n,no
627,sons,m,The sons were getting off the plane. After suc...,passengers were tired and just,m,no,n,no,n,no,n,no
628,spouses,n,The spouses were getting off the plane. After ...,passengers were just happy to,n,no,n,no,n,no,n,no
629,wives,f,The wives were getting off the plane. After su...,wives were tired and just,f,yes,f,yes,f,yes,f,yes


## Save Data

In [94]:
# save the annotations
annotations.to_csv(os.path.join(annotations_dir, 'annotations_EN_majorityvotes.csv'),
                   na_rep='NULL')

## Data Exploration

In [17]:
# show all rows where three annotators disagreed in gender

In [18]:
filtered_ann_gender = annotations[
    (annotations['gender_a1'] != annotations['gender_a2']) &
    (annotations['gender_a2'] != annotations['gender_a3'])
]

In [19]:
len(filtered_ann_gender)

32

In [20]:
filtered_ann_ref = annotations[
    (annotations['ref_a1'] != annotations['ref_a2']) &
    (annotations['ref_a2'] != annotations['ref_a3'])
]
len(filtered_ann_ref)

86

In [21]:
print(annotations['gender_mv'].value_counts(dropna=False),
      annotations['ref_mv'].value_counts(dropna=False))

gender_mv
n      289
m      160
f      159
NaN     22
Name: count, dtype: int64 ref_mv
yes    396
no     226
NaN      8
Name: count, dtype: int64
