# Rater Reliability Notebook

In [13]:
import pandas as pd
import numpy as np
from collections import defaultdict
import operator

In [14]:
#Fill in Participant Number Here
participant_number = '1079' 
df = pd.read_csv("summer_2019/1079/output/responses.csv")

In [15]:
df['count'] = df.groupby(['file_name']).file_name.transform('count')

In [16]:
df_an1 = df[df['count'] == 1]
val_counts = df['count'].value_counts()
for k, v in val_counts.items():
    val_counts[k] = int(v)/int(k)
    print(int(k), ": ", val_counts[k], "clips")

1 :  93 clips
2 :  2 clips
3 :  1 clips


In [17]:
df_an1['Language'].value_counts()

Spanish                32
Mixed                  19
No speech              14
Unsure                 12
Categorize language     8
Quechua                 8
Name: Language, dtype: int64

In [18]:
unsure = df_an1[df_an1['Language'] == 'Unsure']

In [19]:
languages = set(['Spanish', 'Quechua', 'Mixed'])
lang_ratio = df_an1[df_an1['Language'].isin(languages)]

### Clips Annotated Twice

In [20]:
df_an2 = df[df['count'] == 2]

In [21]:
grouped = df_an2.groupby('file_name')

In [22]:
intrarater_dict = defaultdict(list)
interrater_dict = defaultdict(list)
for n, group in grouped:
    annotators = list(group['annotator'])
    group = pd.DataFrame(group).reset_index()
    langs = list(group['Language'])
    if annotators[0] == annotators[1]:  #SAME
        annotator = annotators[0]
        if langs[0] == langs[1]:
            intrarater_dict[annotator].append(1)
            if langs[0] == 'Unsure':
                unsure = unsure.append(group[0:1], ignore_index=False, verify_integrity=False, sort=None)
            else:
                lang_ratio = lang_ratio.append(group[0:1])
        else:
            if any(elem in languages for elem in langs):
                intrarater_dict[annotator].append(0)
                if 'Unsure' in langs:
                    unsure = unsure.append(group[0:2])
    else: #Different
        if annotators[0] != annotators[1] and langs[0] == langs[1]:
            interrater_dict[annotators[0]].append(1)
            interrater_dict[annotators[1]].append(1)
            lang_ratio = lang_ratio.append(group[0:1])
        else:
            if annotators[0] != annotators[1] and any(elem in languages for elem in langs):
                interrater_dict[annotators[0]].append(0)
                interrater_dict[annotators[1]].append(0)
                if any(elem == 'Unsure' for elem in langs):
                    unsure = unsure.append(group[0:2])


## Clips Annotated Thrice+

In [23]:
valCount = {'a':2, 'b':1}
annToLangs = [('k', 'a'), ('l', 'b'), ('l', 'a')]
anns = ['k', 'l', 'l']
majorityLang, majorityCount = max(valCount.items(), key=operator.itemgetter(1))
allMajorityLangs = [key for key in valCount.keys() if valCount[key]==majorityCount]
d1 = defaultdict(list)
for ann, lang in annToLangs:
    if lang == majorityLang:
           d1[ann].append(1)
    else:
           d1[ann].append(0)

In [24]:
df_an3 = df[df['count'] >= 3]
grouped3 = df_an3.groupby('file_name')
for n, group in grouped3:
    annotators = list(group['annotator'])
    langs = list(group['Language'])
    annToLangs = zip(annotators, langs)
    uniqueAnns = set(annotators)
    valCount = group['Language'].value_counts()
    majorityLang, majorityCount = max(valCount.items(), key=operator.itemgetter(1))
    allMajorityLangs = [key for key in valCount.keys() if valCount[key]==majorityCount]
    if len(allMajorityLangs) == 1 and majorityLang in languages:
        lang_ratio = lang_ratio.append(group[0:1])
        if len(uniqueAnns) == 1:
            intrarater_dict[uniqueAnns[0]].append(1)
        else:
            for ann, lang in annToLangs:
                if lang == majorityLang:
                    interrater_dict[ann].append(1)
                else:
                    interrater_dict[ann].append(0)


TypeError: 'set' object is not subscriptable

## Interrater Ratio

In [30]:
interrater_df = pd.DataFrame(columns=['Annotator', 'InterRatio'])
for ann in interrater_dict:
    inter_ratio = sum(interrater_dict[ann])/len(interrater_dict[ann])
    print(ann, "Ratio: " , inter_ratio)
    print("Number of clips: ", len(interrater_dict[ann]))
    interrater_df = interrater_df.append({'Annotator': ann, 'IntraRatio':inter_ratio}, ignore_index=True)

## Intrarater Ratio

In [27]:
intrarater_df = pd.DataFrame(columns=['Annotator', 'IntraRatio'])
for ann in intrarater_dict:
    intra_ratio = sum(intrarater_dict[ann])/len(intrarater_dict[ann])
    print(ann, "Ratio: " , intra_ratio)
    print("Number of clips: ", len(intrarater_dict[ann]))
    intrarater_df = intrarater_df.append({'Annotator': ann, 'IntraRatio':intra_ratio}, ignore_index=True)

Meg Ratio:  1.0
Number of clips:  1
Adela Ratio:  0.0
Number of clips:  1


Unnamed: 0,Annotator,IntraRatio
0,Meg,1.0
1,Adela,0.0


## Convert to CSV

In [None]:
interrater_df.to_csv(participant_number + '_interrater.csv')
intrarater_df.to_csv(participant_number  + '_intrarater.csv')
lang_ratio.to_csv(participant_number + '_bilingualratio.csv')
unsure.to_csv(participant_number  + '_relisten.csv')