In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from scipy.stats import pearsonr as pearson
from scipy.stats import spearmanr as spearman
from math import isnan
from collections import Counter

In [2]:
train_file = "results_train_jun25.csv"
raw_train_file = pd.read_csv(train_file)
raw_train_file.columns = [c.replace('.', '_') for c in raw_train_file.columns]

train_file_c = "results_train_aug09.csv"
raw_train_file_c = pd.read_csv(train_file_c)
raw_train_file_c.columns = [c.replace('.', '_') for c in raw_train_file_c.columns]

devte_file = "results_devte_jun25.csv"
raw_devte_file = pd.read_csv(devte_file)
raw_devte_file.columns = [c.replace('.', '_') for c in raw_devte_file.columns]

devte_file_c = "results_devte_aug09.csv"
raw_devte_file_c = pd.read_csv(devte_file_c)
raw_devte_file_c.columns = [c.replace('.', '_') for c in raw_devte_file_c.columns]

print(len(raw_train_file), len(raw_train_file_c), len(raw_devte_file), len(raw_devte_file_c))


raw_data_file = raw_train_file.append(raw_devte_file, ignore_index=True)
raw_data_file_c = raw_train_file_c.append(raw_devte_file_c, ignore_index=True)


tmp_file = 'pred_root_token.tsv'
tmp = pd.read_csv(tmp_file, sep = "\t")

2394 320 1772 228


In [3]:
def extract_dataframe(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["Input_var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            var_dyn = "Answer_pred_dyn" + str(idx + 1)
            var_dyn_c = "Answer_dyn_conf" + str(idx + 1)
            var_part = "Answer_pred_part" + str(idx + 1)
            var_part_c = "Answer_part_conf" + str(idx + 1)
            var_hyp = "Answer_pred_hyp" + str(idx + 1)
            var_hyp_c = "Answer_hyp_conf" + str(idx + 1)
            temp_dict['part'] = getattr(row, var_part)
            temp_dict['part_conf'] = getattr(row, var_part_c)
            temp_dict['dyn'] = getattr(row, var_dyn)
            temp_dict['dyn_conf'] = getattr(row, var_dyn_c)
            temp_dict['hyp'] = getattr(row, var_hyp)
            temp_dict['hyp_conf'] = getattr(row, var_hyp_c)
            temp_dict['worker_id'] = row.WorkerId
            temp_dict['hit_id'] = row.HITId
            temp_dict['status'] = row.AssignmentStatus
            global_list.append(temp_dict)
    
    return pd.DataFrame(global_list)

In [4]:
raw_data = extract_dataframe(raw_data_file)
raw_data = raw_data[raw_data['status']!='Rejected']
raw_data = raw_data.reset_index(drop=True)
raw_data['sent_pred'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                           raw_data['pred_token'].map(lambda x: str(x))
raw_data['pred_root_token'] = None
tmp['sent_pred'] = tmp['sent_id'].map(lambda x : x) + "_" +\
                           tmp['pred_token'].map(lambda x: str(x))

# predicate root token information was missing from initial data
for i, _ in raw_data.iterrows():
    raw_data.at[i, 'pred_root_token'] = tmp.loc[tmp['sent_pred'] == \
                                              raw_data.at[i, 'sent_pred'], 'pos'].values[0]
    
raw_data['sent_pred_root'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                           raw_data['pred_root_token'].map(lambda x: str(x))

# Rearrange the columns
cols = ['hit_id', 'worker_id','sent_id', 'sent_pred', 'predicate', 'pred_token', 
        'pred_root_token','part','part_conf', 'dyn','dyn_conf','hyp','hyp_conf']
data = raw_data[cols]

# Incorporate new annotations
raw_data_c = extract_dataframe(raw_data_file_c)
raw_data_c = raw_data_c[raw_data_c['status']!='Rejected']
raw_data_c = raw_data_c.reset_index(drop=True)
raw_data_c['sent_pred'] = raw_data_c['sent_id'].map(lambda x : x) + "_" +\
                           raw_data_c['pred_token'].map(lambda x: str(x))
raw_data_c.rename(columns={'pred_root_pos':'pred_root_token'}, inplace=True)

raw_data_c['sent_pred_root'] = raw_data_c['sent_id'].map(lambda x : x) + "_" +\
                               raw_data_c['pred_root_token'].map(lambda x: str(x))
data_c = raw_data_c[cols]

print(len(data_c), len(data))

5480 40460


In [5]:
# train_data_c = data_c[data_c['sent_id'][6:11] == "train"]
data = data.append(data_c, ignore_index=True)

In [6]:
# Create long form dataset
long_cols = ['Split', 'Annotator.ID','Sentence.ID','Pred.Root.Token', 'Pred.Tokens','Predicate',
             'Is.Particular', 'Part.Confidence', 'Is.Dynamic','Dyn.Confidence',
             'Is.Hypothetical','Hyp.Confidence']

long_data = data.copy()
long_data = long_data.rename(columns={'worker_id':'Annotator.ID', 'sent_id':'Sentence.ID',
                                      'pred_root_token':'Pred.Root.Token', 'pred_token':'Pred.Tokens',
                                      'predicate':'Predicate', 'part':'Is.Particular', 
                                      'part_conf':'Part.Confidence', 'dyn':'Is.Dynamic', 
                                      'dyn_conf':'Dyn.Confidence', 'hyp':'Is.Hypothetical', 
                                      'hyp_conf':'Hyp.Confidence'})

long_data['Split'] = long_data['Sentence.ID'].str[6:11]
long_data['Split'] = long_data['Split'].map(lambda x: x.rstrip('.c'))

ann_hash = {}
annid = 0
for ann in set(long_data['Annotator.ID'].values):
    annid += 1
    ann_hash[ann] = annid
long_data['Annotator.ID'] = long_data['Annotator.ID'].map(ann_hash)
long_data = long_data[long_cols]
long_data.to_csv('long_data.tsv', sep="\t", index=False)

26740
9


# Majority vote in devtest with custom ridit score

In [None]:
attributes = ["part", "dyn", "hyp"]
attr_map = {"part": "Is.Particular", "dyn": "Is.Dynamic", "hyp": "Is.Hypothetical"}
attr_conf = {"part": "Part.Confidence", "dyn": "Dyn.Confidence",
             "hyp": "Hyp.Confidence"}
from statistics import mode
data = long_data[long_data['Split'] != 'test']

response = ["Is.Particular", "Is.Hypothetical", "Is.Dynamic"]
response_conf = ["Part.Confidence", "Hyp.Confidence", "Dyn.Confidence"]

# Convert responses to 1s and 0s
for resp in response:
    data[resp] = data[resp].astype(int)

for resp in response_conf:
    data[resp] = data.groupby('Annotator.ID')[resp].apply(lambda x: x.rank() / (len(x) + 1.))
data_dev = data[data['Split'] == 'dev']
col = data_dev['Sentence.ID'] + "_" + data_dev['Pred.Root.Token'].map(str)
data_dev = data_dev.assign(SentenceIDToken=col.values)
sent_ids = list(set(data_dev['SentenceIDToken'].tolist()))
data_dev_reduced = pd.DataFrame()
i = 0
print(len(sent_ids))
for sent_id in sent_ids:
    i += 1
    print(i, end="\r")
    new_df = data_dev[data_dev['SentenceIDToken'] == sent_id]
    sample = new_df.iloc[0]
    
    for attr in attributes:
        answers = new_df[attr_map[attr]].tolist()
        if all(x == answers[0] for x in answers):
            mode_ans = answers[0]
            new_conf = sum(new_df[attr_conf[attr]].tolist()) / 3
        else:
            mode_ans = mode(answers)
            new_df[new_df[attr_map[attr]] != mode_ans][attr_conf[attr]] = 1 - new_df[new_df[attr_map[attr]] != mode_ans][attr_conf[attr]]
            new_conf = sum(new_df[attr_conf[attr]].tolist()) / 3

        sample[attr_map[attr]] = mode_ans
        sample[attr_conf[attr]] = new_conf

    data_dev_reduced = data_dev_reduced.append(sample)
data_dev_reduced.to_csv('pred_data_dev.tsv', sep='\t', index=False)