In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from scipy.stats import pearsonr as pearson
from scipy.stats import spearmanr as spearman
from math import isnan
from collections import Counter

In [None]:
train_file = "pred_train.csv"
raw_train_file = pd.read_csv(train_file)
raw_train_file.columns = [c.replace('.', '_') for c in raw_train_file.columns]

devte_file = "pred_devtest.csv"
raw_devte_file = pd.read_csv(devte_file)
raw_devte_file.columns = [c.replace('.', '_') for c in raw_devte_file.columns]
print(len(raw_train_file), len(raw_devte_file))
raw_data_file = raw_train_file.append(raw_devte_file, ignore_index=True)

happ_file = "it-happened_eng_ud1.2_07092017.tsv"
happ = pd.read_csv(happ_file, sep="\t")

tmp_file = 'check_data.csv'
tmp = pd.read_csv(tmp_file, sep = "\t")

In [None]:
def extract_dataframe(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["Input_var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            var_dyn = "Answer_pred_dyn" + str(idx + 1)
            var_dyn_c = "Answer_dyn_conf" + str(idx + 1)
            var_part = "Answer_pred_part" + str(idx + 1)
            var_part_c = "Answer_part_conf" + str(idx + 1)
            var_hyp = "Answer_pred_hyp" + str(idx + 1)
            var_hyp_c = "Answer_hyp_conf" + str(idx + 1)
            temp_dict['part'] = getattr(row, var_part)
            temp_dict['part_conf'] = getattr(row, var_part_c)
            temp_dict['dyn'] = getattr(row, var_dyn)
            temp_dict['dyn_conf'] = getattr(row, var_dyn_c)
            temp_dict['hyp'] = getattr(row, var_hyp)
            temp_dict['hyp_conf'] = getattr(row, var_hyp_c)
            temp_dict['worker_id'] = row.WorkerId
            temp_dict['hit_id'] = row.HITId
            temp_dict['status'] = row.AssignmentStatus
            global_list.append(temp_dict)
    
    return pd.DataFrame(global_list)

In [None]:
raw_data = extract_dataframe(raw_data_file)
raw_data = raw_data[raw_data['status']!='Rejected']
raw_data = raw_data.reset_index(drop=True)
raw_data['sent_pred'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                           raw_data['pred_token'].map(lambda x: str(x))
raw_data['pred_pos'] = None
tmp['sent_pred'] = tmp['sent_id'].map(lambda x : x) + "_" +\
                           tmp['pred_token'].map(lambda x: str(x))

for i, _ in raw_data.iterrows():
    raw_data.at[i, 'pred_pos'] = tmp.loc[tmp['sent_pred'] == raw_data.at[i, 'sent_pred'], 'pos'].values[0]
    
raw_data['sent_predpos'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                           raw_data['pred_pos'].map(lambda x: str(x))
happ['Pred.Token'] -= 1
happ['sent_pred'] = happ['Sentence.ID'].map(lambda x : x) + "_" +\
                           happ['Pred.Token'].map(lambda x: str(x))
# Rearrange the columns
cols = ['hit_id', 'worker_id','sent_pred', 'predicate', 'pred_pos','sent_predpos', 'sent_id','pred_token','part','part_conf',
        'dyn','dyn_conf','hyp','hyp_conf']
data = raw_data[cols]


happ = happ[happ['Happened'] != 'na']

happ['Happened'] = happ['Happened'].map({'true': 1, 'false': 0})
data['hyp'] = data['hyp'].map({True: 1, False: 0})

In [None]:
x1 = []
x2 = []
print(len(data))
comm =  set(list(data['sent_predpos'].values)).intersection(set(list(happ['sent_pred'].values)))
print(len(list(comm)))
for m in comm:
    x1.append(data[data['sent_predpos'] == m]['hyp'].values[0])
    x2.append(happ[happ['sent_pred'] == m]['Happened'].values[0])
print(pearson(x1, x2))

In [None]:
# from math import sqrt
# cm = {'tp':0, 'tn':0, 'fp':0, 'fn':0}
# for i in range(len(x1)):
#     if x1[i] == x2[i]:
#         if x1[i] == 1:
#             cm['tp'] += 1
#         else:
#             cm['tn'] += 1
#     else:
#         if x1[i] == 1:
#             cm['fp'] += 1
#         else:
#             cm['fn'] += 1
# print(cm)
# matthew = (cm['tp'] * cm['tn'] - cm['fp'] * cm['fn']) / sqrt((cm['tp'] + cm['fp'])*(cm['tp'] + cm['fn'])*(cm['tn'] + cm['fp'])*(cm['tn'] + cm['fn']))
# print(matthew)
# conf_mat = [[cm['tn'], cm['fp']], [cm['fn'], cm['tp']]]
# df_cm = pd.DataFrame(conf_mat, index=['False', 'True'], columns=['False', 'True'])
# plt.figure(figsize = (10,7))
# sns.heatmap(df_cm, annot=True)
# Counter(x2)

In [None]:
# Create long form dataset
long_cols = ['Split', 'Annotator.ID','Sentence.ID','Pred.Root.Token', 'Pred.Tokens','Predicate',
             'Is.Particular', 'Part.Confidence', 'Is.Dynamic','Dyn.Confidence',
             'Is.Hypothetical','Hyp.Confidence']

long_data = data.copy()
long_data = long_data.rename(columns={'worker_id':'Annotator.ID', 'sent_id':'Sentence.ID',
                                      'pred_pos':'Pred.Root.Token', 'pred_token':'Pred.Tokens',
                                      'predicate':'Predicate', 'part':'Is.Particular', 
                                      'part_conf':'Part.Confidence', 'dyn':'Is.Dynamic', 
                                      'dyn_conf':'Dyn.Confidence', 'hyp':'Is.Hypothetical', 
                                      'hyp_conf':'Hyp.Confidence'})

long_data['Split'] = long_data['Sentence.ID'].str[6:11]
long_data['Split'] = long_data['Split'].map(lambda x: x.rstrip('.c'))

ann_hash = {}
annid = 0
for ann in set(long_data['Annotator.ID'].values):
    annid += 1
    ann_hash[ann] = annid
long_data['Annotator.ID'] = long_data['Annotator.ID'].map(ann_hash)
long_data = long_data[long_cols]
long_data['Is.Hypothetical'] = long_data['Is.Hypothetical'].map({1: True, 0: False})
long_data.to_csv('long_data.tsv', sep="\t", index=False)