In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from scipy.stats import pearsonr as pearson
from scipy.stats import spearmanr as spearman
from math import isnan
from collections import Counter
pd.set_option('chained_assignment',None)         # Turn off those dumb annoying warnings
pd.set_option('display.max_columns', None)       # Turns off pandas truncating data

In [2]:
# Read all the various csv files
train_file = "../../../data/pred_results_train_jun25.csv"
raw_train_file = pd.read_csv(train_file)
raw_train_file.columns = [c.replace('.', '_') for c in raw_train_file.columns]

train_file_c = "../../../data/pred_results_train_aug09.csv"
raw_train_file_c = pd.read_csv(train_file_c)
raw_train_file_c.columns = [c.replace('.', '_') for c in raw_train_file_c.columns]

devte_file = "../../../data/pred_results_devte_jun25.csv"
raw_devte_file = pd.read_csv(devte_file)
raw_devte_file.columns = [c.replace('.', '_') for c in raw_devte_file.columns]

devte_file_c = "../../../data/pred_results_devte_aug09.csv"
raw_devte_file_c = pd.read_csv(devte_file_c)
raw_devte_file_c.columns = [c.replace('.', '_') for c in raw_devte_file_c.columns]

print(len(raw_train_file), len(raw_train_file_c), len(raw_devte_file), len(raw_devte_file_c))


raw_data_file = raw_train_file.append(raw_devte_file, ignore_index=True)
raw_data_file_c = raw_train_file_c.append(raw_devte_file_c, ignore_index=True)


tmp_file = '../../../data/pred_root_token.tsv'
tmp = pd.read_csv(tmp_file, sep = "\t")
print(raw_data_file.columns.values.tolist().sort() == raw_data_file_c.columns.values.tolist().sort())

rerun_file = "../../../data/pred_results_sep18.csv"
rerun_data_file = pd.read_csv(rerun_file)
rerun_data_file.columns = [c.replace('.', '_') for c in rerun_data_file.columns]

rerun_file_2 = "../../../data/pred_results_oct2.csv"
rerun_data_file_2 = pd.read_csv(rerun_file_2)
rerun_data_file_2.columns = [c.replace('.', '_') for c in rerun_data_file.columns]

rerun_data_file = rerun_data_file.append(rerun_data_file_2, ignore_index=True)

2394 320 1772 228
True


In [3]:
def extract_dataframe(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["Input_var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            var_dyn = "Answer_pred_dyn" + str(idx + 1)
            var_dyn_c = "Answer_dyn_conf" + str(idx + 1)
            var_part = "Answer_pred_part" + str(idx + 1)
            var_part_c = "Answer_part_conf" + str(idx + 1)
            var_hyp = "Answer_pred_hyp" + str(idx + 1)
            var_hyp_c = "Answer_hyp_conf" + str(idx + 1)
            temp_dict['part'] = getattr(row, var_part)
            temp_dict['part_conf'] = getattr(row, var_part_c)
            temp_dict['dyn'] = getattr(row, var_dyn)
            temp_dict['dyn_conf'] = getattr(row, var_dyn_c)
            temp_dict['hyp'] = getattr(row, var_hyp)
            temp_dict['hyp_conf'] = getattr(row, var_hyp_c)
            temp_dict['worker_id'] = row.WorkerId
            temp_dict['hit_id'] = row.HITId
            temp_dict['status'] = row.AssignmentStatus
            global_list.append(temp_dict)
    
    return pd.DataFrame(global_list)

def extract_dataframe_1(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            global_list.append(temp_dict)
    
    return pd.DataFrame(global_list)

In [4]:
# Remove rejected HITS and bad HITS
rerun_data_file = rerun_data_file[rerun_data_file.AssignmentStatus != "Rejected"]
rerun_data = extract_dataframe(rerun_data_file)
print(rerun_data.columns.values)
rerun_data_1 = rerun_data[pd.isna(rerun_data.pred_root_pos)]
rerun_data_2 = rerun_data[~pd.isna(rerun_data.pred_root_pos)]
print(len(rerun_data), len(rerun_data_1), len(rerun_data_2))

['dyn' 'dyn_conf' 'hit_id' 'hyp' 'hyp_conf' 'id' 'part' 'part_conf'
 'pred_root_pos' 'pred_sentence' 'pred_token' 'predicate' 'raw_sentence'
 'sent_id' 'status' 'worker_id']
6430 4320 2110


In [5]:
raw_data = extract_dataframe(raw_data_file)
hits = pd.read_csv('pred_hits_rerun.tsv', sep="\t")
raw_data = raw_data[~raw_data.worker_id.isin(hits.annotator.values.tolist())]
raw_data = raw_data.append(rerun_data_1, ignore_index=True)
raw_data = raw_data[raw_data['status']!='Rejected']
raw_data = raw_data.reset_index(drop=True)
raw_data['sent_pred'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                           raw_data['pred_token'].map(lambda x: str(x))
raw_data['pred_root_token'] = None
tmp['sent_pred'] = tmp['sent_id'].map(lambda x : x) + "_" +\
                           tmp['pred_token'].map(lambda x: str(x))

# predicate root token information was missing from initial data
for i, _ in raw_data.iterrows():
    raw_data.at[i, 'pred_root_token'] = tmp.loc[tmp['sent_pred'] == \
                                              raw_data.at[i, 'sent_pred'], 'pos'].values[0]
    
raw_data['sent_pred_root'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                           raw_data['pred_root_token'].map(lambda x: str(x))

# Rearrange the columns
cols = ['hit_id', 'worker_id','sent_id', 'sent_pred', 'sent_pred_root', 'predicate', 'pred_token', 
        'pred_root_token','part','part_conf', 'dyn','dyn_conf','hyp','hyp_conf']
data = raw_data[cols]

# Incorporate new annotations
raw_data_c = extract_dataframe(raw_data_file_c)
raw_data_c = raw_data_c[~raw_data_c.worker_id.isin(hits.annotator.values.tolist())]
raw_data_c = raw_data_c.append(rerun_data_2, ignore_index=True)
raw_data_c = raw_data_c[raw_data_c['status']!='Rejected']
raw_data_c = raw_data_c.reset_index(drop=True)
raw_data_c['sent_pred'] = raw_data_c['sent_id'].map(lambda x : x) + "_" +\
                           raw_data_c['pred_token'].map(lambda x: str(x))
raw_data_c.rename(columns={'pred_root_pos':'pred_root_token'}, inplace=True)

raw_data_c['sent_pred_root'] = raw_data_c['sent_id'].map(lambda x : x) + "_" +\
                               raw_data_c['pred_root_token'].map(lambda x: str(x))
data_c = raw_data_c[cols]

print(len(data_c), len(data))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


5470 40460


In [6]:
# train_data_c = data_c[data_c['sent_id'][6:11] == "train"]
data = data.append(data_c, ignore_index=True)

import re
ud_path = "/Users/venkat/Downloads/UD_English-r1.2/"

files = ['en-ud-train.conllu', 'en-ud-dev.conllu', 'en-ud-test.conllu']
lemmas = {}
for file in files:
    with open(ud_path + file, 'r') as f:
        iden = 0
        a = ""
        words = []
        for line in f:
            if line != "\n":
                words.append(line.split("\t")[2])
            else:
                iden += 1
                sent_id = file + " sent_" + str(iden)
                lemmas[sent_id] = words
                words = []

data.loc[:, 'lemma'] = data.apply(lambda x: lemmas[x.loc['sent_id']][int(x.loc['pred_root_token'])], axis=1)

In [7]:
from predpatt import load_conllu
from predpatt import PredPatt
from predpatt import PredPattOpts
from os.path import expanduser

files = ['/UD_English-r1.2/en-ud-train.conllu',
         '/UD_English-r1.2/en-ud-dev.conllu',
         '/UD_English-r1.2/en-ud-test.conllu']
home = expanduser("~/Downloads/")
parsed = []

options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=False)  # Resolve relative clause
options_ = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)  # Resolve relative clause
prons_incl = ["you", "they", "yourself", "themselves", "them", "themself",
              "theirself", "theirselves"]
for file in files:
    path = home + file
    with open(path, 'r') as infile:
        data_in = infile.read()
        parsed += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for
            sent_id, ud_parse in load_conllu(data_in)]
        parsed += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options_)) for
            sent_id, ud_parse in load_conllu(data_in)]
id_to_span = {}
for sent_id, parse_sen in parsed:
    sent_preds = []
    for predicate in parse_sen.instances:
        sent_check = [pr.position for pr in sent_preds]
        if predicate.position not in sent_check:
            sent_preds.append(predicate)

    for predicate in sent_preds:
        if predicate.root.tag not in ["ADJ", "NOUN", "NUM", "DET", "PROPN", "PRON", "VERB", "AUX"]:
            continue
        if predicate.root.tag not in ["VERB", "AUX"]:
            gov_rels = [tok.gov_rel for tok in predicate.tokens]
            all_pred = [t for t in predicate.tokens]
            if 'cop' in gov_rels:
                cop_pos = gov_rels.index('cop')
                pred = [x.text for x in all_pred[cop_pos:]]
                pred_token = [x.position for x in all_pred[cop_pos:]]
            else:
                if predicate.root.tag == "ADJ":
                    pred = [predicate.root.text]
                    pred_token = [predicate.root.position]
                else:
                    continue
        else:
            pred = [predicate.root.text]
            pred_token = [predicate.root.position]
        arguments = predicate.arguments
        predpatt_id = sent_id + "_" + ",".join(map(str, pred_token))
        pred_span = ",".join(map(str, [t.position for t in predicate.tokens]))
        args_context = ",".join(map(str, [t.root.position for t in arguments]))
        id_to_span[predpatt_id] = (pred_span, args_context)

data.loc[:, 'pred_span'] = data.apply(lambda x: id_to_span.get(x['sent_pred'], ("null", "null"))[0], axis=1)
data.loc[:, 'arg_context'] = data.apply(lambda x: id_to_span.get(x['sent_pred'], ("null", "null"))[1], axis=1)

In [9]:
data[data['arg_context']=="null"]
# options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)  # Resolve relative clause
# prons_incl = ["you", "they", "yourself", "themselves", "them", "themself",
#               "theirself", "theirselves"]
# for file in files:
#     path = home + file
#     with open(path, 'r') as infile:
#         data_in = infile.read()
#         parsed += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for
#             sent_id, ud_parse in load_conllu(data_in)]

# for sent_id, parse_sen in parsed:
#     sent_preds = []
#     for predicate in parse_sen.instances:
#         sent_check = [pr.position for pr in sent_preds]
#         if predicate.position not in sent_check:
#             sent_preds.append(predicate)

#     for predicate in sent_preds:
#         if predicate.root.tag not in ["ADJ", "NOUN", "NUM", "DET", "PROPN", "PRON", "VERB", "AUX"]:
#             continue
#         if predicate.root.tag not in ["VERB", "AUX"]:
#             gov_rels = [tok.gov_rel for tok in predicate.tokens]
#             all_pred = [t for t in predicate.tokens]
#             if 'cop' in gov_rels:
#                 cop_pos = gov_rels.index('cop')
#                 pred = [x.text for x in all_pred[cop_pos:]]
#                 pred_token = [x.position for x in all_pred[cop_pos:]]
#             else:
#                 if predicate.root.tag == "ADJ":
#                     pred = [predicate.root.text]
#                     pred_token = [predicate.root.position]
#                 else:
#                     continue
#         else:
#             pred = [predicate.root.text]
#             pred_token = [predicate.root.position]
#         arguments = predicate.arguments
#         predpatt_id = sent_id + "_" + ",".join(map(str, pred_token))
#         pred_span = ",".join(map(str, [t.position for t in predicate.tokens]))
#         args_context = ",".join(map(str, [t.root.position for t in arguments]))
#         id_to_span[predpatt_id] = (pred_span, args_context)

# data.loc[:, 'pred_span'] = data.apply(lambda x: id_to_span.get(x['sent_pred'], ("null", "null"))[0], axis=1)
# data.loc[:, 'arg_context'] = data.apply(lambda x: id_to_span.get(x['sent_pred'], ("null", "null"))[1], axis=1)

Unnamed: 0,hit_id,worker_id,sent_id,sent_pred,sent_pred_root,predicate,pred_token,pred_root_token,part,part_conf,dyn,dyn_conf,hyp,hyp_conf,lemma,pred_span,arg_context


In [11]:
data = data.rename(columns={'hit_id': 'HIT.ID', 'worker_id':'Annotator.ID', 'sent_id':'Sentence.ID', 'sent_pred': 'Sentence.Pred.Tokens', 'sent_pred_root': 'Sentence.Pred.Root.Token',
                            'pred_root_token':'Pred.Root.Token', 'pred_token':'Pred.Tokens', 'pred_span': 'Pred.Span', 'arg_context': 'Argument.Context',
                            'predicate':'Predicate', 'lemma':'Predicate.Lemma',
                            'part':'Is.Particular', 'part_conf':'Part.Confidence',
                            'dyn':'Is.Dynamic', 'dyn_conf':'Dyn.Confidence',
                            'hyp':'Is.Hypothetical', 'hyp_conf':'Hyp.Confidence'})
data.loc[:, 'Split'] = data.loc[:, 'Sentence.ID'].str[6:11]
data['Split'] = data['Split'].map(lambda x: x.rstrip('.c'))
cols = ['Split', 'HIT.ID', 'Annotator.ID','Sentence.ID', 'Pred.Root.Token','Pred.Tokens', 'Pred.Span', 'Argument.Context' ,'Sentence.Pred.Tokens', 'Sentence.Pred.Root.Token',
        'Predicate', 'Predicate.Lemma', 'Is.Particular', 'Part.Confidence', 'Is.Dynamic','Dyn.Confidence',
        'Is.Hypothetical','Hyp.Confidence']
data = data[cols]
data.to_csv('../../../data/pred_raw_data.tsv', sep="\t", index=False)

In [12]:
# Create long form dataset
import re
long_cols = ['Split', 'Annotator.ID','Sentence.ID','Pred.Root.Token', 'Pred.Tokens', 'Pred.Span', 'Argument.Context', 'Predicate',
             'Predicate.Lemma', 'Is.Particular', 'Part.Confidence', 'Is.Dynamic','Dyn.Confidence',
             'Is.Hypothetical','Hyp.Confidence']

long_data = data.copy()
# long_data = long_data.rename(columns={'worker_id':'Annotator.ID', 'sent_id':'Sentence.ID',
#                                       'pred_root_token':'Pred.Root.Token', 'pred_token':'Pred.Span',
#                                       'predicate':'Predicate', 'lemma':'Predicate.Lemma',
#                                       'part':'Is.Particular', 'part_conf':'Part.Confidence',
#                                       'dyn':'Is.Dynamic', 'dyn_conf':'Dyn.Confidence',
#                                       'hyp':'Is.Hypothetical', 'hyp_conf':'Hyp.Confidence'})


long_data['Sentence.ID'] = data['Sentence.ID'].map(lambda x: re.findall(r'\d+', x)[0])

ann_hash = {}
annid = 0
for ann in set(long_data['Annotator.ID'].values):
    annid += 1
    ann_hash[ann] = annid
long_data['Annotator.ID'] = long_data['Annotator.ID'].map(ann_hash)
long_data = long_data[long_cols]
print(len(long_data))
long_data.to_csv('../../../data/pred_long_data.tsv', sep="\t", index=False)

45930
