In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import Counter
from scipy.stats import pearsonr as pearson
from scipy.stats import spearmanr as spearman
from math import isnan
from collections import Counter
from os.path import expanduser
pd.set_option('chained_assignment',None)         # Turn off those dumb annoying warnings
pd.set_option('display.max_columns', None)       # Turns off pandas truncating data

home = expanduser('~')

In [2]:
# Read all the various csv files 
train_file = home + "/Research/protocols/data/noun_results_train_jun25.csv"
raw_train_file = pd.read_csv(train_file)
raw_train_file.columns = [c.replace('.', '_') for c in raw_train_file.columns]

train_file_c = home + "/Research/protocols/data/noun_results_train_aug09.csv"
raw_train_file_c = pd.read_csv(train_file_c)
raw_train_file_c.columns = [c.replace('.', '_') for c in raw_train_file_c.columns]

devte_file = home + "/Research/protocols/data/noun_results_devte_jun25.csv"
raw_devte_file = pd.read_csv(devte_file)
raw_devte_file.columns = [c.replace('.', '_') for c in raw_devte_file.columns]

devte_file_c = home + "/Research/protocols/data/noun_results_devte_aug09.csv"
raw_devte_file_c = pd.read_csv(devte_file_c)
raw_devte_file_c.columns = [c.replace('.', '_') for c in raw_devte_file_c.columns]

print(len(raw_train_file), len(raw_train_file_c), len(raw_devte_file), len(raw_devte_file_c))


raw_data_file = raw_train_file.append(raw_devte_file, ignore_index=True)
raw_data_file_c = raw_train_file_c.append(raw_devte_file_c, ignore_index=True)
raw_data_file = raw_data_file.append(raw_data_file_c, ignore_index=True)
print(len(raw_data_file))

hits = pd.read_csv(home + "/Research/protocols/data/noun_hits_rerun.tsv", sep="\t")
hits_to_rerun = raw_data_file[raw_data_file.WorkerId.isin(hits.annotator.values.tolist())]

rerun_file = home + "/Research/protocols/data/noun_results_sep18.csv"
rerun_data = pd.read_csv(rerun_file)
rerun_data.columns = [c.replace('.', '_') for c in rerun_data.columns]

rerun_file_2 = home + "/Research/protocols/data/noun_results_oct2.csv"
rerun_data_2 = pd.read_csv(rerun_file_2)
rerun_data_2.columns = [c.replace('.', '_') for c in rerun_data_2.columns]

rerun_data = rerun_data.append(rerun_data_2, ignore_index=True)

2985 26 2189 18
5218


In [3]:
def extract_dataframe(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["Input_var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            var_part = "Answer_noun_part" + str(idx + 1)
            var_part_c = "Answer_noun_part_certainty" + str(idx + 1)
            var_kind = "Answer_noun_class" + str(idx + 1)
            var_kind_c = "Answer_noun_class_certainty" + str(idx + 1)
            var_abs = "Answer_noun_abs" + str(idx + 1)
            var_abs_c = "Answer_noun_abs_certainty" + str(idx + 1)
            temp_dict['part'] = getattr(row, var_part)
            temp_dict['part_conf'] = getattr(row, var_part_c)
            temp_dict['kind'] = getattr(row, var_kind)
            temp_dict['kind_conf'] = getattr(row, var_kind_c)
            temp_dict['abs'] = getattr(row, var_abs)
            temp_dict['abs_conf'] = getattr(row, var_abs_c)
            temp_dict['worker_id'] = row.WorkerId
            temp_dict['hit_id'] = row.HITId
            temp_dict['status'] = row.AssignmentStatus
            global_list.append(temp_dict)
    
    return pd.DataFrame(global_list)

In [4]:
# Remove rejected HITS and bad HITS
raw_data_file = raw_data_file.append(rerun_data, ignore_index=True)
raw_data_file = raw_data_file[raw_data_file.AssignmentStatus != "Rejected"]
raw_data_file = raw_data_file[~raw_data_file.WorkerId.isin(hits.annotator.values.tolist())]
raw_data = extract_dataframe(raw_data_file)
raw_data = raw_data.reset_index(drop=True)

raw_data['noun_token'] = pd.to_numeric(raw_data['noun_token'])
raw_data['noun_token'] += 1                 # Changinging from 0-index to 1-index

# Add a column for unique identification of each annotation
raw_data.loc[:, 'sent_noun'] = raw_data['sent_id'] + "_" + raw_data['noun_token'].map(lambda x: str(x))

### Remove duplicate annotations(due to padding)

In [5]:
sid_counts = Counter(raw_data['sent_noun'])
for a in sid_counts.keys():
    if a[:11]=='en-ud-train':
        if sid_counts[a] != 1:
            print(a, sid_counts[a])
    else:
        if sid_counts[a] != 3:
            print(a, sid_counts[a])

en-ud-train.conllu sent_12541_2 2
en-ud-train.conllu sent_12541_10 2
en-ud-train.conllu sent_12541_22 2
en-ud-train.conllu sent_12542_2 2
en-ud-train.conllu sent_12542_17 2
en-ud-train.conllu sent_12542_19 2
en-ud-train.conllu sent_12543_13 2
en-ud-train.conllu sent_12543_18 2
en-ud-train.conllu sent_12543_19 2
en-ud-test.conllu sent_2075_3 6
en-ud-train.conllu sent_12104_20 2
en-ud-train.conllu sent_12200_21 2
en-ud-train.conllu sent_12217_8 2
en-ud-train.conllu sent_12253_10 2
en-ud-train.conllu sent_12281_27 2
en-ud-train.conllu sent_12365_20 2
en-ud-test.conllu sent_919_28 6
en-ud-test.conllu sent_989_19 6
en-ud-test.conllu sent_1004_20 6
en-ud-test.conllu sent_1022_27 6
en-ud-test.conllu sent_1025_21 6
en-ud-test.conllu sent_1060_24 6
en-ud-test.conllu sent_1339_12 6
en-ud-test.conllu sent_1464_57 6


In [6]:
# duplicate_train_sents = ['en-ud-train.conllu sent_12541_1', 'en-ud-train.conllu sent_12541_9', 
#                          'en-ud-train.conllu sent_12541_21', 'en-ud-train.conllu sent_12542_1',
#                          'en-ud-train.conllu sent_12542_16', 'en-ud-train.conllu sent_12542_18', 
#                          'en-ud-train.conllu sent_12543_12', 'en-ud-train.conllu sent_12543_17', 
#                          'en-ud-train.conllu sent_12543_18', 'en-ud-train.conllu sent_12104_19', 
#                          'en-ud-train.conllu sent_12200_20', 'en-ud-train.conllu sent_12217_7',
#                          'en-ud-train.conllu sent_12253_9', 'en-ud-train.conllu sent_12281_26', 
#                          'en-ud-train.conllu sent_12365_19']
# duplicate_dev_sents = ['en-ud-test.conllu sent_2075_2', 'en-ud-test.conllu sent_919_27',
#                        'en-ud-test.conllu sent_989_18', 'en-ud-test.conllu sent_1004_19',
#                        'en-ud-test.conllu sent_1022_26', 'en-ud-test.conllu sent_1025_20',
#                        'en-ud-test.conllu sent_1060_23', 'en-ud-test.conllu sent_1339_11',
#                        'en-ud-test.conllu sent_1464_56']

duplicate_train_sents = ['en-ud-train.conllu sent_12541_2', 'en-ud-train.conllu sent_12541_10', 
                         'en-ud-train.conllu sent_12541_22', 'en-ud-train.conllu sent_12542_2',
                         'en-ud-train.conllu sent_12542_17', 'en-ud-train.conllu sent_12542_19', 
                         'en-ud-train.conllu sent_12543_13', 'en-ud-train.conllu sent_12543_18', 
                         'en-ud-train.conllu sent_12543_19', 'en-ud-train.conllu sent_12104_20', 
                         'en-ud-train.conllu sent_12200_21', 'en-ud-train.conllu sent_12217_8',
                         'en-ud-train.conllu sent_12253_10', 'en-ud-train.conllu sent_12281_27', 
                         'en-ud-train.conllu sent_12365_20']
duplicate_dev_sents = ['en-ud-test.conllu sent_2075_3', 'en-ud-test.conllu sent_919_28',
                       'en-ud-test.conllu sent_989_19', 'en-ud-test.conllu sent_1004_20',
                       'en-ud-test.conllu sent_1022_27', 'en-ud-test.conllu sent_1025_21',
                       'en-ud-test.conllu sent_1060_24', 'en-ud-test.conllu sent_1339_12',
                       'en-ud-test.conllu sent_1464_57']

print(raw_data.shape)
for a in duplicate_dev_sents:
    raw_data.drop(raw_data[raw_data['sent_noun'] == a].index[0:3], inplace=True)

for a in duplicate_train_sents:
    raw_data.drop(raw_data[raw_data['sent_noun'] == a].index[0], inplace=True)
# Rearrange the columns
cols = ['hit_id', 'worker_id','sent_noun', 'noun', 'sent_id','noun_token','part','part_conf',
        'kind','kind_conf','abs','abs_conf']
data = raw_data[cols]

# Lemma extraction
import re
ud_path = "/Users/venkat/Downloads/UD_English-r1.2/"

files = ['en-ud-train.conllu', 'en-ud-dev.conllu', 'en-ud-test.conllu']
lemmas = {}
for file in files:
    with open(ud_path + file, 'r') as f:
        iden = 0
        a = ""
        words = []
        for line in f:
            if line != "\n":
                words.append(line.split("\t")[2])
            else:
                iden += 1
                sent_id = file + " sent_" + str(iden)
                lemmas[sent_id] = words
                words = []

data.loc[:, 'lemma'] = data.apply(lambda x: lemmas[x.loc['sent_id']][int(x.loc['noun_token']) - 1], axis=1)
print(data.shape)

(51410, 18)
(51368, 13)


# Find the Argument and Predicate Span & Root

In [7]:
from predpatt import load_conllu
from predpatt import PredPatt
from predpatt import PredPattOpts
from os.path import expanduser
files = ['/UD_English-r1.2/en-ud-train.conllu',
         '/UD_English-r1.2/en-ud-dev.conllu',
         '/UD_English-r1.2/en-ud-test.conllu']

parsed = []

options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)  # Resolve relative clause
prons_incl = ["you", "they", "yourself", "themselves", "them", "themself",
              "theirself", "theirselves"]
for file in files:
    path = home + '/Downloads' + file
    with open(path, 'r') as infile:
        data_in = infile.read()
        parsed += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for
            sent_id, ud_parse in load_conllu(data_in)]
id_to_span = {}
for sent_id, parse_sen in parsed:
    sent_args = []
    sent_pred_poss = []
    for predicate in parse_sen.instances:
        if predicate.root.tag in ["NOUN"]:
            all_args = predicate.arguments + [predicate]
        else:
            all_args = predicate.arguments
        sent_check = [x.position for x in sent_args]
        for each_arg in all_args:
            if each_arg.position not in sent_check:
                sent_args.append(each_arg)
                sent_pred_poss.append((str(predicate.root.position + 1), ",".join(map(str, [(t.position + 1) for t in predicate.tokens]))))
    for ij, argument in enumerate(sent_args):
        if argument.root.tag in ["DET", "NUM", "NOUN", "PROPN", "PRON"]:
            if argument.root.tag == "PRON":
                if argument.root.text.lower() not in prons_incl:
                    continue
        noun_span = [(t.position + 1) for t in argument.tokens]      # 0-index to 1-index
        pred_context_root = sent_pred_poss[ij][0]
        pred_context_span = sent_pred_poss[ij][1]
        predpatt_id = sent_id + "_" + str(argument.root.position + 1)
        id_to_span[predpatt_id] = (",".join(map(str, noun_span)), pred_context_root, pred_context_span)

data.loc[:, 'noun_span'] = data.apply(lambda x: id_to_span[x['sent_noun']][0], axis=1)
data.loc[:, 'pred_context_root'] = data.apply(lambda x: id_to_span[x['sent_noun']][1], axis=1)
data.loc[:, 'pred_context_span'] = data.apply(lambda x: id_to_span[x['sent_noun']][2], axis=1)

In [8]:
data = data.rename(columns={'hit_id': 'HIT.ID', 'worker_id':'Annotator.ID', 
                            'sent_id':'Sentence.ID', 'sent_noun': 'Sentence.Arg.Token',
                            'noun_token':'Arg.Token', 'noun_span': 'Arg.Span',
                            'lemma': 'Arg.Lemma', 'pred_context_root': 'Pred.Token',
                            'pred_context_span': 'Pred.Span',
                            'noun':'Arg.Word', 'part':'Is.Particular', 
                            'part_conf':'Part.Confidence', 'kind':'Is.Kind', 
                            'kind_conf':'Kind.Confidence', 'abs':'Is.Abstract', 
                            'abs_conf':'Abs.Confidence'})

data.loc[:, 'Split'] = data.loc[:, 'Sentence.ID'].str[6:11]
data['Split'] = data['Split'].map(lambda x: x.rstrip('.c'))
cols = ['Split', 'HIT.ID', 'Annotator.ID','Sentence.ID', 'Arg.Token', 'Arg.Span',
        'Sentence.Arg.Token', 'Pred.Token', 'Pred.Span', 'Arg.Word', 'Arg.Lemma',
        'Is.Particular','Part.Confidence', 'Is.Kind','Kind.Confidence', 'Is.Abstract',
        'Abs.Confidence']
data = data[cols]
data.to_csv(home + '/Research/protocols/data/FINAL_arg_raw_data.tsv', sep="\t", index=False)
print(len(data))

51368


## Convert to long form

In [9]:
import re
long_cols = ['Split', 'Annotator.ID','Sentence.ID', 'Arg.Token', 'Arg.Span', 'Pred.Token',
             'Pred.Span', 'Arg.Word', 'Arg.Lemma', 'Is.Particular', 'Part.Confidence', 'Is.Kind',
             'Kind.Confidence', 'Is.Abstract','Abs.Confidence']

long_data = data.copy()
# long_data['Sentence.ID'] = data['Sentence.ID'].map(lambda x: re.findall(r'\d+', x)[0])

ann_hash = {}
annid = 0
for ann in set(long_data['Annotator.ID'].values):
    annid += 1
    ann_hash[ann] = annid
long_data['Annotator.ID'] = long_data['Annotator.ID'].map(ann_hash)
long_data = long_data[long_cols]
print(len(long_data))
long_data.to_csv(home + '/Research/protocols/data/FINAL_arg_long_data.tsv', sep="\t", index=False)

51368


In [10]:
long_data['Unique.ID'] = long_data.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Arg.Token"]), axis=1)
print(len(long_data['Unique.ID'].unique()))

37146
