In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from scipy.stats import pearsonr as pearson
from scipy.stats import spearmanr as spearman
from math import isnan
from collections import Counter
pd.set_option('chained_assignment',None)         # Turn off those dumb annoying warnings
pd.set_option('display.max_columns', None)       # Turns off pandas truncating data

In [2]:
# Read all the various csv files 
train_file = "../../../data/noun_results_train_jun25.csv"
raw_train_file = pd.read_csv(train_file)
raw_train_file.columns = [c.replace('.', '_') for c in raw_train_file.columns]

train_file_c = "../../../data/noun_results_train_aug09.csv"
raw_train_file_c = pd.read_csv(train_file_c)
raw_train_file_c.columns = [c.replace('.', '_') for c in raw_train_file_c.columns]

devte_file = "../../../data/noun_results_devte_jun25.csv"
raw_devte_file = pd.read_csv(devte_file)
raw_devte_file.columns = [c.replace('.', '_') for c in raw_devte_file.columns]

devte_file_c = "../../../data/noun_results_devte_aug09.csv"
raw_devte_file_c = pd.read_csv(devte_file_c)
raw_devte_file_c.columns = [c.replace('.', '_') for c in raw_devte_file_c.columns]

print(len(raw_train_file), len(raw_train_file_c), len(raw_devte_file), len(raw_devte_file_c))


raw_data_file = raw_train_file.append(raw_devte_file, ignore_index=True)
raw_data_file_c = raw_train_file_c.append(raw_devte_file_c, ignore_index=True)
raw_data_file = raw_data_file.append(raw_data_file_c, ignore_index=True)
print(len(raw_data_file))

hits = pd.read_csv('noun_hits_rerun.tsv', sep="\t")
hits_to_rerun = raw_data_file[raw_data_file.WorkerId.isin(hits.annotator.values.tolist())]

rerun_file = "../../../data/noun_results_sep18.csv"
rerun_data = pd.read_csv(rerun_file)
rerun_data.columns = [c.replace('.', '_') for c in rerun_data.columns]

rerun_file_2 = "../../../data/noun_results_oct2.csv"
rerun_data_2 = pd.read_csv(rerun_file_2)
rerun_data_2.columns = [c.replace('.', '_') for c in rerun_data_2.columns]

rerun_data = rerun_data.append(rerun_data_2, ignore_index=True)

2985 26 2189 18
5218


In [3]:
def extract_dataframe(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["Input_var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            var_part = "Answer_noun_part" + str(idx + 1)
            var_part_c = "Answer_noun_part_certainty" + str(idx + 1)
            var_kind = "Answer_noun_class" + str(idx + 1)
            var_kind_c = "Answer_noun_class_certainty" + str(idx + 1)
            var_abs = "Answer_noun_abs" + str(idx + 1)
            var_abs_c = "Answer_noun_abs_certainty" + str(idx + 1)
            temp_dict['part'] = getattr(row, var_part)
            temp_dict['part_conf'] = getattr(row, var_part_c)
            temp_dict['kind'] = getattr(row, var_kind)
            temp_dict['kind_conf'] = getattr(row, var_kind_c)
            temp_dict['abs'] = getattr(row, var_abs)
            temp_dict['abs_conf'] = getattr(row, var_abs_c)
            temp_dict['worker_id'] = row.WorkerId
            temp_dict['hit_id'] = row.HITId
            temp_dict['status'] = row.AssignmentStatus
            global_list.append(temp_dict)
    
    return pd.DataFrame(global_list)

In [4]:
# Remove rejected HITS and bad HITS
raw_data_file = raw_data_file.append(rerun_data, ignore_index=True)
raw_data_file = raw_data_file[raw_data_file.AssignmentStatus != "Rejected"]
raw_data_file = raw_data_file[~raw_data_file.WorkerId.isin(hits.annotator.values.tolist())]
raw_data = extract_dataframe(raw_data_file)
raw_data = raw_data.reset_index(drop=True)

In [5]:
# Add a column for unique identification of each annotation
raw_data.loc[:, 'sent_noun'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                               raw_data['noun_token'].map(lambda x: str(x))

# Rearrange the columns
cols = ['hit_id', 'worker_id','sent_noun', 'noun', 'sent_id','noun_token','part','part_conf',
        'kind','kind_conf','abs','abs_conf']
data = raw_data[cols]

# Lemma extraction
import re
ud_path = "/Users/venkat/Downloads/UD_English-r1.2/"

files = ['en-ud-train.conllu', 'en-ud-dev.conllu', 'en-ud-test.conllu']
lemmas = {}
for file in files:
    with open(ud_path + file, 'r') as f:
        iden = 0
        a = ""
        words = []
        for line in f:
            if line != "\n":
                words.append(line.split("\t")[2])
            else:
                iden += 1
                sent_id = file + " sent_" + str(iden)
                lemmas[sent_id] = words
                words = []

data.loc[:, 'lemma'] = data.apply(lambda x: lemmas[x.loc['sent_id']][int(x.loc['noun_token'])], axis=1)


# Find the Argument Span

In [6]:
from predpatt import load_conllu
from predpatt import PredPatt
from predpatt import PredPattOpts
from os.path import expanduser
files = ['/UD_English-r1.2/en-ud-train.conllu',
         '/UD_English-r1.2/en-ud-dev.conllu',
         '/UD_English-r1.2/en-ud-test.conllu']
home = expanduser("~/Downloads/")
parsed = []

options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)  # Resolve relative clause
prons_incl = ["you", "they", "yourself", "themselves", "them", "themself",
              "theirself", "theirselves"]
for file in files:
    path = home + file
    with open(path, 'r') as infile:
        data_in = infile.read()
        parsed += [(file[17:] + " " + sent_id, PredPatt(ud_parse, opts=options)) for
            sent_id, ud_parse in load_conllu(data_in)]
id_to_span = {}
for sent_id, parse_sen in parsed:
    sent_args = []
    sent_pred_poss = []
    for predicate in parse_sen.instances:
        if predicate.root.tag in ["NOUN"]:
            all_args = predicate.arguments + [predicate]
        else:
            all_args = predicate.arguments
        sent_check = [x.position for x in sent_args]
        for each_arg in all_args:
            if each_arg.position not in sent_check:
                sent_args.append(each_arg)
                sent_pred_poss.append(predicate.root.position)
    for ij, argument in enumerate(sent_args):
        if argument.root.tag in ["DET", "NUM", "NOUN", "PROPN", "PRON"]:
            if argument.root.tag == "PRON":
                if argument.root.text.lower() not in prons_incl:
                    continue
        noun_span = [t.position for t in argument.tokens]
        pred_pos = str(sent_pred_poss[ij])
        predpatt_id = sent_id + "_" + str(argument.root.position)
#         from IPython.core.debugger import Tracer
#         Tracer()()
        id_to_span[predpatt_id] = (",".join(map(str, noun_span)), pred_pos)

data.loc[:, 'noun_span'] = data.apply(lambda x: id_to_span[x['sent_noun']][0], axis=1)
data.loc[:, 'pred_pos'] = data.apply(lambda x: id_to_span[x['sent_noun']][1], axis=1)

In [7]:
data = data.rename(columns={'hit_id': 'HIT.ID', 'worker_id':'Annotator.ID', 
                            'sent_id':'Sentence.ID', 'sent_noun': 'Sentence.Arg.Token',
                            'noun_token':'Arg.Token', 'noun_span': 'Arg.Span', 'lemma': 'Arg.Lemma', 'pred_pos': 'Pred.Context',
                            'noun':'Argument', 'part':'Is.Particular', 
                            'part_conf':'Part.Confidence', 'kind':'Is.Kind', 
                            'kind_conf':'Kind.Confidence', 'abs':'Is.Abstract', 
                            'abs_conf':'Abs.Confidence'})

data.loc[:, 'Split'] = data.loc[:, 'Sentence.ID'].str[6:11]
data['Split'] = data['Split'].map(lambda x: x.rstrip('.c'))
cols = ['Split', 'HIT.ID', 'Annotator.ID','Sentence.ID', 'Arg.Token', 'Arg.Span', 'Sentence.Arg.Token', 'Predicate.Context',
        'Argument', 'Arg.Lemma', 'Is.Particular', 'Part.Confidence', 'Is.Kind','Kind.Confidence',
        'Is.Abstract','Abs.Confidence']
data = data[cols]
data.to_csv('../../../data/arg_raw_data.tsv', sep="\t", index=False)
print(len(data))

KeyError: "['Predicate.Context'] not in index"

## Convert to long form

In [None]:
import re
long_cols = ['Split', 'Annotator.ID','Sentence.ID', 'Arg.Token', 'Arg.Span', 'Pred.Context', 'Argument', 'Arg.Lemma',
             'Is.Particular', 'Part.Confidence', 'Is.Kind','Kind.Confidence',
             'Is.Abstract','Abs.Confidence']

long_data = data.copy()
# long_data = long_data.rename(columns={'worker_id':'Annotator.ID', 'sent_id':'Sentence.ID',
#                                       'noun_token':'Noun.Root.Token', 'noun_tokens': 'Noun.Span', 'lemma': 'Noun.Lemma',
#                                       'noun':'Noun', 'part':'Is.Particular', 
#                                       'part_conf':'Part.Confidence', 'kind':'Is.Kind', 
#                                       'kind_conf':'Kind.Confidence', 'abs':'Is.Abstract', 
#                                       'abs_conf':'Abs.Confidence'})

# long_data['Split'] = long_data['Sentence.ID'].str[6:11]
# long_data['Split'] = long_data['Split'].map(lambda x: x.rstrip('.c'))
long_data['Sentence.ID'] = data['Sentence.ID'].map(lambda x: re.findall(r'\d+', x)[0])

ann_hash = {}
annid = 0
for ann in set(long_data['Annotator.ID'].values):
    annid += 1
    ann_hash[ann] = annid
long_data['Annotator.ID'] = long_data['Annotator.ID'].map(ann_hash)
long_data = long_data[long_cols]
print(len(long_data))
long_data.to_csv('../../../data/arg_long_data.tsv', sep="\t", index=False)