In [2]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from scipy.stats import pearsonr as pearson
from scipy.stats import spearmanr as spearman
from math import isnan
from collections import Counter

In [3]:
train_file = "results_train_jun25.csv"
raw_train_file = pd.read_csv(train_file)
raw_train_file.columns = [c.replace('.', '_') for c in raw_train_file.columns]

train_file_c = "results_train_aug09.csv"
raw_train_file_c = pd.read_csv(train_file_c)
raw_train_file_c.columns = [c.replace('.', '_') for c in raw_train_file_c.columns]

devte_file = "results_devte_jun25.csv"
raw_devte_file = pd.read_csv(devte_file)
raw_devte_file.columns = [c.replace('.', '_') for c in raw_devte_file.columns]

devte_file_c = "results_devte_aug09.csv"
raw_devte_file_c = pd.read_csv(devte_file_c)
raw_devte_file_c.columns = [c.replace('.', '_') for c in raw_devte_file_c.columns]

print(len(raw_train_file), len(raw_train_file_c), len(raw_devte_file), len(raw_devte_file_c))


raw_data_file = raw_train_file.append(raw_devte_file, ignore_index=True)
raw_data_file_c = raw_train_file_c.append(raw_devte_file_c, ignore_index=True)
raw_data_file = raw_data_file.append(raw_data_file_c, ignore_index=True)
print(len(raw_data_file))

2985 26 2189 18
5218


In [4]:
def extract_dataframe(data):
    '''
    Input: Pandas csv dataframe obtained from MTurk
    
    Output: Pandas dataframe levelled by (User x Sentenced_ID)
    '''
    data["dicts"] = data["Input_var_arrays"].map(lambda x: json.loads(x))
    global_list = []
    
    for row in data.itertuples():
        for idx, local_dict in enumerate(row.dicts):
            temp_dict = local_dict.copy()
            var_part = "Answer_noun_part" + str(idx + 1)
            var_part_c = "Answer_noun_part_certainty" + str(idx + 1)
            var_kind = "Answer_noun_class" + str(idx + 1)
            var_kind_c = "Answer_noun_class_certainty" + str(idx + 1)
            var_abs = "Answer_noun_abs" + str(idx + 1)
            var_abs_c = "Answer_noun_abs_certainty" + str(idx + 1)
            temp_dict['part'] = getattr(row, var_part)
            temp_dict['part_conf'] = getattr(row, var_part_c)
            temp_dict['kind'] = getattr(row, var_kind)
            temp_dict['kind_conf'] = getattr(row, var_kind_c)
            temp_dict['abs'] = getattr(row, var_abs)
            temp_dict['abs_conf'] = getattr(row, var_abs_c)
            temp_dict['worker_id'] = row.WorkerId
            temp_dict['hit_id'] = row.HITId
            temp_dict['status'] = row.AssignmentStatus
            global_list.append(temp_dict)
    
    return pd.DataFrame(global_list)

In [5]:
raw_data = extract_dataframe(raw_data_file)
raw_data = raw_data[raw_data['status']!='Rejected']
raw_data = raw_data.reset_index(drop=True)
raw_data['sent_noun'] = raw_data['sent_id'].map(lambda x : x) + "_" +\
                           raw_data['noun_token'].map(lambda x: str(x))
# Rearrange the columns
cols = ['hit_id', 'worker_id','sent_noun', 'noun', 'sent_id','noun_token','part','part_conf',
        'kind','kind_conf','abs','abs_conf']
data = raw_data[cols]

long_cols = ['Split', 'Annotator.ID','Sentence.ID', 'Noun.Token','Noun',
             'Is.Particular', 'Part.Confidence', 'Is.Kind','Kind.Confidence',
             'Is.Abstract','Abs.Confidence']

long_data = data.copy()
long_data = long_data.rename(columns={'worker_id':'Annotator.ID', 'sent_id':'Sentence.ID',
                                      'noun_token':'Noun.Token',
                                      'noun':'Noun', 'part':'Is.Particular', 
                                      'part_conf':'Part.Confidence', 'kind':'Is.Kind', 
                                      'kind_conf':'Kind.Confidence', 'abs':'Is.Abstract', 
                                      'abs_conf':'Abs.Confidence'})

long_data['Split'] = long_data['Sentence.ID'].str[6:11]
long_data['Split'] = long_data['Split'].map(lambda x: x.rstrip('.c'))

ann_hash = {}
annid = 0
for ann in set(long_data['Annotator.ID'].values):
    annid += 1
    ann_hash[ann] = annid
long_data['Annotator.ID'] = long_data['Annotator.ID'].map(ann_hash)
long_data = long_data[long_cols]
print(len(long_data))
# long_data.to_csv('noun_long_data.tsv', sep="\t", index=False)

51410


# Majority scoring on dev

In [5]:
attributes = ["part", "kind", "abs"]
attr_map = {"part": "Is.Particular", "kind": "Is.Kind", "abs": "Is.Abstract"}
attr_conf = {"part": "Part.Confidence", "kind": "Kind.Confidence",
             "abs": "Abs.Confidence"}
from statistics import mode
data = long_data[long_data['Split'] != 'test']

response = ["Is.Particular", "Is.Kind", "Is.Abstract"]
response_conf = ["Part.Confidence", "Kind.Confidence", "Abs.Confidence"]

# Convert responses to 1s and 0s
for resp in response:
    data[resp] = data[resp].astype(int)

for resp in response_conf:
    data[resp] = data.groupby('Annotator.ID')[resp].apply(lambda x: x.rank() / (len(x) + 1.))
data_dev = data[data['Split'] == 'dev']
col = data_dev['Sentence.ID'] + "_" + data_dev['Noun.Token'].map(str)
data_dev = data_dev.assign(SentenceIDToken=col.values)
sent_ids = list(set(data_dev['SentenceIDToken'].tolist()))
data_dev_reduced = pd.DataFrame()
i = 0
print(len(sent_ids))
for sent_id in sent_ids:
    i += 1
    print(i, end="\r")
    new_df = data_dev[data_dev['SentenceIDToken'] == sent_id]
    sample = new_df.iloc[0]
    
    for attr in attributes:
        answers = new_df[attr_map[attr]].tolist()
        if all(x == answers[0] for x in answers):
            mode_ans = answers[0]
            new_conf = sum(new_df[attr_conf[attr]].tolist()) / 3
        else:
            mode_ans = mode(answers)
            new_df[new_df[attr_map[attr]] != mode_ans][attr_conf[attr]] = 1 - new_df[new_df[attr_map[attr]] != mode_ans][attr_conf[attr]]
            new_conf = sum(new_df[attr_conf[attr]].tolist()) / 3

        sample[attr_map[attr]] = mode_ans
        sample[attr_conf[attr]] = new_conf

    data_dev_reduced = data_dev_reduced.append(sample)
data_dev_reduced.to_csv('noun_data_dev.tsv', sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


3584
1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


3584

In [8]:
len(data_dev_reduced)

3584