## Preprocessing of NP elicitation data 

The data produced by participants in the NP elicitation experiment is cleaned and preprocessed fot the comparison class inference and adjective endorsment experiments.

- comparison class inference: experiments/class-elicitation-free-3.html
    - trial information stored in experiments/js/noun_elicitation_pilot.js -- see for appropriate format
- adjective endorsement task: experiments/adj-endorsements-exp
    - trial information stored in 04_trials.js -- see for appropriate format

In [22]:
import csv
import json
import pandas as pd

In [25]:
# read np production data
file_path = "../data/np-elicitation/np-elicitation-wide-n50.csv"

# drop rows which are in the list of common invalid responses 
raw_data = pd.read_csv(file_path)

raw_data

Unnamed: 0,workerid,trial_num,degree,pos_adj,neg_adj,superordinate,positive,neither_nor,negative
0,s1,1,darkness,light,dark,humans,Europeans,Americans,Africans
1,s1,2,temperature,warm,cold,seasons,Summer days,Spring,Winter days
2,s1,3,size,big,small,fruit,Watermelons,Tomatoes,Kumquats
3,s1,4,speed,fast,slow,motor vehicles,Dragsters,Passenger cars,Scooters
4,s1,5,weight,heavy,light,furniture,Dinner tables,Beds,Foot stools
5,s1,6,hardness,hard,soft,Sports balls,Golf balls,Squash balls,Squash balls
6,s1,7,loudness,loud,quiet,Animals,Elephants,Birds,Snakes
7,s1,8,height,tall,short,Athletes,Basketball players,Golfers,Jockeys
8,s1,9,price,expensive,cheap,Cigars,Arturo Fuente Opus X,Macanudo,Tuscanos
9,s1,10,width,wide,narrow,waterways,Oceans,Waterfalls,Creeks


In [71]:
# drop data with responses that are numeric (column superordinate, positive, neither_nor or negative) or length < 3 characters 
# typical responses to be excluded 
resps_exclude = ['yes', 'no', 'agree', 'like', 'both']

filter_data = raw_data[~raw_data.positive.str.lower().isin(resps_exclude)]
filter_data = filter_data[~filter_data.superordinate.str.lower().isin(resps_exclude)]
filter_data = filter_data[~filter_data.negative.str.lower().isin(resps_exclude)]
filter_data = filter_data[~filter_data.neither_nor.str.lower().isin(resps_exclude)]
filter_data = filter_data[~filter_data.positive.apply(lambda x: len(x) < 3)]
filter_data = filter_data[~filter_data.negative.apply(lambda x: len(x) < 3)]
filter_data = filter_data[~filter_data.neither_nor.apply(lambda x: len(x) < 3)]
filter_data = filter_data[~filter_data.superordinate.apply(lambda x: len(x) < 3)]


Unnamed: 0,workerid,trial_num,degree,pos_adj,neg_adj,superordinate,positive,neither_nor,negative
0,s1,1,darkness,light,dark,humans,Europeans,Americans,Africans
1,s1,2,temperature,warm,cold,seasons,Summer days,Spring,Winter days
2,s1,3,size,big,small,fruit,Watermelons,Tomatoes,Kumquats
3,s1,4,speed,fast,slow,motor vehicles,Dragsters,Passenger cars,Scooters
4,s1,5,weight,heavy,light,furniture,Dinner tables,Beds,Foot stools
5,s1,6,hardness,hard,soft,Sports balls,Golf balls,Squash balls,Squash balls
6,s1,7,loudness,loud,quiet,Animals,Elephants,Birds,Snakes
7,s1,8,height,tall,short,Athletes,Basketball players,Golfers,Jockeys
8,s1,9,price,expensive,cheap,Cigars,Arturo Fuente Opus X,Macanudo,Tuscanos
9,s1,10,width,wide,narrow,waterways,Oceans,Waterfalls,Creeks


In [100]:
# remove rows with responses equal to the adjectives or with equal positive and negative responses 

filter_data = filter_data[~filter_data.apply(lambda x: ((x['pos_adj'] == x['negative']) | (x['pos_adj'] == x['positive'])), axis = 1)]
filter_data = filter_data[~filter_data.apply(lambda x: ((x['neg_adj'] == x['negative']) | (x['neg_adj'] == x['positive'])), axis = 1)]
filter_data = filter_data[~filter_data.apply(lambda x: x['positive'] == x['negative'], axis = 1)]
filter_data = filter_data.sort_values('degree')

In [102]:
# put data into comparison class inference format
filter_data.to_csv("cleaned-np-elicitation-wide-n50.csv", index=False)

In [103]:
# create a json file, to be converted into javascript objects
# context sentences to be added manually

cleaned_data = open('cleaned-np-elicitation-wide-n50.csv')
json_file = open('cleaned-np-elicitation-wide-n50.json', 'w')
header_names = ("worker_id", "stim_id", "degree", "adj_positive", "adj_negative", "superordinate", "positive", "neither_nor", "negative")
reader = csv.DictReader( cleaned_data, header_names)

out = json.dumps([row for row in reader], indent = 4, sort_keys = False)
json_file.write(out)



192898

In [109]:
file = json.load(open('cleaned-np-elicitation-wide-n50.json'))
print(json.dumps(file, indent = 4))

[
    {
        "worker_id": "workerid",
        "stim_id": "trial_num",
        "degree": "degree",
        "adj_positive": "pos_adj",
        "adj_negative": "neg_adj",
        "superordinate": "superordinate",
        "positive": "positive",
        "neither_nor": "neither_nor",
        "negative": "negative"
    },
    {
        "worker_id": "s1",
        "stim_id": "1",
        "degree": "darkness",
        "adj_positive": "light",
        "adj_negative": "dark",
        "superordinate": "humans",
        "positive": "Europeans",
        "neither_nor": "Americans",
        "negative": "Africans "
    },
    {
        "worker_id": "s11",
        "stim_id": "4",
        "degree": "darkness",
        "adj_positive": "light",
        "adj_negative": "dark",
        "superordinate": "Space objects",
        "positive": "Stars",
        "neither_nor": "Planets",
        "negative": "Moons"
    },
    {
        "worker_id": "s44",
        "stim_id": "10",
        "degree": "darkness",
  