This notebook processes results from the norming experiment. More information about how the data file is organized can be found at http://spellout.net/latest_ibex_manual.pdf
The columns most relevant for us are: field (image identifier) and value (label provided for the image by the participant).

In [38]:
import pandas as pd
import numpy as np
import matplotlib as plt
from nltk.metrics import edit_distance
from collections import Counter
import json

In [26]:
data = pd.read_csv("../Data/Main/norming_results.txt", comment='#', header=None, 
                   names=['Time', 'IPHash', 'Controller', 'Item', 'Element', 'Type', 'Group', 'Field', 'Value']) 
data['Value'] = data['Value'].str.lower()
data['Value'] = data['Value'].str.strip()
data.head()

Unnamed: 0,Time,IPHash,Controller,Item,Element,Type,Group,Field,Value
0,1512400181,db217f722ff04706309805a1bed8ff84,Form,0,0,intro,,_REACTION_TIME_,3503
1,1512400181,db217f722ff04706309805a1bed8ff84,Form,24,0,image,,imageLabel3680685,tamborine
2,1512400181,db217f722ff04706309805a1bed8ff84,Form,24,0,image,,_REACTION_TIME_,6931
3,1512400181,db217f722ff04706309805a1bed8ff84,Form,149,0,image,,imageLabel3682968,lawn mower
4,1512400181,db217f722ff04706309805a1bed8ff84,Form,149,0,image,,_REACTION_TIME_,5846


In [27]:
# check whether all are native speakers
lang_responses = data['Value'][data['Field'] == 'language'].tolist() 
any([l != 'native' for l in lang_responses])
# TODO: exclude non-native speakers if present

False

In [28]:
# Optional: check comments
#data['Value'][data['Field'] == 'comments'].tolist() 

In [29]:
def ignore_spelling(word_counter):
    DISTANCE_THRESHOLD = 3
    best_label = word_counter.most_common(1)[0][0]
    new_counter = word_counter.copy()
    for label in word_counter:
        if edit_distance(label, best_label) in range(1,DISTANCE_THRESHOLD):
                new_counter[best_label] += word_counter[label]
                del new_counter[label]
    return new_counter

In [30]:
# test ignore_spelling
labels = data['Value'][data['Field'] == 'imageLabel3680685'].tolist() 
labels_count = Counter(labels)
ignore_spelling(labels_count)

Counter({'bangle': 1,
         'cymbal': 2,
         'instrument': 3,
         'none': 2,
         'tambaray': 1,
         'tamberin': 1,
         'tambourine': 26})

In [31]:
# ACTUAL ANALYSIS

QUALITY_THRESHOLD = 0.8

images = data['Field'].unique().tolist()
images.remove('_REACTION_TIME_')
images.remove('comments')
images.remove('language')
images.remove('easiness')
total_images = len(images)
good_images = []

for image in images:
    labels = data['Value'][data['Field'] == image].tolist()    # get the list of all labels for the image
    num_labels = len(labels)
    labels_count = Counter(labels)      # returns a dict of counts (sorted from max to min)
    # account for spelling errors
    labels_count = ignore_spelling(labels_count)
    if (1.0 * labels_count.most_common(1)[0][1] / num_labels) >= QUALITY_THRESHOLD:
        best_label = labels_count.most_common(1)[0][0]
        good_images.append([image, best_label])
print(len(good_images))
print(total_images)

164
294


In [32]:
def change_to_index(label_list):
    """Takes an [image_num, label] list as input and converts image_num to index. 
    Returns two dicts of form index: label (one for 'object', one for 'match')."""
    
    match_start_index = 3680562
    object_start_index = 3682968
    match_dict = {}
    object_dict = {}
    
    for i in range(len(label_list)):
        # convert string to corresponding number
        number = int(label_list[i][0].replace('imageLabel', ''))
        if number >= object_start_index:
            index = (number - object_start_index) / 3
            object_dict[index] = label_list[i][1]
        else:
            index = (number - match_start_index) / 3
            match_dict[index] = label_list[i][1]
    return [match_dict, object_dict]

def leave_only_pairs(dict1, dict2):
    """Removes dict entries that are not present in both"""
    new_dict1 = dict1.copy()
    new_dict2 = dict2.copy()
    for key in dict1:
        if key not in dict2:
            del new_dict1[key]
    # repeat for the other dict
    for key in dict2:
        if key not in dict1:
            del new_dict2[key]
    return [new_dict1, new_dict2]

def merge_dicts(dict1, dict2):
    """Pre: dicts have the same set of keys"""
    new_dict = {}
    for key in dict1:
        new_dict[key] = [dict1[key], dict2[key]]
    return new_dict

In [33]:
[match_dict, object_dict] = change_to_index(good_images)
[match_dict, object_dict] = leave_only_pairs(match_dict, object_dict)
label_dict = merge_dicts(match_dict, object_dict)

In [34]:
# exclusion indices (mostly based on unequal object size, e.g. "motorcycle and helmet")
to_exclude = [319, 168, 303, 177, 53, 340, 2, 356, 385, 19, 398, 391, 312, 111, 124, 18, ]

for index in to_exclude:
    del label_dict[index]

In [35]:
print(label_dict)

{107: ['pipe', 'cigar'], 135: ['book', 'typewriter'], 392: ['shark', 'dolphin'], 394: ['skateboard', 'scooter'], 11: ['starfish', 'fish'], 272: ['slingshot', 'bow'], 145: ['strawberry', 'raspberry'], 156: ['toaster', 'microwave'], 31: ['football', 'basketball'], 161: ['chair', 'pillow'], 162: ['door', 'window'], 36: ['toaster', 'bread'], 46: ['starfish', 'seahorse'], 180: ['lion', 'tiger'], 54: ['screwdriver', 'hammer'], 287: ['snowflake', 'ornament'], 188: ['flashlight', 'lamp'], 194: ['bowl', 'plate'], 71: ['turtle', 'fish'], 336: ['typewriter', 'keyboard'], 211: ['screw', 'hinge'], 213: ['pipe', 'hookah'], 344: ['leash', 'dog'], 89: ['desk', 'chair'], 348: ['train', 'bridge'], 97: ['keyboard', 'mouse'], 358: ['donkey', 'camel'], 361: ['lock', 'key'], 363: ['lemon', 'kiwi'], 371: ['bat', 'hat'], 116: ['duck', 'feather'], 117: ['medal', 'trophy'], 118: ['eagle', 'owl'], 105: ['guitar', 'microphone'], 122: ['monitor', 'printer'], 254: ['spoon', 'knife'], 127: ['lobster', 'shrimp']}


In [39]:
with open('../Data/Processed/label_dict.txt', 'w') as f:
    json.dump(label_dict, f)