In [4]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
from collections import Counter, defaultdict
%load_ext autoreload

import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pylab as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20.0, 10.0)

import re
from copy import deepcopy
import json
from tqdm import tqdm
from boto.mturk.qualification import PercentAssignmentsApprovedRequirement, Qualifications, Requirement

from mturk_utils.mturk import pickle_this, unpickle_this

# load and basic processing

In [16]:
def create_result(assmt):
    result = {}
    raw_result = json.loads(assmt.answers[0][0].fields[0])
    result['image_id'] = raw_result['image_url']
    result['object_words_raw'] = sorted([w.replace('None', 'None_0_0') for w in raw_result['description']], key=lambda x: ''.join(x.split('_')[1:]))
    obj_words = [word.split('_')[0] for word in result['object_words_raw']]
    obj_word_location = [word.split('_')[1:] for word in result['object_words_raw']]
    result['object_words'] = obj_words
    result['object_locs'] = obj_word_location
    result['asgmt_id'] = assmt.AssignmentId
    result['hit_id'] = assmt.HITId
    result['worker_id'] = assmt.WorkerId
    return result

In [17]:
turk_data = unpickle_this('/Users/schwenk/wrk/animation_gan/turk_annotation_task/stage_4/pickled_data/additional_batch_1_4a_update_10_20.pkl')

In [42]:
assignments = [item for sublist in turk_data.values() for item in sublist]
assignment_results = [create_result(ar) for ar in assignments]

# analysis

In [49]:
def overlap(row):
    for idx in range(row.shape[0]):
        if row[idx] == None:
            row[idx] = set([])
    try:
        return list(row[0].intersection(row[1]).union(row[1].intersection(row[2])).union(row[0].intersection(row[2])))
    except TypeError:
        return {}

def distill_objects(obj_idx_list):
    combined_objects = []
    object_coords = []
    
    for word_position in obj_idx_list:
        split_components = word_position.split('_')
        word, sent_n, word_n = split_components[0], int(split_components[1]), int(split_components[2])
        object_coords.append((word, sent_n, word_n))
    object_coords = sorted(object_coords, key= lambda x: (x[1], x[2]))
    combined_objects.append(object_coords[0])
    
    for idx in range(1, len(object_coords)):
        this_word, this_sent, this_wn = object_coords[idx]
        last_word, last_sent, last_wn = combined_objects[-1]
        if this_sent == last_sent and last_wn + 1 == this_wn:            
            combined_objects[-1] = (' '.join([last_word, this_word]), this_sent, this_wn)
        else:
            combined_objects.append((this_word, this_sent, this_wn))
    return [w[1:] for w in combined_objects], [w[0] for w in combined_objects]

In [44]:
st4_df = pd.DataFrame(assignment_results)
st4_df['obj_set'] = st4_df['object_words_raw'].apply(lambda x: set(x))
grouped_by_image = st4_df.groupby('image_id')
agged_on_mode = grouped_by_image.agg(lambda x: st.mode(x))
agged_on_mode['mode_count'] = agged_on_mode['obj_set'].apply(lambda x: x[1][0])
cons_df = agged_on_mode[agged_on_mode['mode_count'].isin([2, 3])]
cons_df['image_id'] = cons_df.index

In [45]:
noncon_df = agged_on_mode[agged_on_mode['mode_count'].isin([1])]
noncon_vids = set(noncon_df.index.tolist())

In [46]:
cons_df['con_objects'] = cons_df['obj_set'].apply(lambda x: sorted(list(x[0][0]), key=lambda x: (x.split('_')[1], x.split('_')[2] )))
noncon_df = st4_df[st4_df['image_id'].isin(noncon_vids)]
temp_grouped_by_image = noncon_df.groupby('image_id')

In [47]:
noncon_grouped = temp_grouped_by_image['obj_set'].apply(lambda x: pd.Series(x.values)).unstack()
noncon_subsets_choices = noncon_grouped.apply(lambda x: overlap(x), axis=1)
noncon_lookup = noncon_subsets_choices.to_dict()
con_lookup = cons_df['con_objects'].to_dict()
combined_lookup = {**con_lookup, **noncon_lookup}

In [50]:
object_data = {}
for vid, cld in combined_lookup.items():
    spans, object_decriptors =  distill_objects(cld)
    object_data[vid] = {
        'spans': spans,
        'descriptors': object_decriptors
    }

In [51]:
pickle_this(object_data, 'batch_res.pkl')