In [2]:
import os
import json
import pandas as pd
import numpy as np
from collections import namedtuple
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)

In [3]:
Vote = namedtuple('Vote', 'response sample time user')
LabeledSample = namedtuple('LabeledSample', 'sample truth')

In [4]:
def readVotes():
    filename = 'braindrles-votes-export.json'
    path = os.path.join('data', filename)
    with open(path) as reader:
        votes = json.load(reader)
    output = []
    for _id in votes:
        vote = votes[_id]
        try:
            output.append(Vote(**vote))
        except Exception as e:
            print("Unable to parse a vote json. Error: ", str(e))
    return output

In [5]:
def readGoldLabels():
    filename = 'braindrles-sampleTruths-export.json'
    path = os.path.join('data', filename)
    with open(path) as reader:
        samples = json.load(reader)
    output = []
    for sample_id in samples:
        sample = LabeledSample(sample=sample_id, truth=samples[sample_id]['truth'])
        output.append(sample)
    return output


In [6]:
labeled_data_raw = readGoldLabels()
votes_raw = readVotes()

Unable to parse a vote json. Error:  __new__() missing 1 required positional argument: 'user'


In [7]:
votes = pd.DataFrame(votes_raw)
labeled_data = pd.DataFrame(labeled_data_raw)

In [8]:
average_rating_by_user = votes.groupby(['sample', 'user']).apply(lambda x: x.mean()[['response']]).reset_index()
average_rating_by_user.head()

Unnamed: 0,sample,user,response
0,gifbrles_031768,bookworm,1.0
1,gifbrles_031769,EvieD,0.333333
2,gifbrles_031769,Linztogo,0.0
3,gifbrles_031769,Regina Phalange,1.0
4,gifbrles_031769,Tatiana,1.0


In [9]:
average_rating_of_image = average_rating_by_user.groupby(['sample']).apply(lambda x: x.mean()[['response']]).reset_index()
average_rating_of_image.head()

Unnamed: 0,sample,response
0,gifbrles_031768,1.0
1,gifbrles_031769,0.448718
2,gifbrles_031770,0.933333
3,gifbrles_031771,1.0
4,gifbrles_031772,0.872727


In [10]:
count_of_users_rated_each_image = average_rating_by_user.groupby(['sample']).apply(lambda x: x.count()[['response']]).reset_index()
count_of_users_rated_each_image.rename(columns={'response': 'count_of_users'}, inplace=True)
count_of_users_rated_each_image.head()

Unnamed: 0,sample,count_of_users
0,gifbrles_031768,1
1,gifbrles_031769,13
2,gifbrles_031770,5
3,gifbrles_031771,5
4,gifbrles_031772,11


In [11]:
count_of_users_rated_each_image.describe()

Unnamed: 0,count_of_users
count,1301.0
mean,4.349731
std,2.778748
min,1.0
25%,2.0
50%,3.0
75%,5.0
max,16.0


In [14]:
positive_subject_names = labeled_data[labeled_data.truth == 1]['sample'].values
negative_subject_names = labeled_data[labeled_data.truth == 0]['sample'].values

average_rating_by_user_with_pos_true_label = average_rating_by_user[average_rating_by_user['sample'].isin(positive_subject_names)]
average_rating_by_user_with_neg_true_label = average_rating_by_user[average_rating_by_user['sample'].isin(negative_subject_names)]

average_rating_by_user_with_pos_true_label['truth'] = 1
average_rating_by_user_with_neg_true_label['truth'] = 0

data_with_true_labels = average_rating_by_user_with_pos_true_label.append(average_rating_by_user_with_neg_true_label, ignore_index = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Pre-processed data form
`data_with_true_labels` is a dataframe which contains the average rating of a user for a particular image and along with the true label in each row. Use this dataframe for all tasks.

In [15]:
data_with_true_labels.head()

Unnamed: 0,sample,user,response,truth
0,gifbrles_031772,Franky,1.0,1
1,gifbrles_031772,Garvita,1.0,1
2,gifbrles_031772,KirstieJane,1.0,1
3,gifbrles_031772,Regina Phalange,1.0,1
4,gifbrles_031772,Tlow,1.0,1


## How accurate a particular user's reponses are

In [27]:
is_pred_correct_df = data_with_true_labels.copy()
is_pred_correct_df['is_correct'] = is_pred_correct_df.apply(lambda row: 1 if row['response'] == row['truth'] else 0, axis=1)
accuracy_df = is_pred_correct_df.groupby(['user']).agg({'is_correct': ['sum', 'count']})

levels = accuracy_df.columns.levels
labels = accuracy_df.columns.labels
accuracy_df.columns = levels[1][labels[1]]
accuracy_df = accuracy_df.reset_index()
accuracy_df = accuracy_df.rename(columns={'count':'total_swipes', 'sum': 'correct_swipes'})
accuracy_df.head()

accuracy_df['accuracy_in_percent'] = (100.0*accuracy_df['correct_swipes'])/accuracy_df['total_swipes']
accuracy_df.sort_values(['accuracy_in_percent', 'total_swipes'], ascending=False)[['user', 'correct_swipes', 'total_swipes', 'accuracy_in_percent']]


  


Unnamed: 0,user,correct_swipes,total_swipes,accuracy_in_percent
1,BrainDK,19,19,100.0
10,Lindamilla,9,9,100.0
9,Kkotake,1,1,100.0
33,mikeb,22,23,95.652174
18,Tlow,73,77,94.805195
16,Tatiana,31,33,93.939394
6,Galothus,60,64,93.75
21,akeshavan,73,79,92.405063
17,Tester12345,12,13,92.307692
45,zavalian,43,48,89.583333
