In [1]:
import json
import pandas as pd

To get the results file from Google Storage bucket (assuming you have access):


gsutil cp gs://mids-w266-mw/mlflow/14/414e40b4619245a7a200920a7e9a10cf/artifacts/test2015_results_san_expt3_2018-11-21-15:42:18.json .


In [2]:
def _partial_acc(obs):
    """
        inner function used to calculate a weighted accuracy based on how many human
        raters' annotations the predicted value matches.  Applied to a single Series.
        
        Args:
            obs = pandas.Series object containing 'predicted_answer' and 'annotations' columns
    """
    prediction = obs['predicted_answer']
    annotations = obs['annotations']
    matches = 0
    for a in annotations:
        if prediction.strip().lower() == a.strip().lower():
            matches += 1
    return min(1, matches/3)

### 1. Yang's Original

In [5]:
json_results_path = '/home/rachel_sk_ho/w266-final-project/results/test2015_results_san_expt0_2018-11-22-23:17:22.json'

with open(json_results_path) as f:
    resultsj = json.load(f)
    
df = pd.DataFrame(resultsj)
df['correct'] = (df['answer_str'].str.strip().str.lower() == df['predicted_answer'].str.strip().str.lower()).astype(int)
df['partial'] = df.apply(_partial_acc, axis=1)

In [4]:
df.describe()

Unnamed: 0,answer_id,image_id,question_id,correct,partial
count,60712.0,60712.0,60712.0,60712.0,60712.0
mean,29106000.0,291059.949664,2910600.0,0.417133,0.485824
std,16817430.0,168174.313184,1681743.0,0.493089,0.482862
min,4200.0,42.0,420.0,0.0,0.0
25%,14657010.0,146570.0,1465701.0,0.0,0.0
50%,29186620.0,291866.0,2918662.0,0.0,0.333333
75%,43582090.0,435820.75,4358209.0,1.0,1.0
max,58191320.0,581913.0,5819132.0,1.0,1.0


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60712 entries, 0 to 60711
Data columns (total 11 columns):
annotations         60712 non-null object
answer_id           60712 non-null int64
answer_str          60712 non-null object
answer_type         60712 non-null object
image_id            60712 non-null int64
predicted_answer    60712 non-null object
question_id         60712 non-null int64
question_str        60712 non-null object
question_type       60712 non-null object
correct             60712 non-null int64
partial             60712 non-null float64
dtypes: float64(1), int64(4), object(6)
memory usage: 5.1+ MB


In [76]:
df.head()

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial
0,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",49994000,yes,yes/no,499940,yes,4999400,is there two boys in the photo,is there,1,1.0
1,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",49994010,yes,yes/no,499940,yes,4999401,is one of the boys wearing a cap,is,1,1.0
2,"[kites, kites, kites, kites, kites, kites, kit...",49994020,kites,other,499940,kite,4999402,what is flying in the sky,what is,0,0.0
3,"[kitchen, kitchen, kitchen, kitchen, kitchen, ...",39537920,kitchen,other,395379,kitchen,3953792,what is this room,what is this,1,1.0
4,"[white, white, white, white, white, white, whi...",39537900,white,other,395379,white,3953790,what color is the wall,what color is the,1,1.0


In [6]:
# count by answer type
df.groupby(['answer_type'])['correct'].count().sort_values(ascending=False)

answer_type
other     30296
yes/no    22767
number     7649
Name: correct, dtype: int64

In [159]:
# count by question type
df.groupby(['question_type'])['correct'].count().sort_values(ascending=False)

question_type
how many                    5513
is the                      4994
what                        4593
what color is the           4061
what is the                 3246
none of the above           2497
is this                     2245
is this a                   2088
what is                     1779
what kind of                1637
are the                     1614
is there a                  1243
what type of                1184
where is the                1108
is it                       1030
what are the                 950
does the                     934
is                           910
what color are the           898
is there                     889
are these                    787
what is the man              734
are there                    713
how                          706
which                        703
is the man                   675
are                          646
does this                    608
what is on the               567
what does the                

In [28]:
# accuracy overall
acc = df['correct'].mean()
partial_acc = df['partial'].mean()
print('Accuracy = {:.3f}. Partial Accuracy = {:.3f}.'.format(acc, partial_acc))

Accuracy = 0.417. Partial Accuracy = 0.486.


In [20]:
# pd.set_option('display.max_rows', 75)

In [29]:
# accuracy by question type
acc_by_qtype = df.groupby(['question_type'])['correct','partial'] \
    .mean() \
    .sort_values(['correct'], ascending=False)
acc_by_qtype

Unnamed: 0_level_0,correct,partial
question_type,Unnamed: 1_level_1,Unnamed: 2_level_1
what room is,0.861345,0.871148
is there a,0.851167,0.905337
what sport is,0.848943,0.863041
could,0.830409,0.908382
is there,0.749156,0.819648
are there,0.73913,0.798504
do you,0.692708,0.796875
does this,0.666118,0.756031
do,0.662222,0.762222
is it,0.660194,0.746602


In [30]:
# accuracy by answer type
acc_by_anstype = df.groupby(['answer_type'])['correct','partial'] \
    .mean() \
    .sort_values(['correct'], ascending=False)
acc_by_anstype

Unnamed: 0_level_0,correct,partial
answer_type,Unnamed: 1_level_1,Unnamed: 2_level_1
yes/no,0.659902,0.766094
other,0.284559,0.322782
number,0.219637,0.297381


In [83]:
acc_by_anstype.to_dict('index')

{'yes/no': {'correct': 0.6497562261167479, 'partial': 0.7585394064508629},
 'other': {'correct': 0.3058489569580143, 'partial': 0.34698750110025667},
 'number': {'correct': 0.23244868610275854, 'partial': 0.30936505861332597}}

### 1a. Yes/No Answer Type

In [38]:
# accuracy by question type
df[df.answer_type=='yes/no'].groupby(['question_type'])['correct', 'partial'].mean().sort_values(['correct'])

Unnamed: 0_level_0,correct,partial
question_type,Unnamed: 1_level_1,Unnamed: 2_level_1
why,0.0,0.0
what,0.0,0.0
what is the,0.0,0.0
are there any,0.564384,0.673973
none of the above,0.594958,0.747899
was,0.612335,0.71953
is this person,0.613861,0.740924
are,0.616852,0.7292
are these,0.623626,0.754579
is the,0.62617,0.738966


In [89]:
# examples of 0% accuracy for yes/no answer type - 7 data points
df[(df.answer_type=='yes/no') & \
   ((df.question_type=='why') | (df.question_type=='what') | (df.question_type=='what is the'))]

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial
8919,"[yes, yes, yes, company logo, yes, yes, yes, y...",22455400,yes,yes/no,224554,<unk>,2245540,what there a crown on the train,what,0,0.0
22593,"[no, no, no, no, no, no, no, no, no, no]",8474900,no,yes/no,84749,cat,847490,what this picture taken indoors,what,0,0.0
23396,"[make feeding easier, yes, yes, yes, yes, view...",18663720,yes,yes/no,186637,<unk>,1866372,why is there a wooden platform behind the fence,why,0,0.0
27501,"[no, no, no, no, no, yes, no, yes, yes, yes]",8276500,no,yes/no,82765,bed,827650,what someone using the computer in bed,what,0,0.0
51656,"[no, emergency, no, no, ve, no, no, can't see,...",3987110,no,yes/no,39871,<unk>,398711,what word is show on the bus,what,0,0.0
51964,"[yes, 2 men skateboarding, old picture, color,...",27829010,yes,yes/no,278290,<unk>,2782901,what is the picture white and black,what is the,0,0.0
58373,"[yes, his preference, yes, yes, yes, because h...",9924220,yes,yes/no,99242,<unk>,992422,why does the man have a beard,why,0,0.0


In [112]:
# examples of <60% accuracy for yes/no answer type with correct answers - 914 data points
df[(df.answer_type=='yes/no') & \
   ((df.question_type=='are there any') | (df.question_type=='none of the above')) & \
   (df.correct==1)].tail(10)

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial
59787,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",51220600,yes,yes/no,512206,yes,5122060,are there any butterflies in the photo,are there any,1,1.0
59826,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",3139000,yes,yes/no,31390,yes,313900,will they catch the frisbee,none of the above,1,1.0
59828,"[yes, no, yes, yes, yes, yes, yes, no, no, yes]",3139020,yes,yes/no,31390,yes,313902,does everyone have on short,none of the above,1,1.0
59831,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",3492220,yes,yes/no,34922,yes,349222,are there any reflections in this photo,are there any,1,1.0
59910,"[yes, yes, no, yes, yes, no, yes, yes, yes, yes]",37931400,yes,yes/no,379314,yes,3793140,does it look cold,none of the above,1,1.0
60036,"[no, no, no, no, no, no, no, no, no, no]",57382320,no,yes/no,573823,no,5738232,are there any people,are there any,1,1.0
60249,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",37031520,yes,yes/no,370315,yes,3703152,does he wear glasses,none of the above,1,1.0
60343,"[yes, maybe, yes, yes, yes, yes, yes, yes, yes...",52385410,yes,yes/no,523854,yes,5238541,can someone eat outside,none of the above,1,1.0
60419,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",28985510,yes,yes/no,289855,yes,2898551,are there any palm trees in this picture,are there any,1,1.0
60654,"[yes, no, yes, yes, yes, yes, yes, yes, yes, yes]",32166500,yes,yes/no,321665,yes,3216650,will this clock keep time,none of the above,1,1.0


In [109]:
# examples of <60% accuracy for yes/no answer type with incorrect answers - 1555 data points
df[(df.answer_type=='yes/no') & \
   ((df.question_type=='are there any') | (df.question_type=='none of the above')) & \
   (df.correct==0)].tail(10)

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial
60267,"[no, yes, no, no, no, no, no, yes, no, no]",53550600,no,yes/no,535506,yes,5355060,does she look happy,none of the above,0,0.666667
60276,"[no, no, no, no, no, no, no, no, no, 0]",21420400,no,yes/no,214204,yes,2142040,are there any towels in this bathroom,are there any,0,0.0
60308,"[no, no, no, no, no, no, no, no, no, no]",15277620,no,yes/no,152776,yes,1527762,did these come from a money garden,none of the above,0,0.0
60318,"[no, no, no, no, no, no, no, no, no, no]",24424600,no,yes/no,244246,yes,2442460,are there any clouds in the sky,are there any,0,0.0
60333,"[no, no, no, no, no, no, no, no, yes, no]",29378200,no,yes/no,293782,yes,2937820,will this street sign wave in the wind,none of the above,0,0.333333
60425,"[no, no, no, no, no, no, no, no, no, no]",53321720,no,yes/no,533217,yes,5332172,does it look like a cloudy day,none of the above,0,0.0
60461,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",40771720,yes,yes/no,407717,no,4077172,would a vegetarian eat this food,none of the above,0,0.0
60485,"[no, no, no, no, no, no, no, no, no, no]",34400520,no,yes/no,344005,yes,3440052,will it rain soon,none of the above,0,0.0
60505,"[no, no, no, no, no, no, no, no, no, no]",25803610,no,yes/no,258036,yes,2580361,does she have any clothes on,none of the above,0,0.0
60635,"[no, yes, yes, no, no, yes, yes, yes, yes, no]",6783220,yes,yes/no,67832,no,678322,would you eat this,none of the above,0,1.0


In [104]:
# how does model compare to humans?
# what percentage of incorrect answers predicted by model are also predicted by humans?
num_incorrect = df[(df.answer_type=='yes/no') & (df.correct==0)]['annotations'].count()
num_atleast1 = df[(df.answer_type=='yes/no') & (df.correct==0) & (df.partial>0)]['annotations'].count() 
num_atleast2 = df[(df.answer_type=='yes/no') & (df.correct==0) & (df.partial>0.35)]['annotations'].count() 
num_atleast3 = df[(df.answer_type=='yes/no') & (df.correct==0) & (df.partial==1)]['annotations'].count() 

print('Percentage of incorrect answers predicted by at least one human: {:.1%}'.format(num_atleast1/num_incorrect))
print('Percentage of incorrect answers predicted by at least two human: {:.1%}'.format(num_atleast2/num_incorrect))
print('Percentage of incorrect answers predicted by at least three human: {:.1%}'.format(num_atleast3/num_incorrect))

Percentage of incorrect answers predicted by at least one human: 47.2%
Percentage of incorrect answers predicted by at least two human: 28.6%
Percentage of incorrect answers predicted by at least three human: 17.9%


### 1b. Other Answer Type

In [113]:
# accuracy by question type
df[df.answer_type=='other'].groupby(['question_type'])['correct', 'partial'].mean().sort_values(['correct'])

Unnamed: 0_level_0,correct,partial
question_type,Unnamed: 1_level_1,Unnamed: 2_level_1
what number is,0.0,0.0
are there,0.0,0.0
can you,0.0,0.296296
how many people are in,0.0,0.0
how many people are,0.0,0.0
has,0.0,0.0
why,0.013363,0.017075
why is the,0.017045,0.030303
what is the name,0.02,0.021667
how many,0.027027,0.081081


In [166]:
# examples of incorrect predictions for other answer type
df[(df.answer_type=='other') & (df.correct==0)].tail(10)

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial
60686,"[helmet, helmet, helmet, helmet, helmet, helme...",40976310,helmet,other,409763,hat,4097631,what is on the man's head,what is on the,0,0.333333
60694,"[night, evening, twilight, night time, dusk, n...",26004810,night,other,260048,evening,2600481,what time of day is it,what time,0,0.333333
60695,"[stop light, stoplight is red, red light, caus...",26004820,red light,other,260048,<unk>,2600482,why isn't the car on the right the one with it...,why,0,0.0
60697,"[flowers, flowers, flowers, sunglass, flowers ...",8603610,flowers,other,86036,hat,860361,what is on the girl's head,what is on the,0,0.0
60700,"[log, tree, tree, tree, log, tree, log, tree, ...",53298910,tree,other,532989,fence,5329891,what is laying on the ground behind the giraffe,what is,0,0.0
60701,"[in front of fence, behind giraffe, background...",53298920,behind giraffe,other,532989,<unk>,5329892,where is the fallen tree trunk,where is the,0,0.0
60703,"[yellow, yellow, yellow white black, yellow, y...",20132610,yellow,other,201326,white,2013261,what color is he wearing,what color is,0,0.0
60704,"[red black, red white black, red, orange, red ...",20132620,red and black,other,201326,red,2013262,what color is the racquet,what color is the,0,0.333333
60708,"[table, on table, on right, on right, by napki...",1603000,table,other,16030,<unk>,160300,where is the fork,where is the,0,0.0
60710,"[formal, restaurant, restaurant, elegant, dinn...",1603020,formal,other,16030,<unk>,160302,what type of setting is this,what type of,0,0.0


In [7]:
# create column with number of words in answers
df['answer_length'] = df.apply(lambda data: len(data['answer_str'].split()), axis=1)
df.tail(10)

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial,answer_length
60702,"[yes, yes, yes, yes, yes, yes, yes, no, yes, yes]",20132600,yes,yes/no,201326,yes,2013260,is the tennis player wearing a nike t shirt,is the,1,1.0,1
60703,"[yellow, yellow, yellow white black, yellow, y...",20132610,yellow,other,201326,white,2013261,what color is he wearing,what color is,0,0.0,1
60704,"[red black, red white black, red, orange, red ...",20132620,red and black,other,201326,red,2013262,what color is the racquet,what color is the,0,0.333333,3
60705,"[night, night, night, night, night, night, nig...",47747000,night,other,477470,night,4774700,what time of day was this photo taken,what time,1,1.0,1
60706,"[yes, yes, yes, no, no, yes, yes, yes, yes, yes]",47747010,yes,yes/no,477470,no,4774701,is this the right atmosphere for dracula,is this,0,0.666667,1
60707,"[stop, stop, stop, stop, stop, stop, stop, sto...",47747020,stop,other,477470,stop,4774702,what does the traffic light say to do,what does the,1,1.0,1
60708,"[table, on table, on right, on right, by napki...",1603000,table,other,16030,<unk>,160300,where is the fork,where is the,0,0.0,1
60709,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",1603010,yes,yes/no,16030,yes,160301,are there glasses,are there,1,1.0,1
60710,"[formal, restaurant, restaurant, elegant, dinn...",1603020,formal,other,16030,<unk>,160302,what type of setting is this,what type of,0,0.0,1
60711,"[no, yes, yes, yes, yes, yes, yes, yes, no, no]",44087800,yes,yes/no,440878,yes,4408780,is there usually a third piece to the ensemble,is there,1,1.0,1


In [182]:
# number of incorrect predictions by answer length
df[(df.answer_type=='other') & (df.correct==0)].groupby(['answer_length'])['annotations'].count()

answer_length
0         5
1     17278
2      3059
3      1053
4       172
5        72
6        19
7         9
8         4
10        2
11        1
12        1
Name: annotations, dtype: int64

In [183]:
# number of correct predictions by answer length
df[(df.answer_type=='other') & (df.correct==1)].groupby(['answer_length'])['annotations'].count()

answer_length
1    8154
2     336
3     131
Name: annotations, dtype: int64

### 1c. Number Answer Type

In [55]:
# accuracy by question type
df[df.answer_type=='number'].groupby(['question_type'])['correct', 'partial'].mean().sort_values(['correct'])

Unnamed: 0_level_0,correct,partial
question_type,Unnamed: 1_level_1,Unnamed: 2_level_1
are there,0.0,0.0
what type of,0.0,0.0
what time,0.0,0.00191
what number is,0.0,0.0
what kind of,0.0,0.0
what is this,0.0,0.0
what is the name,0.0,0.0
what is the,0.0,0.003175
what is on the,0.0,0.0
what is,0.0,0.0


In [34]:
# number of data points by question type
df[(df.answer_type=='number')].groupby(['question_type'])['annotations'].count().sort_values(ascending=False)

question_type
how many                  5476
how many people are        509
what time                  349
how                        267
what                       247
how many people are in     222
what is the                210
what number is             189
none of the above           75
which                       22
what are the                20
what does the               20
what is                     13
does the                     4
is this                      3
is the                       2
what type of                 2
what is this                 2
what is the name             2
is                           2
can you                      1
what kind of                 1
is he                        1
is the woman                 1
is the man                   1
is there                     1
is this a                    1
is this person               1
where is the                 1
was                          1
what is on the               1
what brand               

In [20]:
# examples of incorrect predictions for other answer type
# df[(df.answer_type=='number') & (df.correct==1) & (df.question_type=='how many people are')].tail(10)
df[(df.answer_type=='number') & (df.correct==0)].tail(10)

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial,answer_length
60610,"[20, 20, 20, 20, 20, 20, 20, 20, 60, 20]",22553210,20,number,225532,1,2255321,how many mph,how many,0,0.0,1
60627,"[51, 51, 51, 51, 51, 51, 251, 51, 51, 51]",49631300,51,number,496313,<unk>,4963130,what is the number of the bus's route,what is the,0,0.0,1
60629,"[51, 51, 51, 251, 32, 51, 8, 51, 51, 51]",49631320,51,number,496313,<unk>,4963132,what number is on the bus,what number is,0,0.0,1
60647,"[5, 1, 6, 4, 4, 3, 4, 4, 10, 4]",23307910,4,number,233079,1,2330791,how many benches are in the lobby,how many,0,0.333333,1
60655,"[1:20, 1 twenty, 1:19, 1:20, 1:20, 1:20, 1:25,...",32166510,1:20,number,321665,<unk>,3216651,what time does the clock say,what time,0,0.0,1
60662,"[20, 4, 20, 20, 9, 3, lot, 1, 23, 19]",16313220,20,number,163132,3,1631322,how many lights are below the plane,how many,0,0.333333,1
60672,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",3766000,2,number,37660,3,376600,how many items are in the hand,how many,0,0.0,1
60681,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]",11380100,0,number,113801,2,1138010,how many boats are in the photo,how many,0,0.0,1
60684,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",40976320,2,number,409763,<unk>,4097632,what is the number on the back of the batter o...,what is the,0,0.0,1
60690,"[1:45, 9:11, 9:10, 9:10, 9:12, 9:15, 2:40, 9:1...",15309400,9:10,number,153094,<unk>,1530940,what time does it say on the clock,what time,0,0.0,1


In [21]:
# create column with unk token flag
df['unk_flag'] = df.apply(lambda data: data['predicted_answer']=='<unk>', axis=1)
df.head(5)

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial,answer_length,unk_flag
0,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",49994000,yes,yes/no,499940,yes,4999400,is there two boys in the photo,is there,1,1.0,1,False
1,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",49994010,yes,yes/no,499940,yes,4999401,is one of the boys wearing a cap,is,1,1.0,1,False
2,"[kites, kites, kites, kites, kites, kites, kit...",49994020,kites,other,499940,kite,4999402,what is flying in the sky,what is,0,0.0,1,False
3,"[kitchen, kitchen, kitchen, kitchen, kitchen, ...",39537920,kitchen,other,395379,kitchen,3953792,what is this room,what is this,1,1.0,1,False
4,"[white, white, white, white, white, white, whi...",39537900,white,other,395379,white,3953790,what color is the wall,what color is the,1,1.0,1,False


In [35]:
# number of incorrect answers by unk token flag
df[(df.answer_type=='number') & (df.correct==0)].groupby(['unk_flag'])['annotations'].count()

unk_flag
False    4684
True     1285
Name: annotations, dtype: int64

In [40]:
# examples of incorrect predictions that are not unk tokens
df[(df.answer_type=='number') & (df.correct==0) & (df.predicted_answer == '<unk>')].tail(50)

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial,answer_length,unk_flag
58872,"[13, 13, 13, 13, 13, 13, 13, 13, 13, 13]",23298500,13,number,232985,<unk>,2329850,what number is in the picture,what number is,0,0.0,1,True
58882,"[101, 1:01, 1:01, 1:01, 1:01, 1:01, 1:01, 1:01...",56738300,1:01,number,567383,<unk>,5673830,what time is it,what time,0,0.0,1,True
58906,"[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]",36767310,8,number,367673,<unk>,3676731,what is the girl's number,what is the,0,0.0,1,True
58916,"[weeknights at 11:30 pm, weeknights, 11:30, 11...",45538420,11:30 pm,number,455384,<unk>,4553842,what time is entourage advertised to be on,what time,0,0.0,2,True
58977,"[1, 3, 3, 3, oranges, 4, 4, good, 2, 2]",18479120,3,number,184791,<unk>,1847912,how reflections are in the painting,how,0,0.0,1,True
59003,"[4, 4, 4:01, 4:01, 4:00, 4:01, 4:01, 4:00, 4:0...",6036310,4:00,number,60363,<unk>,603631,what time is it,what time,0,0.0,1,True
59054,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]",36434120,6,number,364341,<unk>,3643412,what number can clearly be seen,what,0,0.0,1,True
59108,"[15 hours, 8 hours and 15 min, 9 hours and 15 ...",28053020,8 hours and 15 min,number,280530,<unk>,2805302,how long until midnight,how,0,0.0,5,True
59140,"[201, 251, 201, 201, 201, 201, 201, 204, 201, ...",28843500,201,number,288435,<unk>,2884350,what is the number on the front of the bus,what is the,0,0.0,1,True
59146,"[07:47, 8:46, 7:45, 7:46, 8:47, 7:46, 7:46, 7:...",35196700,7:46,number,351967,<unk>,3519670,what time does the clock show,what time,0,0.0,1,True


In [36]:
# examples of incorrect predictions that are not unk tokens
df[(df.answer_type=='number') & (df.correct==0) & (df.predicted_answer != '<unk>')].head(10)

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial,answer_length,unk_flag
24,"[yes, yes, yes, 3, 3, 2, 3, 3, 2, 3]",892320,3,number,8923,2,89232,how many different colored flowers are in fron...,how many,0,0.666667,1,False
30,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",53446720,1,number,534467,3,5344672,how many towels are in the photo,how many,0,0.0,1,False
92,"[bananas, 7, boggles, 7, 7, 7, 7, 7, 6, 7]",49597510,7,number,495975,3,4959751,how many bunches are on this scene,how many,0,0.0,1,False
93,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",18817300,0,number,188173,3,1881730,how many kangaroos are there,how many,0,0.0,1,False
95,"[5, 5, 5, 5, 5, 5, 5, 5, 5, 5]",18817320,5,number,188173,3,1881732,how many cars are in the photo,how many,0,0.0,1,False
112,"[2, 2, 2, 2, 2, 0, 2, 0, 2, 2]",6410300,2,number,64103,1,641030,how many ducks are there in the picture,how many,0,0.0,1,False
124,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",17129810,3,number,171298,2,1712981,how many buses are there,how many,0,0.0,1,False
151,"[1, 1, 1, 1, 1, pinning, 1, 1, 1, 1]",52083210,1,number,520832,3,5208321,how many people are in the photo,how many people are in,0,0.0,1,False
168,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]",21300800,6,number,213008,2,2130080,how many pillows are on the bed,how many,0,0.0,1,False
171,"[16, 16, 16, 16, 16, 16, 16, 16, 16, 16]",11733700,16,number,117337,3,1173370,how many photos are shown,how many,0,0.0,1,False


#### Summary
- Best accuracy (66%): Yes/No answer type
- Second accuracy (28%): Other answer type
- Worst accuracy (22%): Number answer type

**Yes/No Answer Type**  
The model does quite well consistently for all question types with accuracy above 50% except for three question types including "why", "what", and "what is the".  As we can see, the phrasing of these question types do not point to a yes or no answer so it's not a surprise that the model does poorly on these questions.  For other question types, the model does the best when the questions have clear and direct answers and it performs poorly when the questions are abstract, subjective, or require common sense knowledge.  It's also interesting to note that for the incorrect predictions almost half of them (47%) were also predicted by at least one human and almost 29% were predicted by at least two human.

**Other Answer Type**
We explored whether multi-word answers led to low accuracy for this answer type since multi-word phrases are more likely to be excluded from the training vocabulary.  However, we found that most of the incorrect predictions (80%) are made up of one-word answer and only 20% are made up of multi-word answers.  In general, the model does the best in predicting rooms, animals, sports, and colors for this type of question.


**Number Answer Type**
The model does the worst for number answer type with only 22% accuracy.  Most of the questions for this answer type involve counting i.e. "how many".  Around 20% of the incorrect predictions have the UNK token as the predicted answers.  We found that a lot of the UNK token predictions involve answers that relate to time or number sequences (such as bus number or number on a jersey) that need to be identified on objects.   



For future iterations we can try to measure accuracy by measuring distance between word vectors so that words / phrases with similar semantic meanings can be given credit (ex: nighttime vs. night).  We can also find better ways to tokenize time and numbers and explore different UNK replacement techniques.


### 2. Yang's Original + Frozen GloVe

In [67]:
json_results_path = '/home/rachel_sk_ho/w266-final-project/results/test2015_results_san_expt2_2018-11-23-06:31:32.json'

with open(json_results_path) as f:
    resultsj = json.load(f)
    
df2 = pd.DataFrame(resultsj)
df2['correct'] = (df2['answer_str'].str.strip().str.lower() == df2['predicted_answer'].str.strip().str.lower()).astype(int)
df2['partial'] = df2.apply(_partial_acc, axis=1)

In [68]:
df2.head()

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial
0,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",49994000,yes,yes/no,499940,yes,4999400,is there two boys in the photo,is there,1,1.0
1,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",49994010,yes,yes/no,499940,yes,4999401,is one of the boys wearing a cap,is,1,1.0
2,"[kites, kites, kites, kites, kites, kites, kit...",49994020,kites,other,499940,kite,4999402,what is flying in the sky,what is,0,0.0
3,"[kitchen, kitchen, kitchen, kitchen, kitchen, ...",39537920,kitchen,other,395379,kitchen,3953792,what is this room,what is this,1,1.0
4,"[white, white, white, white, white, white, whi...",39537900,white,other,395379,white,3953790,what color is the wall,what color is the,1,1.0


In [72]:
# accuracy
print('Accuracy = {:.3f}. Partial Accuracy = {:.3f}.'.format(df2['correct'].mean(), df2['partial'].mean()))

Accuracy = 0.427. Partial Accuracy = 0.498.


In [80]:
# accuracy by question type
df2.groupby(['question_type'])['correct','partial'].mean().sort_values(['correct'], ascending=False)

Unnamed: 0_level_0,correct,partial
question_type,Unnamed: 1_level_1,Unnamed: 2_level_1
what room is,0.852941,0.858543
is there a,0.838294,0.893805
what sport is,0.836858,0.853978
could,0.836257,0.912281
are there,0.744741,0.802244
is there,0.739033,0.818148
do you,0.692708,0.821181
has,0.67433,0.758621
is,0.664835,0.751282
does this,0.652961,0.752193


In [81]:
# accuracy by answer type
df2.groupby(['answer_type'])['correct','partial'].mean().sort_values(['correct'], ascending=False)

Unnamed: 0_level_0,correct,partial
answer_type,Unnamed: 1_level_1,Unnamed: 2_level_1
yes/no,0.651118,0.759784
other,0.30641,0.348132
number,0.234933,0.312285


### 3. Yang's Original + Trainable GloVe

In [77]:
json_results_path = '/home/rachel_sk_ho/w266-final-project/results/test2015_results_san_expt3_2018-11-21-17:27:34.json'

with open(json_results_path) as f:
    resultsj = json.load(f)
    
df3 = pd.DataFrame(resultsj)
df3['correct'] = (df3['answer_str'].str.strip().str.lower() == df3['predicted_answer'].str.strip().str.lower()).astype(int)
df3['partial'] = df3.apply(_partial_acc, axis=1)

In [78]:
df3.head()

Unnamed: 0,annotations,answer_id,answer_str,answer_type,image_id,predicted_answer,question_id,question_str,question_type,correct,partial
0,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",49994000,yes,yes/no,499940,yes,4999400,is there two boys in the photo,is there,1,1.0
1,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",49994010,yes,yes/no,499940,yes,4999401,is one of the boys wearing a cap,is,1,1.0
2,"[kites, kites, kites, kites, kites, kites, kit...",49994020,kites,other,499940,kite,4999402,what is flying in the sky,what is,0,0.0
3,"[kitchen, kitchen, kitchen, kitchen, kitchen, ...",39537920,kitchen,other,395379,kitchen,3953792,what is this room,what is this,1,1.0
4,"[white, white, white, white, white, white, whi...",39537900,white,other,395379,white,3953790,what color is the wall,what color is the,1,1.0


In [79]:
# accuracy
print('Accuracy = {:.3f}. Partial Accuracy = {:.3f}.'.format(df3['correct'].mean(), df3['partial'].mean()))

Accuracy = 0.426. Partial Accuracy = 0.497.


In [82]:
# accuracy by question type
df3.groupby(['question_type'])['correct','partial'].mean().sort_values(['correct'], ascending=False)

Unnamed: 0_level_0,correct,partial
question_type,Unnamed: 1_level_1,Unnamed: 2_level_1
what room is,0.852941,0.858543
is there a,0.84473,0.900241
could,0.842105,0.918129
what sport is,0.833837,0.845921
are there,0.757363,0.812529
is there,0.737908,0.815898
do you,0.6875,0.805556
has,0.67433,0.754789
is,0.667033,0.751648
is this an,0.657895,0.763158


In [83]:
# accuracy by answer type
df3.groupby(['answer_type'])['correct','partial'].mean().sort_values(['correct'], ascending=False)

Unnamed: 0_level_0,correct,partial
answer_type,Unnamed: 1_level_1,Unnamed: 2_level_1
yes/no,0.649756,0.758539
other,0.305849,0.346988
number,0.232449,0.309365
