In [12]:
import json
import pandas as pd

To get the results file from Google Storage bucket (assuming you have access):


gsutil cp gs://mids-w266-mw/mlflow/14/414e40b4619245a7a200920a7e9a10cf/artifacts/test2015_results_san_expt3_2018-11-21-15:42:18.json .


In [13]:
# v1 sample results set
#json_results_path = '/home/mwinton/w266-final-project/results/json/test2015_results_san_expt3_2018-11-21-15:42:18.json'

# v2 sample results set
json_results_path = '/home/mwinton/w266-final-project/results/json/v2_test2015_results_san_expt9_2018-11-26-22:48:19.json'

In [14]:
with open(json_results_path) as f:
    resultsj = json.load(f)

In [15]:
df = pd.DataFrame(resultsj)

In [16]:
df['correct'] = (df['answer_str'].str.strip().str.lower() == df['predicted_answer'].str.strip().str.lower()).astype(int)


In [17]:
def _partial_acc(obs):
    """
        inner function used to calculate a weighted accuracy based on how many human
        raters' annotations the predicted value matches.  Applied to a single Series.
        
        Args:
            obs = pandas.Series object containing 'predicted_answer' and 'annotations' columns
    """
    prediction = obs['predicted_answer']
    annotations = obs['annotations']
    matches = 0
    for a in annotations:
        if prediction.strip().lower() == a.strip().lower():
            matches += 1
    return min(1, matches/3)

In [18]:
df['partial'] = df.apply(_partial_acc, axis=1)

In [19]:
df

Unnamed: 0,annotations,answer_id,answer_str,answer_type,complement_id,image_id,predicted_answer,question_id,question_str,question_type,correct,partial
0,"[yes, yes, yes, yes, no, yes, maybe, no, yes, ...",113200000,yes,yes/no,271844002.0,11320,yes,11320000,was this taken in the usa,was,1,1.000000
1,"[red, red, red, red, red, red, red, red, red, ...",4224320010,red,other,197266002.0,422432,red,422432001,what color is the truck,what color is the,1,1.000000
2,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",2073660020,2,number,558661010.0,207366,2,207366002,how many soccer players are in this scene,how many,1,1.000000
3,"[yes, no, n, yes, yes, yes, yes, yes, yes, yes]",5135070020,yes,yes/no,427649003.0,513507,no,513507002,are there clouds in the distance,are there,0,0.333333
4,"[cat, cat, cat, cat, cat, cat, cat, cat, cat, ...",4128070020,cat,other,,412807,cat,412807002,what animal is in the picture,what animal is,1,1.000000
5,"[art, lounge, gallery, gallery, art room, exhi...",4423050000,art gallery,other,288884001.0,442305,<unk>,442305000,what type of room is the lady in,what type of,0,0.000000
6,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]",5615390010,6,number,576946000.0,561539,3,561539001,how many people are in the photo,how many people are in,0,0.000000
7,"[pizza cutter, scooter, pizza cutter, toy, sco...",4024330030,pizza cutter,other,,402433,pizza,402433003,what is the red item,what is the,0,0.000000
8,"[no, no, no, no, no, no, no, no, no, no]",4389190080,no,yes/no,451952005.0,438919,no,438919008,is the pavement wet,is the,1,1.000000
9,"[evening, night, night, daytime, night, night,...",740920020,night,other,68025002.0,74092,tennis,74092002,what time of day is it,what time,0,0.000000


In [20]:
acc = df['correct'].mean()
partial_acc = df['partial'].mean()
print('Accuracy = {:.3f}. Partial Accuracy = {:.3f}.'.format(acc, partial_acc))

Accuracy = 0.098. Partial Accuracy = 0.128.


In [22]:
# older results files and any v1 results files won't have complement_id's
if 'complement_id' in df.columns and df['complement_id'].notnull().any():
    joined = pd.merge(df, df, left_on='complement_id', right_on='question_id')
    joined['both_complements_correct'] = ((joined['correct_x']==1) & (joined['correct_y']==1)).astype(int)
    complementary_acc = joined['both_complements_correct'].mean()
    print('Complementary Pairs accuracy = {:.3f}'.format(complementary_acc))
else:
    print('No complementary pairs data.')

Complementary Pairs accuracy = 0.021


In [23]:
pd.set_option('display.max_rows', 75)

In [24]:
acc_by_qtype = df.groupby(['question_type'])['correct','partial'] \
    .mean() \
    .sort_values(['correct'], ascending=False)
acc_by_qtype

Unnamed: 0_level_0,correct,partial
question_type,Unnamed: 1_level_1,Unnamed: 2_level_1
do you,0.253369,0.312668
is the man,0.24277,0.295282
are there any,0.242424,0.286869
are there,0.242403,0.289517
can you,0.238202,0.307116
could,0.237785,0.332248
is there a,0.233621,0.275718
are they,0.231563,0.287119
is there,0.229431,0.291862
is he,0.227915,0.293286


In [25]:
acc_by_anstype = df.groupby(['answer_type'])['correct','partial'] \
    .mean() \
    .sort_values(['correct'], ascending=False)
acc_by_anstype

Unnamed: 0_level_0,correct,partial
answer_type,Unnamed: 1_level_1,Unnamed: 2_level_1
yes/no,0.235123,0.302531
number,0.0355,0.051222
other,0.010431,0.016047


In [26]:
acc_by_anstype.to_dict('index')

{'yes/no': {'correct': 0.23512297018377143, 'partial': 0.3025306907384864},
 'number': {'correct': 0.03549961106003819, 'partial': 0.0512222143648494},
 'other': {'correct': 0.010431062226681559, 'partial': 0.016047302627012424}}