**This notebook is modified for UBC MDS-MDA Capstone project based on the eval.py code from this [repo](https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py)**

In [69]:
import json
import pandas as pd

In [15]:

import matplotlib.pyplot as plt
import skimage.io as io
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

import json
from json import encoder
encoder.FLOAT_REPR = lambda o: format(o, '.3f')


## 1. Import metric scripts

In [13]:
# import metric scripts
import sys
sys.path.append('../scr/evaluation/')
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice

In [15]:
# download Stanford models
!../scr/evaluation/get_stanford_models.sh

Found Stanford CoreNLP.


## 2. Load reference captions and generated captions

In [22]:
# reference captions
ref_path = '../data/processed/json/'
with open(ref_path + 'valid.json', 'r') as jsonFile:
    data = json.load(jsonFile)
    
# generated captions
results_path = '../models'
with open(results_path + '/' + 'test_results.json', 'r') as f:
    results = json.load(f)

## 3. Format the inputs and tokenize

In [26]:
# format the inputs
img_id_dict = {'image_id': list(data.keys())}

imgIds = img_id_dict['image_id']
gts = {}
res = {}

required_key = { 'raw', 'imgid', 'sentid' }

for imgId in imgIds:
    caption_list = data[imgId]['sentences']
    caption_list_sel = []
    for i in caption_list:
        lst = { key:value for key,value in i.items() if key in required_key}
        lst['caption'] = lst.pop('raw')
        lst['image_id'] = lst.pop('imgid')
        lst['id'] = lst.pop('sentid')
        caption_list_sel.append(lst)
    gts[imgId] = caption_list_sel
    
    generated = [{'caption': results[imgId]}]
    res[imgId] = generated

In [27]:
# tokenize
print('tokenization...')
tokenizer = PTBTokenizer()
gts  = tokenizer.tokenize(gts)
res = tokenizer.tokenize(res)

tokenization...


## 4. Evaluate the model results

In [51]:
# set up the scorer
print('setting up scorers...')
scorers = [
    (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
    (Meteor(),"METEOR"),
    (Rouge(), "ROUGE_L"),
    (Cider(), "CIDEr"),
    (Spice(), "SPICE")
]

setting up scorers...


In [52]:
# compute scores
score_dict = {}
scores_dict = {}
for scorer, method in scorers:
    print('computing %s score...'%(scorer.method()))
    score, scores = scorer.compute_score(gts, res)
    if type(method) == list:
        for sc, scs, m in zip(score, scores, method):
            score_dict[m] = sc
            scores_dict[m] = scs
    else:
        score_dict[method] = score
        scores_dict[method] = scores

computing Bleu score...
{'testlen': 21524, 'reflen': 21224, 'guess': [21524, 19440, 17356, 15272], 'correct': [13527, 7790, 5135, 3541]}
ratio: 1.014134941575527
computing METEOR score...
computing Rouge score...
computing CIDEr score...
computing SPICE score...


In [75]:
# View the overall average score for the model
score_df = pd.DataFrame(score_dict, index = ['baseline_model']).T
score_df

Unnamed: 0,baseline_model
Bleu_1,0.628461
Bleu_2,0.501834
Bleu_3,0.420795
Bleu_4,0.362544
METEOR,0.289759
ROUGE_L,0.535357
CIDEr,2.050211
SPICE,0.379176


In [76]:
# process the individual scores
# BLEU SCORE
