In [1]:
from beam import BeamSearch
from functools import partial
from itertools import izip, repeat
import sys
import time
sys.path.append('../coco-caption')
sys.path.append('../show-attend-and-tell-tensorflow')
from core.utils import load_pickle
from pycocoevalcap.bleu.bleu import Bleu

## Load data

In [2]:
split = 'train'
word_to_idx = load_pickle('../show-attend-and-tell-tensorflow/data/train/word_to_idx.pkl')
words = word_to_idx.keys()
words.remove('<START>')
words.remove('<END>')
words.remove('<NULL>')
words.append('.')
references = load_pickle('../show-attend-and-tell-tensorflow/data/{}/{}.references.pkl'.format(split, split))

Loaded ../show-attend-and-tell-tensorflow/data/train/word_to_idx.pkl..
Loaded ../show-attend-and-tell-tensorflow/data/train/train.references.pkl..


## Score function

In [3]:
scorer = Bleu(n=4)

def score(hypotheses, reference_sequences):
    """
    Score each hypothesis. Any sequence in reference_sequences is correct.
    Returns the best score for each hypothesis, among the reference_sequences.
    :param hypotheses: Dict of hypotheses to compute score.
    :param reference_sequences: List of ground truth sequences.
    :return scores: Dict with same keys as hypotheses, but with score as value
    """
    #hypotheses = dict(izip(xrange(0, len(hypotheses)), hypotheses))
    reference_sequences = dict(izip(hypotheses.iterkeys(), repeat(reference_sequences, len(hypotheses))))
    _, scores = scorer.compute_score(reference_sequences, hypotheses)
    return scores

## Beam Search
#### with reference actions
Reference words are as good, or better than, non-reference words in terms of BLEU score. Only a handful (~50) reference words to consider, but potentially 10k+ non-reference words. So by only considering the reference words as the set of actions available to expert, beam search is dramatically faster and produces the same results.

~0.5s per search with reference word actions

In [4]:
start_time = time.time()
n_beams = 5
n_searches = 2
for counter, (_, reference) in enumerate(references.iteritems()):
    if counter >= n_searches:
        break

    print 'Ground truth:\n \t', '\n \t'.join(reference)
    score_wrapper = partial(score, reference_sequences=reference)
    reference_words = set()
    for sentence in reference:
        reference_words.update(sentence.split(' '))
    beam_search = BeamSearch(n_beams, score_wrapper, reference_words)
    initial_beams = [[sentence.split(' ')[0]] for sentence in reference]
    scores, sequence = beam_search.search(max_length=15, beams=initial_beams)

end_time = time.time()
print ' \n Elapsed time per search: {}s \n \n'.format(float(end_time - start_time) / n_searches)

Ground truth:
 	closeup of bins of food that include broccoli and bread .
 	a meal is presented in brightly colored plastic trays .
 	there are containers filled with different kinds of foods .
 	a bunch of trays that have different food .
 	colorful dishes holding meat vegetables fruit and bread .
{'reflen': 1620, 'guess': [355, 175, 0, 0], 'testlen': 355, 'correct': [350, 7, 0, 0]}
ratio: 0.219135802469
Beam length 2 
 	3.01973833959e-05: closeup of
 	3.01973833959e-05: a bunch
 	3.01973833959e-05: a meal
 	3.01973833959e-05: there are
 	3.01973833959e-05: a bunch
{'reflen': 1575, 'guess': [525, 350, 175, 0], 'testlen': 525, 'correct': [516, 183, 5, 0]}
ratio: 0.333333333333
Beam length 3 
 	0.00427967742473: closeup of bins
 	0.00427967742473: a bunch of
 	0.00427967742473: a meal is
 	0.00427967742473: there are containers
 	0.00427967742473: a bunch of
{'reflen': 1575, 'guess': [700, 525, 350, 175], 'testlen': 700, 'correct': [688, 361, 180, 5]}
ratio: 0.444444444444
Beam length 4

#### With all actions
~360s per search with reference word actions

In [5]:
start_time = time.time()
n_beams = 5
n_searches = 2
for counter, (_, reference) in enumerate(references.iteritems()):
    if counter >= n_searches:
        break

    print 'Ground truth:\n \t', '\n \t'.join(reference)
    score_wrapper = partial(score, reference_sequences=reference)
    beam_search = BeamSearch(n_beams, score_wrapper, words)
    initial_beams = [[sentence.split(' ')[0]] for sentence in reference]
    scores, sequence = beam_search.search(max_length=15, beams=initial_beams)

end_time = time.time()
print ' \n Elapsed time per search: {}s \n \n'.format(float(end_time - start_time) / n_searches)

Ground truth:
 	closeup of bins of food that include broccoli and bread .
 	a meal is presented in brightly colored plastic trays .
 	there are containers filled with different kinds of foods .
 	a bunch of trays that have different food .
 	colorful dishes holding meat vegetables fruit and bread .
{'reflen': 1039905, 'guess': [231085, 115540, 0, 0], 'testlen': 231085, 'correct': [115715, 7, 0, 0]}
ratio: 0.222217414091
Beam length 2 
 	3.01973833959e-05: closeup of
 	3.01973833959e-05: a bunch
 	3.01973833959e-05: a meal
 	3.01973833959e-05: there are
 	3.01973833959e-05: a bunch
{'reflen': 1039860, 'guess': [346620, 231080, 115540, 0], 'testlen': 346620, 'correct': [231246, 115548, 5, 0]}
ratio: 0.333333333333
Beam length 3 
 	0.00427967742473: closeup of bins
 	0.00427967742473: a bunch of
 	0.00427967742473: a meal is
 	0.00427967742473: there are containers
 	0.00427967742473: a bunch of
{'reflen': 1039860, 'guess': [462160, 346620, 231080, 115540], 'testlen': 462160, 'correct': [