In [38]:
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dropout, Dense
import numpy as np
import random
import json
import nltk
import itertools

In [32]:
# Keep reproducible results (Remove randomness between runs)
import tensorflow as tf
import os
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
random.seed(12345)
# Set parallelism to 1 to prevent randomness due to multcore
# session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

# from keras import backend as K

tf.set_random_seed(1234)

# sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
# K.set_session(sess)

In [33]:
# Load mscoco dataset: http://images.cocodataset.org/annotations/annotations_trainval2014.zip
mscoco_train = json.load(open('data/annotations/captions_train2014.json'))
mscoco_val = json.load(open('data/annotations/captions_val2014.json'))

In [34]:
# Load captions and check
captions_train = [x['caption'] for x in mscoco_train['annotations']]
captions_val = [x['caption'] for x in mscoco_val['annotations']]
print("Total training captions: {}".format(len(captions_train)))
print("Total validation captions: {}".format(len(captions_val)))
print("Sample training captions:")
[print('\t', x) for x in random.sample(captions_train, 2)]
print("Sample validation captions:")
_ = [print('\t', x) for x in random.sample(captions_val, 2)]

Total training captions: 414113
Total validation captions: 202654
Sample training captions:
	 A group of men on horses and pack mules on top of a high ridge.
	 Some baseball players are playing a game. 
Sample validation captions:
	 Their is a toilet next to an opaque window. 
	 A man in an orange robe holding a red umbrella.


In [35]:
# Divide samples between validation and captions
VALIDATION_SIZE = 5000
captions_train = captions_train + captions_val[:-VALIDATION_SIZE]
captions_val = captions_val[-VALIDATION_SIZE:]

In [36]:
# Tokenize captions
class CaptionIndexer:
    def __init__(self, unknown_token='UNKNOWN_TOKEN', start_token='START_TOKEN', 
                 end_token='END_TOKEN', padding_token='PADDING_TOKEN'):
        self.unknown_token = unknown_token
        self.start_token = start_token
        self.end_token = end_token
        self.padding_token = padding_token
        
    def generate_freqDist(self, texts):
        print('Tokenizing texts for training')
        self.tokens_list = []
        for i, x in zip(range(1, len(texts)+1), texts):
            self.tokens_list.append(nltk.word_tokenize(x.lower()))
            if i%10000 == 0:
                print("{} texts done".format(i))
        print('Generating freq dist')
        self.freqDist = nltk.FreqDist(itertools.chain(*self.tokens_list))
    
    def fit_on_texts(self, texts, vocab_size):
        self.vocab_size = vocab_size
        vocab = self.freqDist.most_common(self.vocab_size-4) # Reserve 3 for custom tokens
        self.index2token = [self.padding_token, self.start_token, self.end_token, self.unknown_token] + \
                            [x[0] for x in vocab]
        self.token2index = {w: i for i, w in enumerate(self.index2token)}
        print('Done training')
    
    def texts_to_indices(self, texts, retokenize=True):
        print('Transforming texts')
        if retokenize:
            tokens_list = [nltk.word_tokenize(x.lower()) for x in texts]
        else:
            tokens_list = self.tokens_list
        indices_list = []
        for tokens in tokens_list:
            tokens = [self.start_token] + \
                    [x if x in self.token2index else self.unknown_token for x in tokens] + \
                    [self.end_token]
            indices_list.append([self.token2index[token] for token in tokens])
        print('Done transforming')
        return indices_list
    
def pad_indices(indices_list, maxlen):
    return pad_sequences(indices_list, maxlen=maxlen, padding='pre', truncating='post', 
                                            value=0) # 0 is padding index

In [39]:
indexer = CaptionIndexer()
indexer.generate_freqDist(captions_train)
indexer.fit_on_texts(captions_train, vocab_size=VOCAB_SIZE)

Tokenizing texts for training
10000 texts done
20000 texts done
30000 texts done
40000 texts done
50000 texts done
60000 texts done
70000 texts done
80000 texts done
90000 texts done
100000 texts done
110000 texts done
120000 texts done
130000 texts done
140000 texts done
150000 texts done
160000 texts done
170000 texts done
180000 texts done
190000 texts done
200000 texts done
210000 texts done
220000 texts done
230000 texts done
240000 texts done
250000 texts done
260000 texts done
270000 texts done
280000 texts done
290000 texts done
300000 texts done
310000 texts done
320000 texts done
330000 texts done
340000 texts done
350000 texts done
360000 texts done
370000 texts done
380000 texts done
390000 texts done
400000 texts done
410000 texts done
420000 texts done
430000 texts done
440000 texts done
450000 texts done
460000 texts done
470000 texts done
480000 texts done
490000 texts done
500000 texts done
510000 texts done
520000 texts done
530000 texts done
540000 texts done
550000 

In [40]:
VOCAB_SIZE = 4096
EMBEDDING_DIM = 128
LSTM_NODES = 128

In [41]:
# Build inference model
inference_model = Sequential()
inference_model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=1, batch_input_shape=(1, 1)))
inference_model.add(LSTM(LSTM_NODES, return_sequences=False, stateful=True))
inference_model.add(Dropout(0.2))
inference_model.add(Dense(VOCAB_SIZE, activation='softmax'))
inference_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, 1, 128)               524288    
_________________________________________________________________
lstm_3 (LSTM)                (1, 128)                  131584    
_________________________________________________________________
dropout_3 (Dropout)          (1, 128)                  0         
_________________________________________________________________
dense_3 (Dense)              (1, 4096)                 528384    
Total params: 1,184,256
Trainable params: 1,184,256
Non-trainable params: 0
_________________________________________________________________


In [137]:
# Load best model yet and save its weights
fname = 'model/v2/language_generation-11-1.27.h5'
model = load_model(fname)
weights_fname = 'model/v2/language_generation_weights.h5'
model.save_weights(weights_fname)

# Load weights from original model
inference_model.load_weights(weights_fname)

In [140]:
# Sample sentences
def sample_sentence(model):
    model.reset_states()
    input_index = np.full((1, 1), indexer.token2index[indexer.start_token])
    sentence = []
    while True:
        next_probs = model.predict(input_index).astype('float64')
        next_probs = next_probs / next_probs.sum()
        next_index = np.random.multinomial(1, next_probs.squeeze()).argmax()
        next_word = indexer.index2token[next_index]
        sentence.append(next_word)
        if next_word == indexer.end_token:
            break
        if next_word in (indexer.unknown_token, indexer.padding_token, indexer.start_token): # Reset and start again if encounter unknown token
            model.reset_states()
            sentence = []
            input_index = np.full((1, 1), indexer.token2index[indexer.start_token])
        else:
            input_index = np.full((1, 1), next_index)
    return sentence
sentences = [sample_sentence(inference_model) for _ in range(5)]
_ = [print('[{}] '.format(i)+' '.join(sentence)) for i, sentence in zip(range(1, 6), sentences)]

[1] a man in a gray suit on the beach , END_TOKEN
[2] guy . a single teddy bear END_TOKEN
[3] two two stopped a train motorcycle pieces to on an airplane next to a truck in front of wall with a dog standing . END_TOKEN
[4] white bags END_TOKEN
[5] a girl on a long board on the beach . END_TOKEN
