# Training

In [2]:
import numpy as np
import os
import math
from keras.layers import Embedding, Input, Dense, LSTM, RepeatVector,concatenate, Reshape,multiply
from keras.models import Sequential, Model
import keras
from keras.callbacks import ModelCheckpoint, Callback, LearningRateScheduler
from keras import optimizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd


def get_emb(tokenizer):
    embeddings_index = {}
    f = open(os.path.join('glove.6B.300d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((5000, 300))
    for word, i in word_index.items():
        if i >= 5000:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


def emb_model(n, embedding_matrix,y):
    input_data = Input(shape=[n])
    embedding_layer = Embedding(5000, 300, weights=[embedding_matrix],
                                input_length=25, trainable=False)(input_data)
    model = Model(input_data, embedding_layer)
    return model.predict(y)


Y_emb = np.load('coco_pre_prop_Y_new.npy')
X = np.load('coco_pre_prop_X_new.npy')
X = np.reshape(X, [X.shape[0], X.shape[-1]])
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(Y_emb)
sequences = tokenizer.texts_to_sequences(Y_emb)
data = pad_sequences(sequences, maxlen=25,truncating='post', padding='post')
embedding_matrix = get_emb(tokenizer)
data_input = data[:,:-1]
Y_input = emb_model(24,embedding_matrix, data_input)
Y_target = emb_model(25,embedding_matrix, data)
mask = Y_target.sum(axis = -1, keepdims = True) != 0
mask = np.repeat(mask, 300,axis = 2)
initializer = keras.initializers.RandomUniform(minval=-0.08, maxval=0.08)
image_feature = Input(shape = [2048], name = 'img_input')
dense1 = Dense(300,kernel_initializer=initializer,name = 'dense1')(image_feature)
image_emb = Reshape((1,300), name = 'image_emb')(dense1)
cap_input = Input(shape = [24,300], name = 'cap_input')
merged_input = concatenate([image_emb, cap_input],axis = 1)
lstm_cell = LSTM(300,return_sequences=True,kernel_initializer=initializer,dropout=0.3,name = 'lstm_cell')(merged_input)
mask_input = Input(shape = [25,300], name = 'mask_input')
final_output = multiply([lstm_cell,mask_input])
show_and_tell = Model(inputs = [image_feature,cap_input, mask_input], outputs = [final_output])
def step_decay(epoch):
    initial_lrate = 2
    drop = 0.5
    epochs_drop = 8
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_batch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
op = optimizers.SGD(lr=0.0, decay=0.0, nesterov=False)
lrate = LearningRateScheduler(step_decay)
history = LossHistory()
show_and_tell.compile(optimizer = op, loss = 'mse')
show_and_tell.fit({'img_input':X, 'cap_input':Y_input,'mask_input':mask}, Y_target,epochs = 1,validation_split=0.3, callbacks=[lrate,history])



Train on 57828 samples, validate on 24784 samples
Epoch 1/1


<keras.callbacks.History at 0x7f0a09445908>

In [4]:
show_and_tell.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
img_input (InputLayer)           (None, 2048)          0                                            
____________________________________________________________________________________________________
dense1 (Dense)                   (None, 300)           614700                                       
____________________________________________________________________________________________________
image_emb (Reshape)              (None, 1, 300)        0                                            
____________________________________________________________________________________________________
cap_input (InputLayer)           (None, 24, 300)       0                                            
___________________________________________________________________________________________

In [6]:
show_and_tell.save_weights('new_model.h5')

# Testing

In [44]:
test_input = Input((1,300))
lstm_cell = LSTM(300,kernel_initializer=initializer,dropout=0.3,name = 'lstm_cell', return_sequences = True)(test_input)
tell = Model(inputs = test_input, outputs = lstm_cell)

In [45]:
tell.load_weights('new_model.h5',by_name=True)

In [46]:
tell.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 1, 300)            0         
_________________________________________________________________
lstm_cell (LSTM)             (None, 1, 300)            721200    
Total params: 721,200.0
Trainable params: 721,200.0
Non-trainable params: 0.0
_________________________________________________________________


In [47]:
w,b = show_and_tell.layers[1].get_weights()

In [48]:
img = X[0].dot(w) + b

In [62]:
def pre(img):
    final = []
    global w,b,embedding_matrix
    img_out = img.dot(w) + b
    img_out = np.reshape(img_out, newshape=[1,1,img_out.shape[0]])
    i = 0 
    while i < 25:
        out = tell.predict(img_out)
        final.append(np.argmax(np.reshape(out,newshape=[300,]).dot(embedding_matrix.T)))
        img_out = out
        i +=1
    return final

In [63]:
pre(X[0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]