Source: https://github.com/hlamba28/Automatic-Image-Captioning

In [2]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from PIL import Image
import glob
import pickle
from pickle import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector, \
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization, Flatten
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
dataDir = './data'

In [4]:
data_dict = pickle.load(open(os.path.join(dataDir, 'bird_data_dict.pkl'), 'rb'))
print ('Done loading data_dict')


X_sentence = np.array(data_dict['X_sentence'], dtype=np.int32) # (11788, 10, 76) numpy array of word IDS
y_sentence = np.array(data_dict['y_sentence'], dtype=np.int32) # (11788, 10, 76) numpy array of next word IDS, or X[:,:,1:]
lengths = np.array(data_dict['lengths'], dtype=np.int32) # (11788, 10) numpy array of lengths
vocab_size = data_dict['num_words'] # no. words in vocab
id_to_word = data_dict['id_to_word'] # dict: integer id to english word
#print(idToWords(X_sentence[0], id_to_word))
#print(idToWords(y_sentence[0], id_to_word))

word_to_id = data_dict['word_to_id'] # dict: english word to id
X_img = data_dict['X_img'] # (11788, 8192) numpy array of feature vectors
y_img = data_dict['y_img'] # (11788, ) numpy array of classes
max_length = data_dict['max_length'] # max length of sentence, or X_sentence.shape[-1]
print ('Done extracting objects from data_dict')

id_to_class = pickle.load(open(os.path.join(dataDir, 'id_to_class.pkl'), 'rb'))
names = pickle.load(open(os.path.join(dataDir, 'names.pkl'), 'rb'))
name_to_index = {} # dict: name of file to index in dataset
for i, n in enumerate(names):
    name_to_index[n] = i
print ('Done loading id_to_class and name_to_index')

# # ######## TRAIN/VAL SPLIT ##########
with open(os.path.join(dataDir, 'train_no_cub.txt'), 'rb') as f:
    train = [line.rstrip().decode('utf-8') for line in f.readlines()]
    train_indices = [name_to_index.get(f) for f in train]
with open(os.path.join(dataDir, 'val_no_cub.txt'), 'rb') as f:
    val = [line.rstrip().decode('utf-8') for line in f.readlines()]
    val_indices = [name_to_index.get(f) for f in val]

train_val_overlap = bool(set(val_indices) & set(train_indices))
assert not train_val_overlap

X_img_train, y_img_train, X_sentence_train, y_sentence_train, lengths_train = X_img[train_indices, :], y_img[train_indices], X_sentence[train_indices], y_sentence[train_indices], lengths[train_indices]
X_img_val, y_img_val, X_sentence_val, y_sentence_val, lengths_val = X_img[val_indices, :], y_img[val_indices], X_sentence[val_indices], y_sentence[val_indices], lengths[val_indices]
print ("Done splitting into train/validation")

# ######## Expand Descriptions ########
# # 10 descriptions per image, flatten descriptions
#X_sentence_train = X_sentence_train.reshape(X_sentence_train.shape[0]*X_sentence_train.shape[1], max_length)
#X_sentence_val = X_sentence_val.reshape(X_sentence_val.shape[0]*X_sentence_val.shape[1], max_length)

#y_sentence_train = y_sentence_train.reshape(y_sentence_train.shape[0]*y_sentence_train.shape[1], max_length)
#y_sentence_val = y_sentence_val.reshape(y_sentence_val.shape[0]*y_sentence_val.shape[1], max_length)

# lengths_train = lengths_train.flatten()
# lengths_val = lengths_val.flatten()

# # Repeat image 10 times to match description flattening
#X_img_exp_train = np.tile(X_img_train, 10).reshape(X_img_train.shape[0]*10, X_img_train.shape[1])
#X_img_exp_val = np.tile(X_img_val, 10).reshape(X_img_val.shape[0]*10, X_img_val.shape[1])
#print ("Done Expanding descriptions\n")

Done loading data_dict
Done extracting objects from data_dict
Done loading id_to_class and name_to_index
Done splitting into train/validation


In [23]:
def buildingModel(vocabSize, maxLength):
    inputLayer = Input(shape=(maxLength, vocabSize))
    dropout = Dropout(0.5)(inputLayer)
    lstm = LSTM(256, input_shape=(maxLength, vocabSize))(dropout)
    denseLayer = Dense(256, activation='relu')(lstm)
    outputLayer = Dense(vocabSize, activation='softmax')(denseLayer)
    
    return Model(inputs=inputLayer, outputs=outputLayer)

In [5]:
print(X_sentence_train.shape)
# print(X_img_val.shape)

(4000, 10, 76)


`inSeq = pad_sequences([inSeq], maxlen=maxLength)[0]` [43, 12, 35, 67, 0, 0, 0, 0, 0, ..., 0]
`inSeq = ...` [[0,0,0,1,0,0,0], 12, 35, 67, [0,0,0,0,0,0,0]]

In [15]:
hotends = to_categorical([x for x in word_to_id.values()], vocab_size)
id_to_hotend = dict(zip(word_to_id.values(), hotends))

def hotend_to_id(hotend):
    return np.argmax(hotend, axis=None, out=None)

# Data generator, intended to be used in a call to model.fit_generator()
def dataGenerator(descriptions, wordtoid, maxLength, vocabSize, batchSize=32):
    # Incluir xMarkov quando implementar Markov Chain
    xSentence, ySentence = list(), list()
    n = 0
    while 1:
        for i, descriptions in zip(range(1, len(descriptions)), descriptions):
            n += 1
            for d in descriptions:
                for i in range(1, len(d)):
                    # split into input and output pair
                    # maybe I could change this to => inSeq, outSeq = d[:i], dnext[:i]
                    # where the dnext is extracted by the provided y_sentence
                    inSeq, outSeq = d[:i], d[i]
                    # pad input sequence
                    # inSeq = pad_sequences([inSeq], maxlen=maxLength)[0]
                    inSeq = np.array([id_to_hotend[x] for x in pad_sequences([inSeq], maxlen=max_length)[0]])
                    # encode output sequence
                    #outSeq = to_categorical([outSeq], num_classes=vocabSize)[0]
                    outSeq = id_to_hotend[outSeq]
                    # store
                    xSentence.append(inSeq)
                    ySentence.append(outSeq)
            # yield the batch data
            if n == batchSize:
                yield [array(xSentence), array(ySentence)]
                xSentence, ySentence = list(), list()
                n = 0

In [24]:
lstm = buildingModel(vocab_size, max_length)
lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 76, 6895)          0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 76, 6895)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               7323648   
_________________________________________________________________
dense_5 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_6 (Dense)              (None, 6895)              1772015   
Total params: 9,161,455
Trainable params: 9,161,455
Non-trainable params: 0
_________________________________________________________________


In [25]:
lstm.compile(loss='categorical_crossentropy', optimizer='adam')
callback = ModelCheckpoint(os.path.join(dataDir, 'models/weights.{epoch:02d}-{val_loss:.2f}.hdf5'), monitor='val_loss', save_best_only=True)

In [26]:
nEpochs = 10
batchSize = 1

In [27]:
trainGenerator = dataGenerator(X_sentence_train, word_to_id, max_length, vocab_size, batchSize)
valGenerator = dataGenerator(X_sentence_val, word_to_id, max_length, vocab_size, batchSize)

In [None]:
trainGenerator[0]

In [None]:
lstm.fit_generator(trainGenerator, steps_per_epoch = len(X_sentence_train)//batchSize, epochs=nEpochs, callbacks=[callback], validation_data=valGenerator, validation_steps=len(X_sentence_val)//batchSize)

Epoch 1/10
  12/4000 [..............................] - ETA: 150349s - loss: 7.0995