# 1. MAKE IMAGE DATA DICT

* key: image name
* Create a folder images with all the required data
* Create a image_dict_V2.pickle file for further useage
* value: vector of shape [1,4096], output of VGGNet

In [1]:
import os
import cv2
import pickle
import numpy as np


# imagedictv2: Images of size 100x100

if(os.path.isfile('image_dict.pickle')):
    with open('image_dict.pickle', 'rb') as handle:
        image_dict = pickle.load(handle)
        
else:
  folder = 'images/'

  image_dict_V2 = {}        

  skipped = []
  i=0

  for filename in os.listdir(folder):
      if(i%500 == 0):
          print("{} images processed".format(i))
      i+=1
      if filename in image_dict_V2:
          # print ("already in dict - moving on")
          continue
      try:
          # load an image from file
          image = cv2.imread(os.path.join(folder, filename))
      except:
          print("Error reading file: {}!!!".format(filename))
          skipped.append(filename)
          continue
      if image is not None:
          resized_image = cv2.resize(image, (100, 100)) 
          image_dict_V2[filename] = resized_image
      else:
          skipped.append(filename)

  print("{} files skipped:".format(len(skipped)))
  for f in skipped:
      print("    {}".format(f))
  print("dict created")

# # Save dict pickle

  print("Saving image_dict_V2.pickle")
  with open('image_dict_V2.pickle', 'wb') as handle:
      pickle.dump(image_dict_V2, handle, protocol=pickle.HIGHEST_PROTOCOL)
      print('image_dict_V2.pickle saved')



0 images processed
50 images processed
100 images processed
150 images processed
200 images processed
250 images processed
300 images processed
350 images processed
400 images processed
450 images processed
500 images processed
550 images processed
600 images processed
650 images processed
700 images processed
750 images processed
800 images processed
850 images processed
900 images processed
950 images processed
1000 images processed
1050 images processed
1100 images processed
1150 images processed
1200 images processed
1250 images processed
1300 images processed
1350 images processed
1400 images processed
1450 images processed
1500 images processed
1550 images processed
1600 images processed
1650 images processed
1700 images processed
1750 images processed
1800 images processed
1850 images processed
1900 images processed
1950 images processed
2000 images processed
2050 images processed
2100 images processed
2150 images processed
2200 images processed
2250 images processed
2300 images

# 2. CREATE QUESTION DATA 

* load Quest_Answers.json
* store image ID in a list
* store question in a list
* Store answer in a list

In [2]:
# create 2 lists
# one stores image ids
# one stores question

imageNamesX  = []
questionsNLX = []
answers = []

import json
QAs = json.load(open("Quest_Answers.json", 'r'))['quest_answers']

for QA in QAs:
    img_name = QA["Image"]+".png"
    ques = QA["Question"]
    ans = QA["Answer"]
    
    if img_name not in image_dict_V2:
        print("Skipping {} - not found in dict".format(img))
        continue
    
    imageNamesX.append(img_name)
    questionsNLX.append(ques)
    answers.append(ans)

# 3.0 Create word embeddings

* Using the keras.preprocessing module Tokenize the questions
* Create a word index from the tokenizer object
* Save the word index as a word_index.pickle file

In [3]:
### NOW do word embeddings for questions
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words = 81
tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,split=' ')
tokenizer.fit_on_texts(questionsNLX)
questionsX = tokenizer.texts_to_sequences(questionsNLX)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

max_length_of_text = 200
questionsX = pad_sequences(questionsX, maxlen=max_length_of_text)

print("Saving word_index.pickle")
import pickle
with open('word_index.pickle', 'wb') as handle:
    pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Saved.")

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Found 80 unique tokens.
Saving word_index.pickle
Saved.


# 4.0 Create vector embeddings using Glove

* download the glove embedding file from https://www.kaggle.com/incorpes/glove6b200d
* Use this file to create the embedding matrix
* Save the embedding matrix as embedding_matrix.pickle

In [4]:
# vector embeddings

embeddings_index = {}
    
EMBEDDING_DIM = 200

embedding_matrix = None
if(os.path.isfile('embedding_matrix.pickle')):
    print(">> Embedding Matrix Pickle found...")
    with open('embedding_matrix.pickle', 'rb') as handle:
        embedding_matrix = pickle.load(handle)
    print(">>> loaded!")
else:
    f = open('glove.6B.200d.txt', encoding="utf8")

    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            
    with open('embedding_matrix.pickle', 'wb') as handle:
        pickle.dump(embedding_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('embedding_matrix.pickle saved')

>> Embedding Matrix Pickle found...
>>> loaded!


# 5.0 One Hot Encoding of target values

* Create a one hot encoding of the answers list created above
* save as label_encoder.pickle

In [5]:
# One hot encode answers

## ONE HOT
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data = answers
values = array(data)
# print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
# print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

Y = onehot_encoded

print("Saving label_encoder.pickle")
import pickle
with open('label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('label_encoder.pickle saved')

# print(onehot_encoded)
# invert first example
# inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
# print(inverted)

def decode_predictions(label_encoder, predictions):
    texts = []
    for p in predictions:
        text = label_encoder.inverse_transform(argmax(p))
        texts.append(text)
    return texts

label_encoder.classes_

Saving label_encoder.pickle
label_encoder.pickle saved


array(['0', '1', '2', '3', '4', '5', '6', '7', '8', 'False', 'True',
       'blue', 'brown', 'cube', 'cyan', 'cylinder', 'gray', 'green',
       'large', 'metal', 'purple', 'red', 'rubber', 'small', 'sphere',
       'yellow'], dtype='<U11')

# 6.0 Split the created variables 

* Create imageNamesX_train
* Create imageNamesX_test
* Create questionsX_train
* Create questionsX_test
* Create Y_train
* Create Y_test

In [6]:
# Split quesionsX
from sklearn.model_selection import train_test_split
imageNamesX_train, imageNamesX_test, questionsX_train, questionsX_test, Y_train, Y_test = train_test_split(imageNamesX, questionsX, Y, test_size=0.1, random_state=42)

# 7.0 Models and experiments

## 7.1 COMBINING MODELS 

In [14]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM, Flatten, Embedding, Multiply
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D

def img_model(dropout_rate):
    print("Creating image model...")
    model = Sequential()
    model.add(Dense(1024, input_dim=4096, activation='tanh'))
    return model

def Word2VecModel(embedding_matrix, num_words, embedding_dim, seq_length, dropout_rate):
    print("Creating text model...")
    model = Sequential()
    model.add(Embedding(num_words, embedding_dim, 
        weights=[embedding_matrix], input_length=seq_length, trainable=False))
    model.add(LSTM(units=512, return_sequences=True, input_shape=(seq_length, embedding_dim)))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=512, return_sequences=False))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1024, activation='tanh'))
    return model

def vqa_model1(embedding_matrix, num_words, embedding_dim, seq_length, dropout_rate, num_classes):
    vgg_model = img_model(dropout_rate)
    lstm_model = Word2VecModel(embedding_matrix, num_words, embedding_dim, seq_length, dropout_rate)
    print("Merging final model...")
    fc_model = Sequential()
    # fc_model.add(Merge([vgg_model, lstm_model], mode='mul'))
    fc_model.add(Multiply([vgg_model, lstm_model]))
    fc_model.add(Dropout(dropout_rate))
    fc_model.add(Dense(1000, activation='tanh'))
    fc_model.add(Dropout(dropout_rate))
    fc_model.add(Dense(num_classes, activation='softmax'))
    fc_model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
        metrics=['accuracy'])
    return fc_model

# 7.2 Create the VQA Model

In [39]:
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation, Dropout, LSTM, Flatten, Embedding, Multiply, Concatenate, Conv2D, BatchNormalization
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
import keras


def vqa_model(embedding_matrix, num_words, embedding_dim, seq_length, dropout_rate, num_classes):
    
    print("Creating image model...")
    img_model = Sequential()
    img_model.add(Conv2D(24, kernel_size=(3, 3), strides=2, activation='relu'))
    img_model.add(BatchNormalization())
    img_model.add(Conv2D(48, kernel_size=(3, 3), strides=2, activation='relu'))
    img_model.add(BatchNormalization())
    img_model.add(Conv2D(48, kernel_size=(3, 3), strides=2, activation='relu'))
    img_model.add(BatchNormalization())
    img_model.add(Conv2D(64, kernel_size=(3, 3), strides=2, activation='relu'))
    img_model.add(BatchNormalization())
    img_model.add(keras.layers.Flatten())
    
    image_input = Input(shape=(100, 100, 3))
    encoded_image = img_model(image_input)
    
    print(img_model.summary())

    print("Creating text model...")
    txt_model = Sequential()
    txt_model.add(Embedding(num_words, embedding_dim, 
        weights=[embedding_matrix], input_length=seq_length, trainable=False))
    txt_model.add(LSTM(units=128, return_sequences=False, input_shape=(seq_length, embedding_dim)))
    txt_model.add(Dropout(dropout_rate))
#     txt_model.add(AttentionDecoder(512, embedding_dim))
#     txt_model.add(LSTM(units=512, return_sequences=False))
#     txt_model.add(Dropout(dropout_rate))
#     txt_model.add(Dense(1024, activation='tanh'))
    
    question_input = Input(shape=(EMBEDDING_DIM, ), dtype='int32')
    embedded_question = txt_model(question_input)
    
    print(txt_model.summary())
    
    print("Merging final model...")
    merged = keras.layers.concatenate([encoded_image, embedded_question])
    d1  = Dense(512, activation='relu')(merged)
    dp1 = Dropout(dropout_rate)(d1)
#     d2  = Dense(1000, activation='tanh')(dp1)
#     dp2 = Dropout(dropout_rate)(d2)
    output  = Dense(num_classes, activation='softmax')(dp1)
    
    vqa_model = Model(inputs=[image_input, question_input], outputs=output)
    
    
#     fc_model = Sequential()
#     # fc_model.add(Merge([vgg_model, lstm_model], mode='mul'))
#     fc_model.add(Concatenate([img_model, txt_model]))
#     fc_model.add(Dropout(dropout_rate))
#     fc_model.add(Dense(1000, activation='tanh'))
#     fc_model.add(Dropout(dropout_rate))
#     fc_model.add(Dense(num_classes, activation='softmax'))
    
    vqa_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return vqa_model

# 8.0 Create image data generator

In [None]:
### Data generator ###

def generator(image_dict, img_names, questions, labels, batch_size):
    
    q_ptr = 0
    while True:
        image_inp = []
        q_inp = []
        batch_labels = []
        for i in range(batch_size):
            if q_ptr == len(questions):
                q_ptr = 0
            index = q_ptr
#             import random
#             index= random.randint(0, len(questions)-1)
            # print(imageNamesX[q_ptr].shape)
            # print(questionsX[q_ptr])
            image_inp.append(image_dict[img_names[index]])
            q_inp.append(questions[index])
            batch_labels.append(labels[index])
            q_ptr+=1
            
        yield [np.array(image_inp), np.array(q_inp)], np.array(batch_labels)

In [35]:
def generator(image_dict, img_names, questions, labels, batch_size):
    
    q_ptr = 0
    while True:
        image_inp = []
        q_inp = []
        batch_labels = []
        for i in range(batch_size):
            if q_ptr == len(questions):
                q_ptr = 0
            index = q_ptr
            # print(imgsX2[q_ptr].shape)
            # print(quesX2_sq[q_ptr])
            image_inp.append(image_dict[img_names[index]])
            q_inp.append(questions[index])
            batch_labels.append(Y[index])
            q_ptr+=1
        yield [np.array(image_inp), np.array(q_inp)], np.array(batch_labels)

# 10. Fit thedata to the model 

* use model.fit_generator

In [None]:
dropout_rate=0.5
num_classes=26
model_weights_filename = "weights.bkp"
ckpt_model_weights_filename = "checkP.cp"

model = vqa_model(embedding_matrix, num_words, EMBEDDING_DIM, max_length_of_text, dropout_rate, num_classes)
if os.path.exists(model_weights_filename):
    print ("Loading Weights...")
    model.load_weights(model_weights_filename)
    
# print(model.summary())

from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath=ckpt_model_weights_filename,verbose=1)

batch_size = 64
model.fit_generator(
    generator(image_dict_V2, imageNamesX_train, questionsX_train, Y_train, batch_size),
    validation_data=generator(image_dict_V2, imageNamesX_test, questionsX_test, Y_test, batch_size),
    validation_steps=len(Y_test)/(batch_size-1), steps_per_epoch=len(Y_train)/(batch_size-1), nb_epoch=2, callbacks=[checkpointer])

Creating image model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 49, 49, 24)        672       
_________________________________________________________________
batch_normalization_1 (Batch (None, 49, 49, 24)        96        
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 24, 24, 48)        10416     
_________________________________________________________________
batch_normalization_2 (Batch (None, 24, 24, 48)        192       
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 11, 11, 48)        20784     
_________________________________________________________________
batch_normalization_3 (Batch (None, 11, 11, 48)        192       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 5, 5, 64)       



Epoch 1/2
  88/1928 [>.............................] - ETA: 7:38:20 - loss: 3.0405 - acc: 0.1772

### EXPERIMENTS ###

In [None]:
dropout_rate=0.5
num_classes=26
model_weights_filename = "weightsV2.bkp"
ckpt_model_weights_filename = "checkpointsV2/checkP.cp"

model = None
if os.path.exists(ckpt_model_weights_filename):
    print("Loading model: {}".format(ckpt_model_weights_filename))
    from keras.models import load_model
    model = load_model(ckpt_model_weights_filename)
else:
    model = vqa_model0(embedding_matrix, num_words, EMBEDDING_DIM, max_length_of_text, dropout_rate, num_classes)
    if os.path.exists(model_weights_filename):
        print ("Loading Weights...")
        model.load_weights(model_weights_filename)
    
# print(model.summary())

from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath=ckpt_model_weights_filename, verbose=1)

batch_size = 10
model.fit_generator(
    generator(image_dict_V2, imageNamesX_train, questionsX_train, Y_train, batch_size),
    validation_data=generator(image_dict_V2, imageNamesX_test, questionsX_test, Y_test, batch_size),
    validation_steps=len(Y_test)/(batch_size-1), steps_per_epoch=len(Y_train)/(batch_size-1), nb_epoch=1, callbacks=[checkpointer])

In [None]:
dropout_rate=0.5
num_classes=26
model_weights_filename = "weights.bkp"
#ckpt_model_weights_filename = "checkpoints/checkP.cp"

model = vqa_model(embedding_matrix, num_words, EMBEDDING_DIM, max_length_of_text, dropout_rate, num_classes)
if os.path.exists(model_weights_filename):
    print ("Loading Weights...")
    model.load_weights(model_weights_filename)
    
# print(model.summary())

from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath=ckpt_model_weights_filename,verbose=1)

batch_size = 128
model.fit_generator(
    generator(image_dict, imageNamesX_train, questionsX_train, Y_train, batch_size),
    validation_data=generator(image_dict, imageNamesX_test, questionsX_test, Y_test, batch_size),
    validation_steps=len(Y_test)/(batch_size-1), steps_per_epoch=len(Y_train)/(batch_size-1), nb_epoch=3, callbacks=[checkpointer])