# Visual Question Answering
We split the dataset: 80% train set and 20% validation set. The batch size is set to 128 to speed up a little bit the training part, same thing for the epochs that are only 2.


In [1]:
COMMIT = False

if COMMIT:
    from PIL import Image
    import numpy as np
    import json
    import cv2
    import tensorflow as tf
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    imgs_path = "/kaggle/input/ann-and-dl-vqa/dataset_vqa/train"
    train_json_path = "/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json"
    test_json_path = "/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json"
    
    SEED = 1234
    DATASET_SPLIT = 0.8
    img_h = 128
    img_w = 128
    BATCH_SIZE = 128
    
    classes = {'0': 0,
               '1': 1,
               '10': 2,
               '2': 3,
               '3': 4,
               '4': 5,
               '5': 6,
               '6': 7,
               '7': 8,
               '8': 9,
               '9': 10,
               'no': 11,
               'yes': 12}
    
    N_CLASSES = len(classes)

In [2]:
if COMMIT:
    # ----------------------TRAIN THE TOKENIZER VOCABULARY----------------------
    if 'tokenizer' not in globals():        # only if it does not exists yet
        # Use the Tokenizer to transform the text (questions) into sequence
        tokenizer = tf.keras.preprocessing.text.Tokenizer()

        with open(train_json_path, 'r') as f:
            data = json.load(f)
            data = data['questions']

            for question in data:
                quest = question['question'].split(" ")
                for i in range(len(quest)):
                    quest[i] = quest[i].replace("?", "")
                #print(quest)

                # Updates internal vocabulary based on the questions of the dataset
                tokenizer.fit_on_texts(quest)            
        f.close()
    words_number = len(tokenizer.word_index) + 1

# Custom Generator

In [3]:
class DataGenerator(tf.keras.utils.Sequence):
    
    def __init__(self, list_IDs, image_path, train_input_questions, max_length, to_fit=True,
                 batch_size=16, dim=(100, 150), n_channels=3, n_classes=13, shuffle=True):
        self.list_IDs = list_IDs
        self.train_input_questions = train_input_questions
        self.image_path = image_path
        self.to_fit = to_fit
        self.batch_size = batch_size
        self.dim = dim
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.img_h = dim[0]
        self.img_w = dim[1]
        self.max_length = max_length
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X = self._generate_X(list_IDs_temp)

        if self.to_fit:
            y = self._generate_y(list_IDs_temp)
            return X, y
        else:
            return X

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def _generate_X(self, list_IDs_temp):
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        X2 = np.empty((self.batch_size, self.max_length))

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = self._load_image(self.image_path[ID], self.img_w, self.img_h)
            X2[i,] = (self.train_input_questions[ID]).tolist()
        ole = [X2, X]
        
        return ole

    def _generate_y(self, list_IDs_temp):
        y = np.empty((self.batch_size, 1), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            y[i] = self.list_IDs[ID]

        return y

    def _load_image(self, image_path, img_w, img_h):
        if self.to_fit:
            image = cv2.imread("/kaggle/input/ann-and-dl-vqa/dataset_vqa/train/" + image_path)
        else:
            image = cv2.imread("/kaggle/input/ann-and-dl-vqa/dataset_vqa/test/" + image_path)   
        image = cv2.resize(image, (img_w, img_h))
        image = image/ 255.
        return image

NameError: name 'tf' is not defined

In [4]:
def readTrainJson(data, first, last):
        images = []
        questions = []
        answers = []

        for question in data[first:last]:
            name = question['image_filename']
            quest = question['question'].split(" ")
            for i in range(len(quest)):
                quest[i] = quest[i].replace("?", "")
            ans = question['answer']

            images.append(name)
            questions.append(quest)
            answers.append(classes[ans])
        return images, questions, answers

def readTestJson(data, first, last):
    quest_id = []
    images = []
    questions = []

    for question in data[first:last]:
        qid = question['question_id']
        name = question['image_filename']
        quest = question['question'].split(" ")
        for i in range(len(quest)):
            quest[i] = quest[i].replace("?", "")
        
        quest_id.append(qid)
        images.append(name)
        questions.append(quest)
    return images, questions, quest_id

## Description Generator
We used a custom generator who take the couple (image, question) as input and its answer as output.

In [5]:
if COMMIT:
    #read train JSON file
    with open(train_json_path, 'r') as f:
        train_data = json.load(f)
        train_data = train_data['questions']
    f.close()
    
    #read test JSON file
    with open(test_json_path, 'r') as f:
        test_data = json.load(f)
        test_data = test_data['questions']
    f.close()
    
    
    TOT_QUESTIONS = len(train_data)
    TRAIN_QUESTIONS = int(TOT_QUESTIONS*DATASET_SPLIT)
    VALID_QUESTIONS = TOT_QUESTIONS-TRAIN_QUESTIONS

    #extract images, questions and answer (or quest_id) from the train and test files
    train_images, train_questions, train_answers = readTrainJson(train_data, 0, TRAIN_QUESTIONS)
    valid_images, valid_questions, valid_answers = readTrainJson(train_data, TRAIN_QUESTIONS, TOT_QUESTIONS)
    test_images, test_questions, questions_id = readTestJson(test_data, 0, len(test_data))
    
    sequences = tokenizer.texts_to_sequences(train_questions)
    max_length = max(len(sequence) for sequence in sequences)
    train_input_questions = pad_sequences(sequences, maxlen=max_length)

    sequences = tokenizer.texts_to_sequences(valid_questions)
    valid_input_questions = pad_sequences(sequences, maxlen=max_length)

    tokenizer.fit_on_texts(test_questions)
    sequences = tokenizer.texts_to_sequences(test_questions)
    test_input_questions = pad_sequences(sequences, maxlen=max_length)

    words_number = len(tokenizer.word_index) + 1

    training_generator = DataGenerator(train_answers, train_images, train_input_questions, max_length, batch_size=BATCH_SIZE, dim=(img_h, img_w), n_classes=N_CLASSES)
    validation_generator = DataGenerator(valid_answers, valid_images, valid_input_questions, max_length, batch_size=BATCH_SIZE, dim=(img_h, img_w), n_classes=N_CLASSES)
    test_generator = DataGenerator(questions_id, test_images, test_input_questions,  max_length, to_fit=False, batch_size=1, dim=(img_h, img_w), n_classes=N_CLASSES, shuffle=False)

## CNN & RNN
We used the standard network provided by Keras and than we introduce some changing in order to reach a better result. For example: VGG16 model. 

In [6]:
if COMMIT:
    # Import Keras 
    # import tensorflow as tensorflow
    
    INPUT_SIZE_MERGE = 64

    # Define CNN for Image Input
    base_model = tf.keras.applications.VGG16(input_shape=(img_h, img_w, 3), include_top=False, weights='imagenet')
    for i in range(len(base_model.layers)):
        base_model.layers[i].trainable = False
        
    vision_model = tf.keras.models.Sequential()
    #vision_model.add(tf.keras.layers.Dropout(0.2))
    #global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
    vision_model.add(base_model)
    #vision_model.add(global_average_layer)
    vision_model.add(tf.keras.layers.Dropout(0.1))
    vision_model.add(tf.keras.layers.Flatten())
    vision_model.add(tf.keras.layers.Dense(INPUT_SIZE_MERGE))

    image_input = tf.keras.layers.Input(shape=(img_h, img_w, 3))
    encoded_image = vision_model(image_input)

    # Define RNN for language input
    question_input = tf.keras.layers.Input(shape=[max_length])
    embedded_question = tf.keras.layers.Embedding(input_dim=words_number, output_dim=512, input_length=100)(question_input)
    encoded_question = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(INPUT_SIZE_MERGE, dropout=0.1, recurrent_dropout=0.1, unroll=True))(embedded_question)

    # Combine CNN and RNN to create the final model
    merged = tf.keras.layers.concatenate([encoded_question, encoded_image])
    output = tf.keras.layers.Dense(32)(merged)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(len(classes), activation='softmax')(output)
    vqa_model = tf.keras.models.Model(inputs=[question_input, image_input], outputs=output)
    
    vision_model.summary()
    vqa_model.summary()

## Parameters
We play a lot with the learning rate, optimizer and loss to improve our result but they does not seem to change a lot.


In [7]:
if COMMIT:
    # Optimization params
    # -------------------

    # Loss
    loss = tf.keras.losses.SparseCategoricalCrossentropy()

    # learning rate
    lr = 5e-4
    #optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr, rho=0.9)
    # -------------------

    # Validation metrics
    # ------------------

    metrics = ['accuracy']
    # ------------------

    # Compile Model
    #vqa_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    vqa_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [8]:
if COMMIT:
    vqa_model.fit_generator(generator=training_generator,
                            validation_data=validation_generator,
                            epochs=2)
    pred = vqa_model.predict_generator(test_generator)

In [9]:
import os
from datetime import datetime

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(str(key) + ',' + str(value) + '\n')

results = {}

for i in range(len(pred)):
    results[test_generator.list_IDs[i]] = np.argmax(pred[i])

create_csv(results)

NameError: name 'pred' is not defined