# Third Kaggle competition - Visual Question Answering

To complete this challenge I chosed to use two different networks to complete two different complementary task at the same time:
- Categorize the question in 7 different classes (numbers, colors, yes or no, weather, actions, left or right, others)
- Perform the VQA task

To select the final answer is used, as usual, *arg_max* but now between the most probable classes in the category predicted by the first network.

This method corrects about 1 to 2% of the answers leading to a proportional gain in the final score.

This method is effective in this challenge due to the reduced number of final classes (and also to the fact, for example, the category weather includes only one answer).

The VQA network architecture is very similar to the one implemented in [this](http://arxiv.org/pdf/1505.00468.pdf) paper but with some modifications:

- Increased number of neurons in the fully connected layers
- Xception instead of VGG-16
- Different embedding size and other hyperparameters

The training set is splitted evenly between categories making it balanced w.r.t. the validation set.

During this challenge I used, instead of TensorBoard, Weight and Biases to have statistics about how the model is performing and to perform hyperparameters tuning in a simpler way. Here you can find the entire project with all the runs and sweeps: https://wandb.ai/lrsb/kaggle3


# Download dataset

In [None]:
import json

!pip install --upgrade --force-reinstall --no-deps kaggle
!pip install --upgrade wandb

#@markdown Insert here your credentials
kaggle_username = ''#@param {type:'string'}
kaggle_api_key = ''#@param {type:'string'}
wandb_key = ''#@param {type:'string'}

!wandb login {wandb_key}

api_token = {'username': kaggle_username, 'key': kaggle_api_key}

!mkdir ~/.kaggle
with open('/root/.kaggle/kaggle.json', 'w') as kaggle_json:
  json.dump(api_token, kaggle_json)

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c anndl-2020-vqa
!unzip -q anndl-2020-vqa.zip

# Setup

### Making results more reproducible and setting params

In [None]:
import tensorflow as tf

SEED = 1234#@param {type:'number'}
tf.random.set_seed(SEED)

labels = ['0', '1', '2', '3', '4', '5',
              'apple', 'baseball', 'bench', 
              'bike', 'bird', 'black', 'blanket',
              'blue', 'bone', 'book', 'boy', 
              'brown', 'cat', 'chair', 'couch',
              'dog', 'floor', 'food', 'football',
              'girl', 'grass', 'gray', 'green',
              'left', 'log', 'man', 'monkey bars',
              'no', 'nothing', 'orange', 'pie',
              'plant', 'playing', 'red', 'right',
              'rug', 'sandbox', 'sitting',
              'sleeping', 'soccer', 'squirrel', 
              'standing', 'stool', 'sunny', 
              'table', 'tree', 'watermelon', 'white', 
              'wine', 'woman', 'yellow', 'yes']

labels_dict = {}
for index, label in enumerate(labels):
  labels_dict[label] = index

categories = {
  'numbers': [0, 1, 2, 3, 4, 5],
  'colors' : [11, 13, 17, 27, 28, 35, 39, 53, 56],
  'yesno': [33, 57],
  'weather': [49],
  'actions': [43, 44, 47],
  'leftright': [29, 40],
  'others': [6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 19, 20,
             21, 22, 23, 24, 25, 26, 30, 31, 32, 34, 36,
             37, 38, 41, 42, 45, 46, 48, 50, 51, 52, 54, 55]
}

categories_long = {
  'numbers': [0, 1, 2, 3, 4, 5],
  'colors' : [11, 13, 17, 27, 28, 35, 39, 53, 56],
  'yesno': [33, 57],
  'weather': [49],
  'actions': [38, 43, 44, 47],
  'leftright': [29, 40],
  'persons': [16, 25, 31, 55],
  'objects': [6, 7, 8, 9, 10, 12, 14, 15, 18, 19, 20, 21, 22, 23, 24, 26, 30,
              32, 34, 36, 37, 41, 42, 45, 46, 48, 50, 51, 52, 54]
}

img_w = 299#@param {type:'number'}
img_h = 299#@param {type:'number'}

### Code for creating datasets

In [None]:
import os, numpy as np, json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import image as k_image

class VQADataset(tf.keras.utils.Sequence):
  def __init__(self, validation_split=0, subset='training', only_text=False, preprocessing_function=None):
    self.subset = subset
    self.only_text = only_text
    self.preprocessing_function = preprocessing_function
    self.train_set = []
    self.valid_set = []
    self.test_set = []
    self.tokenizer = tf.keras.preprocessing.text.Tokenizer()

    with open('/content/VQA_Dataset/train_questions_annotations.json', 'r') as f:
      train_questions = json.load(f)

      count = [0 for i in range(0, len(labels_dict))]
      self.max_question_len = max(len(q) for q in map(lambda e: e['question'], train_questions.values()))

      for answer in map(lambda e: e['answer'], train_questions.values()):
        count[labels_dict[answer]] += 1

      valid_count = [0 for i in range(0, len(labels_dict))]

      for key, value in train_questions.items():
        item_index = labels_dict[value['answer']]
        if valid_count[item_index] < count[item_index] * validation_split:
          self.valid_set.append(value)
          valid_count[item_index] += 1
        else:
          self.train_set.append(value)
      
      self.tokenizer.fit_on_texts([q for q in map(lambda e: e['question'], train_questions.values())])

    with open('/content/VQA_Dataset/test_questions.json', 'r') as f:
      self.test_set = [[key, value] for key, value in json.load(f).items()]

  def __len__(self):
    if self.subset == 'testing':
      return len(self.test_set)
    if self.subset == 'training':
      return len(self.train_set)
    return len(self.valid_set)

  def __getitem__(self, index):
    if self.subset == 'testing':
      elem = self.test_set[index][1]
    elif self.subset == 'training':
      elem = self.train_set[index]
    else:
      elem = self.valid_set[index]
    question = pad_sequences(self.tokenizer.texts_to_sequences([elem['question']]), maxlen=self.max_question_len, padding='post')[0]
    if self.only_text:
      if self.subset != 'testing':
        for key, value in categories.items():
          if labels_dict[elem['answer']] in value:
            return question, tf.keras.utils.to_categorical(list(categories.keys()).index(key), num_classes=len(categories))
      return question, [0 for i in range(0, len(categories))]

    image = k_image.load_img(os.path.join('/content/VQA_Dataset/Images', elem['image_id'] + '.png'), target_size=(img_w, img_h)) 
    image = k_image.img_to_array(image)
    if self.preprocessing_function is not None:
      image = self.preprocessing_function(image)
    
    if self.subset == 'testing':
      return (question, image), [0 for i in range(0, len(labels_dict))]
    return (question, image), tf.keras.utils.to_categorical(labels_dict[elem['answer']], num_classes=len(labels_dict))

  def vocabulary_size(self):
    return len(self.tokenizer.word_index) + 1

  def question_id(self, index):
    return self.test_set[index][0]


def GetVQADatasets(validation_split=0, batch_size=32, preprocessing_function=None):
  train_set = VQADataset(validation_split=validation_split, subset='training', preprocessing_function=preprocessing_function)
  valid_set = VQADataset(validation_split=validation_split, subset='validation', preprocessing_function=preprocessing_function)
  test_set = VQADataset(subset='testing', preprocessing_function=preprocessing_function)

  train_dataset = tf.data.Dataset.from_generator(lambda: train_set, 
                                                 output_types=((tf.int32, tf.float32), tf.float32),
                                                 output_shapes=(([train_set.max_question_len], [img_w, img_h, 3]), [len(labels_dict)]))
  train_dataset = train_dataset.batch(batch_size)
  train_dataset = train_dataset.repeat()


  valid_dataset = tf.data.Dataset.from_generator(lambda: valid_set, 
                                                 output_types=((tf.int32, tf.float32), tf.float32),
                                                 output_shapes=(([valid_set.max_question_len], [img_w, img_h, 3]), [len(labels_dict)]))
  valid_dataset = valid_dataset.batch(batch_size)
  valid_dataset = valid_dataset.repeat()

  test_dataset = tf.data.Dataset.from_generator(lambda: test_set,
                                                output_types=((tf.int32, tf.float32), tf.float32),
                                                output_shapes=(([valid_set.max_question_len], [img_w, img_h, 3]), [len(labels_dict)]))
  test_dataset = test_dataset.batch(1)
  test_dataset = test_dataset.repeat()

  return train_set, train_dataset, valid_set, valid_dataset, test_set, test_dataset

def GetQuestionsDatasets(validation_split=0, batch_size=32):
  train_set = VQADataset(validation_split=validation_split, subset='training', only_text=True)
  valid_set = VQADataset(validation_split=validation_split, subset='validation', only_text=True)
  test_set = VQADataset(validation_split=validation_split, subset='testing', only_text=True)

  train_dataset = tf.data.Dataset.from_generator(lambda: train_set, 
                                                 output_types=(tf.int32, tf.int32),
                                                 output_shapes=([train_set.max_question_len], [len(categories)]))
  train_dataset = train_dataset.batch(batch_size)
  train_dataset = train_dataset.repeat()


  valid_dataset = tf.data.Dataset.from_generator(lambda: valid_set, 
                                                 output_types=(tf.int32, tf.int32),
                                                 output_shapes=([valid_set.max_question_len], [len(categories)]))
  valid_dataset = valid_dataset.batch(batch_size)
  valid_dataset = valid_dataset.repeat()

  test_dataset = tf.data.Dataset.from_generator(lambda: test_set, 
                                                 output_types=(tf.int32, tf.int32),
                                                 output_shapes=([valid_set.max_question_len], [len(categories)]))
  test_dataset = test_dataset.batch(1)
  test_dataset = test_dataset.repeat()

  return train_set, train_dataset, valid_set, valid_dataset, test_set, test_dataset

### Code for saving testset results

In [None]:
import ntpath, numpy as np

def PredictTestset(vqa_model, cat_model=None, filename='/content/results.csv', filename_corrected='/content/results-cat.csv', preprocessing_function=None, upload=False):
  vqa_datasets = GetVQADatasets(preprocessing_function=preprocess_input)
  vqa_predictions = vqa_model.predict(vqa_datasets[5], steps=len(vqa_datasets[4]), verbose=1)
  if cat_model is not None:
    cat_datasets = GetQuestionsDatasets()
    cat_predictions = cat_model.predict(cat_datasets[5], steps=len(cat_datasets[4]), verbose=1)

  results = {}
  results_corrected = {}
  corrected = 0
  for i in range(0, len(vqa_predictions)):
    key = vqa_datasets[4].question_id(i)
    prediction = vqa_predictions[i]
    results[key] = np.argmax(prediction)

    if cat_model is not None:
      question_category = list(categories.keys())[np.argmax(cat_predictions[i])]
      possible_answer_indexes = categories[question_category]
      final_answer = possible_answer_indexes[np.argmax([prediction[index] for index in possible_answer_indexes])]

      if results[key] != final_answer:
        corrected += 1
      results_corrected[key] = final_answer

  if cat_model is not None:
    print('Corrected ' + str(corrected) + ' of ' + str(len(results)) + ' (' + str(corrected / len(results) * 100) + '%)')
    with open(filename_corrected, 'w') as f:
      f.write('Id,Category\n')
      for key, value in results_corrected.items():
        f.write(key + ',' + str(value) + '\n')

  with open(filename, 'w') as f:
    f.write('Id,Category\n')
    for key, value in results.items():
      f.write(key + ',' + str(value) + '\n')

  if upload:
    !kaggle competitions submit -c anndl-2020-vqa -f $filename_corrected -m 'Autoupload'

# Implementation


### Answer category predictor

In [None]:
import wandb
from wandb.keras import WandbCallback

def GetCatModel(validation_split, batch_size, epochs, embedding_size, dropout, units_1, units_2):
  datasets = GetQuestionsDatasets(validation_split=validation_split, batch_size=batch_size)

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Embedding(datasets[0].vocabulary_size(), 
                                      embedding_size, 
                                      input_length=datasets[0].max_question_len,
                                      mask_zero=True))
  model.add(tf.keras.layers.LSTM(units_1))
  model.add(tf.keras.layers.Dropout(dropout))
  model.add(tf.keras.layers.Dense(units_1, activation='relu'))
  model.add(tf.keras.layers.Dropout(dropout))
  model.add(tf.keras.layers.Dense(units_2, activation='relu'))
  model.add(tf.keras.layers.Dense(len(categories), activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.fit(x=datasets[1],
            epochs=epochs,
            steps_per_epoch=len(datasets[0]) // batch_size,
            validation_data=datasets[3],
            validation_steps=len(datasets[2]) // batch_size, 
            callbacks=[WandbCallback()])
  return model

#@markdown Set hyperparameters used during training

validation_split = 0.1#@param {type:'number'}
batch_size = 64#@param {type:'number'}
epochs = 4#@param {type:'number'}
embedding_size = 300#@param {type:'number'}
dropout = 0.5#@param {type:'number'}
units_1 = 1024#@param {type:'number'}
units_2 = 1024#@param {type:'number'}

#@markdown Or use hyperparameter optimization

use_hyperparameter_optimization = False#@param {type:'boolean'}

!nvidia-smi

if use_hyperparameter_optimization:
  def RunFitWithHypOpt():
    wandb.init()
    model = GetCatModel(wandb.config.validation_split,
                        wandb.config.batch_size,
                        wandb.config.epochs,
                        wandb.config.embedding_size,
                        wandb.config.dropout,
                        wandb.config.units_1,
                        wandb.config.units_2)

  wandb.agent(wandb.sweep({
    'method': 'bayes',
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'early_terminate': {
        'type': 'hyperband',
        'min_iter': 3
    },
    'parameters': {
        'validation_split': {
            'values': [0.1, 0.2, 0.4]
        },
        'batch_size': {
            'values': [4, 32, 64, 256]
        },
        'epochs': {
            'value': 4
        },
        'embedding_size': {
            'values': [64, 128, 300]
        },
        'dropout': {
            'values': [0, 0.5]
        },
        'units_1': {
            'values': [512, 1024, 2048]
        },
        'units_2': {
            'values': [256, 1024]
        }
      }
  }, project='kaggle3'), function=RunFitWithHypOpt)

else:
  wandb.init(project='kaggle3', config={
      'validation_split': validation_split,
      'batch_size': batch_size,
      'epochs': epochs,
      'embedding_size': embedding_size,
      'dropout': dropout,
      'units_1': units_1,
      'units_2': units_2
  })

  cat_model = GetCatModel(wandb.config.validation_split,
                        wandb.config.batch_size,
                        wandb.config.epochs,
                        wandb.config.embedding_size,
                        wandb.config.dropout,
                        wandb.config.units_1,
                        wandb.config.units_2)

Best performing model [here](https://wandb.ai/lrsb/kaggle3/runs/muum7bu1)

### VQA

In [None]:
import wandb
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from wandb.keras import WandbCallback
from tensorflow.keras.applications.xception import preprocess_input

def GetVQAModel(validation_split, batch_size, epochs, use_early_stopping, embedding_size, dropout, lstm_units):
  datasets = GetVQADatasets(validation_split=validation_split, batch_size=batch_size, preprocessing_function=preprocess_input)

  xception = tf.keras.applications.Xception(weights='imagenet', include_top=True, input_shape=(img_w, img_h, 3))
  xception.trainable = False

  cnnnorm = tf.keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, axis=1))(xception.layers[-2].output)
  cnndense1 = tf.keras.layers.Dense(units=2048, activation='relu')(cnnnorm)
  cnn = tf.keras.layers.Dense(units=2048, activation='tanh')(cnndense1)

  embedding_input = tf.keras.Input((datasets[0].max_question_len,))
  embedding = tf.keras.layers.Embedding(datasets[0].vocabulary_size(),
                                        embedding_size,
                                        input_length=datasets[0].max_question_len,
                                        mask_zero=True)(embedding_input)
  lstm1 = tf.keras.layers.LSTM(units=lstm_units,
                               return_sequences=True,
                               return_state=True,
                               input_shape=(datasets[0].max_question_len, embedding_size))(embedding)
  lstm2 = tf.keras.layers.LSTM(units=lstm_units, return_sequences=False, return_state=True)(lstm1)
  concat = tf.keras.layers.Concatenate()([lstm1[1], lstm1[2], lstm2[0], lstm2[1]])
  lstmdense1 = tf.keras.layers.Dense(units=2048, activation='relu')(concat)
  lstmdense2 = tf.keras.layers.Dense(2048, activation='tanh')(lstmdense1)

  mul = tf.keras.layers.Multiply()([cnn, lstmdense2])
  drop1 = tf.keras.layers.Dropout(dropout)(mul)
  dense1 = tf.keras.layers.Dense(2048, activation='tanh')(drop1)
  drop2 = tf.keras.layers.Dropout(dropout)(dense1)
  dense2 = tf.keras.layers.Dense(2048, activation='tanh')(drop2)
  dense3 = tf.keras.layers.Dense(units=len(labels_dict), activation='softmax')(dense2)

  model = tf.keras.models.Model(inputs=[embedding_input, xception.input], outputs=dense3)
  model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

  callbacks = [WandbCallback(),
               ModelCheckpoint(filepath='/tmp/checkpoint',
                               save_weights_only=True,
                               monitor='val_accuracy',
                               mode='max',
                               save_best_only=True,
                               verbose=1)]
  if use_early_stopping:
    callbacks.append(EarlyStopping(monitor='val_accuracy', mode='max', patience=5, restore_best_weights=False))

  model.fit(x=datasets[1],
            epochs=epochs,
            steps_per_epoch=len(datasets[0]) // batch_size,
            validation_data=datasets[3],
            validation_steps=len(datasets[2]) // batch_size,
            workers=8,
            max_queue_size=200,
            use_multiprocessing=True,
            callbacks=callbacks)
  return model

#@markdown Set hyperparameters used during training

validation_split = 0.03#@param {type:'number'}
batch_size = 256#@param {type:'number'}
epochs = 15#@param {type:'number'}
use_early_stopping = False#@param {type:'boolean'}
embedding_size = 512#@param {type:'number'}
dropout = 0.5#@param {type:'number'}
lstm_units = 256#@param {type:'number'}

#@markdown Or use hyperparameter optimization

use_hyperparameter_optimization = False#@param {type:'boolean'}

!nvidia-smi

if use_hyperparameter_optimization:
  def RunFitWithHypOpt():
    wandb.init()
    model = GetVQAModel(wandb.config.validation_split,
                        wandb.config.batch_size,
                        wandb.config.epochs,
                        wandb.config.use_early_stopping,
                        wandb.config.embedding_size,
                        wandb.config.dropout,
                        wandb.config.lstm_units,
                        wandb.config.trainable)
    PredictTestset(model,
                   cat_model=cat_model, 
                   filename=os.path.join(wandb.run.dir, 'results.csv'), 
                   filename_corrected=os.path.join(wandb.run.dir, 'results-cat.csv'), 
                   preprocessing_function=preprocess_input,
                   upload=False)

  cat_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/Kaggle3/cat-model-best.h5')
  wandb.agent(wandb.sweep({
    'method': 'bayes',
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'validation_split': {
            'value': 0.03
        },
        'batch_size': {
            'value': 256
        },
        'epochs': {
            'value': 13
        },
        'use_early_stopping': {
            'value': True
        },
        'embedding_size': {
            'values': [256, 300, 512]
        },
        'dropout': {
            'values': [0, 0.5]
        },
        'lstm_units': {
            'values': [256, 512, 1024]
        }
      }
  }, project='kaggle3'), function=RunFitWithHypOpt)

else:
  wandb.init(project='kaggle3', config={
      'validation_split': validation_split,
      'batch_size': batch_size,
      'epochs': epochs,
      'use_early_stopping': use_early_stopping,
      'embedding_size': embedding_size,
      'dropout': dropout,
      'lstm_units': lstm_units
  })

  model = GetVQAModel(wandb.config.validation_split,
                      wandb.config.batch_size,
                      wandb.config.epochs,
                      wandb.config.use_early_stopping,
                      wandb.config.embedding_size,
                      wandb.config.dropout,
                      wandb.config.lstm_units)
  
  cat_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/Kaggle3/cat-model-best.h5')
  PredictTestset(model, cat_model=cat_model, 
                 filename=os.path.join(wandb.run.dir, 'results.csv'), 
                 filename_corrected=os.path.join(wandb.run.dir, 'results-cat.csv'),
                 preprocessing_function=preprocess_input,
                 upload=False)
  model.load_weights('/tmp/checkpoint')
  PredictTestset(model, cat_model=cat_model, 
                 filename=os.path.join(wandb.run.dir, 'results-restored.csv'), 
                 filename_corrected=os.path.join(wandb.run.dir, 'results-cat-restored.csv'),
                 preprocessing_function=preprocess_input,
                 upload=False)
  wandb.init()

Best performing model [here](https://wandb.ai/lrsb/kaggle3/runs/1710a4ge)

# Utilities

### Predict dataset using a saved model

In [None]:
from tensorflow.keras.applications.xception import preprocess_input

model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/Kaggle3/model-best.h5')
model_last = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/Kaggle3/model-best-last.h5')
cat_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/Kaggle3/cat-model-best.h5')

PredictTestset(model, cat_model=cat_model, filename='/content/results.csv', filename_corrected='/content/results-cat.csv', preprocessing_function=preprocess_input, upload=False)
PredictTestset(model_last, cat_model=cat_model, filename='/content/results-last.csv', filename_corrected='/content/results-cat-last.csv', preprocessing_function=preprocess_input, upload=False)

In [None]:
model_last.summary()

In [None]:
print(model_last.get_layer('dense_2').get_config())