In [1]:
from PrepareOriginalData import PrepareData
import numpy as np

# Some constants
taskType = 'all'
data_amount = 1
epochs = 50

# Load training set
p = PrepareData(path_images='data_vqa_feat', # Path to image features 
                subset='train2014', # Desired subset: either train2014 or val2014
                taskType=taskType, # 'OpenEnded', 'MultipleChoice', 'all'
                cut_data=data_amount, # Percentage of data to use, 1 = All values, above 1=#samples for debugging
                output_path='data', # Path where we want to output temporary data
                pad_length=32, # Number of words in a question (zero padded)
                question_threshold=0, answer_threshold=0, # Keep only most common words
                answers_sparse=True, questions_sparse=True)
_, questions, _, annotations = p.load_data()
print("Question features", questions.shape)
print("Dictionary size", p.dic_size)

# Save dictionary
p.dumpDictionary('dictionary_all_types')

Using TensorFlow backend.


loading VQA annotations and questions into memory...
0:00:04.023371
creating index...
index created!
loading VQA annotations and questions into memory...
0:00:09.406677
creating index...
index created!
Question features (496698, 32)
Dictionary size 14178


In [2]:
# Get labels
y = np.array([2 if ann['answer_type'] == 'number' else 1 if ann['answer_type'] == 'yes/no' else 0 for ann in annotations])
print(y.shape)

(496698,)


In [3]:
#from sklearn.ensemble import RandomForestClassifier
#classifier = RandomForestClassifier()
#classifier.fit(questions, y)
from NeuralNetworkQuestionType import NeuralNetwork
neuralnetQuestionType = NeuralNetwork(questions.shape[1], p.dic_size, 3, epochs=50, batchSize=64, loss='sparse_categorical_crossentropy')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_input (InputLayer)      (None, 32)                0         
_________________________________________________________________
word_embedding (Embedding)   (None, 32, 64)            907392    
_________________________________________________________________
flatten_embedding (Flatten)  (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_4 (Dense)              (None, 128)               32896     
__________

In [None]:
neuralnetQuestionType.fit(questions, y)

In [None]:
# Train on only one question type
question_type = 'yes/no'

# Load training set
p = PrepareData(path_images='data_vqa_feat', # Path to image features 
                subset='train2014', # Desired subset: either train2014 or val2014
                taskType=taskType, # 'OpenEnded', 'MultipleChoice', 'all'
                cut_data=data_amount, # Percentage of data to use, 1 = All values, above 1=#samples for debugging
                output_path='data', # Path where we want to output temporary data
                pad_length=32, # Number of words in a question (zero padded)
                question_threshold=0, answer_threshold=10, # Keep only most common words
                answers_sparse=True, questions_sparse=True, answer_type=question_type,
                precomputed_dic=p._question_dict)
image_features, questions, answers, annotations = p.load_data()
print("Image features", image_features.shape)
print("Question features", questions.shape)
print("Answers", answers.shape)
print("Dictionary size", p.dic_size)
print("Number of possible classes", np.max(answers) + 1)

# Save dictionary
p.dumpDictionary('dictionary_yes_no')

loading VQA annotations and questions into memory...
0:00:03.904357
creating index...
index created!
loading VQA annotations and questions into memory...
0:00:07.144110
creating index...
index created!
Image features (190604, 1024)
Question features (190604, 32)
Answers (190604, 1)
Dictionary size 14178
Number of possible classes 3


In [None]:
# Extract object features
from ExtractObjects import ExtractObjects

# Consider three thresholds
eo = ExtractObjects(cut_data=data_amount, output_fileName='objects_train.txt', subset='train2014', threshold=25)
object_matrix1 = eo.onehotvector(annotations)
eo = ExtractObjects(cut_data=data_amount, output_fileName='objects_train.txt', subset='train2014', threshold=50)
object_matrix2 = eo.onehotvector(annotations)
eo = ExtractObjects(cut_data=data_amount, output_fileName='objects_train.txt', subset='train2014', threshold=75)
object_matrix3 = eo.onehotvector(annotations)

object_matrix = np.concatenate([object_matrix1, object_matrix2, object_matrix3], axis = 1)
print(object_matrix.shape)
np.save('data/object_matrix_train_yesno.npy', object_matrix)

In [5]:
from NeuralNetworkYesNo import NeuralNetwork
# Use this when using sparse representation
neuralnet = NeuralNetwork(image_features.shape[0],1024,questions.shape[1],p.dic_size,np.max(answers)+1, lr=0.001, objects=240,
                          epochs = epochs, batchSize=512, loss='sparse_categorical_crossentropy', activation='softmax')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input (InputLayer)         (None, 32)           0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 32, 32)       453696      word_input[0][0]                 
__________________________________________________________________________________________________
image_input (InputLayer)        (None, 1024)         0                                            
__________________________________________________________________________________________________
flatten_embedding (Flatten)     (None, 1024)         0           word_embedding[0][0]             
__________________________________________________________________________________________________
concatenat

In [6]:
# Train network
neuralnet.fit(image_features, object_matrix, questions, answers)

(190604, 1024) (190604, 32) (190604, 1)
Train on 133422 samples, validate on 57182 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [7]:
question_type = 'yes/no'
# Load validation set and evaluate prediction on it
pt= PrepareData(path_images='data_vqa_feat', # Path to image features 
                        subset='val2014', # Desired subset: either train2014 or val2014
                        taskType=taskType, # 'OpenEnded', 'MultipleChoice', 'all'
                        cut_data=data_amount, # Percentage of data to use, 1 = All values, above 1 = 10 samples for debugging
                        output_path='data', # Path where we want to output temporary data
                        pad_length=32)
pt.loadDictionary('data/dictionary_all_types.pkl') # Use same dictionary as in training
image_features, questions, _, annotations = pt.load_data()
print("Image features", image_features.shape)
print("Question features", questions.shape)
print("Dictionary size", pt.dic_size)

loading VQA annotations and questions into memory...
0:00:02.176399
creating index...
index created!
loading VQA annotations and questions into memory...
0:00:03.243434
creating index...
index created!
Image features (243024, 1024)
Question features (243024, 32)
Dictionary size 14178


In [8]:
# Check prediction accuracy of answer-type classifier
y = np.array([2 if ann['answer_type'] == 'number' else 1 if ann['answer_type'] == 'yes/no' else 0 for ann in annotations])

# Predict
#pred = classifier.predict(questions)
pred = neuralnetQuestionType.predict(questions, 'weights/weights-18-0.0089.hdf5')
pred.shape

(243024,)

In [9]:
from sklearn.metrics import accuracy_score
# TODO: can probably still improve this accuracy
print('Answer type classification accuracy:', accuracy_score(pred, y))

Answer type classification accuracy: 0.990206728554


In [14]:
# Filter questions accordingly to their predicted type
question_type_idx = 2 if question_type == 'number' else 1 if question_type == 'yes/no' else 0
image_features = image_features[pred == question_type_idx, :]
questions = questions[pred == question_type_idx, :]
original_questions = np.array(pt._original_questions)[pred == question_type_idx]
print(image_features.shape)
print(questions.shape)
print(original_questions.shape)

(92090, 1024)
(92090, 32)
(92090,)


In [None]:
# Consider three thresholds
eo = ExtractObjects(cut_data=data_amount, output_fileName='objects_val.txt', subset='val2014', threshold=25)
object_matrix1 = eo.onehotvector(original_questions)
eo = ExtractObjects(cut_data=data_amount, output_fileName='objects_val.txt', subset='val2014', threshold=50)
object_matrix2 = eo.onehotvector(original_questions)
eo = ExtractObjects(cut_data=data_amount, output_fileName='objects_val.txt', subset='val2014', threshold=75)
object_matrix3 = eo.onehotvector(original_questions)

object_matrix = np.concatenate([object_matrix1, object_matrix2, object_matrix3], axis = 1)
print(object_matrix.shape)

In [17]:
# Test prediction on validation set
# pred = neuralnet.predict(image_features, questions, 'weights/weights-44-0.4226.hdf5')
pred = neuralnet.predict_current_state(image_features, questions)
print(pred.shape)

(92090, 3)


In [18]:
from EvaluateModel import ProduceResult
model_evaluator = ProduceResult(p._int_to_answer, p._answer_to_int, dataSubType='val2014')
answers = model_evaluator.produce_results(pred, original_questions)
model_evaluator.evaluate(taskType=taskType)

loading VQA annotations and questions into memory...
0:00:02.359158
creating index...
index created!
Loading and preparing results...     
DONE (t=0.10s)
creating index...
index created!
computing accuracy
Finshed Percent: [####################] 99% Done computing accuracy


Overall Accuracy is: 74.84

Per Question Type Accuracy is the following:
is the : 72.66
is this an : 75.10
are there : 80.00
is it : 77.09
is this : 74.65
is there a : 85.36
is the person : 75.09
is this a : 77.13
do : 69.75
are the : 72.02
are : 72.90
does this : 75.08
has : 76.46
is the man : 75.31
are they : 74.64
is : 76.32
is this person : 71.16
are these : 73.15
is there : 80.02
do you : 75.62
none of the above : 68.61
does the : 73.37
are there any : 72.74
is he : 75.50
is the woman : 73.87
was : 73.36
could : 85.54
can you : 71.99
is that a : 75.54
how : 3.60
what : 0.00
how many : 0.00


Per Answer Type Accuracy is the following:
yes/no : 76.03
other : 8.04
number : 7.04


loading VQA annotations and quest