In [1]:
from PrepareDataCNN import PrepareOriginalData
import numpy as np
p = PrepareOriginalData(path_images='data_vqa_feat', # Path to image features 
                        subset='train2014', # Desired subset: either train2014 or val2014
                        taskType='OpenEnded', # 'OpenEnded', 'MultipleChoice', 'all'
                        cut_data=1, # Percentage of data to use, 1 = All values, above 1 = 10 samples for debugging
                        output_path='data', # Path where we want to output temporary data
                        pad_length=32, # Number of words in a question (zero padded)
                        question_threshold=0, answer_threshold=0, # Keep only most common words
                        answers_sparse=True, questions_sparse=True, image_extractor='RawImages')
image_features, questions, answers, annotations = p.load_data()
print("Image features", image_features.shape)
print("Question features", questions.shape)
print("Answers", answers.shape)
print("Dictionary size", p.dic_size)
print("Number of possible classes", np.max(answers) + 1)

Using TensorFlow backend.


loading VQA annotations and questions into memory...
0:00:04.900708
creating index...
index created!
Image features (248349, 1)
Question features (248349, 32)
Answers (248349, 1)
Dictionary size 20359
Number of possible classes 17090


In [2]:
from NeuralNetworkCNN import NeuralNetwork

In [3]:
# Use this when using sparse representation
neuralnet = NeuralNetwork(image_features.shape[0],1024,questions.shape[1],p.dic_size,np.max(answers)+1, epochs = 5, bulkMultiplier=256, loss='sparse_categorical_crossentropy')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input (InputLayer)         (None, 32)           0                                            
__________________________________________________________________________________________________
image_input (InputLayer)        (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 32, 32)       651488      word_input[0][0]                 
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 2304)         5275968     image_input[0][0]                
__________________________________________________________________________________________________
flatten_em

In [None]:
neuralnet.get_model_summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_input (InputLayer)         (None, 32)           0                                            
__________________________________________________________________________________________________
image_input (InputLayer)        (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 32, 32)       651488      word_input[0][0]                 
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 2304)         5275968     image_input[0][0]                
__________________________________________________________________________________________________
flatten_em

In [None]:
# Train network
neuralnet.fit(image_features, questions, answers)

(248349, 1)
input is image stream
Train on 1433 samples, validate on 615 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [4]:
# Concatenate in one matrix
X = np.hstack([image_features, questions])
print(X.shape)

(121512, 1056)


In [6]:
# Test prediction on training set
pred = neuralnet.predict(X, 'weights/weights-01-6.4158.hdf5')
print(pred.shape)

KeyboardInterrupt: 

In [5]:
# Test prediction
pred = neuralnet.predict(image_features,questions, 'weights/weights-01-3.2744.hdf5')

ValueError: Dimension 0 in both shapes must be equal, but are 20359 and 7387 for 'Assign_1' (op: 'Assign') with input shapes: [20359,32], [7387,32].

In [None]:
# Visualize prediction
from IPython.display import Image,display

def visualize_image_question_answer(index):
    img_indices, questions, answers, answer_types = p.load_question_features()
    if (index % 3 == 0):
        subset_path = 'vqa_' + p.subset + '2014_img/' + p.subset + '2014/'
        image_path = 'images/' + subset_path + img_indices[index]
        # Please keep these lines, as I use a different path
        #subset_path = 'VQA/Images/mscoco/' + p.subset + '2014/'
        #image_path = subset_path + img_indices[index]
        display(Image(image_path,width=300))
        
    print(p._original_questions[index] + '?')
    
    predictions = pred[index,:]
    pred_indices = predictions.argsort()[-3:][::-1]
    top_answers = ''
    for i in range(3):
        answer_i = p._int_to_answer[pred_indices[i]]
        confidence = pred[index, pred_indices[i]]
        top_answers += (answer_i + ': ' + str(confidence) + '; ')
    print(top_answers +'\n')

In [None]:
for i in range(0,12):
    visualize_image_question_answer(i)