In [2]:
import h5py
import json
import numpy as np

In [3]:
data = 'data/'
img_feat = data + 'VQA_image_features.h5'
img_ids = data + 'image_ids_vqa.json'
map_features_to_id = data + 'VQA_img_features2id.json'
img_info = data + 'imgid2imginfo.json'

questions_train = data + 'vqa_questions_train.json'
questions_validation = data + 'vqa_questions_valid.json'
questions_test = data + 'vqa_questions_test.json'

annotations_train = data + 'vqa_annotations_train.json'
annotations_validation = data + 'vqa_annotations_valid.json'
annotations_test = data + 'vqa_annotations_test.json'

In [4]:
def read_images():
    # load computed VQA image features from hdf5 file
    image_features = np.asarray(h5py.File(img_feat, 'r')['img_features'])

    # load IDs file
    with open(img_ids, 'r') as file:
        image_ids = json.load(file)['image_ids']

    # load feature mapping file
    with open(map_features_to_id, 'r') as file:
        feature_mapping = json.load(file)['VQA_imgid2id']

    # load info file
    with open(img_info, 'r') as file:
        image_info = json.load(file)

    return image_ids, image_features, feature_mapping, image_info

In [5]:
def read_text():
    with open(questions_train, 'r') as file:
        q_train = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
    with open(questions_validation, 'r') as file:
        q_validation = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
    with open(questions_test, 'r') as file:
        q_test = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
        
    with open(annotations_train, 'r') as file:
        a_train = [x['multiple_choice_answer'] for x in json.load(file)['annotations']]
    with open(annotations_validation, 'r') as file:
        a_validation = [x['multiple_choice_answer'] for x in json.load(file)['annotations']]
    with open(annotations_test, 'r') as file:
        a_test = [x['multiple_choice_answer']  for x in json.load(file)['annotations']]
    
    return q_train, q_validation, q_test, a_train, a_validation, a_test

In [6]:
## Display image from URL

import os
from PIL import Image
from urllib import request

def show_image(imgid2info, id):
    img_name = 'temp-image.jpg'
    request.urlretrieve(imgid2info[str(id)]['flickr_url'], img_name)

    img = Image.open(img_name)
    img.show()

    os.remove(img_name)
    img.close()

In [7]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random

In [8]:
#NUM_EPOCHS = 5
#LEARNING_RATE = 0.001
#RNDM_SEED = 42
#torch.manual_seed(RNDM_SEED)

In [9]:
def image_id_to_features(image_id):
    feat = feature_mapping[str(image_id)]
    return image_features[feat]

In [10]:
q_train, q_validation, q_test, a_train, a_validation, a_test = read_text()

In [11]:
image_ids, image_features, feature_mapping, image_info = read_images()

In [12]:
#print(image_ids)

In [13]:
#visual_feat_mapping['376397']

In [14]:
#img_id = 376397
#h5_id = visual_feat_mapping[str(img_id)]
#img_feat = img_features[h5_id]
#print(img_feat.shape)


In [15]:
## TO DO 
# Increase size of datasets

train_len = int(0.1 * len(q_train)) 
validation_len = int(0.1 * len(q_validation))
test_len =  int(0.1 * len(q_test))

In [16]:
def organize_data():
    # determine train data
    train_text = []
    train_images = []
    for i in range(train_len): 
        train_text.append((q_train[i][0].split(), a_train[i]))
        train_images.append(image_id_to_features(q_train[i][1]))

    # determine validation data
    validation_text = []
    validation_images = []
    for i in range(validation_len): 
        #if str(i) in visual_feat_mapping.keys():
        validation_text.append((q_validation[i][0].split(), a_validation[i]))
        validation_images.append(image_id_to_features(q_validation[i][1]))

    # determine test data
    test_text = []
    test_images = []
    for i in range(test_len): 
        #if str(i) in visual_feat_mapping.keys():
        test_text.append((q_test[i][0].split(), a_test[i]))
        test_images.append(image_id_to_features(q_test[i][1]))
        
    return train_text, train_images, validation_text, validation_images, test_text, test_images


In [17]:
def shuffle_data(text_features, visual_features):
    combined = [(text, visual) for text, visual in zip(text_features, visual_features)]
    random.shuffle(combined)
    return [text for (text, _) in combined], [visual for (_, visual) in combined]

In [18]:
def vocabulary():
    question_vocab = {}
    annotation_vocab = {}
    annotation_vocab_lookup = []
    for question, answer in train_text + validation_text + test_text:
        for word in question:
            if word not in question_vocab:
                question_vocab[word] = len(question_vocab)
        if answer not in annotation_vocab:
            annotation_vocab[answer] = len(annotation_vocab)
            annotation_vocab_lookup.append(answer)
    return question_vocab, annotation_vocab, annotation_vocab_lookup

In [19]:
train_text, train_images, validation_text, validation_images, test_text, test_images = organize_data()
question_vocab, annotation_vocab, annotation_vocab_lookup = vocabulary()

In [20]:
question_vocab_size = len(question_vocab) 
annotation_vocab_size = len(annotation_vocab)
image_feature_length = len(train_images[0])
input_vector_size = question_vocab_size + image_feature_length
print(question_vocab_size)
print(annotation_vocab_size)
print(input_vector_size)
#question_vocab_size+image_feature_length

3653
1350
5701


In [21]:
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [22]:
def bow_image_vector(question, question_vocab, visual_features): 
    vec = torch.zeros(len(question_vocab)+len(visual_features))
    #print(vec.dtype)
    for word in question:
        vec[question_vocab[word]] += 1
    for i in range(len(visual_features)):
        vec[i+len(question_vocab)] += visual_features[i]
    return vec.view(1, -1)


In [23]:
def make_target(label, annotation_vocab):
    return torch.LongTensor([annotation_vocab[label]])


In [24]:
def train(num_epochs, batch_size):
    
    for ep in range(num_epochs):
        ep_loss = 0
        
        for start in range(0, len(train_text), batch_size):
            text_batch = train_text[start:start+batch_size]
            image_batch = train_images[start:start+batch_size]
            in_mat = torch.zeros(batch_size, input_vector_size)
            out_vec = torch.zeros(batch_size, dtype=torch.long)
        #counter=1 
            if start % 1024 == 0:
                print(start, "/", len(train_text))
                
            for i, ((instance, label), image_features) in enumerate(zip(*shuffle_data(text_batch, image_batch))):

                visual_features = torch.from_numpy(image_features)
                bow_vec = bow_image_vector(instance, question_vocab, visual_features)
                target = make_target(label, annotation_vocab)
                
                in_mat[i] = bow_vec
                out_vec[i] = target
                
            #label_vec[i] = label
            #label_vec = torch.tensor([label]) # sort of works
            #print(label_vec.shape)
            
            log_probs = bow_model(in_mat) 

            batch_loss = loss_function(log_probs, out_vec)
            #print(loss)
            ep_loss += batch_loss
            
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

        print(ep, ep_loss)
        
    return bow_model

In [25]:
bow_model = BoWClassifier(annotation_vocab_size, input_vector_size)
num_epochs = 10
learning_rate = 1e-3
batch_size = 32
loss_function = nn.NLLLoss()
optimizer = optim.Adam(bow_model.parameters(), lr=learning_rate)

In [26]:
#trained_model, all_losses = train_model()
trained_model = train(num_epochs, batch_size)    
#print("Trained BoW model:\n", trained_model)
#print("Average loss of each epoch:\n", all_losses)

0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
0 tensor(845.5402, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
1 tensor(427.7195, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
2 tensor(246.8250, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
3 tensor(182.2072, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
4 tensor(147.5235, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
5 tensor(126.1784, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
6 tensor(110.9383, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
7 tensor(99.9860, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
8 tensor(91.0591, grad_fn=<ThAddBackward>)
0 / 4806
1024 / 4806
2048 / 4806
3072 / 4806
4096 / 4806
9 tensor(84.1635, grad_fn=<ThAddBack

In [27]:
def calculate_accuracy(model, text, image_features):
    counter = 0
    for (question, actual_answer), visual_features in zip(text, image_features):
        visual_features = torch.from_numpy(visual_features)
        bow_vec = bow_image_vector(question, question_vocab, visual_features)
        log_probs = model(bow_vec)
        value, index = torch.max(log_probs, 1)
        index = index.data[0]

        predicted_answer = annotation_vocab_lookup[index]
        #print("Question", question)
        #print("Actual Answer", actual_answer)
        #print("Predicted Answer", predicted_answer)
         
        if predicted_answer == actual_answer:
            counter += 1
            
    accuracy = (float(counter) / len(text)) * 100
    return accuracy


In [28]:
accuracy = calculate_accuracy(bow_model, train_text, train_images)
print(accuracy)

87.3491468997087


In [29]:
#sum_len_q = 0
#for i in range(len(train_text)):
#    sum_len_q += len(train_text[i][0])
#
#sum_len_q /= len(train_text)
#print(sum_len_q)
#
#count = 0
#for i in range(len(train_text)):
#    if len(train_text[i][0]) > 10:
#        count += 1
#print(count)  

In [30]:
### TRAINING WITHOUT BATCHES. NOT USED

def train_model():
    current_loss = 0
    losses = []
    
    for iter in range(1, num_epochs+1):
        print("Epoch", iter, "/", num_epochs)
        counter = 1
        for (instance, label), image_features in zip(*shuffle_data(train_text, train_images)):
            if counter % 1000 == 0:
                print(counter, "/", len(train_text))
            counter += 1
            bow_model.zero_grad()
            #for i in range(len(image_features)):
            #    print(image_features[i])
            #print(instance)
            #print(label)
            visual_features = torch.from_numpy(image_features)
            bow_vec = bow_image_vector(instance, question_vocab, visual_features)
            print(bow_vec.shape)
            target = make_target(label, annotation_vocab)
            print(target)
            
            log_probs = bow_model(bow_vec)        
            loss = loss_function(log_probs, target)
            current_loss += loss
            
            loss.backward()
            optimizer.step()
            
        losses.append(current_loss / len(train_text))
                
        print("The average loss of epoch ", iter, " is: ", str(current_loss / len(train_text)))
        current_loss = 0
        
    return bow_model, losses

Question ['What', 'English', 'meal', 'is', 'this', 'likely', 'for?']
Actual Answer tea
Predicted Answer no
Question ['What', 'color', 'is', 'his', 'uniform?']
Actual Answer blue
Predicted Answer blue
Question ['Which', 'girl', 'is', 'wearing', 'glasses?']
Actual Answer right
Predicted Answer no
Question ['What', 'is', 'the', 'person', 'doing?']
Actual Answer sunbathing
Predicted Answer yes
Question ['How', 'does', 'the', 'weather', 'appear', 'in', 'this', 'photo?']
Actual Answer sunny
Predicted Answer no
Question ['What', 'kind', 'of', 'facility', 'are', 'the', 'people', 'standing', 'in?']
Actual Answer greenhouse
Predicted Answer no
Question ['What', 'shape', 'is', 'this?']
Actual Answer octagon
Predicted Answer red
Question ['What', 'color', 'is', 'the', 'Frisbee', 'in', 'the', "man's", 'hand?']
Actual Answer red
Predicted Answer white
Question ['What', 'is', 'this', 'person', 'riding?']
Actual Answer motorcycle
Predicted Answer no
Question ['What', 'color', 'are', 'the', 'frames', 'of', 'the', 'glasses?']
Actual Answer brown
Predicted Answer brown
Question ['What', 'is', 'the', 'dog', 'looking', 'out', 'of?']
Actual Answer window
Predicted Answer no
Question ['How', 'many', 'people', 'in', 'the', 'shot?']
Actual Answer 12
Predicted Answer 2
Question ['What', 'is', 'this', 'animal?']
Actual Answer giraffe
Predicted Answer giraffe
Question ['What', 'is', 'lined', 'up', 'on', 'the', 'counter', 'behind', 'the', 'man?']
Actual Answer wine bottles
Predicted Answer no
Question ['What', 'type', 'of', 'food', 'is', 'the', 'man', 'eating?']
Actual Answer pizza
Predicted Answer pizza
Question ['Is', 'there', 'more', 'meat', 'or', 'vegetables', 'on', 'the', 'plate?']
Actual Answer vegetables
Predicted Answer no
Question ['Where', 'is', 'the', 'man?']
Actual Answer beach
Predicted Answer no
Question ['Is', 'this', 'a', 'board', 'game?']
Actual Answer yes
Predicted Answer no
Question ['Is', 'the', 'photo', 'in', 'black', 'in', 'white?']
Actual Answer yes
Predicted Answer no
Question ['Is', 'this', 'area', 'rural?']
Actual Answer no
Predicted Answer no
Question ['Are', 'there', 'number', 'on', 'the', 'large', 'cubes?']
Actual Answer yes
Predicted Answer no
Question ['Is', 'the', 'bus', 'parked?']
Actual Answer no
Predicted Answer no
Question ['Of', 'what', 'airline', 'is', 'the', 'closest', 'plane', 'in', 'the', 'background?']
Actual Answer world
Predicted Answer no
Question ['What', 'season', 'was', 'this', 'photo', 'likely', 'taken', 'in?']
Actual Answer winter
Predicted Answer no
Question ['Can', 'you', 'see', 'the', 'desktop', 'of', 'the', 'computer?']
Actual Answer yes
Predicted Answer yes
Question ['Is', 'there', 'a', 'stop', 'sign?']
Actual Answer yes
Predicted Answer no
Question ['Is', 'the', 'plane', 'taking', 'off?']
Actual Answer yes
Predicted Answer yes
Question ['What', 'kind', 'of', 'creature', 'is', 'on', 'the', 'right?']
Actual Answer cat
Predicted Answer no
Question ['Are', 'the', 'giraffes', 'in', 'the', 'wild?']
Actual Answer no
Predicted Answer no
Question ['What', 'is', 'the', 'boy', 'reaching', 'for?']
Actual Answer banana
Predicted Answer yes
Question ['What', 'is', 'the', 'material', 'on', 'the', 'ground?']
Actual Answer brick
Predicted Answer yes
Question ['Is', 'this', 'a', 'bat', 'or', 'golf', 'club?']
Actual Answer neither
Predicted Answer no
Question ['Is', 'the', 'room', 'busy?']
Actual Answer no
Predicted Answer no
Question ['Is', 'there', 'carrots', 'on', 'the', 'plate?']
Actual Answer yes
Predicted Answer yes
Question ['Is', 'there', 'anything', 'in', 'this', 'picture', 'than', 'can', 'transfer', 'data', 'to', 'another', 'computer?']
Actual Answer yes
Predicted Answer no
Question ['Is', 'there', 'a', 'sandy', 'beach', 'in', 'the', 'horizon?']
Actual Answer no
Predicted Answer no
Question ['Are', 'these', 'types', 'of', 'planes', 'currently', 'used?']
Actual Answer no
Predicted Answer yes
Question ['What', 'is', 'on', 'the', 'plate?']
Actual Answer donuts
Predicted Answer white
Question ['How', 'many', 'tablecloths', 'are', 'there?']
Actual Answer 2
Predicted Answer 2
Question ['Are', 'the', 'slices', 'of', 'pizza', 'small?']
Actual Answer yes
Predicted Answer yes