In [51]:
import h5py
import json
import numpy as np

In [52]:
data = 'data/'
img_feat = data + 'VQA_image_features.h5'
img_ids = data + 'image_ids_vqa.json'
map_features_to_id = data + 'VQA_img_features2id.json'
img_info = data + 'imgid2imginfo.json'

questions_train = data + 'vqa_questions_train.json'
questions_validation = data + 'vqa_questions_valid.json'
questions_test = data + 'vqa_questions_test.json'

annotations_train = data + 'vqa_annotations_train.json'
annotations_validation = data + 'vqa_annotations_valid.json'
annotations_test = data + 'vqa_annotations_test.json'

In [53]:
def read_images():
    # load computed VQA image features from hdf5 file
    image_features = np.asarray(h5py.File(img_feat, 'r')['img_features'])

    # load IDs file
    with open(img_ids, 'r') as file:
        image_ids = json.load(file)['image_ids']

    # load feature mapping file
    with open(map_features_to_id, 'r') as file:
        feature_mapping = json.load(file)['VQA_imgid2id']

    # load info file
    with open(img_info, 'r') as file:
        image_info = json.load(file)

    return image_ids, image_features, feature_mapping, image_info

In [54]:
def read_text():
    with open(questions_train, 'r') as file:
        q_train = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
    with open(questions_validation, 'r') as file:
        q_validation = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
    with open(questions_test, 'r') as file:
        q_test = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
        
    with open(annotations_train, 'r') as file:
        a_train = [x['multiple_choice_answer'] for x in json.load(file)['annotations']]
    with open(annotations_validation, 'r') as file:
        a_validation = [x['multiple_choice_answer'] for x in json.load(file)['annotations']]
    with open(annotations_test, 'r') as file:
        a_test = [x['multiple_choice_answer']  for x in json.load(file)['annotations']]
    
    return q_train, q_validation, q_test, a_train, a_validation, a_test

In [55]:
## Display image from URL

import os
from PIL import Image
from urllib import request

def show_image(imgid2info, id):
    img_name = 'temp-image.jpg'
    request.urlretrieve(imgid2info[str(id)]['flickr_url'], img_name)

    img = Image.open(img_name)
    img.show()

    os.remove(img_name)
    img.close()
    
#show_image(image_info, 111756)

In [56]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random

USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print(device)

cuda


In [57]:
def image_id_to_features(image_id):
    feat = feature_mapping[str(image_id)]
    return image_features[feat]

In [58]:
q_train, q_validation, q_test, a_train, a_validation, a_test = read_text()

In [59]:
image_ids, image_features, feature_mapping, image_info = read_images()

In [60]:
## TO DO 
# Increase size of datasets

train_len = int(0.1* len(q_train)) 
validation_len = int(0.1* len(q_validation))
test_len =  int(0.1* len(q_test))

In [61]:
import string
import nltk
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

def organize_data():
    # determine train data
    train_text = []
    train_images = []
    for i in range(train_len): 
        split_words = q_train[i][0].split()
        lower_words = [w.lower() for w in split_words]
        no_punct_words = [w.strip(string.punctuation) for w in lower_words]
       
        train_text.append((no_punct_words, a_train[i]))
        train_images.append(image_id_to_features(q_train[i][1]))

    # determine validation data
    validation_text = []
    validation_images = []
    for i in range(validation_len): 
        split_words = q_validation[i][0].split()
        lower_words = [w.lower() for w in split_words]
        no_punct_words = [w.strip(string.punctuation) for w in lower_words]
        
        validation_text.append((no_punct_words, a_validation[i]))
        validation_images.append(image_id_to_features(q_validation[i][1]))

    # determine test data
    test_text = []
    test_images = []
    for i in range(test_len): 
        split_words = q_test[i][0].split()
        lower_words = [w.lower() for w in split_words]
        no_punct_words = [w.strip(string.punctuation) for w in lower_words]
        
        test_text.append((no_punct_words, a_test[i]))
        test_images.append(image_id_to_features(q_test[i][1]))
        
    return train_text, train_images, validation_text, validation_images, test_text, test_images

In [62]:
def shuffle_data(text_features, visual_features):
    combined = [(text, visual) for text, visual in zip(text_features, visual_features)]
    random.shuffle(combined)
    return [text for (text, _) in combined], [visual for (_, visual) in combined]

In [63]:
from collections import defaultdict
import operator

def select_frequent_answers(train_text, train_images, maxAnswers):
    
    answer_fq= defaultdict(int)
    for question, answer in train_text:
        answer_fq[answer] += 1
    sorted_fq = sorted(answer_fq.items(), key=operator.itemgetter(1), reverse=True)[0:maxAnswers]
    top_answers, top_fq = zip(*sorted_fq)
    new_train_text=[]
    new_train_images=[]
    
    for (ques, ans), img in zip(train_text, train_images):
        if ans in top_answers:
            new_train_text.append((ques, ans))
            new_train_images.append(img)

    return new_train_text, new_train_images

In [64]:
def vocabulary():
    question_vocab = {}
    annotation_vocab = {}
    annotation_vocab_lookup = []
    question_vocab['<unk>'] = 0
    for question, answer in train_text:
        for word in question:
            if word not in question_vocab:
                question_vocab[word] = len(question_vocab)
        #if answer in top_answers:
        if answer not in annotation_vocab:
            annotation_vocab[answer] = len(annotation_vocab)
            annotation_vocab_lookup.append(answer)
    return question_vocab, annotation_vocab, annotation_vocab_lookup

In [65]:
og_train_text, og_train_images, validation_text, validation_images, test_text, test_images = organize_data()
train_text, train_images = select_frequent_answers(og_train_text, og_train_images, 1000)
question_vocab, annotation_vocab, annotation_vocab_lookup = vocabulary()

In [66]:
n_questions = len(train_text)
question_vocab_size = len(question_vocab) 
annotation_vocab_size = len(annotation_vocab)
image_feature_length = len(train_images[0])
print(n_questions)
print(question_vocab_size)
print(annotation_vocab_size)

4676
2407
1000


In [67]:
#print(annotation_vocab)

In [68]:
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [69]:
def bow_image_vector(question, question_vocab, visual_features):     
    vec = torch.zeros(len(question_vocab)).cuda()
    for word in question:
        if word not in question_vocab.keys():
            word = '<unk>'
        vec[question_vocab[word]] += 1
    vec = torch.cat((vec, visual_features), dim=0)
    return vec.view(1, -1)

from torch.autograd import Variable
import seq2vec

num_emb = 300
hidden_size = 1024
#lstm = seq2vec.LSTM(list(question_vocab.keys()), num_emb, hidden_size, 1).to(device=device)
#input_vector_size = hidden_size+image_feature_length
input_vector_size = question_vocab_size + image_feature_length

def question2invec(question):
    zeros18 = np.zeros((1,10))
    for idx, word in enumerate(question[:10]):
        zeros18[0][idx] = question_vocab[word]+1
        
    return zeros18

def convert_to_lstm_feature(text):
    q = []
    for question, answer in text:
        qidx = question2invec(question)
        q.append(qidx)
    q = np.asarray(q)
    qt = torch.from_numpy(q[:,0]).long().cuda()
    output = lstm(qt)

    temp = []
    for i in range(len(text)):
        tens = output[i].detach()
        label = text[i][1]
        temp.append((tens,label))
        
    return temp
#temp = convert_to_lstm_feature(train_text)
#temp = train_text
#print(temp[0])

In [70]:
def make_target(label, annotation_vocab):
    return torch.LongTensor([annotation_vocab[label]], device=device).cuda()


In [71]:
def train(num_epochs, batch_size):
    s = True
    for ep in range(num_epochs):
        ep_loss = 0
        
        for start in range(0, len(train_text), batch_size):
            text_batch = train_text[start:start+batch_size]
            image_batch = train_images[start:start+batch_size]
            in_mat = torch.zeros(batch_size, input_vector_size, device=device)
            out_vec = torch.zeros(batch_size, dtype=torch.long, device=device)
            #counter=1 
            #if start % 4096 == 0:
            #    print(start, "/", len(train_text))
                
            for i, ((instance, label), image_features) in enumerate(zip(*shuffle_data(text_batch, image_batch))):
                visual_features = torch.from_numpy(image_features).to(device=device)
                #bow_vec = torch.cat((instance, visual_features)).view(1,-1)
                bow_vec = bow_image_vector(instance, question_vocab, visual_features)
                target = make_target(label, annotation_vocab)
                
                in_mat[i] = bow_vec
                out_vec[i] = target
            
            log_probs = bow_model(in_mat) 

            batch_loss = loss_function(log_probs, out_vec)
            ep_loss += batch_loss
            
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

        print(ep, ep_loss)
        
    return bow_model

In [72]:
'''
def train_model():
    current_loss = 0
    losses = []
    
    for iter in range(1, num_epochs+1):
        print("Epoch", iter, "/", num_epochs)
        counter = 1
        for (instance, label), image_features in zip(*shuffle_data(temp, train_images)):
            if counter % 1000 == 0:
                print(counter, "/", len(train_text))
            counter += 1
            bow_model.zero_grad()

            visual_features = torch.from_numpy(image_features).cuda()
            #bow_vec = torch.cat((instance, visual_features)).view(1,-1)
            bow_vec = bow_image_vector(instance, question_vocab, visual_features)
            target = make_target(label, annotation_vocab)
            
            log_probs = bow_model(bow_vec)        
            loss = loss_function(log_probs, target)
            current_loss += loss
            
            loss.backward()
            optimizer.step()
            
        losses.append(current_loss / len(train_text))
                
        print("The average loss of epoch ", iter, " is: ", str(current_loss / len(train_text)))
        current_loss = 0
        
    return bow_model, losses
    
'''

'\ndef train_model():\n    current_loss = 0\n    losses = []\n    \n    for iter in range(1, num_epochs+1):\n        print("Epoch", iter, "/", num_epochs)\n        counter = 1\n        for (instance, label), image_features in zip(*shuffle_data(temp, train_images)):\n            if counter % 1000 == 0:\n                print(counter, "/", len(train_text))\n            counter += 1\n            bow_model.zero_grad()\n\n            visual_features = torch.from_numpy(image_features).cuda()\n            #bow_vec = torch.cat((instance, visual_features)).view(1,-1)\n            bow_vec = bow_image_vector(instance, question_vocab, visual_features)\n            target = make_target(label, annotation_vocab)\n            \n            log_probs = bow_model(bow_vec)        \n            loss = loss_function(log_probs, target)\n            current_loss += loss\n            \n            loss.backward()\n            optimizer.step()\n            \n        losses.append(current_loss / len(train_text)

In [74]:
bow_model = BoWClassifier(annotation_vocab_size, input_vector_size).to(device=device)
num_epochs = 10
learning_rate = 1e-3
batch_size = 32
loss_function = nn.NLLLoss()
optimizer = optim.Adam(bow_model.parameters(), lr=learning_rate)

In [75]:
import time
with torch.cuda.device(0):
    #start = time.time()
    trained_model = train(num_epochs, batch_size)   
    #print("Time taken", time.time()-start)

0 tensor(787.7375, device='cuda:0', grad_fn=<ThAddBackward>)
1 tensor(426.7748, device='cuda:0', grad_fn=<ThAddBackward>)
2 tensor(259.7792, device='cuda:0', grad_fn=<ThAddBackward>)
3 tensor(194.4196, device='cuda:0', grad_fn=<ThAddBackward>)
4 tensor(159.2665, device='cuda:0', grad_fn=<ThAddBackward>)
5 tensor(136.9492, device='cuda:0', grad_fn=<ThAddBackward>)
6 tensor(120.8793, device='cuda:0', grad_fn=<ThAddBackward>)
7 tensor(109.1952, device='cuda:0', grad_fn=<ThAddBackward>)
8 tensor(99.5963, device='cuda:0', grad_fn=<ThAddBackward>)
9 tensor(92.0075, device='cuda:0', grad_fn=<ThAddBackward>)


In [76]:
def calculate_accuracy(model, text, image_features):
    with torch.no_grad():
        counter = 0
        i = 0
        for (question, actual_answer), visual_features in zip(text, image_features):
            visual_features = torch.from_numpy(visual_features).to(device=device)
            #bow_vec = torch.cat((question, visual_features)).view(1,-1)
            bow_vec = bow_image_vector(question, question_vocab, visual_features)
            log_probs = model(bow_vec)
            value, index = torch.max(log_probs, 1)
            index = index.data[0]

            predicted_answer = annotation_vocab_lookup[index]
            #print("Question", validation_text[i])
            #print("Actual Answer", actual_answer)
            #print("Predicted Answer", predicted_answer)

            if predicted_answer == actual_answer:
                counter += 1
            i += 1
        accuracy = (float(counter) / len(text)) * 100
        return accuracy


In [77]:
#temp2 = convert_to_lstm_feature(test_text)
#accuracy = calculate_accuracy(trained_model[0], test_text, test_images)
#print(accuracy)

In [78]:
train_accuracy = calculate_accuracy(trained_model, train_text, train_images)
print(train_accuracy)
validation_accuracy = calculate_accuracy(trained_model, validation_text, validation_images)
print(validation_accuracy)

87.38237810094097
24.414715719063544
