In [1]:
import h5py
import json
import numpy as np

In [2]:
data = 'data/'
img_feat = data + 'VQA_image_features.h5'
img_ids = data + 'image_ids_vqa.json'
map_features_to_id = data + 'VQA_img_features2id.json'
img_info = data + 'imgid2imginfo.json'

questions_train = data + 'vqa_questions_train.json'
questions_validation = data + 'vqa_questions_valid.json'
questions_test = data + 'vqa_questions_test.json'

annotations_train = data + 'vqa_annotations_train.json'
annotations_validation = data + 'vqa_annotations_valid.json'
annotations_test = data + 'vqa_annotations_test.json'

In [3]:
def read_images():
    # load computed VQA image features from hdf5 file
    image_features = np.asarray(h5py.File(img_feat, 'r')['img_features'])

    # load IDs file
    with open(img_ids, 'r') as file:
        image_ids = json.load(file)['image_ids']

    # load feature mapping file
    with open(map_features_to_id, 'r') as file:
        feature_mapping = json.load(file)['VQA_imgid2id']

    # load info file
    with open(img_info, 'r') as file:
        image_info = json.load(file)

    return image_ids, image_features, feature_mapping, image_info

In [4]:
def read_text():
    with open(questions_train, 'r') as file:
        q_train = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
    with open(questions_validation, 'r') as file:
        q_validation = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
    with open(questions_test, 'r') as file:
        q_test = [[x['question'], x['image_id']] for x in json.load(file)['questions']]
        
    with open(annotations_train, 'r') as file:
        a_train = [x['multiple_choice_answer'] for x in json.load(file)['annotations']]
    with open(annotations_validation, 'r') as file:
        a_validation = [x['multiple_choice_answer'] for x in json.load(file)['annotations']]
    with open(annotations_test, 'r') as file:
        a_test = [x['multiple_choice_answer']  for x in json.load(file)['annotations']]
    
    return q_train, q_validation, q_test, a_train, a_validation, a_test

In [5]:
## Display image from URL

import os
from PIL import Image
from urllib import request

def show_image(imgid2info, id):
    img_name = 'temp-image.jpg'
    request.urlretrieve(imgid2info[str(id)]['flickr_url'], img_name)

    img = Image.open(img_name)
    img.show()

    os.remove(img_name)
    img.close()
    
#show_image(image_info, 111756)

In [6]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random

USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print(device)

cuda


In [7]:
def image_id_to_features(image_id):
    feat = feature_mapping[str(image_id)]
    return image_features[feat]

In [8]:
q_train, q_validation, q_test, a_train, a_validation, a_test = read_text()

In [9]:
image_ids, image_features, feature_mapping, image_info = read_images()

In [10]:
train_len = int(len(q_train)) 
validation_len = int(len(q_validation))
test_len =  int(len(q_test))

In [11]:
import string

def organize_data():
    # determine train data
    train_text = []
    train_images = []
    for i in range(train_len): 
        split_words = q_train[i][0].split()
        lower_words = [w.lower() for w in split_words]
        no_punct_words = [w.strip(string.punctuation) for w in lower_words]
        
        train_text.append((no_punct_words, a_train[i]))
        train_images.append(image_id_to_features(q_train[i][1]))

    # determine validation data
    validation_text = []
    validation_images = []
    for i in range(validation_len): 
        split_words = q_validation[i][0].split()
        lower_words = [w.lower() for w in split_words]
        no_punct_words = [w.strip(string.punctuation) for w in lower_words]
        
        validation_text.append((no_punct_words, a_validation[i]))
        validation_images.append(image_id_to_features(q_validation[i][1]))

    # determine test data
    test_text = []
    test_images = []
    for i in range(test_len): 
        split_words = q_test[i][0].split()
        lower_words = [w.lower() for w in split_words]
        no_punct_words = [w.strip(string.punctuation) for w in lower_words]
        
        test_text.append((no_punct_words, a_test[i]))
        test_images.append(image_id_to_features(q_test[i][1]))
        
    return train_text, train_images, validation_text, validation_images, test_text, test_images

In [12]:
def shuffle_data(text_features, visual_features):
    combined = [(text, visual) for text, visual in zip(text_features, visual_features)]
    random.shuffle(combined)
    return [text for (text, _) in combined], [visual for (_, visual) in combined]

In [13]:
def vocabulary():
    question_vocab = {}
    annotation_vocab = {}
    annotation_vocab_lookup = []
    for question, answer in train_text:
        for word in question:
            if word not in question_vocab:
                question_vocab[word] = len(question_vocab)
        if answer not in annotation_vocab:
            annotation_vocab[answer] = len(annotation_vocab)
            annotation_vocab_lookup.append(answer)
    return question_vocab, annotation_vocab, annotation_vocab_lookup

In [14]:
from collections import defaultdict
import operator

def select_frequent_answers(train_text, train_images, maxAnswers):   
    answer_fq= defaultdict(int)
    for question, answer in train_text:
        answer_fq[answer] += 1
    sorted_fq = sorted(answer_fq.items(), key=operator.itemgetter(1), reverse=True)[0:maxAnswers]
    top_answers, top_fq = zip(*sorted_fq)
    new_train_text=[]
    new_train_images=[]
    
    for (ques, ans), img in zip(train_text, train_images):
        if ans in top_answers:
            new_train_text.append((ques, ans))
            new_train_images.append(img)

    return new_train_text, new_train_images

In [15]:
og_train_text, og_train_images, validation_text, validation_images, test_text, test_images = organize_data()
train_text, train_images = select_frequent_answers(og_train_text, og_train_images, 1000)
question_vocab, annotation_vocab, annotation_vocab_lookup = vocabulary()

n_questions = len(train_text)
annotation_vocab_size = len(annotation_vocab)
image_feature_length = len(train_images[0])
input_vector_size = image_feature_length+2400
print(n_questions)
print(annotation_vocab_size)

43502
1000


In [16]:
class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [17]:
import torch
from torch.autograd import Variable
import sys
sys.path.append('/home/mau_engr/skip-thoughts.torch/pytorch')
from skipthoughts import UniSkip

dir_st = '/home/mau_engr/skip-thoughts.torch/theano/skip-thoughts'

def get_st_inputs(text):
    q_idxs = []
    q_lens = []
    for question, answer in text:
        sent = []
        l = 0
        for word in question:
            if l < 8:
                try:
                    sent.append(question_vocab[word])
                    l +=1
                except KeyError:
                    pass
        while len(sent) < 8:
            sent.append(0)
        q_idxs.append(sent)
        q_lens.append(l)
    return q_idxs, q_lens

uniskip = UniSkip(dir_st, list(question_vocab.keys())).to(device=device)

qvalid_idxs, qvalid_lens = get_st_inputs(validation_text)
qvalid_features = uniskip(torch.cuda.LongTensor(qvalid_idxs).detach_(), lengths=qvalid_lens).detach()

qtest_idxs, qtest_lens = get_st_inputs(test_text)
qtest_features = uniskip(torch.cuda.LongTensor(qtest_idxs).detach_(), lengths=qtest_lens).detach()



  "num_layers={}".format(dropout, num_layers))


In [18]:
qtrain_idxs, qtrain_lens = get_st_inputs(train_text[0:10000])
qtrain_features_1 = uniskip(torch.cuda.LongTensor(qtrain_idxs), lengths=qtrain_lens).detach()

qtrain_idxs, qtrain_lens = get_st_inputs(train_text[10000:20000])
qtrain_features_2 = uniskip(torch.cuda.LongTensor(qtrain_idxs), lengths=qtrain_lens).detach()

qtrain_idxs, qtrain_lens = get_st_inputs(train_text[20000:30000])
qtrain_features_3 = uniskip(torch.cuda.LongTensor(qtrain_idxs), lengths=qtrain_lens).detach()

qtrain_idxs, qtrain_lens = get_st_inputs(train_text[30000:40000])
qtrain_features_4 = uniskip(torch.cuda.LongTensor(qtrain_idxs), lengths=qtrain_lens).detach()

qtrain_idxs, qtrain_lens = get_st_inputs(train_text[40000:43502])
qtrain_features_5 = uniskip(torch.cuda.LongTensor(qtrain_idxs), lengths=qtrain_lens).detach()

qtrain_features = torch.cat((qtrain_features_1,qtrain_features_2,qtrain_features_3,qtrain_features_4, qtrain_features_5))
print(qtrain_features.shape)

torch.Size([43502, 2400])


In [19]:
input_vector_size = image_feature_length+2400

def featureize(text, features):
    features_text = []
    for i in range(len(text)):
        _, answer = text[i]
        f = features[i]
        features_text.append((f, answer))
    return features_text    
        
train_text = featureize(train_text, qtrain_features)
validation_text = featureize(validation_text, qvalid_features)
test_text = featureize(test_text, qtest_features)

In [20]:
def make_target(label, annotation_vocab):
    return torch.LongTensor([annotation_vocab[label]]).to(device=device)

In [21]:
def train(model, num_epochs, batch_size):
    s = True
    for ep in range(num_epochs):
        ep_loss = 0
        
        for start in range(0, len(train_text), batch_size):
            text_batch = train_text[start:start+batch_size]
            image_batch = train_images[start:start+batch_size]
            in_mat = torch.zeros(batch_size, input_vector_size, device=device)
            out_vec = torch.zeros(batch_size, dtype=torch.long, device=device)
                
            for i, ((instance, label), image_features) in enumerate(zip(*shuffle_data(text_batch, image_batch))):
                text_features = instance.to(device=device)
                visual_features = torch.from_numpy(image_features).to(device=device)
                infersent_vec = torch.cat((text_features, visual_features)).view(1,-1)
                target = make_target(label, annotation_vocab)
                
                in_mat[i] = infersent_vec
                out_vec[i] = target
            
            log_probs = model(in_mat) 

            batch_loss = loss_function(log_probs, out_vec)
            ep_loss += batch_loss
            
            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
    return model

In [22]:
def calculate_accuracy(model, text, image_features):
    with torch.no_grad():
        counter = 0
        i = 0
        for (question, actual_answer), visual_features in zip(text, image_features):
            text_features = question.to(device=device)
            visual_features = torch.from_numpy(visual_features).to(device=device)
            infersent_vec = torch.cat((text_features, visual_features)).view(1,-1)
            log_probs = model(infersent_vec)
            value, index = torch.max(log_probs, 1)
            index = index.data[0]

            predicted_answer = annotation_vocab_lookup[index]

            if predicted_answer == actual_answer:
                counter += 1
            i += 1
        accuracy = (float(counter) / len(text)) * 100
        return accuracy

In [26]:
epochs = [20, 30]
learning_rates = [1e-4, 1e-5]
batch_sizes = [32,48]
best = 0
best_values = 0
best_model = 0

with torch.cuda.device(0):
    for epoch in epochs:
        for lr in learning_rates:
            for bs in batch_sizes:
                print('LR= ' + str(lr) + ', BS= ' + str(bs) + ', epochs= ' + str(epoch))
                model = BoWClassifier(annotation_vocab_size, input_vector_size).to(device=device)
                loss_function = nn.NLLLoss()
                optimizer = optim.Adam(model.parameters(), lr=lr)
                train_model = train(model, epoch, bs)  
                train_accuracy = calculate_accuracy(train_model, train_text, train_images)
                print(train_accuracy)
                validation_accuracy = calculate_accuracy(train_model, validation_text, validation_images)
                print(validation_accuracy)
                
                if validation_accuracy > best:
                    best = validation_accuracy
                    best_values = (epoch,lr,bs)
                    best_model = train_model

LR= 0.0001, BS= 32, epochs= 20
64.22923083996139
31.70324161746686
LR= 0.0001, BS= 48, epochs= 20
62.661486828191805
31.92603319594519
LR= 1e-05, BS= 32, epochs= 20
36.12707461725898
28.29453046674836
LR= 1e-05, BS= 48, epochs= 20
34.249000045974896
27.860086888715607
LR= 0.0001, BS= 32, epochs= 30
66.98312721254194
31.70324161746686
LR= 0.0001, BS= 48, epochs= 30
65.63836145464576
31.658683301771195
LR= 1e-05, BS= 32, epochs= 30
40.078617075077005
29.375069622368276
LR= 1e-05, BS= 48, epochs= 30


KeyboardInterrupt: 

In [27]:
print(calculate_accuracy(best_model, test_text, test_images))

33.592167454422686
