In [None]:
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
#git clone https://github.com/Cyanogenoid/pytorch-vqa.git

import os, sys
sys.path.append(os.path.realpath('./pytorch-vqa'))

In [None]:
# https://github.com/Cyanogenoid/pytorch-vqa/releases

#wget https://github.com/Cyanogenoid/pytorch-vqa/releases/download/v1.0/2017-08-04_00.55.19.pth  # 81Mb model

In [None]:
import model

#log = torch.load('logs/2017-08-04_00:55:19.pth')
log = torch.load('./2017-08-04_00.55.19.pth')
tokens = len(log['vocab']['question']) + 1

net = torch.nn.DataParallel(model.Net(tokens))
net.load_state_dict(log['weights'])
net.to(device)

In [None]:
log.keys()

### Now let's try and answer a question on a single image...

In [None]:
sys.path.append(os.path.realpath('./pytorch-resnet'))

In [None]:
import resnet

import torchvision.transforms as transforms
from PIL import Image

def get_transform(target_size, central_fraction=1.0):
    return transforms.Compose([
        transforms.Scale(int(target_size / central_fraction)),
        transforms.CenterCrop(target_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

class ResNetLayer4(torch.nn.Module):
    def __init__(self):
        super(ResNetLayer4, self).__init__()
        self.model = resnet.resnet152(pretrained=True)
        
        # from  visual_qa_analysis/config.py
        image_size = 448  # scale shorter end of image to this size and centre crop
        #output_size = image_size // 32  # size of the feature maps after processing through a network
        output_features = 2048  # number of feature maps thereof
        central_fraction = 0.875 # only take this much of the centre when scaling and centre cropping

        self.transform = get_transform(image_size, central_fraction)

        def save_output(module, input, output):
            self.buffer = output
        self.model.layer4.register_forward_hook(save_output)

    def forward(self, x):
        self.model(x)
        return self.buffer
    
    def image_to_features(self, img_file):
        img = Image.open(img_file).convert('RGB')
        img_transformed = self.transform(img)
        #print(img_transformed.size())
        img_batch = img_transformed.unsqueeze(0).to(device)
        return self.forward(img_batch) 
    
resnet_layer4 = ResNetLayer4().to(device)  # Downloads 241Mb model when first run


In [None]:
#resnet_layer4.image_to_features('')
v = resnet_layer4.image_to_features('./img/tabby-cat-colour-and-pattern-genetics-5516c44dbd383.jpg')

### Have a look at how the vocab is built

In [None]:
log['vocab'].keys()  # dict_keys(['question', 'answer'])
list(log['vocab']['question'].items())[:5]  # [('the', 1), ('is', 2), ('what', 3), ('are', 4), ('this', 5)]
list(log['vocab']['answer'].items())[:5]  # [('yes', 0), ('no', 1), ('2', 2), ('1', 3), ('white', 4)]

#### To answer a question

*  Convert the image to features 'v'
*  Convert the question to a torch vector of longs
*  Pass both into the the VQA model
*  Interpret the softmax-y answer vectors

In [None]:
qtoken_to_index = log['vocab']['question']
    
def encode_question(self, question_str):
    """ Turn a question into a vector of indices and a question length """
    max_question_length = 30 # say...
    vec = torch.zeros(max_question_length).long()
    for i, token in enumerate(question.lower().split(' ')):
        index = qtoken_to_index.get(token, 0)
        vec[i] = index
    return vec, len(question)

In [None]:
answer_words = ['UNDEF'] * len(log['vocab']['answer'])
for w,idx in log['vocab']['answer'].items():
    answer_words[idx]=w
answer_words[:10]  # ['yes', 'no', '2', '1', 'white', '3', 'red', 'blue', '4', 'green']

In [None]:
# Important thing to know...
'tabby' in answer_words

In [None]:
v = resnet_layer4.image_to_features('./img/tabby-cat-colour-and-pattern-genetics-5516c44dbd383.jpg')

In [None]:
## Load vocabulary - no need
#with open(config.vocabulary_path, 'r') as fd:
#  vocab_json = json.load(fd)
#reverse_vocab_question = dict([(v, k) for k, v in vocab_json['question'].items()])
#reverse_vocab_answer   = dict([(v, k) for k, v in vocab_json['answer'].items()])

In [None]:
# From visual_qa_analysis/notebooks/inference.ipynb

log_softmax = nn.LogSoftmax().cuda()
num_batches = 0
for v, q, a, idx, q_len in tq:
        var_params = {
            'volatile': True,
            'requires_grad': False,
        }
        v = Variable(v.cuda(async=True), **var_params)
        q = Variable(q.cuda(async=True), **var_params)
        a = Variable(a.cuda(async=True), **var_params)
        q_len = Variable(q_len.cuda(async=True), **var_params)

        out = net(v, q, q_len)
        nll = -log_softmax(out)
        loss = (nll * a / 10).sum(dim=1).mean()
        acc = utils.batch_accuracy(out.data, a.data).cpu()
        
        _, answer = out.data.cpu().max(dim=1)
        answ.append(answer.view(-1))
        accs.append(acc.view(-1))
        idxs.append(idx.view(-1).clone())
        print(np.mean(list(torch.cat(accs, dim=0))))