In [None]:
import os, sys

from matplotlib import pyplot as plt

In [None]:
if not os.path.isfile('./pytorch-vqa/README.md'):
    !git clone https://github.com/Cyanogenoid/pytorch-vqa.git
sys.path.append(os.path.realpath('./pytorch-vqa'))

In [None]:
# https://github.com/Cyanogenoid/pytorch-vqa/releases

if not os.path.isfile('./2017-08-04_00.55.19.pth'):   # 81Mb model
    !wget https://github.com/Cyanogenoid/pytorch-vqa/releases/download/v1.0/2017-08-04_00.55.19.pth

In [None]:
import torch

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
import model # from pytorch-vqa

#saved_state = torch.load('logs/2017-08-04_00:55:19.pth')
saved_state = torch.load('./2017-08-04_00.55.19.pth')
tokens = len(saved_state['vocab']['question']) + 1

saved_state.keys()  # See what's in the saved state

In [None]:
# Load the predefined model
vqa_net = torch.nn.DataParallel(model.Net(tokens))
vqa_net.load_state_dict(saved_state['weights'])
vqa_net.to(device)

### Now let's try and answer a question on a single image...

In [None]:
if not os.path.isfile('./pytorch-resnet/README.md'):
    !git clone https://github.com/Cyanogenoid/pytorch-resnet.git
sys.path.append(os.path.realpath('./pytorch-resnet'))

In [None]:
import resnet  # from pytorch-resnet

import torchvision.transforms as transforms
from PIL import Image

def get_transform(target_size, central_fraction=1.0):
    return transforms.Compose([
        transforms.Scale(int(target_size / central_fraction)),
        transforms.CenterCrop(target_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

class ResNetLayer4(torch.nn.Module):
    def __init__(self):
        super(ResNetLayer4, self).__init__()
        self.model = resnet.resnet152(pretrained=True)
        
        # from  visual_qa_analysis/config.py
        image_size = 448  # scale shorter end of image to this size and centre crop
        #output_size = image_size // 32  # size of the feature maps after processing through a network
        output_features = 2048  # number of feature maps thereof
        central_fraction = 0.875 # only take this much of the centre when scaling and centre cropping

        self.transform = get_transform(image_size, central_fraction)

        def save_output(module, input, output):
            self.buffer = output
        self.model.layer4.register_forward_hook(save_output)

    def forward(self, x):
        self.model(x)
        return self.buffer
    
    def image_to_features(self, img_file):
        img = Image.open(img_file).convert('RGB')
        img_transformed = self.transform(img)
        #print(img_transformed.size())
        img_batch = img_transformed.unsqueeze(0).to(device)
        return self.forward(img_batch) 
    
resnet_layer4 = ResNetLayer4().to(device)  # Downloads 241Mb model when first run

In [None]:
# Sample images : 
image_urls, image_path, image_files = [
    'https://www.pets4homes.co.uk/images/articles/2709/large/tabby-cat-colour-and-pattern-genetics-5516c44dbd383.jpg',
    'https://imgc.allpostersimages.com/img/print/posters/cat-black-jumping-off-wall_a-G-12469828-14258383.jpg',
    'https://i.ytimg.com/vi/AIwlyly7Eso/hqdefault.jpg',
    'https://upload.wikimedia.org/wikipedia/commons/9/9b/Black_pussy_-_panoramio.jpg',
    'https://www.thehappycatsite.com/wp-content/uploads/2017/06/siamese5.jpg',
], './img/', []
os.makedirs('./img', exist_ok=True)
for url in image_urls:
    image_file=os.path.join(image_path, os.path.basename(url))
    image_files.append(image_file)
    if not os.path.isfile(image_file):
        !wget {url} --directory-prefix ./img/
image_files

In [None]:
v = resnet_layer4.image_to_features(image_files[0])
v.size()

### Have a look at how the vocab is built

In [None]:
vocab = saved_state['vocab']
vocab.keys()  # dict_keys(['question', 'answer'])
list(vocab['question'].items())[:5]  # [('the', 1), ('is', 2), ('what', 3), ('are', 4), ('this', 5)]
list(vocab['answer'].items())[:5]  # [('yes', 0), ('no', 1), ('2', 2), ('1', 3), ('white', 4)]

#### To answer a question

*  Convert the image to features 'v'
*  Convert the question to a torch vector of longs
*  Pass both into the the VQA model
*  Interpret the softmax-y answer vectors

In [None]:
qtoken_to_index = vocab['question']
    
def encode_question(question_str):
    """ Turn a question into a vector of indices and a question length """
    max_question_length = 30 # say...
    question = question_str.lower().split(' ')
    vec = torch.zeros(max_question_length).long()
    for i, token in enumerate(question):
        index = qtoken_to_index.get(token, 0)
        vec[i] = index
    return vec.to(device), torch.tensor( len(question) ).to(device)

In [None]:
answer_words = ['UNDEF'] * len(vocab['answer'])
for w,idx in vocab['answer'].items():
    answer_words[idx]=w
answer_words[:10]  # ['yes', 'no', '2', '1', 'white', '3', 'red', 'blue', '4', 'green']

In [None]:
# Important things to know...
'colour' in qtoken_to_index, 'color' in qtoken_to_index, 'tabby' in answer_words

In [None]:
image_idx = 4
image_filename = image_files[image_idx]

img = Image.open(image_filename).convert('RGB')
plt.imshow(img)

In [None]:
v0 = resnet_layer4.image_to_features(image_filename)

In [None]:
q, q_len = encode_question("is there a cat in the picture ?")

In [None]:
ans = vqa_net(v0, q.unsqueeze(0), q_len.unsqueeze(0))

In [None]:
_, answer_idx = ans.data.cpu().max(dim=1)
answer_words[ answer_idx ]

### It appears that something works...

In [None]:
def vqa(img, question_arr):
    res=[]
    for question_str in question_arr:
        q, q_len = encode_question(question_str)
        ans = vqa_net(v0, q.unsqueeze(0), q_len.unsqueeze(0))
        _, answer_idx = ans.data.cpu().max(dim=1)
        res.append(answer_words[ answer_idx ])
    return res

In [None]:
vqa(v0, [
    "is there a cat in the picture ?", 
    "what color are the cat's paws ?",
    "what color are the cat's eyes ?",
])

In [None]:
vqa(v0, ['color cat'])

In [None]:
# From visual_qa_analysis/notebooks/inference.ipynb

log_softmax = nn.LogSoftmax().cuda()
num_batches = 0
for v, q, a, idx, q_len in tq:
        var_params = {
            'volatile': True,
            'requires_grad': False,
        }
        v = Variable(v.cuda(async=True), **var_params)
        q = Variable(q.cuda(async=True), **var_params)
        a = Variable(a.cuda(async=True), **var_params)
        q_len = Variable(q_len.cuda(async=True), **var_params)

        out = net(v, q, q_len)
        nll = -log_softmax(out)
        loss = (nll * a / 10).sum(dim=1).mean()
        acc = utils.batch_accuracy(out.data, a.data).cpu()
        
        _, answer = out.data.cpu().max(dim=1)
        answ.append(answer.view(-1))
        accs.append(acc.view(-1))
        idxs.append(idx.view(-1).clone())
        print(np.mean(list(torch.cat(accs, dim=0))))