## Text Visual Question Answering using Pythia
[Pythia](https://github.com/facebookresearch/pythia) is a modular framework for vision and language multimodal research. Lets use it to have some fun solving [TextVQA](https://textvqa.org/). TextVQA requires models to read and reason about text in images to answer questions about them. Specifically, models need to incorporate a new modality of text present in the images and reason over it to answer TextVQA questions.

In [86]:
# first some imports
import yaml
import cv2
import torch
import requests
import numpy as np
import gc
import pprint
import pandas as pd
import torch.nn.functional as F
import os

import torchvision.models as models
import torchvision.transforms as transforms

from pythia.utils.configuration import ConfigNode
from pythia.tasks.processors import VocabProcessor, VQAAnswerProcessor
from pythia.models.pythia import Pythia
from pythia.common.registry import registry
from pythia.common.sample import Sample, SampleList

In [124]:
class TextVQA:
    TARGET_IMAGE_SIZE = [448, 448]
    CHANNEL_MEAN = [0.485, 0.456, 0.406]
    CHANNEL_STD = [0.229, 0.224, 0.225]
    
    def __init__(self):
        self._init_processors()
        self.pythia_model = self._build_pythia_model()
        #self.detection_model = self._build_detection_model()
        #self.resnet_model = self._build_resnet_model()
        
    def _init_processors(self):
        """
        Pythia uses processors is to keep data processing pipelines as similar as
        possible for different datasets and allow code reusability.
        """
        with open("configs/vqa/textvqa/lorra.yml") as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        
        #update config with includes for model specific config
#         for inc in config.get("includes", []):
#             config.update(yaml.load(open("pythia/"+ inc), Loader=yaml.FullLoader))

        config = ConfigNode(config)
    
        registry.register("config", config)
        self.config = config
        pprint.pprint(config.keys())
        textvqa_config = config.task_attributes.vqa.dataset_attributes.textvqa
        text_processor_config = textvqa_config.processors.text_processor
        text_processor_config.params.vocab.vocab_file = \
            "data/vocabs/vocabulary_100k.txt"
        answer_processor_config = textvqa_config.processors.answer_processor
        answer_processor_config.params.vocab_file = \
            "data/vocabs/answers_textvqa_8k.txt"
        self.text_processor = VocabProcessor(text_processor_config.params)
        self.answer_processor = VQAAnswerProcessor(answer_processor_config.params)
        registry.register("vqa2_text_processor", self.text_processor)
        registry.register("vqa2_answer_processor", self.answer_processor)
        registry.register("vqa2_num_final_outputs", 
                      self.answer_processor.get_vocab_size())
    
    def _build_pythia_model(self):
        state_dict = torch.load("data/models/pythia_train_val.pth")
        model_config = self.config.model_attributes.pythia
        model_config.model_data_dir = "./data"
        model = Pythia(mode_config)
        model.build()
        model.init_losses_and_metrics()
        if list(state_dict.keys())[0].startswith('module') and \
        not hasattr(model, 'module'):
            state_dict = self._multi_gpu_state_to_single(state_dict)
        model.load_state_dict(state_dict)
        model.to("cuda")
        model.eval()
        return model
    
    def _build_resnet_model(self):
        self.data_transforms = transforms.Compose([
            transforms.Resize(self.TARGET_IMAGE_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(self.CHANNEL_MEAN, self.CHANNEL_STD),
        ])
        resnet152 = models.resnet152(pretrained=True)
        resnet152.eval()
        modules = list(resnet152.children())[:2]
        self.resnet152_model = torch.nn.Sequential(*modules)
        self.resnet152_model.to("cuda")
        
    def _multi_gpu_state_to_single(self, state_dict):
        new_sd = {}
        for k, v in state_dict.items():
            if not k.startswith('module.'):
                raise TypeError("Not a multiple GPU state of dict")
            k1 = k[7:]
            new_sd[k1] = v
        return new_sd

    def predict(self, url, question):
        return
#         with torch.no_grad():
#             detectron_features = 

            

In [125]:
text_vqa = TextVQA()

odict_keys(['includes', 'task_attributes', 'model_attributes', 'optimizer_attributes', 'training_parameters'])


AttributeError: text_processor