In [None]:
# Install dependencies
!pip install ninja yacs cython matplotlib demjson
!pip install git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI

In [None]:
!ls

In [None]:
%cd vqa-maskrcnn-benchmark

In [None]:
!ls

In [None]:
!sudo python setup.py build
!sudo python setup.py develop

In [1]:
import sys
sys.path.append('/home/rmandli_g_clemson_edu/pythia')
sys.path.append('/home/rmandli_g_clemson_edu/pythia/vqa-maskrcnn-benchmark')

In [None]:
!sudo conda install pandas -y

In [3]:
import yaml
import cv2
import torch
import requests
import numpy as np
import gc
import torch.nn.functional as F
import pandas as pd

import torchvision.models as models
import torchvision.transforms as transforms


from PIL import Image
from IPython.display import display, HTML, clear_output
from ipywidgets import widgets, Layout
from io import BytesIO

from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.layers import nms
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.structures.image_list import to_image_list
from maskrcnn_benchmark.utils.model_serialization import load_state_dict

from pythia.utils.configuration import ConfigNode
from pythia.tasks.processors import VocabProcessor, VQAAnswerProcessor
from pythia.tasks.processors import FastTextProcessor, SoftCopyAnswerProcessor, SimpleWordProcessor
from pythia.models import LoRRA
from pythia.common.registry import registry
from pythia.common.sample import Sample, SampleList
from pythia.tasks.vqa.vqa2 import VQA2Dataset
from pythia.tasks.concat_dataset import PythiaConcatDataset
from pprint import pprint

In [2]:
%cd /home/rmandli_g_clemson_edu/pythia

!ls configs/vqa/textvqa/lorra.yml

/home/rmandli_g_clemson_edu/pythia
configs/vqa/textvqa/lorra.yml


In [20]:
pythia_path = '/home/rmandli_g_clemson_edu/pythia'
lorra_model_config_path = '/home/rmandli_g_clemson_edu/pythia/configs/vqa/textvqa/lorra.yml'
detectron_model_config_path = '/home/rmandli_g_clemson_edu/pythia/configs/detectron_model/detectron_model.yaml'

In [14]:
!wget -O data/vocabulary_100k.txt https://dl.fbaipublicfiles.com/pythia/data/vocabulary_100k.txt

--2019-06-02 20:04:15--  https://dl.fbaipublicfiles.com/pythia/data/vocabulary_100k.txt
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:6a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 626738 (612K) [text/plain]
Saving to: ‘data/vocabulary_100k.txt’


2019-06-02 20:04:16 (17.4 MB/s) - ‘data/vocabulary_100k.txt’ saved [626738/626738]



In [19]:
!wget -O configs/detectron_model/detectron_model.yaml https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.yaml

--2019-06-02 20:11:08--  https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.yaml
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:16a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 918 [text/plain]
Saving to: ‘configs/detectron_model/detectron_model.yaml’


2019-06-02 20:11:08 (18.1 MB/s) - ‘configs/detectron_model/detectron_model.yaml’ saved [918/918]



In [21]:
!wget -O data/detectron/model/detectron_model.pth  https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.pth 

--2019-06-02 20:14:05--  https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.pth
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:16a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 684079216 (652M) [application/octet-stream]
Saving to: ‘data/detectron/model/detectron_model.pth’


2019-06-02 20:14:14 (83.4 MB/s) - ‘data/detectron/model/detectron_model.pth’ saved [684079216/684079216]



In [18]:
class TextVQADemo:
    TARGET_IMAGE_SIZE = [448, 448]
    CHANNEL_MEAN = [0.485, 0.456, 0.406]
    CHANNEL_STD = [0.229, 0.224, 0.225]
    
    def __init__(self):
        self._init_processors()
        self.lorra_model = self._build_lorra_model()
        self.detection_model = self._build_detection_model()
        self.resnet_model = self._build_resnet_model()
        
    def _init_processors(self):
        """
        Pythia uses processors is to keep data processing pipelines as similar as
        possible for different datasets and allow code reusability.
        """
        with open("configs/vqa/textvqa/lorra.yml") as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
            

        #update config with includes for model specific config
        for inc in config.get("includes", []):
            config.update(yaml.load(open("pythia/"+ inc), Loader=yaml.FullLoader))

        config = ConfigNode(config)
        config.datasets = 'textvqa'
        config.training_parameters.evalai_inference = True
        textvqa_config = config.task_attributes.vqa.dataset_attributes.textvqa
        
        answer_processor_config = textvqa_config.processors.answer_processor
        answer_processor_config.params.vocab_file = \
            "data/vocabs/answers_textvqa_more_than_1.txt"
        self.answer_processor = SoftCopyAnswerProcessor(answer_processor_config.params)
        print("self.answer_processor.get_vocab_size()", self.answer_processor.get_vocab_size())
    
        registry.register("textvqa_num_final_outputs", 
                      self.answer_processor.get_vocab_size())
        registry.register("textvqa_answer_processor", self.answer_processor)
        
        text_processor_config = textvqa_config.processors.text_processor
        text_processor_config.params.vocab.vocab_file = "data/vocabs/vocabulary_100k.txt"
        self.text_processor = VocabProcessor(text_processor_config.params)
        registry.register("textvqa_text_processor", self.text_processor)
        registry.register("textvqa_text_vocab_size", 
                      self.text_processor.get_vocab_size())
        
        ocr_token_processor_config = textvqa_config.processors.ocr_token_processor
        self.ocr_token_processor = SimpleWordProcessor(ocr_token_processor_config)
        registry.register("textvqa_ocr_token_processor", self.ocr_token_processor)
        #pprint(text_processor_config)
        
        context_processor_config = textvqa_config.processors.context_processor
        self.context_processor = FastTextProcessor(context_processor_config.params)
        registry.register("textvqa_context_processor", self.context_processor)
        
        self.config = config

        registry.register("config", config)
        #pprint(config.classifier.keys())
        
    def _prepare_data_set(self, dataset_type, config):
        if dataset_type not in config.imdb_files:
            raise ValueError(
                "Dataset type {} is not present in "
                "imdb_files of dataset config".format(dataset_type)
            )

        imdb_files = self.config["imdb_files"][dataset_type]

        datasets = []

        for imdb_idx in range(len(imdb_files)):
            dataset = VQA2Dataset(dataset_type, imdb_idx, config)
            datasets.append(dataset)

        dataset = PythiaConcatDataset(datasets)

        return dataset 

    def _add_imdb_details(self):
        imdb_files = self.config.imdb_files
        dataset_type = 'textvqa'
        if dataset_type not in imdb_files:
            raise ValueError(
                "Dataset type {} is not present in "
                "imdb_files of dataset config".format(dataset_type)
            )

        self.imdb_file = imdb_files[dataset_type][imdb_file_index]
        self.imdb_file = self._get_absolute_path(self.imdb_file)
        self.imdb = ImageDatabase(self.imdb_file)
        
    def _multi_gpu_state_to_single(self, state_dict):
        new_sd = {}
        for k, v in state_dict.items():
            if not k.startswith('module.'):
                raise TypeError("Not a multiple GPU state of dict")
            k1 = k[7:]
            new_sd[k1] = v
        return new_sd
    

    def _build_lorra_model(self):
        state_dict = torch.load('data/models/lorra_best.pth')
        #pprint(state_dict)
        model_config = self.config.model_attributes.lorra
        pprint(self.config.datasets)
        model = LoRRA(model_config)
        model.build()
        model.init_losses_and_metrics()
        self.model = model
        #print(model.params)
        if list(state_dict.keys())[0].startswith('module') and \
           not hasattr(model, 'module'):
            state_dict = self._multi_gpu_state_to_single(state_dict)
        model.load_state_dict(state_dict)
        model.to("cuda")
        model.eval()
    
        return model
    
    def _build_resnet_model(self):
        self.data_transforms = transforms.Compose([
            transforms.Resize(self.TARGET_IMAGE_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(self.CHANNEL_MEAN, self.CHANNEL_STD),
        ])
        resnet152 = models.resnet152(pretrained=True)
        resnet152.eval()
        modules = list(resnet152.children())[:-2]
        self.resnet152_model = torch.nn.Sequential(*modules)
        self.resnet152_model.to("cuda")
        
    def _build_detection_model(self):
        cfg.merge_from_file('configs/detectron_model/detectron_model.yaml')
        cfg.freeze()
        model = build_detection_model(cfg)
        checkpoint = torch.load('data/detectron/model/detectron_model.pth', 
                                  map_location=torch.device("cpu"))
        load_state_dict(model, checkpoint.pop("model"))

        model.to("cuda")
        model.eval()
        return model
    
    
    def add_ocr_details(self, sample_info, sample):
        if self.use_ocr:
            # Preprocess OCR tokens
            ocr_tokens = [
                self.ocr_token_processor({"text": token})["text"]
                for token in sample_info["ocr_tokens"]
            ]
            # Get embeddings for tokens
            context = self.context_processor({"tokens": ocr_tokens})
            sample.context = context["text"]
            sample.context_tokens = context["tokens"]
            sample.context_feature_0 = context["text"]
            sample.context_info_0 = Sample()
            sample.context_info_0.max_features = context["length"]

            order_vectors = torch.eye(len(sample.context_tokens))
            order_vectors[context["length"] :] = 0
            sample.order_vectors = order_vectors

        if self.use_ocr_info and "ocr_info" in sample_info:
            sample.ocr_bbox = self.bbox_processor({"info": sample_info["ocr_info"]})[
                "bbox"
            ]

        return sample
    
    def predict(self, url, question):
        with torch.no_grad():
            detectron_features = self.get_detectron_features(url)
            resnet_features = self.get_resnet_features(url)
        
            sample = Sample()

            processed_text = self.text_processor({"text": question})
            sample.text = processed_text["text"]
            sample.text_len = len(processed_text["tokens"])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample({
              "max_features": torch.tensor(100, dtype=torch.long)
            })
            
            
            self._prepare_data_set("textvqa", self.config.task_attributes.vqa.dataset_attributes.textvqa)
            
            # Get embeddings for tokens
            context = self.context_processor({"tokens": processed_text["tokens"]})
            sample.context = context["text"]
            sample.context_tokens = context["tokens"]
            sample.context_feature_0 = context["text"]
            sample.context_info_0 = Sample()
            sample.context_info_0.max_features = context["length"]

            order_vectors = torch.eye(len(sample.context_tokens))
            order_vectors[context["length"] :] = 0
            sample.order_vectors = order_vectors

            sample.image_feature_1 = resnet_features

            sample_list = SampleList([sample])
            sample_list = sample_list.to("cuda")

            scores = self.lorra_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            probs = []
            answers = []

            for idx, score in enumerate(top_scores):
                probs.append(score.item())
                answers.append(
                    self.answer_processor.idx2word(top_indices[idx].item())
                )

        gc.collect()
        torch.cuda.empty_cache()
        return probs, answers
    
    def get_actual_image(self, image_path):
        if image_path.startswith('http'):
            path = requests.get(image_path, stream=True).raw
        else:
            path = image_path
        return path

    def _image_transform(self, image_path):
        path = self.get_actual_image(image_path)

        img = Image.open(path)
        im = np.array(img).astype(np.float32)
        im = im[:, :, ::-1]
        im -= np.array([102.9801, 115.9465, 122.7717])
        im_shape = im.shape
        im_size_min = np.min(im_shape[0:2])
        im_size_max = np.max(im_shape[0:2])
        im_scale = float(800) / float(im_size_min)
        # Prevent the biggest axis from being more than max_size
        if np.round(im_scale * im_size_max) > 1333:
            im_scale = float(1333) / float(im_size_max)
        im = cv2.resize(
           im,
           None,
           None,
           fx=im_scale,
           fy=im_scale,
           interpolation=cv2.INTER_LINEAR
           )
        img = torch.from_numpy(im).permute(2, 0, 1)
        return img, im_scale
    
    def _process_feature_extraction(self, output,
                                    im_scales,
                                    feat_name='fc6',
                                    conf_thresh=0.2):
        batch_size = len(output[0]["proposals"])
        n_boxes_per_image = [len(_) for _ in output[0]["proposals"]]
        score_list = output[0]["scores"].split(n_boxes_per_image)
        score_list = [torch.nn.functional.softmax(x, -1) for x in score_list]
        feats = output[0][feat_name].split(n_boxes_per_image)
        cur_device = score_list[0].device

        feat_list = []

        for i in range(batch_size):
            dets = output[0]["proposals"][i].bbox / im_scales[i]
            scores = score_list[i]

            max_conf = torch.zeros((scores.shape[0])).to(cur_device)

            for cls_ind in range(1, scores.shape[1]):
                cls_scores = scores[:, cls_ind]
                keep = nms(dets, cls_scores, 0.5)
                max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep],
                                            cls_scores[keep],
                                            max_conf[keep])

            keep_boxes = torch.argsort(max_conf, descending=True)[:100]
            feat_list.append(feats[i][keep_boxes])
        return feat_list

    def masked_unk_softmax(self, x, dim, mask_idx):
        x1 = F.softmax(x, dim=dim)
        x1[:, mask_idx] = 0
        x1_sum = torch.sum(x1, dim=1, keepdim=True)
        y = x1 / x1_sum
        return y

    def get_resnet_features(self, image_path):
        path = self.get_actual_image(image_path)
        img = Image.open(path).convert("RGB")
        img_transform = self.data_transforms(img)
        
        if img_transform.shape[0] == 1:
            img_transform = img_transform.expand(3, -1, -1)
        img_transform = img_transform.unsqueeze(0).to("cuda")
        
        features = self.resnet152_model(img_transform).permute(0, 2, 3, 1)
        features = features.view(196, 2048)
        return features

    def get_detectron_features(self, image_path):
        im, im_scale = self._image_transform(image_path)
        img_tensor, im_scales = [im], [im_scale]
        current_img_list = to_image_list(img_tensor, size_divisible=32)
        current_img_list = current_img_list.to('cuda')
        with torch.no_grad():
            output = self.detection_model(current_img_list)
        feat_list = self._process_feature_extraction(output, im_scales, 
                                                    'fc6', 0.2)
        return feat_list[0]

In [19]:
demo = TextVQADemo()

self.answer_processor.get_vocab_size() 4047
'textvqa'
text_embeddings:  2048
text_embeddings:  2048
text_embeddings:  2048
Multi Modal combine layer: 5000
Classifier dimensions: 10000  out_dim 4047


In [20]:
def init_widgets(url, question):
    image_text = widgets.Text(
        description="Image URL", layout=Layout(minwidth="70%")
    )
    question_text = widgets.Text(
        description="Question", layout=Layout(minwidth="70%")
    )

    image_text.value = url
    question_text.value = question
    submit_button = widgets.Button(description="Ask TextVQA!")

    display(image_text)
    display(question_text)
    display(submit_button)

    submit_button.on_click(lambda b: on_button_click(
      b, image_text, question_text
    ))

    return image_text, question_text

In [21]:
def on_button_click(b, image_text, question_text):
    clear_output()
    image_path = demo.get_actual_image(image_text.value)
    image = Image.open(image_path)
  
    scores, predictions = demo.predict(image_text.value, question_text.value)
    scores = [score * 100 for score in scores]
    df = pd.DataFrame({
      "Prediction": predictions,
      "Confidence": scores
    })
  
    init_widgets(image_text.value, question_text.value)
    display(image)

    display(HTML(df.to_html()))

In [22]:
image_text, question_text = init_widgets(
    "http://images.cocodataset.org/train2017/000000505539.jpg", 
    "where is this place?"
)


Text(value='http://images.cocodataset.org/train2017/000000505539.jpg', description='Image URL')

Text(value='where is this place?', description='Question')

Button(description='Ask TextVQA!', style=ButtonStyle())