In [2]:
import torch, torchvision
import matplotlib.pyplot as plt
import json
import cv2
import numpy as np
from copy import deepcopy

In [3]:
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputs
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg

In [4]:
# if torch.cuda.is_available():
#     device = torch.device("cuda")

In [5]:
# x = torch.rand(10000,256,device=device)
# y = x.to(device)
# print(x[0:5,0:5])

In [6]:
cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"

def load_config_and_model_weights(cfg_path):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(cfg_path))

    # ROI HEADS SCORE THRESHOLD
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

    # Comment the next line if you're using 'cuda'
    cfg['MODEL']['DEVICE']='cpu'

    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(cfg_path)

    return cfg

cfg = load_config_and_model_weights(cfg_path)

In [7]:
def get_model(cfg):
    # build model
    model = build_model(cfg)

    # load weights
    checkpointer = DetectionCheckpointer(model)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    # eval mode
    model.eval()
    return model

model = get_model(cfg)

The checkpoint state_dict contains keys that are not used by the model:
  [35mproposal_generator.anchor_generator.cell_anchors.{0, 1, 2, 3, 4}[0m


In [8]:
import glob
import cv2
import os


# image1 = []
# image2 = []
# image3 = []
# image4 = []

# img_dir1 = '/home/btech/nityanand.mathur/hateful_memes/dataset/train/hateful'
# img_dir2= '/home/btech/nityanand.mathur/hateful_memes/dataset/train/non_hateful'
# img_dir3= '/home/btech/nityanand.mathur/hateful_memes/dataset/val/hateful'
# img_dir4= '/home/btech/nityanand.mathur/hateful_memes/dataset/val/non_hateful'
# image = '/home/btech/nityanand.mathur/hateful_memes/dataset/train/hateful/01235.png'
# img = cv2.imread(image)

# data_path1 = os.path.join(img_dir1,'*g')
# files = glob.glob(data_path1)
# for f1 in files:
#     img1 = cv2.imread(f1)
#     image1.append(img1)



# data_path2 = os.path.join(img_dir2,'*g')
# files = glob.glob(data_path2)
# for f1 in files:
#     img2 = cv2.imread(f1)
#     image2.append(img2)


# data_path3 = os.path.join(img_dir3,'*g')
# files = glob.glob(data_path3)
# for f1 in files:
#     img3= cv2.imread(f1)
#     image3.append(img3)


# data_path4 = os.path.join(img_dir4,'*g')
# files = glob.glob(data_path4)
# for f1 in files:
#     img4 = cv2.imread(f1)
#     image4.append(img4)

# image_bgr1 = []
# for i in image1:
#     img_bgr = cv2.cvtColor(i,cv2.COLOR_RGB2BGR)
#     image_bgr1.append(img_bgr)

# image_bgr2 = []
# for i in image2:
#     img_bgr = cv2.cvtColor(i,cv2.COLOR_RGB2BGR)
#     image_bgr2.append(img_bgr)

# image_bgr3 = []
# for i in image3:
#     img_bgr = cv2.cvtColor(i,cv2.COLOR_RGB2BGR)
#     image_bgr3.append(img_bgr)

# image_bgr4 = []
# for i in image4:
#     img_bgr = cv2.cvtColor(i,cv2.COLOR_RGB2BGR)
#     image_bgr4.append(img_bgr)

# del(image1)
# del(image2)
# del(image3)
# del(image4)

In [9]:
# print(img.shape)

In [10]:
import numpy as np


In [11]:
def prepare_image_inputs(cfg, img_list):
    # Resizing the image according to the configuration
    transform_gen = T.ResizeShortestEdge(
                [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
            )
    img_list = [transform_gen.get_transform(img).apply_image(img) for img in img_list]

    # Convert to C,H,W format
    convert_to_tensor = lambda x: torch.Tensor(x.astype("float32").transpose(2, 0, 1))#.to(device=device)

    batched_inputs = [{"image":convert_to_tensor(img), "height": img.shape[0], "width": img.shape[1]} for img in img_list]

    # Normalizing the image
    num_channels = len(cfg.MODEL.PIXEL_MEAN)
    pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1)#.to(device=device)
    pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)#.to(device=device)
    normalizer = lambda x: (x - pixel_mean) / pixel_std
    images = [normalizer(x["image"]) for x in batched_inputs]

    # Convert to ImageList
    images =  ImageList.from_tensors(images,model.backbone.size_divisibility)
    
    return images, batched_inputs

def get_features(model, images):
    features = model.backbone(images.tensor)
    return features

    
def get_proposals(model, images, features):
    proposals, _ = model.proposal_generator(images, features)
    return proposals


def get_box_features(model, features, proposals):
    features_list = [features[f] for f in ['p2', 'p3', 'p4', 'p5']]
    box_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    box_features = model.roi_heads.box_head.flatten(box_features)
    box_features = model.roi_heads.box_head.fc1(box_features)
    box_features = model.roi_heads.box_head.fc_relu1(box_features)
    box_features = model.roi_heads.box_head.fc2(box_features)

    box_features = box_features.reshape(1, 1000, 1024) # depends on your config and batch size
    return box_features, features_list


def get_prediction_logits(model, features_list, proposals):
    cls_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    cls_features = model.roi_heads.box_head(cls_features)
    pred_class_logits, pred_proposal_deltas = model.roi_heads.box_predictor(cls_features)
    return pred_class_logits, pred_proposal_deltas


def get_box_scores(cfg, pred_class_logits, pred_proposal_deltas):
    box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
    smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA

    outputs = FastRCNNOutputs(
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta,
    )

    boxes = outputs.predict_boxes()
    scores = outputs.predict_probs()
    image_shapes = outputs.image_shapes

    return boxes, scores, image_shapes


def get_output_boxes(boxes, batched_inputs, image_size):
    proposal_boxes = boxes.reshape(-1, 4).clone()
    scale_x, scale_y = (batched_inputs["width"] / image_size[1], batched_inputs["height"] / image_size[0])
    output_boxes = Boxes(proposal_boxes)

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(image_size)

    return output_boxes


def select_boxes(cfg, output_boxes, scores):
    test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
    test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
    cls_prob = scores.detach().cpu() #added CPU
    cls_boxes = output_boxes.tensor.detach().cpu().reshape(1000,80,4) # added CPU 
    max_conf = torch.zeros((cls_boxes.shape[0]))
    for cls_ind in range(0, cls_prob.shape[1]-1):
        cls_scores = cls_prob[:, cls_ind+1]
        det_boxes = cls_boxes[:,cls_ind,:]
        keep = np.array(nms(det_boxes, cls_scores, test_nms_thresh))
        max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep])
    keep_boxes = torch.where(max_conf >= test_score_thresh)[0]
    return keep_boxes, max_conf

def filter_boxes(keep_boxes, max_conf, min_boxes, max_boxes):
    if len(keep_boxes) < min_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
    elif len(keep_boxes) > max_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
    return keep_boxes


def get_visual_embeds(box_features, keep_boxes):
    return box_features[keep_boxes.copy()]


In [12]:
a=[]
b=[]
c=[]

In [None]:
#output4 = []
#f = open("output2.txt", "a")
import json

# last_hidden_states=[0]*769
# Opening JSON file
f = open('hateful_memes/test_seen.jsonl', 'r')



data = json.load(f)

count = 1000
# 1079,2000
for i in range(243,999):    
    a=[]
    b=[]
    c=[]
    #reading from json file and converting from RGB to BGR
    img_path = 'hateful_memes/' + data[i]['img']
    data_label =  data[i]['label']
    count=count+1
    # print(count)
    print(i)
    print(img_path)
    img = cv2.imread(img_path)
    img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    #loading the image and converting to tensor
    images, batched_inputs = prepare_image_inputs(cfg, [img_bgr])
    features = get_features(model, images)
    features.keys()
    proposals = get_proposals(model, images, features)
    box_features, features_list = get_box_features(model, features, proposals)
    pred_class_logits, pred_proposal_deltas = get_prediction_logits(model, features_list, proposals)
    boxes, scores, image_shapes = get_box_scores(cfg, pred_class_logits, pred_proposal_deltas)
    output_boxes = [get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]
    temp = [select_boxes(cfg, output_boxes[i], scores[i]) for i in range(len(scores))]
    keep_boxes, max_conf = [],[]
    for keep_box, mx_conf in temp:
        keep_boxes.append(keep_box)
        max_conf.append(mx_conf)
    MIN_BOXES=10
    MAX_BOXES=100
    keep_boxes = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]

    from transformers import BertTokenizer, VisualBertModel
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model1 = VisualBertModel.from_pretrained("uclanlp/visualbert-nlvr2-coco-pre")
    #tokenizing text inputs
    inputs = tokenizer(data[i]['text'], return_tensors="pt") 
    #finding visual embeddings
    visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]
    visual_embeds = torch.stack(visual_embeds)
    visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
    visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
    #adding both embeddings
    inputs.update(
        {
            "visual_embeds": visual_embeds,
            "visual_token_type_ids": visual_token_type_ids,
            "visual_attention_mask": visual_attention_mask,
        }
    )
    outputs = model1(**inputs)  
    # f.write(str(outputs))
    #print(outputs)
    last_hidden_states = outputs.last_hidden_state
    #hs.write(str(last_hidden_states))
# print(last_hidden_states)


    last_hidden_states.shape
    
    
    x= last_hidden_states[0][0].detach().numpy() 
    
        
    a.append(x)
    c.append(data_label)
    print(data_label)
    import pandas as pd
    df = pd.DataFrame(a)
    df['labels']= c
    df.to_csv('vbOutput_copy_test.csv' ,mode= 'a' , index=False, header=False)
f.close()

In [None]:
import pandas as pd
df4= pd.read_csv('vbOutput_copy_test_seen.csv')

In [14]:
df4

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,labels
0,0.170978,0.089326,0.429117,0.070517,0.067093,0.235964,0.108436,0.030929,-0.110482,-0.014652,...,0.193023,0.013946,-0.091701,0.274277,-0.052339,0.121195,0.168660,0.081520,0.149842,0
1,0.089268,0.048875,0.408846,0.066852,-0.071109,0.176630,0.021386,-0.000114,-0.169066,-0.141772,...,0.294075,0.137564,-0.144482,0.463317,-0.068560,0.318430,0.235068,0.101768,0.168836,0
2,0.256827,-0.026399,0.623371,0.175083,-0.191368,0.322828,-0.106844,0.412517,-0.237286,-0.250846,...,0.476146,0.097350,-0.194196,0.430817,0.024029,0.256070,0.263368,0.081743,0.260702,0
3,0.270728,0.333192,0.521146,0.495716,0.039000,0.405105,-0.137310,0.528017,-0.419496,-0.247839,...,0.299528,-0.027163,0.093689,0.423370,0.032230,0.236399,0.021081,-0.070043,0.338585,0
4,0.205579,-0.094509,0.260854,0.360824,-0.024221,0.257254,-0.019049,0.381762,-0.370187,-0.259235,...,0.691821,0.056803,-0.206222,0.129076,0.144424,0.110602,0.049915,0.016529,0.653590,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5018,-0.010783,0.171946,0.504565,0.186444,0.030936,0.359977,0.147488,0.118704,-0.173143,0.001097,...,0.126878,0.046461,-0.112887,0.287010,-0.026144,0.228265,0.200479,0.122199,0.152073,0
5019,0.186702,-0.085937,0.399042,0.246066,-0.373190,0.360632,-0.033751,0.520471,-0.224998,-0.324981,...,0.563349,0.393105,0.013627,0.250617,0.192441,0.424167,0.220093,0.032374,0.450843,0
5020,0.243489,0.296555,0.390999,0.253801,-0.000433,0.414278,-0.284034,0.365687,-0.375300,0.082686,...,0.029395,-0.171186,-0.153967,0.326386,-0.129204,0.384538,0.006830,0.058571,0.216073,1
5021,0.267090,0.099608,0.442221,0.057278,-0.001420,0.226526,0.080998,0.071496,-0.149249,0.002621,...,0.238559,-0.057606,-0.065466,0.358417,-0.041131,0.211333,0.207369,0.070160,0.124140,1
