## Object Detection with the OpenImages v4 classes

The paper to this project is [this](https://openaccess.thecvf.com/content/ICCV2021/papers/Narayan_Discriminative_Region-Based_Multi-Label_Zero-Shot_Learning_ICCV_2021_paper.pdf)<br>
The code is slightly modified copy from the one in the corresponding [github-repository](https://github.com/akshitac8/BiAM)


In [None]:
from IPython.display import Image
Image('../images/BiAM1.png')

### Understanding the Backbone
$\mathbf{x}_g$, the global or scene-context features as well as the regional features $\mathbf{x}_r$ are extracted for the different layers of an old-school VVG19 net.

In [None]:

Image('../images/vgg1.png')

In [None]:
import torch.nn.functional as F
import torchvision
from torch import nn
import torch


class Net(nn.Module):
    """
    computes regional features (backbone)
    """
    def __init__(self):
        super(Net, self).__init__()
        vgg19 = torchvision.models.vgg19(True)
        
        # all convolutions but the the last MaxPool2d layer
        self.regional_features = nn.Sequential(*list(vgg19.features[:-1]))
    def forward(self, x):
        # image with 224 x 224 x 3 but has to be in pytorch order
        x = x.view(-1, 3, 224, 224)
        return self.regional_features(x)

       
    
class vgg_net(nn.Module):
    """
    applies the last MaxPool2d to the backbone 'regional' features
    and passes them through part of the classifier to get a 4096
    contextual feature-vector
    """
    def __init__(self):
        super(vgg_net, self).__init__()
        vgg19 = torchvision.models.vgg19(True)
        # only MaxPool2d operation
        self.features = nn.Sequential(*list(vgg19.features[-1:]))
                        
        # classifier without the last Dropout and linear-layer 4096 -> 1000
        self.fc = nn.Sequential(*list(vgg19.classifier[0:-2]))
                  
        ## nnlist = []
        ## nnlist.append(vgg19.features[36])
        ## for i in list(vgg19.classifier[0:5]):
        ##     nnlist.append(i)
        ## self.fc = nn.Sequential(*nnlist)

    def forward(self, x):
        x = x.view([-1,512,14,14])
        x = self.features(x)
        # x = x.view(x.size(0),-1)
        x = x.reshape(x.size(0), -1)
        x = self.fc(x)
        return x
    
class Vgg(nn.Module):
    """
    takes contextual features and classifies in 1000 imagenet categories
    """
    def __init__(self):
        super(Vgg, self).__init__()
        vgg19 = torchvision.models.vgg19(True)
        self.final_classifier = nn.Sequential(*list(vgg19.classifier[-2:]))
    def forward(self, x):
        # input is batch-size, contextual-feature vector
        x = x.view(-1, 4096)
        return self.final_classifier(x)



In [None]:
vgg19 = torchvision.models.vgg19(True)
vgg19

In [None]:
Image('../images/vgg2.png')

### Joint Visual-Semantic Space

The target of the BiAM-Network are the Glove-embeddings of the OpenImages-Labels. The Network is trained to output embedding-vectors that are as near as possible to the embeddings of the labels.<br>
This is also how the zero-shot learning takes place.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
import random
import torchvision
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


class Net(nn.Module):
    """
    computes regional features (backbone)
    """
    def __init__(self):
        super(Net, self).__init__()
        vgg19 = torchvision.models.vgg19(True)
        
        # all convolutions but the the last MaxPool2d layer
        self.regional_features = nn.Sequential(*list(vgg19.features[:-1]))
    def forward(self, x):
        # image with 224 x 224 x 3 but has to be in pytorch order
        x = x.view(-1, 3, 224, 224)
        return self.regional_features(x)
    
class Vgg(nn.Module):
    """
    takes contextual features and classifies in 1000 imagenet categories
    """
    def __init__(self):
        super(Vgg, self).__init__()
        vgg19 = torchvision.models.vgg19(True)
        self.final_classifier = nn.Sequential(*list(vgg19.classifier[-2:]))
    def forward(self, x):
        # input is batch-size, contextual-feature vector
        x = x.view(-1, 4096)
        return self.final_classifier(x)

class vgg_net(nn.Module):
    """
    applies the last MaxPool2d to the backbone 'regional' features
    and passes them through part of the classifier to get a 4096
    contextual feature-vector
    """
    def __init__(self):
        super(vgg_net, self).__init__()
        vgg19 = torchvision.models.vgg19(True)
        # only MaxPool2d operation
        self.features = nn.Sequential(*list(vgg19.features[-1:]))
        # classifier without the last Dropout and linear-layer 4096 -> 1000
        self.fc = nn.Sequential(*list(vgg19.classifier[0:-2]))

    def forward(self, x):
        x = x.view([-1,512,14,14])
        x = self.features(x)
        x = x.view(x.size(0),-1)
        x = self.fc(x)
        return x


random.seed(3483)
np.random.seed(3483)
torch.manual_seed(3483)
torch.cuda.manual_seed(3483)
torch.cuda.manual_seed_all(3483)

def tensordot(x,y):
    return torch.matmul(x, y)
    # return torch.einsum("abc,cd->abd", (x, y))


def matmul(x,y):
    return torch.matmul(x, y)
    # return torch.einsum("ab,bc->ac", (x, y))

class CONV3_3(nn.Module):
    def __init__(self, num_in=512,num_out=512,kernel=3):
        super(CONV3_3, self).__init__()
        self.body = nn.Conv2d(num_in, num_out, kernel, padding=int((kernel-1)/2), dilation=1)
        self.bn = nn.BatchNorm2d(num_out, affine=True, eps=0.001, momentum=0.99)
        self.relu = nn.ReLU(True)
    def forward(self, x):
        x = self.body(x)
        x = self.relu(x)
        x = self.bn(x) 
        return x

class CONV1_1(nn.Module):
    def __init__(self, num_in=512,num_out=512,kernel=1):
        super(CONV1_1, self).__init__()
        self.body = nn.Conv2d(num_in, num_out, kernel, padding=int((kernel-1)/2), dilation=1)
    def forward(self, x):
        x = self.body(x)
        return x

class vgg_net(nn.Module):
    def __init__(self):
        super(vgg_net, self).__init__()
        vgg19 = torchvision.models.vgg19(True)
        self.features = nn.Sequential(*list(vgg19.features[-1:]))
        self.fc = nn.Sequential(*list(vgg19.classifier[0:-2]))

    def forward(self, x):
        x = x.view([-1,512,14,14])
        x = self.features(x)
        x = x.reshape(x.size(0),-1)
        x = self.fc(x)
        return x

class RCB(nn.Module):
    """
    Region contextualized block
    """
    def __init__(self, heads=8, d_model=512, d_ff=1024, dropout = 0.1):
        super(RCB, self).__init__()
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.w_q = nn.Conv2d(in_channels = d_model , out_channels = d_model , kernel_size=1, bias=True)
        self.w_k = nn.Conv2d(in_channels = d_model , out_channels = d_model , kernel_size=1, bias=True)
        self.w_v = nn.Conv2d(in_channels = d_model, out_channels = d_model, kernel_size=1, bias=True)
        self.w_o = nn.Conv2d(in_channels = d_model , out_channels = d_model , kernel_size=1, bias=True)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.sub_network = C_R(d_model, d_ff)

    def F_R(self, q, k, v, d_k, dropout=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
        #scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.Tensor([d_k])) #math.sqrt(d_k)
        #scores = torch.div(torch.matmul(q, k.transpose(-2, -1)), torch.sqrt(torch.Tensor([d_k]))) #math.sqrt(d_k)
        scores = scores.masked_fill(scores == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
        if dropout is not None:
            scores = dropout(scores) 
        return scores

    def forward(self, q_feat, k_feat, v_feat):
        if k_feat is None:
            k_feat = q_feat
        bs = q_feat.size(0)
        spa = q_feat.size(-1)
        residual = q_feat
        k_h_r = self.w_k(k_feat).view(bs, self.h, self.d_k, spa*spa).transpose(3,2)
        q_h_r = self.w_q(q_feat).view(bs, self.h, self.d_k, spa*spa).transpose(3,2)
        v_h_r = self.w_v(v_feat).view(bs, self.h, self.d_k, spa*spa).transpose(3,2)
        r_h = self.F_R(q_h_r, k_h_r, v_h_r, self.d_k, self.dropout_1)
        alpha_h = torch.matmul(r_h, v_h_r)
        o_r = alpha_h.transpose(1,2).contiguous().view(bs, -1, self.d_model)
        o_r = o_r.permute(0,2,1)
        o_r = o_r.view(-1,self.d_model,spa,spa)
        o_r = self.dropout_2(self.w_o(o_r))
        o_r += residual
        input_o_r = o_r
        e_r = self.sub_network(o_r)
        e_r += input_o_r
        return e_r

class C_R(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels = d_model , out_channels = d_ff , kernel_size= 1, bias=True)
        self.conv2 = nn.Conv2d(in_channels = d_ff , out_channels = d_model , kernel_size= 1, bias=True)
    def forward(self, x):
        x_out = self.conv2(F.relu(self.conv1(x), True))
        return x_out

class SCB(nn.Module):
    """
    scene contextualized block
    """
    def __init__(self, opt, D):
        super(SCB, self).__init__()
        self.channel_dim = opt.channel_dim
        self.sigmoid = nn.Sigmoid()
        self.gcdropout = nn.Dropout(0.2)
        self.lrelu = nn.LeakyReLU(0.2, False)
        self.w_g = nn.Conv2d(in_channels=4096,out_channels=self.channel_dim,kernel_size=1,bias=True) #nn.Linear(4096, self.channel_dim, bias=False) # 
        self.gcff = CONV3_3(num_in=self.channel_dim, num_out=self.channel_dim)
        self.channel_conv = CONV1_1(num_in=self.channel_dim, num_out=self.channel_dim)

    def F_G(self, q , k):
        r_g = q * k
        r_g = self.sigmoid(r_g)     
        r_g = r_g.view(-1,self.channel_dim,1)
        return r_g

    def forward(self, h_r, vecs, x_g):
        # import pdb;pdb.set_trace()
        q_g = self.lrelu(self.channel_conv(h_r))
        v_g =  self.lrelu(self.channel_conv(h_r))
        k_g = self.w_g(self.gcdropout(x_g).view(-1,4096,1,1))
        # k_g = self.w_g(self.gcdropout(x_g))
        q_g_value = q_g.view(-1,self.channel_dim,196).mean(-1).repeat(1,1,1).view(-1,self.channel_dim)
        r_g = self.F_G(q_g_value,k_g.view(-1,self.channel_dim))
        # r_g = self.F_G(q_g_value,k_g)
        c_g = r_g.unsqueeze(3).unsqueeze(4) * v_g.unsqueeze(2)
        c_g = c_g.view(-1,self.channel_dim,14,14)
        e_g = c_g + self.gcff(c_g)
        return e_g

class BiAM(nn.Module):
    def __init__(self, opt, dim_w2v=300, dim_feature=[196,512]):
        super(BiAM, self).__init__()
        D = dim_feature[1]     #### D is the feature dimension of attention windows
        self.channel_dim = opt.channel_dim
        self.conv_3X3 = CONV3_3(num_out=self.channel_dim)
        self.region_context_block = RCB(heads=opt.heads, d_model=self.channel_dim, d_ff=self.channel_dim*2, dropout = 0.1)
        self.scene_context_block = SCB(opt, D)
        self.W = nn.Linear(dim_w2v,D, bias=True)
        self.conv_1X1 = CONV1_1(num_in=self.channel_dim*2, num_out=D)
        self.lrelu = nn.LeakyReLU(0.2, True)

    def predict(self, e_f, vecs, W):
        classifiers = W(vecs)                                 
        m = tensordot(e_f, classifiers.t())                                   
        logits = torch.topk(m,k=6,dim=1)[0].mean(dim=1)
        return logits
        
    def forward(self, features, vecs, x_g):
        # import pdb;pdb.set_trace()
        x_r = features.view([-1,512,14,14])
        h_r = self.conv_3X3(x_r)
        e_r = self.region_context_block(h_r,h_r,h_r)
        e_g = self.scene_context_block(h_r, vecs, x_g)
        e_f = torch.cat([e_r, e_g], dim=1)
        e_f = self.lrelu(self.conv_1X1(e_f))
        e_f = e_f.permute(0,2,3,1)
        e_f = e_f.view(-1,196,512)
        logits = self.predict(e_f, vecs, self.W)
        return logits

def ranking_lossT(logitsT, labelsT):
    eps = 1e-8
    subset_idxT = torch.sum(torch.abs(labelsT),dim=0)
    subset_idxT = (subset_idxT>0).nonzero().view(-1).long().cuda()
    sub_labelsT = labelsT[:,subset_idxT]
    sub_logitsT = logitsT[:,subset_idxT]    
    positive_tagsT = torch.clamp(sub_labelsT,0.,1.)
    negative_tagsT = torch.clamp(-sub_labelsT,0.,1.)
    maskT = positive_tagsT.unsqueeze(1) * negative_tagsT.unsqueeze(-1)
    pos_score_matT = sub_logitsT * positive_tagsT
    neg_score_matT = sub_logitsT * negative_tagsT
    IW_pos3T = pos_score_matT.unsqueeze(1)
    IW_neg3T = neg_score_matT.unsqueeze(-1)
    OT = 1 + IW_neg3T - IW_pos3T
    O_maskT = maskT * OT
    diffT = torch.clamp(O_maskT, 0)
    violationT = torch.sign(diffT).sum(1).sum(1) 
    diffT = diffT.sum(1).sum(1) 
    lossT =  torch.mean(diffT / (violationT+eps))
    return lossT


In [None]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--lr', type=float, default=0.001 ,help='initial learning rate')
parser.add_argument('--lr_min', type=float, default=0.0002 ,help='minimum lr for scheduler drop')
parser.add_argument('--train_full_lr', type=float, default=0.0002 ,help='lr for finetuning')
parser.add_argument('--workers', type=int,help='number of data loading workers', default=0)
parser.add_argument('--manualSeed', type=int, help='manual seed')
parser.add_argument('--cuda', action='store_true',default=True, help='enables cuda')
parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
parser.add_argument('--nepoch', type=int, default=2000, help='number of epochs to train for')
parser.add_argument('--train', action='store_true',default=False, help='enables cuda')
parser.add_argument('--train_full_data', action='store_true',default=False, help='Only train a pretrained model')

parser.add_argument('--eval_interval', type=int, default=2)
parser.add_argument('--test_interval', type=int, default=10)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--val_batch_size', type=int, default=500)
parser.add_argument('--test_batch_size', type=int, default=500)


parser.add_argument('--save_path', type=str, default='test dataset type 2 split', help='details regarding the code')
parser.add_argument('--SESSION', type=str, default='SA_LRANK', help='MODEL NAME')
parser.add_argument('--job_id', type=str, default='14567', help='file job id')

parser.add_argument('--heads', type=int, default=4, help='Heads for region Atn')

parser.add_argument('--cosinelr_scheduler', action='store_true',default=False, help='Run with lr scheduler')
parser.add_argument('--summary', type=str, default='Summary', help='Summary of Expt')
parser.add_argument('--src', type=str,default="../../../../data")

parser.add_argument('--nseen_class', type=int, default=925,help='number of seen classes')
parser.add_argument('--nclass_all', type=int, default=1006,help='number of all classes')

parser.add_argument('--channel_dim', type=int, default=256,help='conv channel dim')

# opt = parser.parse_args()
opt, unknown = parser.parse_known_args()


In [None]:
biam_model_path = '/home/martin/python/fhnw_lecture/data'
glove_vectors = 'OpenImage_w2v_context_window_10_glove-wiki-gigaword-300.pkl'

In [None]:
from sklearn.preprocessing import normalize
import pickle
biam = BiAM(opt, dim_feature=[196,512])
biam.load_state_dict(torch.load(os.path.join(biam_model_path, 'model_BiAM.pth'),map_location=torch.device('cpu')))

src_att = pickle.load(open(os.path.join(biam_model_path, glove_vectors), 'rb'))
vecs_7186 = torch.from_numpy(normalize(src_att[0]))

In [None]:
import pandas as pd
import os
all_classes = pd.read_csv(os.path.join(biam_model_path, 'OpenImages/class-descriptions.csv'), header=None)
all_classes.columns = ['id', 'object']

In [None]:
trainable = pd.read_csv(os.path.join(biam_model_path, 'OpenImages/classes-trainable.txt'), header=None)
trainable.columns = ['id']

In [None]:
class_labels = all_classes.merge(trainable, on='id', how='inner')

In [None]:
class_labels = class_labels[['object']].values.reshape(-1)

In [None]:
import json
import ast
with open('/home/martin/python/fhnw_lecture/data/imagenet1000_clsidx_to_labels.txt', 'r') as tap:
    imagenet_cls = tap.read()
    
imagenet_cls = ast.literal_eval(imagenet_cls)    

imagenet_cls = np.asarray(list(imagenet_cls.values()))

In [None]:
contextual = vgg_net()
backbone = Net()
image_net_clf = Vgg()
biam.eval()
contextual.eval()
backbone.eval()
image_net_clf.eval()


In [None]:
import torchvision.transforms as transforms
from io import BytesIO
import requests
from PIL import Image
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # bilinear interpolation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

In [None]:
from bs4 import BeautifulSoup

import re
#'https://unsplash.com/s/photos/random-objects'
response = requests.get('https://unsplash.com/s/photos/random-objects')
soup = BeautifulSoup(response.text)
images = []
for img in soup.findAll('img'):
    images.append(img.get('src'))

print(images)


In [None]:
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
%matplotlib inline
#for im in range(0, 10):
for im in list(set(images)):
    try:
        # response = requests.get('https://picsum.photos/200/300?random=1')
        response = requests.get(im)
        img = Image.open(BytesIO(response.content))
    except:
        next
    
    i = transform(img) if img.mode == 'RGB' else None
    if type(i) == torch.Tensor:
        with torch.no_grad():

            regional_features = backbone(i)
            vgg_4096 = contextual(regional_features)
            biam_logits = biam(regional_features, vecs_7186, vgg_4096)
            img_net_logits = image_net_clf(vgg_4096)
            biam_probs = F.softmax(biam_logits, dim=1)
            img_net_probs = F.softmax(img_net_logits, dim=1)

        probabilities = biam_probs.cpu().numpy().reshape(-1)
        img_net_probabilities = img_net_probs.cpu().numpy().reshape(-1)
        order = np.flip(np.argsort(probabilities))
        order_imgnet = np.flip(np.argsort(img_net_probabilities))
        plt.figure()
        plt.imshow(img)
        print([(i, j) for i,j in zip(class_labels[order], probabilities[order])][0:10])
        print([(i,j) for i,j in zip(imagenet_cls[order_imgnet], img_net_probabilities[order_imgnet])][0:5])
        plt.show()
                                           


In [None]:
response = requests.get(im)
img = Image.open(BytesIO(response.content))
i = transform(img) if img.mode == 'RGB' else None
if type(img) == torch.Tensor:
    with torch.no_grad():

        regional_features = backbone(img)
        vgg_4096 = contextual(regional_features)
        biam_logits = biam(regional_features, vecs_7186, vgg_4096)
        img_net_logits = image_net_clf(vgg_4096)
        biam_probs = F.softmax(biam_logits, dim=1)
        img_net_probs = F.softmax(img_net_logits, dim=1)

    probabilities = biam_probs.cpu().numpy().reshape(-1)
    order = np.flip(np.argsort(probabilities))
    display(Image(img))