In [26]:
import torch
import json
import numpy as np

from PIL import Image
from utility import ViltImageSetProcessor
from transformers import BertForSequenceClassification
from torch import nn
from copy import deepcopy
from isvqa_data_setup import ISVQA
from torch.utils.data import DataLoader
from engine import max_to_one_hot
from collections import Counter
from torch.optim.lr_scheduler import StepLR
from modified_transformers import ViltForQuestionAnswering as Baseline

In [19]:
import torch
from prettytable import PrettyTable

def count_model_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params += param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

In [20]:
import json
import random

from copy import deepcopy

with open("/home/nikostheodoridis/nuscenes/v1.0-trainval/sample_data.json") as f:
    set = json.load(f)

In [38]:
t = torch.tensor([[[[0.5, 0.5, 2, 2, 1, 1, 0, 0]]]]).permute(3, 0, 1, 2)

In [141]:
noise_factor = torch.randn(8, 6)
noise_factor

tensor([[-0.4044,  0.3408,  0.1087, -1.0835,  0.0249,  0.5760],
        [-0.6841,  1.4932,  0.7315,  0.3696, -0.3710, -0.3811],
        [-0.9306, -0.7599,  0.0304,  0.9442,  0.8090,  0.1756],
        [ 0.9852,  0.1327,  0.8538,  1.1149,  1.0211, -0.2032],
        [-0.6104, -0.2446, -0.6902,  0.4723,  0.7550,  1.8322],
        [ 0.9440, -0.3154, -0.9049, -1.0326,  0.5432,  0.9103],
        [ 1.2303,  1.5866,  0.1275, -1.5437, -0.0708, -0.0039],
        [-0.1501, -1.1086, -0.3724,  0.1634, -0.4599,  0.3803]])

In [142]:
torch.randn_like(noise_factor)

tensor([[ 0.2725,  1.4158,  0.2247,  1.0662,  0.7006, -2.4877],
        [-0.4855, -1.8564,  2.9844, -0.5942, -1.0811,  1.2101],
        [-0.0240,  0.9791,  1.2967, -2.5906, -0.4644, -0.7949],
        [-0.0980, -1.4887,  0.7977,  0.0402, -1.1820, -0.6645],
        [-2.1476, -3.0469, -0.4683,  1.4268,  0.9250,  0.7324],
        [ 0.8740, -0.7800,  1.0434, -0.0034,  0.2543,  0.9389],
        [ 0.1522,  0.9004,  1.9317, -0.0290, -1.6792, -3.3612],
        [-0.8901, -1.3162, -0.5851, -0.9978, -0.3627,  0.5429]])

In [59]:
attn_scores = torch.rand(8, 1, 6)

In [60]:
attn_scores

tensor([[[0.2221, 0.2914, 0.7832, 0.6062, 0.4547, 0.7412]],

        [[0.9836, 0.2860, 0.2046, 0.3320, 0.6811, 0.2645]],

        [[0.2581, 0.9593, 0.0395, 0.4757, 0.2544, 0.8954]],

        [[0.0906, 0.6056, 0.9974, 0.6916, 0.0303, 0.3662]],

        [[0.7107, 0.4780, 0.1377, 0.2741, 0.7966, 0.7660]],

        [[0.3671, 0.2790, 0.9802, 0.1566, 0.4528, 0.3579]],

        [[0.6882, 0.7649, 0.7570, 0.6947, 0.8298, 0.1939]],

        [[0.9151, 0.1664, 0.5766, 0.6750, 0.2826, 0.6396]]])

In [61]:
maxs = attn_scores.max(dim=2)[0].unsqueeze(2)
maxs

tensor([[[0.7832]],

        [[0.9836]],

        [[0.9593]],

        [[0.9974]],

        [[0.7966]],

        [[0.9802]],

        [[0.8298]],

        [[0.9151]]])

In [48]:
attn_scores.max(dim=2)[0].shape

torch.Size([8, 1])

In [62]:
(attn_scores / maxs).shape

torch.Size([8, 1, 6])

In [86]:
weights = (attn_scores / maxs).squeeze()

In [89]:
x = torch.randn(2, 3)
y = torch.randn(2, 3)

In [90]:
x

tensor([[ 1.5050,  0.5403,  1.1741],
        [ 1.2193, -1.1905, -1.8367]])

In [93]:
image = torch.randn(8, 6, 3, 352, 608)
noise = torch.randn(8, 6, 1, 1, 1)
noisy_image = image + noise

In [111]:
image[7, 5]

tensor([[[-2.0366,  0.3002,  0.2240,  ...,  1.6317, -0.3505, -0.9450],
         [-0.8359, -0.5737, -0.2112,  ...,  0.1756, -0.2384, -0.1210],
         [ 1.9564, -2.2391, -0.8146,  ...,  0.1312,  1.3213,  0.6846],
         ...,
         [ 1.0601,  0.1381,  0.3950,  ..., -0.8237,  0.1875, -1.5251],
         [-1.5678, -0.9031, -2.8113,  ...,  0.7849, -0.7146,  0.8592],
         [-1.4974,  0.1687, -0.8978,  ...,  1.0746, -0.1101, -0.3491]],

        [[-0.7154, -0.8828, -0.4064,  ...,  1.4242, -0.7499,  0.7787],
         [ 0.8162, -0.1590, -0.0790,  ..., -1.4163,  0.9879,  0.6088],
         [ 0.0808,  0.4854,  0.2819,  ..., -0.5682, -1.2184,  0.5419],
         ...,
         [ 0.5976, -1.0927, -1.9151,  ...,  1.0141,  0.7429,  1.4709],
         [ 0.5953,  1.1467,  2.6890,  ...,  1.0548, -0.0602,  0.3727],
         [-2.3645,  0.3572,  0.9312,  ...,  1.1767, -0.5813, -0.0706]],

        [[-0.0658,  0.8294, -2.2025,  ..., -0.1788, -2.3541, -0.9722],
         [ 1.6987,  1.0280, -0.7680,  ..., -1

In [110]:
noise[7, 5]

tensor([[[-0.1038]]])

In [112]:
noisy_image[7, 5]

tensor([[[-2.1404,  0.1964,  0.1202,  ...,  1.5279, -0.4543, -1.0488],
         [-0.9397, -0.6775, -0.3150,  ...,  0.0718, -0.3421, -0.2248],
         [ 1.8526, -2.3429, -0.9184,  ...,  0.0274,  1.2176,  0.5808],
         ...,
         [ 0.9563,  0.0343,  0.2912,  ..., -0.9274,  0.0837, -1.6288],
         [-1.6716, -1.0069, -2.9151,  ...,  0.6811, -0.8184,  0.7554],
         [-1.6012,  0.0649, -1.0015,  ...,  0.9708, -0.2139, -0.4528]],

        [[-0.8192, -0.9866, -0.5102,  ...,  1.3205, -0.8537,  0.6749],
         [ 0.7125, -0.2628, -0.1828,  ..., -1.5201,  0.8841,  0.5050],
         [-0.0229,  0.3816,  0.1781,  ..., -0.6719, -1.3221,  0.4381],
         ...,
         [ 0.4938, -1.1965, -2.0189,  ...,  0.9103,  0.6391,  1.3672],
         [ 0.4915,  1.0430,  2.5853,  ...,  0.9510, -0.1640,  0.2690],
         [-2.4682,  0.2534,  0.8274,  ...,  1.0730, -0.6851, -0.1744]],

        [[-0.1696,  0.7257, -2.3063,  ..., -0.2826, -2.4578, -1.0760],
         [ 1.5950,  0.9242, -0.8718,  ..., -1

In [91]:
y

tensor([[ 0.4297,  0.7265,  1.7333],
        [ 1.7115, -0.4212, -0.6956]])

In [92]:
x*y

tensor([[0.6467, 0.3925, 2.0350],
        [2.0867, 0.5014, 1.2775]])

In [88]:
1 - weights

tensor([[0.7164, 0.6280, 0.0000, 0.2260, 0.4195, 0.0537],
        [0.0000, 0.7092, 0.7920, 0.6624, 0.3075, 0.7311],
        [0.7310, 0.0000, 0.9588, 0.5041, 0.7348, 0.0667],
        [0.9092, 0.3929, 0.0000, 0.3066, 0.9697, 0.6328],
        [0.1079, 0.4000, 0.8271, 0.6559, 0.0000, 0.0384],
        [0.6255, 0.7154, 0.0000, 0.8403, 0.5381, 0.6349],
        [0.1707, 0.0783, 0.0878, 0.1628, 0.0000, 0.7663],
        [0.0000, 0.8181, 0.3700, 0.2624, 0.6912, 0.3011]])

In [74]:
weights.shape

torch.Size([8, 6, 1, 1, 1])

In [67]:
image = torch.randn(8, 6, 3, 352, 608)

In [76]:
weighted_image = weights * image

In [77]:
weighted_image.shape

torch.Size([8, 6, 3, 352, 608])

In [84]:
image[2, 5]

tensor([[[-0.2311,  0.2379,  0.9339,  ..., -1.0574,  0.9730,  0.1771],
         [ 0.5407,  0.4366,  0.5054,  ...,  0.6561,  1.0277,  1.1712],
         [ 0.2865, -0.8768,  1.0330,  ...,  0.8223, -0.2839,  0.5287],
         ...,
         [ 0.7281,  1.2110, -0.7459,  ..., -1.3635,  0.0546, -0.2609],
         [-0.8802,  1.7907,  0.7237,  ..., -0.3063,  0.2031, -1.7614],
         [-1.8903,  0.6733,  1.3758,  ..., -0.6519,  0.4261,  1.1572]],

        [[ 0.4723,  0.9483, -0.8223,  ...,  0.8321, -2.1512,  1.9406],
         [ 0.7125, -0.2474, -0.4329,  ...,  0.3571, -0.0951, -1.0156],
         [-0.7591, -1.4258,  0.8662,  ...,  0.5441,  0.4617,  0.1125],
         ...,
         [ 0.2302, -0.0024, -0.5610,  ...,  0.3369,  1.3106, -0.3004],
         [ 1.2285, -1.4839,  1.1372,  ...,  0.7506,  1.4058, -0.7462],
         [ 0.2507, -0.6956, -1.0991,  ...,  0.9928,  0.3242,  1.5292]],

        [[ 0.4740,  0.0262, -0.8766,  ..., -0.7047, -0.4071,  0.3702],
         [ 0.4548, -0.1914, -1.0548,  ...,  0

In [83]:
weights[2, 5]

tensor([[[0.9333]]])

In [85]:
weighted_image[2, 5]

tensor([[[-0.2157,  0.2220,  0.8717,  ..., -0.9869,  0.9081,  0.1653],
         [ 0.5047,  0.4075,  0.4717,  ...,  0.6124,  0.9592,  1.0931],
         [ 0.2674, -0.8183,  0.9641,  ...,  0.7675, -0.2650,  0.4934],
         ...,
         [ 0.6796,  1.1303, -0.6962,  ..., -1.2726,  0.0509, -0.2435],
         [-0.8215,  1.6713,  0.6755,  ..., -0.2859,  0.1896, -1.6439],
         [-1.7642,  0.6284,  1.2840,  ..., -0.6085,  0.3977,  1.0801]],

        [[ 0.4408,  0.8851, -0.7675,  ...,  0.7766, -2.0077,  1.8112],
         [ 0.6650, -0.2309, -0.4041,  ...,  0.3333, -0.0888, -0.9479],
         [-0.7085, -1.3307,  0.8085,  ...,  0.5079,  0.4309,  0.1050],
         ...,
         [ 0.2148, -0.0022, -0.5236,  ...,  0.3145,  1.2232, -0.2804],
         [ 1.1465, -1.3849,  1.0613,  ...,  0.7006,  1.3121, -0.6965],
         [ 0.2340, -0.6493, -1.0258,  ...,  0.9266,  0.3026,  1.4273]],

        [[ 0.4424,  0.0245, -0.8181,  ..., -0.6577, -0.3799,  0.3455],
         [ 0.4244, -0.1786, -0.9845,  ...,  0

In [41]:
x = torch.randn(8, 2, 2, 4)

In [27]:
model = Baseline.from_pretrained("dandelin/vilt-b32-finetuned-vqa")



In [28]:
print(model)

ViltForQuestionAnswering(
  (vilt): ViltModel(
    (embeddings): ViltEmbeddings(
      (text_embeddings): TextEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(40, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (patch_embeddings): ViltPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
      )
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViltEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViltLayer(
          (attention): ViltAttention(
            (attention): ViltSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=76

In [None]:
torch.cuda.is_available()

True

In [29]:
x = torch.randn(8, 6, 3, 352, 608)

In [30]:
type(x)

torch.Tensor

In [31]:
x.shape

torch.Size([8, 6, 3, 352, 608])

In [35]:
x.min()

tensor(-5.3835)

In [None]:
model.classifier

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

In [None]:
model.classifier

Linear(in_features=768, out_features=1, bias=True)

In [None]:
model = MultiviewViltForQuestionAnsweringBaseline(6, 210, 768, True, True, True, 429).to("cuda")



In [None]:
from transformers.models.vilt.modeling_vilt import ViltEmbeddings

In [None]:
import langchain
langchain.__version__

'0.2.11'

In [None]:
model.classifier

Sequential(
  (0): Linear(in_features=768, out_features=1536, bias=True)
  (1): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  (2): GELU(approximate='none')
  (3): Linear(in_features=1536, out_features=429, bias=True)
)

In [23]:
isvqa = ISVQA("/home/nikostheodoridis/isvqa/train_set.json",
              "/home/nikostheodoridis/nuscenes/samples",
              "/home/nikostheodoridis/isvqa/answers.json")

In [34]:
isvqa[0][0]["pixel_values"].min()

tensor(-1., device='cuda:0')

In [None]:
loader = DataLoader(isvqa, 4, shuffle=False)
batch = next(iter(loader))

In [None]:
batch[0].keys()

odict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask'])

In [None]:
out = model(**batch[0])

In [None]:
out.shape

torch.Size([4, 429])

In [None]:
questions = torch.randn(4, 1, 768)
images = torch.randn(4, 6, 768)

attn_scores = torch.randn(4, 1, 6)

In [None]:
weight = torch.randn()


In [None]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    model = MultiviewViltForQuestionAnsweringBaseline(6, 210, 768, True, True, False).to("cuda")
print(prof.key_averages().table(sort_by="cuda_time_total"))



-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::copy_        46.67%      69.913ms        71.95%     107.799ms     128.638us     110.855ms        68.45%     110.855ms     132.285us           838  
                                   aten::to         0.57%     857.000us        29.40%      44.043ms     205.808us       1.066ms         0.66%      44.273ms     206.883us           214  
                             aten::_to_copy         1.20%       1.795

In [None]:
loader = DataLoader(isvqa, 1, shuffle=False)

In [None]:
batch = next(iter(loader))

In [None]:
out = model(**batch[0])

In [None]:
out.last_hidden_state.shape

AttributeError: 'torch.Size' object has no attribute 'last_hidden_state'

In [None]:
questions = torch.randn(1, 768)
images = torch.randn(6, 768)

img_attn = nn.MultiheadAttention(768, 12)

_, attn_scores = img_attn(questions, images, images)

attn_scores[0, 2]

tensor(0.1913, grad_fn=<SelectBackward0>)

In [None]:
def val_step(model, loader, acc_fn, answ_len):
    """
    A function that validates the model by going through all the mini-batches in the validation dataloader once.
    """
    print("\tValidating...")
    model.eval()
    losses = []  # to save the loss of each mini-batch in order to take their average at the end
    accuracies = []  # to save the accuracy of each mini-batch in order to take their average at the end

    predictions = []
    

    with torch.inference_mode():
        for i, (X, y) in enumerate(loader):
            outputs = model(**X, labels=y)
            loss = outputs.loss
            pred = max_to_one_hot(outputs.logits)
            acc = acc_fn(pred, y, answ_len)

            losses.append(loss.item())
            accuracies.append(acc)

    avg_loss = sum(losses) / len(loader)
    avg_acc = sum(accuracies) / len(loader)
    
    return avg_loss, avg_acc

# Copy

In [None]:
model = MultiviewViltForQuestionAnswering(6, 210, 768, True, False, False).to("cuda")



In [None]:
model.model.classifier = nn.Sequential(
        nn.Linear(768, 1536),
        nn.LayerNorm(1536),
        nn.GELU(),
        nn.Linear(1536, 429)
    ).to("cuda")

In [None]:
trained_model = deepcopy(model)

In [None]:
trained_model.load_state_dict(torch.load("/home/nikostheodoridis/Trained Models/2024-07-08 00:07:49/model.pth"))

<All keys matched successfully>

In [None]:
# for p1, p2 in zip(model.parameters(), trained_model.parameters()):
#     assert torch.equal(p1, p2)

In [None]:
val_set = ISVQA(qa_path="/home/nikostheodoridis/isvqa/val_set.json",
                nuscenes_path="/home/nikostheodoridis/nuscenes/samples",
                answers_path="/home/nikostheodoridis/isvqa/answers.json")

In [None]:
val_loader = DataLoader(val_set, batch_size=6, shuffle=False)

In [None]:
# targets = []
# untrained_predictions = []
# trained_predictions = []

# model.eval()
# trained_model.eval()
# for i in range(2576):
#     inputs, target = val_set[i]

#     targets.append(target)

#     with torch.inference_mode():
        


In [None]:
def accuracy(predictions: torch.Tensor, targets: torch.Tensor, answers_len: int) -> float:
    cnt = torch.eq(torch.eq(predictions, targets).sum(dim=1), answers_len).sum()
    return cnt.item() / len(predictions)

In [None]:
untrained_loss, untrained_acc = val_step(model, val_loader, accuracy, 429)

	Validating...


In [None]:
trained_loss, trained_acc = val_step(trained_model, val_loader, accuracy, 429)

	Validating...


In [None]:
print(untrained_loss)
print(untrained_acc)

303.8099614342978
0.0011627906976744186


In [None]:
print(trained_loss)
print(trained_acc)

2.5099486532945967
0.6003875968992246


In [None]:
with open("/home/nikostheodoridis/isvqa/answers_counter.json") as f:
    answers_cnt = json.load(f)

In [None]:
Counter(answers_cnt)

Counter({'yes': 13564,
         'no': 3734,
         'one': 3663,
         'white': 2893,
         'two': 2544,
         'red': 1205,
         'black': 1046,
         'blue': 1025,
         'three': 986,
         'green': 968,
         'yellow': 930,
         'orange': 782,
         'four': 529,
         'night': 464,
         'rainy': 434,
         'gray': 365,
         'black and white': 345,
         'silver': 287,
         'zero': 254,
         'five': 218,
         'orange and white': 178,
         'six': 156,
         'left': 151,
         'none': 147,
         'ahead': 147,
         'right': 141,
         'fedex': 136,
         'brown': 134,
         'cloudy': 129,
         'slow': 119,
         'ups': 114,
         'bus': 112,
         'raining': 111,
         'wet': 111,
         'sunny': 107,
         'ryder': 95,
         'urban': 93,
         'twenty-three': 92,
         'stop': 89,
         'day': 77,
         'hump': 69,
         'brick': 69,
         'red and white': 57,

In [None]:
# Start
import json
import random
train_path = "/home/nikostheodoridis/nuscenes-qa/train_set.json"

val_path = "/home/nikostheodoridis/nuscenes-qa/val_set.json"

test_path = "/home/nikostheodoridis/nuscenes-qa/test_set.json"
with open(train_path) as f:
    train_data = json.load(f)

with open(val_path) as f:
    val_data = json.load(f)

with open(test_path) as f:
    test_data = json.load(f)

for data in train_data:
    if data in test_data:
        print("False")
        break
else:
    print("True")

True
