In [1]:
import torch
import json
import numpy as np

from PIL import Image
from utility import ViltImageSetProcessor
from transformers import ViltImageProcessor, BertTokenizer, ViltModel, ViltConfig, ViltForQuestionAnswering
from models import MultiviewViltModel, MultiviewViltForQuestionAnsweringBaseline
from torch import nn
from copy import deepcopy
from isvqa_data_setup import ISVQA
from torch.utils.data import DataLoader
from engine import max_to_one_hot
from collections import Counter
from torch.optim.lr_scheduler import StepLR

In [2]:
import json
import random

from copy import deepcopy

with open("/home/nikostheodoridis/nuscenes/v1.0-trainval/sample_data.json") as f:
    set = json.load(f)

print("djsdjsjsdksj")

In [3]:
import torch
from prettytable import PrettyTable

def count_model_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params += param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

In [4]:
model = MultiviewViltForQuestionAnsweringBaseline(6, 210, 768, True, True, True, 429).to("cuda")



In [5]:
count_model_parameters(model)

+--------------------------------------------------------------------+------------+
|                              Modules                               | Parameters |
+--------------------------------------------------------------------+------------+
|          encoder.layer.0.attention.attention.query.weight          |   589824   |
|           encoder.layer.0.attention.attention.query.bias           |    768     |
|           encoder.layer.0.attention.attention.key.weight           |   589824   |
|            encoder.layer.0.attention.attention.key.bias            |    768     |
|          encoder.layer.0.attention.attention.value.weight          |   589824   |
|           encoder.layer.0.attention.attention.value.bias           |    768     |
|           encoder.layer.0.attention.output.dense.weight            |   589824   |
|            encoder.layer.0.attention.output.dense.bias             |    768     |
|             encoder.layer.0.intermediate.dense.weight              |  2359

114406317

In [4]:
from transformers.models.vilt.modeling_vilt import ViltEmbeddings

In [3]:
import langchain
langchain.__version__

'0.2.11'

In [6]:
model.classifier

Sequential(
  (0): Linear(in_features=768, out_features=1536, bias=True)
  (1): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  (2): GELU(approximate='none')
  (3): Linear(in_features=1536, out_features=429, bias=True)
)

In [7]:
isvqa = ISVQA("/home/nikostheodoridis/isvqa/train_set.json",
              "/home/nikostheodoridis/nuscenes/samples",
              "/home/nikostheodoridis/isvqa/answers.json")

In [8]:
loader = DataLoader(isvqa, 4, shuffle=False)
batch = next(iter(loader))

In [9]:
batch[0].keys()

odict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask'])

In [10]:
out = model(**batch[0])

In [12]:
out.shape

torch.Size([4, 429])

In [6]:
questions = torch.randn(4, 1, 768)
images = torch.randn(4, 6, 768)

attn_scores = torch.randn(4, 1, 6)

In [None]:
weight = torch.randn()


In [2]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    model = MultiviewViltForQuestionAnsweringBaseline(6, 210, 768, True, True, False).to("cuda")
print(prof.key_averages().table(sort_by="cuda_time_total"))



-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::copy_        46.67%      69.913ms        71.95%     107.799ms     128.638us     110.855ms        68.45%     110.855ms     132.285us           838  
                                   aten::to         0.57%     857.000us        29.40%      44.043ms     205.808us       1.066ms         0.66%      44.273ms     206.883us           214  
                             aten::_to_copy         1.20%       1.795

In [None]:
loader = DataLoader(isvqa, 1, shuffle=False)

In [None]:
batch = next(iter(loader))

In [None]:
out = model(**batch[0])

In [None]:
out.last_hidden_state.shape

AttributeError: 'torch.Size' object has no attribute 'last_hidden_state'

In [None]:
questions = torch.randn(1, 768)
images = torch.randn(6, 768)

img_attn = nn.MultiheadAttention(768, 12)

_, attn_scores = img_attn(questions, images, images)

attn_scores[0, 2]

tensor(0.1913, grad_fn=<SelectBackward0>)

In [None]:
def val_step(model, loader, acc_fn, answ_len):
    """
    A function that validates the model by going through all the mini-batches in the validation dataloader once.
    """
    print("\tValidating...")
    model.eval()
    losses = []  # to save the loss of each mini-batch in order to take their average at the end
    accuracies = []  # to save the accuracy of each mini-batch in order to take their average at the end

    predictions = []
    

    with torch.inference_mode():
        for i, (X, y) in enumerate(loader):
            outputs = model(**X, labels=y)
            loss = outputs.loss
            pred = max_to_one_hot(outputs.logits)
            acc = acc_fn(pred, y, answ_len)

            losses.append(loss.item())
            accuracies.append(acc)

    avg_loss = sum(losses) / len(loader)
    avg_acc = sum(accuracies) / len(loader)
    
    return avg_loss, avg_acc

# Copy

In [None]:
model = MultiviewViltForQuestionAnswering(6, 210, 768, True, False, False).to("cuda")



In [None]:
model.model.classifier = nn.Sequential(
        nn.Linear(768, 1536),
        nn.LayerNorm(1536),
        nn.GELU(),
        nn.Linear(1536, 429)
    ).to("cuda")

In [None]:
trained_model = deepcopy(model)

In [None]:
trained_model.load_state_dict(torch.load("/home/nikostheodoridis/Trained Models/2024-07-08 00:07:49/model.pth"))

<All keys matched successfully>

In [None]:
# for p1, p2 in zip(model.parameters(), trained_model.parameters()):
#     assert torch.equal(p1, p2)

In [None]:
val_set = ISVQA(qa_path="/home/nikostheodoridis/isvqa/val_set.json",
                nuscenes_path="/home/nikostheodoridis/nuscenes/samples",
                answers_path="/home/nikostheodoridis/isvqa/answers.json")

In [None]:
val_loader = DataLoader(val_set, batch_size=6, shuffle=False)

In [None]:
# targets = []
# untrained_predictions = []
# trained_predictions = []

# model.eval()
# trained_model.eval()
# for i in range(2576):
#     inputs, target = val_set[i]

#     targets.append(target)

#     with torch.inference_mode():
        


In [None]:
def accuracy(predictions: torch.Tensor, targets: torch.Tensor, answers_len: int) -> float:
    cnt = torch.eq(torch.eq(predictions, targets).sum(dim=1), answers_len).sum()
    return cnt.item() / len(predictions)

In [None]:
untrained_loss, untrained_acc = val_step(model, val_loader, accuracy, 429)

	Validating...


In [None]:
trained_loss, trained_acc = val_step(trained_model, val_loader, accuracy, 429)

	Validating...


In [None]:
print(untrained_loss)
print(untrained_acc)

303.8099614342978
0.0011627906976744186


In [None]:
print(trained_loss)
print(trained_acc)

2.5099486532945967
0.6003875968992246


In [None]:
with open("/home/nikostheodoridis/isvqa/answers_counter.json") as f:
    answers_cnt = json.load(f)

In [None]:
Counter(answers_cnt)

Counter({'yes': 13564,
         'no': 3734,
         'one': 3663,
         'white': 2893,
         'two': 2544,
         'red': 1205,
         'black': 1046,
         'blue': 1025,
         'three': 986,
         'green': 968,
         'yellow': 930,
         'orange': 782,
         'four': 529,
         'night': 464,
         'rainy': 434,
         'gray': 365,
         'black and white': 345,
         'silver': 287,
         'zero': 254,
         'five': 218,
         'orange and white': 178,
         'six': 156,
         'left': 151,
         'none': 147,
         'ahead': 147,
         'right': 141,
         'fedex': 136,
         'brown': 134,
         'cloudy': 129,
         'slow': 119,
         'ups': 114,
         'bus': 112,
         'raining': 111,
         'wet': 111,
         'sunny': 107,
         'ryder': 95,
         'urban': 93,
         'twenty-three': 92,
         'stop': 89,
         'day': 77,
         'hump': 69,
         'brick': 69,
         'red and white': 57,

In [None]:
# Start
import json
import random
train_path = "/home/nikostheodoridis/nuscenes-qa/train_set.json"

val_path = "/home/nikostheodoridis/nuscenes-qa/val_set.json"

test_path = "/home/nikostheodoridis/nuscenes-qa/test_set.json"
with open(train_path) as f:
    train_data = json.load(f)

with open(val_path) as f:
    val_data = json.load(f)

with open(test_path) as f:
    test_data = json.load(f)

for data in train_data:
    if data in test_data:
        print("False")
        break
else:
    print("True")

True
