In [1]:
# pip install torch

In [2]:
# pip install torchvision

In [3]:
# pip install transformers

In [4]:
# pip install nuscenes-devkit &> /dev/null 

In [5]:
import torch
import json

from torch import nn
from torchvision import transforms
from modified_vilt_v1 import MultiviewViltForQuestionAnswering
from PIL import Image
from isvqa_data_setup import ISVQA
from collections import Counter
from torch.utils.data import DataLoader
from nuscenesqa_data_setup import NuScenesQA
from torch.utils.data import random_split


In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
seed = 42

In [7]:
qa_folder = "/home/nikostheodoridis/nuscenes-qa"
nuscenes_folder = "/home/nikostheodoridis/nuscenes"
train_nuscenesqa = NuScenesQA(qa_folder_path=qa_folder,
                              qa_set="train",
                              nuscenes_folder_path=nuscenes_folder)

In [8]:
valtest_nuscenesqa = NuScenesQA(qa_folder_path=qa_folder,
                                qa_set="val",
                                nuscenes_folder_path=nuscenes_folder)

In [9]:
val_size = int(0.5 * len(valtest_nuscenesqa))
test_size = len(valtest_nuscenesqa) - val_size
generator = torch.Generator().manual_seed(seed)

val_nuscenesqa, test_nuscenesqa = random_split(valtest_nuscenesqa, [val_size, test_size], generator=generator)


In [10]:
train_loader = DataLoader(train_nuscenesqa,
                          batch_size=8,
                          shuffle=True)

val_loader = DataLoader(val_nuscenesqa,
                        batch_size=8,
                        shuffle=False)

test_loader = DataLoader(test_nuscenesqa,
                         batch_size=8,
                         shuffle=False)

In [11]:
batch = next(iter(train_loader))

In [12]:
batch[0]["pixel_values"].shape

torch.Size([8, 6, 3, 352, 608])

In [13]:
batch[1].shape

torch.Size([8, 30])

In [14]:
type(batch[0])

collections.OrderedDict

In [15]:
model = MultiviewViltForQuestionAnswering(6, 210, 768, True, True).to(device)

output = model(**batch[0], labels=batch[1])

ValueError: Target size (torch.Size([8, 30])) must be the same as input size (torch.Size([8, 3129]))

In [61]:
x = torch.tensor([[1, 2], [3, 4]])
y = torch.tensor([[0, 2], [3, 4]])

(torch.eq(x, y).sum(dim=1) == 2).sum()

tensor(1)

In [62]:
def accuracy(predictions, targets):
    cnt = torch.eq(torch.eq(predictions, targets).sum(dim=1), 30).sum()
    return cnt / len(predictions)

In [None]:
def train_one_epoch(model, loader, optimizer, accuracy):
    model.train()
    accuracies = []

    for X, y in loader():
        outputs = model(**X, labels=y)
        loss = outputs.loss
        accuracies.append(accuracy)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [14]:
# Create the set with the question-answer pairs

qa_path = "/home/nikostheodoridis/isvqa"
nuscenes_path = "/home/nikostheodoridis/nuscenes/samples"

isvqa = ISVQA(qa_path, nuscenes_path, device="cpu")



In [15]:
len(isvqa)

43484

In [16]:
print(isvqa[0][0]["pixel_values"].device)
print(isvqa[0][1].device)

cpu
cpu


In [17]:
dataloader = DataLoader(isvqa, batch_size=8)

In [18]:
batch = next(iter(dataloader))

In [19]:
print(batch[0]["pixel_values"].device)
print(batch[1].device)

cpu
cpu


In [10]:
len(isvqa)

43484

In [9]:
model = MultiviewViltForQuestionAnswering(6, 210, 768, True, True)

In [11]:
out = model(**batch[0], labels=batch[1])

ValueError: Target size (torch.Size([8, 640])) must be the same as input size (torch.Size([8, 3129]))

In [22]:
batch[0]["input_ids"]

tensor([[ 101, 2024, 2045,  ...,    0,    0,    0],
        [ 101, 2129, 2116,  ...,    0,    0,    0],
        [ 101, 2079, 2017,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2003, 2045,  ...,    0,    0,    0]])

In [5]:
len(trainval_qa)

28826

In [17]:
len(test_qa)

14658

In [19]:
test_qa[0]

{'answers': ['one', 'one', 'one'],
 'ocr_tokens': [],
 'image_names': ['CAM_FRONT_LEFT/n015-2018-08-02-17-28-51+0800__CAM_FRONT_LEFT__1533202622254844',
  'CAM_FRONT/n015-2018-08-02-17-28-51+0800__CAM_FRONT__1533202622262460',
  'CAM_FRONT_RIGHT/n015-2018-08-02-17-28-51+0800__CAM_FRONT_RIGHT__1533202622270339',
  'CAM_BACK_LEFT/n015-2018-08-02-17-28-51+0800__CAM_BACK_LEFT__1533202622297423',
  'CAM_BACK/n015-2018-08-02-17-28-51+0800__CAM_BACK__1533202622287525',
  'CAM_BACK_RIGHT/n015-2018-08-02-17-28-51+0800__CAM_BACK_RIGHT__1533202622277893'],
 'question_str': 'how many of the images can you see a boy holding a sign in',
 'question_tokens': ['how',
  'many',
  'of',
  'the',
  'images',
  'can',
  'you',
  'see',
  'a',
  'boy',
  'holding',
  'a',
  'sign',
  'in'],
 'feature_paths': ['CAM_FRONT_LEFT/n015-2018-08-02-17-28-51+0800__CAM_FRONT_LEFT__1533202622254844.npy',
  'CAM_FRONT/n015-2018-08-02-17-28-51+0800__CAM_FRONT__1533202622262460.npy',
  'CAM_FRONT_RIGHT/n015-2018-08-02-17

In [5]:
cnt = 0
for data in isvqa:
    for image in data["image_names"]:
        try:
            x = Image.open(f"/home/nikostheodoridis/nuscenes/samples/{image}.jpg")
        except FileNotFoundError:
            cnt += 1

cnt

0

In [6]:
answers = set()
for data in isvqa:
    counter = Counter(data["answers"])
    answers.add(max(counter, key=counter.get))

len(answers)

640

In [8]:
from collections import Counter

my_list = [1, 2, 2, 3, 3, 3, 4, 5]

counter = Counter(my_list)
most_common_element = counter.most_common(1)[0][0]
print(f"The element that appears most frequently is {most_common_element}")

The element that appears most frequently is 3


In [9]:
counter

Counter({3: 3, 2: 2, 1: 1, 4: 1, 5: 1})

In [6]:
trainval_qa[0]

{'answers': ['yes', 'no', 'yes', 'yes'],
 'ocr_tokens': [],
 'image_names': ['CAM_FRONT_LEFT/n008-2018-08-30-15-52-26-0400__CAM_FRONT_LEFT__1535659467004799',
  'CAM_FRONT/n008-2018-08-30-15-52-26-0400__CAM_FRONT__1535659467012404',
  'CAM_FRONT_RIGHT/n008-2018-08-30-15-52-26-0400__CAM_FRONT_RIGHT__1535659467020486',
  'CAM_BACK_LEFT/n008-2018-08-30-15-52-26-0400__CAM_BACK_LEFT__1535659467047405',
  'CAM_BACK/n008-2018-08-30-15-52-26-0400__CAM_BACK__1535659467037558',
  'CAM_BACK_RIGHT/n008-2018-08-30-15-52-26-0400__CAM_BACK_RIGHT__1535659467028113'],
 'question_str': 'are there parking meters on the side of the street',
 'question_tokens': ['are',
  'there',
  'parking',
  'meters',
  'on',
  'the',
  'side',
  'of',
  'the',
  'street'],
 'feature_paths': ['CAM_FRONT_LEFT/n008-2018-08-30-15-52-26-0400__CAM_FRONT_LEFT__1535659467004799.npy',
  'CAM_FRONT/n008-2018-08-30-15-52-26-0400__CAM_FRONT__1535659467012404.npy',
  'CAM_FRONT_RIGHT/n008-2018-08-30-15-52-26-0400__CAM_FRONT_RIGHT__

In [None]:
def get_questions_from_sample(qa_set, sample):
    questions = []
    for qa_sample in qa_set["questions"]:
        if qa_sample["sample_token"] == sample["token"]:
            questions.append(qa_sample["question"])

    return questions

In [95]:
len(trainval_qa)

33973

In [96]:
len(test_qa)

15644

In [98]:
trainval_qa = clean_data(trainval_qa)

In [99]:
test_qa = clean_data(test_qa)

In [100]:
len(trainval_qa)

28826

In [101]:
len(test_qa)

14658

In [57]:
cnt = 0

for data in deepcopy(trainval_qa["data"]):
    counts = {}
    for answer in list(set(data["answers"])):
        counts[answer] = data["answers"].count(answer)


    if max(counts, key=counts.get) == "<unk>":
        cnt += 1
        trainval_qa["data"].remove(data)

cnt

0

In [58]:
len(trainval_qa["data"])

28826

In [24]:
trainval_qa["data"][0]

{'answers': ['yes', 'no', 'yes', 'yes'],
 'ocr_tokens': [],
 'image_names': ['CAM_FRONT_LEFT/n008-2018-08-30-15-52-26-0400__CAM_FRONT_LEFT__1535659467004799',
  'CAM_FRONT/n008-2018-08-30-15-52-26-0400__CAM_FRONT__1535659467012404',
  'CAM_FRONT_RIGHT/n008-2018-08-30-15-52-26-0400__CAM_FRONT_RIGHT__1535659467020486',
  'CAM_BACK_LEFT/n008-2018-08-30-15-52-26-0400__CAM_BACK_LEFT__1535659467047405',
  'CAM_BACK/n008-2018-08-30-15-52-26-0400__CAM_BACK__1535659467037558',
  'CAM_BACK_RIGHT/n008-2018-08-30-15-52-26-0400__CAM_BACK_RIGHT__1535659467028113'],
 'question_str': 'are there parking meters on the side of the street',
 'question_tokens': ['are',
  'there',
  'parking',
  'meters',
  'on',
  'the',
  'side',
  'of',
  'the',
  'street'],
 'feature_paths': ['CAM_FRONT_LEFT/n008-2018-08-30-15-52-26-0400__CAM_FRONT_LEFT__1535659467004799.npy',
  'CAM_FRONT/n008-2018-08-30-15-52-26-0400__CAM_FRONT__1535659467012404.npy',
  'CAM_FRONT_RIGHT/n008-2018-08-30-15-52-26-0400__CAM_FRONT_RIGHT__