In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
from datasets import load_dataset
import torch
from transformers import ViTModel, BertModel, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch import nn
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
from datasets import load_dataset
from PIL import Image
import io
import json
import os
from torchvision import transforms
from huggingface_hub import HfApi, login


2025-04-26 03:26:53.530140: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745638013.737456      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745638013.795072      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
ds = load_dataset('khoadole/cars_8k_balance_dataset_full_augmented_v2')

README.md:   0%|          | 0.00/670 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/13.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/14.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15468 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4824 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5154 [00:00<?, ? examples/s]

In [3]:
color_list = ['blue', 'white', 'black', 'gray', 'silver']
brand_list = ['bentley', 'audi', 'bmw', 'acura']

def get_answer_type(answer):
    if answer in color_list:
        return 'color'
    elif answer in brand_list:
        return 'brand'
    else:
        return 'car_name'

In [4]:
class VQADataset(Dataset):
    def __init__(self, dataset, tokenizer, answer_to_idx):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.answer_to_idx = answer_to_idx
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        # Định nghĩa mapping cho answer_type
        self.answer_type_map = {'color': 0, 'brand': 1, 'car_name': 2}

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        image_bytes = sample['image']['bytes']
        image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
        image_tensor = self.transform(image)
        question = sample['question']
        tokenized = self.tokenizer(question, padding='max_length', truncation=True, max_length=32, return_tensors='pt')
        input_ids = tokenized['input_ids'].squeeze(0)
        attention_mask = tokenized['attention_mask'].squeeze(0)
        answer = sample['answer']
        answer_idx = self.answer_to_idx.get(answer, -1)
        answer_type = torch.tensor(self.answer_type_map[get_answer_type(answer)], dtype=torch.long)  # Chuyển thành tensor
        return image_tensor, input_ids, attention_mask, answer_idx, answer_type

In [5]:
class VQAModel(nn.Module):
    def __init__(self, num_answers):
        super(VQAModel, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-base-patch16-224')
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            # nn.Linear(768 * 3, 2048),
            # nn.Linear(2048, 1024),
            # nn.GELU(),
            # nn.Linear(1024, 512),
            # nn.GELU(),
            nn.Linear(768, 512),
            nn.GELU(),
            nn.Linear(512, num_answers)
        )
        # for param in self.vit.encoder.layer[:6].parameters():
        #     param.requires_grad = False
        for param in self.bert.encoder.layer[:6].parameters():
            param.requires_grad = False

    def forward(self, image, input_ids, attention_mask):
        image_features = self.vit(image).last_hidden_state[:, 0, :]
        text_features = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        combined = image_features * text_features
        output = self.classifier(combined)
        return output

In [6]:
all_train_answers = list(set(sample['answer'] for sample in ds['train']))
answer_to_idx = {answer: idx for idx, answer in enumerate(all_train_answers)}

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = VQADataset(ds['train'], tokenizer, answer_to_idx)
val_dataset = VQADataset(ds['validation'], tokenizer, answer_to_idx)
test_dataset = VQADataset(ds['test'], tokenizer, answer_to_idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

# Khởi tạo mô hình và tối ưu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = VQAModel(num_answers=len(answer_to_idx)).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scaler = GradScaler()
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True)
loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  scaler = GradScaler()


In [7]:
# Training loop
num_epochs = 30
best_val_acc = 0
patience = 5
patience_counter = 0
color_loss_weight = 2.0
answer_type_map_reverse = {0: 'color', 1: 'brand', 2: 'car_name'}  # Để map ngược lại từ số sang string

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_color_loss = 0
    total_brand_loss = 0
    total_car_name_loss = 0
    color_count = 0
    brand_count = 0
    car_name_count = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        image, input_ids, attention_mask, answer_idx, answer_type = [x.to(device) for x in batch]
        valid_mask = answer_idx != -1
        if not valid_mask.any():
            continue
        # Lọc các tensor bằng valid_mask
        image = image[valid_mask]
        input_ids = input_ids[valid_mask]
        attention_mask = attention_mask[valid_mask]
        answer_idx = answer_idx[valid_mask]
        answer_type = answer_type[valid_mask]  # answer_type giờ là tensor

        optimizer.zero_grad()
        with autocast():
            output = model(image, input_ids, attention_mask)
            loss = loss_fn(output, answer_idx)
            weighted_loss = torch.zeros_like(loss)
            for i in range(len(answer_type)):
                ans_type_str = answer_type_map_reverse[answer_type[i].item()]  # Map ngược lại thành string
                if ans_type_str == 'color':
                    weighted_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0)) * color_loss_weight
                    total_color_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0)).item()
                    color_count += 1
                elif ans_type_str == 'brand':
                    weighted_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0))
                    total_brand_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0)).item()
                    brand_count += 1
                else:
                    weighted_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0))
                    total_car_name_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0)).item()
                    car_name_count += 1
        scaler.scale(weighted_loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    avg_color_loss = total_color_loss / color_count if color_count > 0 else 0
    avg_brand_loss = total_brand_loss / brand_count if brand_count > 0 else 0
    avg_car_name_loss = total_car_name_loss / car_name_count if car_name_count > 0 else 0
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")
    print(f"Color Loss: {avg_color_loss:.4f}, Brand Loss: {avg_brand_loss:.4f}, Car Name Loss: {avg_car_name_loss:.4f}")

    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            image, input_ids, attention_mask, answer_idx, _ = [x.to(device) for x in batch]
            valid_mask = answer_idx != -1
            if not valid_mask.any():
                continue
            image = image[valid_mask]
            input_ids = input_ids[valid_mask]
            attention_mask = attention_mask[valid_mask]
            answer_idx = answer_idx[valid_mask]
            with autocast():
                output = model(image, input_ids, attention_mask)
            pred = output.argmax(dim=1)
            correct += (pred == answer_idx).sum().item()
            total += answer_idx.size(0)
    val_accuracy = correct / total if total > 0 else 0
    print(f"Epoch {epoch+1}, Validation Accuracy: {val_accuracy:.4f}")

    scheduler.step(val_accuracy)
    if val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        patience_counter = 0
        torch.save(model.state_dict(), 'best_vqa_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

  with autocast():
Epoch 1/30 - Training: 100%|██████████| 484/484 [03:10<00:00,  2.54it/s]


Epoch 1, Train Loss: 1.8530
Color Loss: 1.4815, Brand Loss: 1.5419, Car Name Loss: 2.5377


  with autocast():
Epoch 1/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 1, Validation Accuracy: 0.7220


Epoch 2/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 2, Train Loss: 1.1332
Color Loss: 1.0599, Brand Loss: 0.7447, Car Name Loss: 1.5961


Epoch 2/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 2, Validation Accuracy: 0.7666


Epoch 3/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 3, Train Loss: 0.9684
Color Loss: 0.8721, Brand Loss: 0.6877, Car Name Loss: 1.3449


Epoch 3/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 3, Validation Accuracy: 0.8033


Epoch 4/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 4, Train Loss: 0.8535
Color Loss: 0.7502, Brand Loss: 0.6769, Car Name Loss: 1.1326


Epoch 4/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.94it/s]


Epoch 4, Validation Accuracy: 0.8259


Epoch 5/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.48it/s]


Epoch 5, Train Loss: 0.7689
Color Loss: 0.7004, Brand Loss: 0.6657, Car Name Loss: 0.9404


Epoch 5/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 5, Validation Accuracy: 0.8520


Epoch 6/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 6, Train Loss: 0.7099
Color Loss: 0.6744, Brand Loss: 0.6598, Car Name Loss: 0.7953


Epoch 6/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 6, Validation Accuracy: 0.8489


Epoch 7/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.48it/s]


Epoch 7, Train Loss: 0.6831
Color Loss: 0.6633, Brand Loss: 0.6534, Car Name Loss: 0.7320


Epoch 7/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.87it/s]


Epoch 7, Validation Accuracy: 0.8514


Epoch 8/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 8, Train Loss: 0.6680
Color Loss: 0.6559, Brand Loss: 0.6476, Car Name Loss: 0.7006


Epoch 8/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.89it/s]


Epoch 8, Validation Accuracy: 0.8682


Epoch 9/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 9, Train Loss: 0.6591
Color Loss: 0.6521, Brand Loss: 0.6464, Car Name Loss: 0.6790


Epoch 9/30 - Validation: 100%|██████████| 151/151 [00:18<00:00,  7.98it/s]


Epoch 9, Validation Accuracy: 0.8636


Epoch 10/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 10, Train Loss: 0.6758
Color Loss: 0.6674, Brand Loss: 0.6509, Car Name Loss: 0.7092


Epoch 10/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 10, Validation Accuracy: 0.8449


Epoch 11/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 11, Train Loss: 0.6851
Color Loss: 0.6898, Brand Loss: 0.6523, Car Name Loss: 0.7132


Epoch 11/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.83it/s]


Epoch 11, Validation Accuracy: 0.8520


Epoch 12/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 12, Train Loss: 0.6554
Color Loss: 0.6521, Brand Loss: 0.6425, Car Name Loss: 0.6715


Epoch 12/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.82it/s]


Epoch 12, Validation Accuracy: 0.8731


Epoch 13/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 13, Train Loss: 0.6471
Color Loss: 0.6445, Brand Loss: 0.6388, Car Name Loss: 0.6579


Epoch 13/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.85it/s]


Epoch 13, Validation Accuracy: 0.8700


Epoch 14/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 14, Train Loss: 0.6432
Color Loss: 0.6413, Brand Loss: 0.6363, Car Name Loss: 0.6521


Epoch 14/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 14, Validation Accuracy: 0.8777


Epoch 15/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 15, Train Loss: 0.6416
Color Loss: 0.6402, Brand Loss: 0.6352, Car Name Loss: 0.6495


Epoch 15/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 15, Validation Accuracy: 0.8704


Epoch 16/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 16, Train Loss: 0.6406
Color Loss: 0.6385, Brand Loss: 0.6344, Car Name Loss: 0.6488


Epoch 16/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.86it/s]


Epoch 16, Validation Accuracy: 0.8634


Epoch 17/30 - Training: 100%|██████████| 484/484 [03:14<00:00,  2.49it/s]


Epoch 17, Train Loss: 0.6405
Color Loss: 0.6400, Brand Loss: 0.6349, Car Name Loss: 0.6467


Epoch 17/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.82it/s]


Epoch 17, Validation Accuracy: 0.8744


Epoch 18/30 - Training: 100%|██████████| 484/484 [03:13<00:00,  2.50it/s]


Epoch 18, Train Loss: 0.7039
Color Loss: 0.7090, Brand Loss: 0.6532, Car Name Loss: 0.7497


Epoch 18/30 - Validation: 100%|██████████| 151/151 [00:19<00:00,  7.88it/s]


Epoch 18, Validation Accuracy: 0.8381


Epoch 19/30 - Training: 100%|██████████| 484/484 [03:13<00:00,  2.50it/s]


Epoch 19, Train Loss: 0.6533
Color Loss: 0.6504, Brand Loss: 0.6391, Car Name Loss: 0.6706


Epoch 19/30 - Validation: 100%|██████████| 151/151 [00:18<00:00,  7.95it/s]

Epoch 19, Validation Accuracy: 0.8615
Early stopping triggered!





In [8]:
from collections import defaultdict
# Test
model.load_state_dict(torch.load('best_vqa_model.pth', weights_only=True))
model.eval()
correct_color = 0
total_color = 0
correct_brand = 0
total_brand = 0
correct_car_name = 0
total_car_name = 0
total_correct = 0
total_samples = 0
total_color_loss = 0
total_brand_loss = 0
total_car_name_loss = 0
color_count = 0
brand_count = 0
car_name_count = 0
answer_type_map_reverse = {0: 'color', 1: 'brand', 2: 'car_name'}
idx_to_answer = {idx: answer for answer, idx in answer_to_idx.items()}  # Map ngược từ idx sang answer

# Theo dõi số lần xuất hiện, đúng, và nhầm lẫn
color_stats = defaultdict(lambda: {'correct': 0, 'total': 0, 'confusion': defaultdict(int)})
brand_stats = defaultdict(lambda: {'correct': 0, 'total': 0, 'confusion': defaultdict(int)})
car_name_stats = defaultdict(lambda: {'correct': 0, 'total': 0, 'confusion': defaultdict(int)})

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test"):
        image, input_ids, attention_mask, answer_idx, answer_type = [x.to(device) for x in batch]
        valid_mask = (answer_idx != -1).to(device)
        if not valid_mask.any():
            continue
        image = image[valid_mask]
        input_ids = input_ids[valid_mask]
        attention_mask = attention_mask[valid_mask]
        answer_idx = answer_idx[valid_mask]
        answer_type = answer_type[valid_mask]

        with autocast():
            output = model(image, input_ids, attention_mask)
            loss = loss_fn(output, answer_idx)
        pred = output.argmax(dim=1)
        total_correct += (pred == answer_idx).sum().item()
        total_samples += answer_idx.size(0)

        for i in range(len(answer_type)):
            ans_type_str = answer_type_map_reverse[answer_type[i].item()]
            true_answer = idx_to_answer[answer_idx[i].item()]
            pred_answer = idx_to_answer[pred[i].item()]
            is_correct = pred[i] == answer_idx[i]

            if ans_type_str == 'color':
                total_color += 1
                total_color_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0)).item()
                color_count += 1
                if is_correct:
                    correct_color += 1
                color_stats[true_answer]['total'] += 1
                if is_correct:
                    color_stats[true_answer]['correct'] += 1
                else:
                    color_stats[true_answer]['confusion'][pred_answer] += 1

            elif ans_type_str == 'brand':
                total_brand += 1
                total_brand_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0)).item()
                brand_count += 1
                if is_correct:
                    correct_brand += 1
                brand_stats[true_answer]['total'] += 1
                if is_correct:
                    brand_stats[true_answer]['correct'] += 1
                else:
                    brand_stats[true_answer]['confusion'][pred_answer] += 1

            else:
                total_car_name += 1
                total_car_name_loss += loss_fn(output[i].unsqueeze(0), answer_idx[i].unsqueeze(0)).item()
                car_name_count += 1
                if is_correct:
                    correct_car_name += 1
                car_name_stats[true_answer]['total'] += 1
                if is_correct:
                    car_name_stats[true_answer]['correct'] += 1
                else:
                    car_name_stats[true_answer]['confusion'][pred_answer] += 1

# Tính accuracy tổng và riêng
test_color_accuracy = correct_color / total_color if total_color > 0 else 0
test_brand_accuracy = correct_brand / total_brand if total_brand > 0 else 0
test_car_name_accuracy = correct_car_name / total_car_name if total_car_name > 0 else 0
test_total_accuracy = total_correct / total_samples if total_samples > 0 else 0
avg_color_loss = total_color_loss / color_count if color_count > 0 else 0
avg_brand_loss = total_brand_loss / brand_count if brand_count > 0 else 0
avg_car_name_loss = total_car_name_loss / car_name_count if car_name_count > 0 else 0

# Tính top 4 sai nhiều nhất và nhầm lẫn
def get_top_errors(stats, category_name):
    error_rates = []
    for answer, stat in stats.items():
        total = stat['total']
        correct = stat['correct']
        if total > 0:
            error_rate = 1 - (correct / total)
            error_rates.append((answer, error_rate, total, stat['confusion']))
    error_rates.sort(key=lambda x: (x[1], x[2]), reverse=True)
    top_4 = error_rates[:4]
    print(f"\nTop 4 {category_name} sai nhiều nhất:")
    for answer, error_rate, total, confusion in top_4:
        print(f"- {answer}: {error_rate:.4f} ({(error_rate * 100):.2f}%), xuất hiện {total} lần")
        print(f"  Nhầm lẫn với:")
        for pred_answer, count in confusion.items():
            print(f"    + {pred_answer}: {count} lần")

# In kết quả
print(f"Test Total Accuracy: {test_total_accuracy:.4f}")
print(f"Test Color Accuracy: {test_color_accuracy:.4f}, Brand Accuracy: {test_brand_accuracy:.4f}, Car Name Accuracy: {test_car_name_accuracy:.4f}")
print(f"Test Color Loss: {avg_color_loss:.4f}, Brand Loss: {avg_brand_loss:.4f}, Car Name Loss: {avg_car_name_loss:.4f}")

get_top_errors(color_stats, "màu")
get_top_errors(brand_stats, "hãng xe")
get_top_errors(car_name_stats, "tên xe")

  with autocast():
Test: 100%|██████████| 162/162 [00:23<00:00,  6.94it/s]

Test Total Accuracy: 0.8458
Test Color Accuracy: 0.8341, Brand Accuracy: 0.9377, Car Name Accuracy: 0.7654
Test Color Loss: 1.1617, Brand Loss: 0.8043, Car Name Loss: 1.3292

Top 4 màu sai nhiều nhất:
- gray: 0.2898 (28.98%), xuất hiện 245 lần
  Nhầm lẫn với:
    + white: 9 lần
    + black: 33 lần
    + silver: 27 lần
    + blue: 2 lần
- silver: 0.2500 (25.00%), xuất hiện 248 lần
  Nhầm lẫn với:
    + black: 20 lần
    + gray: 16 lần
    + blue: 8 lần
    + white: 18 lần
- blue: 0.1416 (14.16%), xuất hiện 339 lần
  Nhầm lẫn với:
    + black: 25 lần
    + silver: 19 lần
    + gray: 2 lần
    + white: 2 lần
- white: 0.1313 (13.13%), xuất hiện 518 lần
  Nhầm lẫn với:
    + gray: 19 lần
    + silver: 15 lần
    + black: 32 lần
    + blue: 2 lần

Top 4 hãng xe sai nhiều nhất:
- bmw: 0.1200 (12.00%), xuất hiện 425 lần
  Nhầm lẫn với:
    + audi: 43 lần
    + acura: 2 lần
    + bentley: 6 lần
- acura: 0.0485 (4.85%), xuất hiện 371 lần
  Nhầm lẫn với:
    + audi: 8 lần
    + bmw: 8 lần
    + b




In [9]:
save_path = "/kaggle/working/my-vqa-model"
os.makedirs(save_path, exist_ok=True)
torch.save(model.state_dict(), f"{save_path}/pytorch_model.bin")
config = {
    "vit_model": "google/vit-base-patch16-224",
    "bert_model": "bert-base-uncased",
    "num_answers": len(answer_to_idx),
    "architecture": "ViT+BERT with concatenation and multiplication fusion"
}
with open(f"{save_path}/config.json", "w") as f:
    json.dump(config, f)
tokenizer.save_pretrained(save_path)
with open(f"{save_path}/answer_list.json", "w") as f:
    json.dump(all_train_answers, f)