# Import Library

In [None]:
import os
import re
import json
import random
from collections import defaultdict
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.metrics import accuracy_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from torch.nn.utils.rnn import pad_sequence
from torchvision import transforms, models

# Preprocessing

## Image

In [None]:
def resize_images(input_dir, output_dir, size=(224, 224)):
    os.makedirs(output_dir, exist_ok=True)
    
    for img_file in os.listdir(input_dir):
        img_path = os.path.join(input_dir, img_file)
        save_path = os.path.join(output_dir, img_file)
        
        try:
            with Image.open(img_path) as img:
                img = img.resize(size, Image.LANCZOS) 
                img.save(save_path, img.format)
        except Exception as e:
            print(f"Wrong at {img_file}: {e}")

    print(f"Resized: {len(os.listdir(output_dir))} images.")

In [None]:

input_dir = "/kaggle/input/dl-mt-sq2sq-data/data_seq2seq/train"
output_dir = "/kaggle/working/resized_images_train"
resize_images(input_dir, output_dir)

In [None]:

input_dir = "/kaggle/input/dl-mt-sq2sq-data/data_seq2seq/test"
output_dir = "/kaggle/working/resized_images_test"
resize_images(input_dir, output_dir)

## Text

### Create Vocab

In [None]:
def tokenizer(sentence):
    regex = re.compile(r'(\W+)')
    tokens = regex.split(sentence.lower())
    return [w.strip() for w in tokens if len(w.strip()) > 0]

In [None]:
def make_ans_vocab(annotation_file, save_path="/kaggle/working/answer_vocabs.txt"):
    with open(annotation_file, 'r') as f:
        data = json.load(f)

    answers = set()
    for ann in data['annotations']:
        for ans in ann['answers']:
            answers.update(tokenizer(ans['answer']))  

    answers = sorted(answers)
    answers.insert(0, '<pad>') 
    answers.insert(1, '<unk>') 
    answers.insert(2, '<sos>') 
    answers.insert(3, '<eos>') 

    with open(save_path, 'w') as f:
        f.writelines([a + '\n' for a in answers])

    print(f"Generated answer vocab: {len(answers)} answers.")

In [None]:
annotation_file = "/kaggle/input/dl-mt-sq2sq-data/data_seq2seq/train_annotations.json"
make_ans_vocab(annotation_file)

In [None]:
def make_q_vocab(question_file, save_path="/kaggle/working/question_vocabs.txt"):
    with open(question_file, 'r') as f:
        data = json.load(f)

    words = set()
    for q in data['questions']:
        words.update(tokenizer(q['question']))

    words = sorted(words)
    words.insert(0, '<pad>')
    words.insert(1, '<unk>')

    with open(save_path, 'w') as f:
        f.writelines([w + '\n' for w in words])

    print(f"Generated question vocab: {len(words)} words.")


In [None]:
question_file = "/kaggle/input/dl-mt-sq2sq-data/data_seq2seq/train_questions.json"
make_q_vocab(question_file)

In [None]:
def load_vocab(vocab_file):
    with open(vocab_file, encoding="utf-8") as f:
        vocab = [line.strip() for line in f]
    return {word: idx for idx, word in enumerate(vocab)}

In [None]:
def find_image_file(image_id, image_dir):
    for filename in os.listdir(image_dir):
        if filename.startswith(f"id_{image_id}."):
            return os.path.join(image_dir, filename) 
    return None  

In [None]:
def process_data(question_file, annotation_file, image_dir, output_path, labeled=True):
    with open(question_file, 'r', encoding="utf-8") as f:
        questions = json.load(f)['questions']

    if labeled:
        with open(annotation_file, 'r', encoding="utf-8") as f:
            annotations = json.load(f)['annotations']
        q_dict = {ann['question_id']: ann for ann in annotations}

    vocab2idx = load_vocab("/kaggle/working/question_vocabs.txt")  
    ans_vocab2idx = load_vocab("/kaggle/working/answer_vocabs.txt")  

    dataset = []

    for q in questions:
        qu_id = q['question_id']
        img_id = q['image_id']
        qu_sentence = q['question']
        qu_tokens = tokenizer(qu_sentence)
        qu2idx = [vocab2idx.get(token, vocab2idx['<unk>']) for token in qu_tokens]  

        img_path = find_image_file(img_id, image_dir)

        info = {
            'img_name': os.path.basename(img_path),
            'img_path': img_path,
            'question': qu_sentence,
            'qu_tokens': qu2idx,
            'qu_id': qu_id
        }

        if labeled:
            annotation_ans = q_dict[qu_id]['answers']
            valid_ans = [ans['answer'] for ans in annotation_ans]
            ans_tokens = tokenizer(valid_ans[0])
            ans2idx = [ans_vocab2idx.get(token, ans_vocab2idx['<unk>']) for token in ans_tokens]
            
            info['answer'] = valid_ans[0]
            info['ans_tokens'] = ans2idx 
            
        dataset.append(info)

    with open(output_path, 'w', encoding="utf-8") as f:
        json.dump(dataset, f, indent=4, ensure_ascii=False)  

    print(f"Saved JSON at {output_path} | Total samples: {len(dataset)}")


In [None]:
image_dir = "/kaggle/working/resized_images_train"
question_file = "/kaggle/input/dl-mt-sq2sq-data/data_seq2seq/train_questions.json"
annotation_file = "/kaggle/input/dl-mt-sq2sq-data/data_seq2seq/train_annotations.json"

process_data(question_file, annotation_file, image_dir, '/kaggle/working/train.json')

In [None]:
image_dir = "/kaggle/working/resized_images_test"
question_file = "/kaggle/input/dl-mt-sq2sq-data/data_seq2seq/test_questions.json"
annotation_file = "/kaggle/input/dl-mt-sq2sq-data/data_seq2seq/test_annotations.json"

process_data(question_file, annotation_file, image_dir, '/kaggle/working/test.json')

# Dataset


In [None]:
class VQADataset(Dataset):
    def __init__(self, json_path, transform=None,
                 max_qu_len=20, max_ans_len=20,
                 typeData='train',
                 pad_idx=0, sos_idx=2, eos_idx=3):
        
        self.max_qu_len = max_qu_len
        self.max_ans_len = max_ans_len
        self.typeData = typeData
        
        self.PAD_IDX = pad_idx
        self.SOS_IDX = sos_idx
        self.EOS_IDX = eos_idx
        
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        self.data = []
        if self.typeData == 'train':
            for i in range(0, len(data), 210):
                self.data.extend(data[i:i+200]) 
        elif self.typeData == 'valid':
            for i in range(0, len(data), 210):
                self.data.extend(data[i+200:i+210]) 
        elif self.typeData == 'test':
            self.data = data 
        
        self.transform = transform if transform else transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])
    
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        sample = self.data[idx]
        img_path = sample["img_path"]
        question_tokens = torch.tensor(sample["qu_tokens"], dtype=torch.long)
        answer_tokens = torch.tensor(sample["ans_tokens"], dtype=torch.long)

        #
        if len(question_tokens) > self.max_qu_len:
            question_tokens = question_tokens[:self.max_qu_len]
        else:
            pad_len = self.max_qu_len - len(question_tokens)
            padding = torch.full((pad_len,), self.PAD_IDX, dtype=torch.long)
            question_tokens = torch.cat([question_tokens, padding])

        
        if len(answer_tokens) > self.max_ans_len - 2:
            answer_tokens = answer_tokens[:self.max_ans_len - 2]

        
        answer_tokens = torch.cat([
            torch.tensor([self.SOS_IDX], dtype=torch.long),
            answer_tokens,
            torch.tensor([self.EOS_IDX], dtype=torch.long)
        ])

        
        if len(answer_tokens) < self.max_ans_len:
            pad_len = self.max_ans_len - len(answer_tokens)
            padding = torch.full((pad_len,), self.PAD_IDX, dtype=torch.long)
            answer_tokens = torch.cat([answer_tokens, padding])

        
        answer_input = answer_tokens[:-1]   
        answer_target = answer_tokens[1:]   

        
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        
        return image, question_tokens, answer_input, answer_target


# Model

In [None]:
class ImgEncoder(nn.Module):
    def __init__(self, model, in_features, img_feature_size, is_train=False):
        super(ImgEncoder, self).__init__()
        self.model = model
        self.fc = nn.Linear(in_features, img_feature_size)
        self.is_train = is_train

    def forward(self, image):
        if self.is_train:
            img_feature = self.model(image)
        else:
            with torch.no_grad():
                img_feature = self.model(image)

        img_feature = self.fc(img_feature)
        l2_norm = F.normalize(img_feature, p=2, dim=1)
        return l2_norm

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_size):
        super(SelfAttention, self).__init__()
        self.attention = nn.Linear(hidden_size, 1) 

    def forward(self, lstm_out):
        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)  
        context_vector = torch.sum(attn_weights * lstm_out, dim=1) 
        return context_vector



In [None]:
class QuEncoder(nn.Module):
    def __init__(self, qu_vocab_size, word_embed, hidden_size, num_hidden, qu_feature_size, with_att=False):
        super(QuEncoder, self).__init__()
        self.with_att = with_att
        self.word_embedding = nn.Embedding(qu_vocab_size, word_embed)
        self.tanh = nn.Tanh()
        self.lstm = nn.LSTM(word_embed, hidden_size, num_hidden, batch_first=True)

        if self.with_att:
            self.attention = SelfAttention(hidden_size)
            self.fc = nn.Linear(hidden_size, qu_feature_size)
        else:
            self.fc = nn.Linear(2 * num_hidden * hidden_size, qu_feature_size)

    def forward(self, question):
        qu_embedding = self.word_embedding(question)
        qu_embedding = self.tanh(qu_embedding)
        lstm_out, (hidden, cell) = self.lstm(qu_embedding)
    
        if self.with_att:
            attn_output = self.attention(lstm_out)  
        else:
            qu_feature = torch.cat((hidden, cell), dim=2)
            qu_feature = qu_feature.transpose(0, 1).reshape(qu_feature.size(1), -1)
            attn_output = self.tanh(qu_feature)
    
        qu_feature = self.fc(attn_output)  
    
        return attn_output, hidden, cell


In [None]:
class AnswerDecoder(nn.Module):
    def __init__(self, ans_vocab_size, word_embed, hidden_size, num_layers=1, pad_idx=0):
        super(AnswerDecoder, self).__init__()
        self.embedding = nn.Embedding(ans_vocab_size, word_embed, padding_idx=pad_idx)
        self.lstm = nn.LSTM(word_embed, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, ans_vocab_size)
        self.hidden_size = hidden_size

    def forward(self, answer_input, hidden):
        embedded = self.embedding(answer_input)  
        lstm_out, hidden = self.lstm(embedded, hidden)  
        outputs = self.fc(lstm_out)  
        return outputs


In [None]:
## VQA MODEL ##
class VQAModel(nn.Module):
    def __init__(self, img_model, img_in_features,
                 qu_vocab_size, ans_vocab_size,
                 word_embed, hidden_size, num_hidden,
                 feature_size, with_att=False,
                 is_train_image=False, pad_idx=0):
        super(VQAModel, self).__init__()

        if (with_att):
            output_size = 768
        else: 
            output_size = 1536
        
        
        img_model = img_model.to(device)
        self.img_encoder = ImgEncoder(img_model, img_in_features, feature_size, is_train_image)
        self.qu_encoder = QuEncoder(qu_vocab_size, word_embed, hidden_size, num_hidden, feature_size, with_att)
        self.feature_projection = nn.Linear(output_size, hidden_size)
        self.decoder = AnswerDecoder(ans_vocab_size, word_embed, hidden_size, pad_idx=pad_idx)

    def forward(self, image, question, answer_input):
        img_feature = self.img_encoder(image)
        qu_feature, hidden, cell = self.qu_encoder(question)
        combined_feature = torch.cat([img_feature, qu_feature], dim=1) 
        projected_feature = torch.tanh(self.feature_projection(combined_feature))
        h0 = projected_feature.unsqueeze(0)
        c0 = torch.tanh(cell[-1]).unsqueeze(0)
        outputs = self.decoder(answer_input, (h0, c0))
        return outputs

# Training & Testing

In [None]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
import matplotlib.pyplot as plt

def plot_loss(history):
    epochs = range(1, len(history['train_loss']) + 1)
    
    plt.figure(figsize=(8, 5))
    plt.plot(epochs, history['train_loss'], label='Train Loss', marker='o')
    plt.plot(epochs, history['valid_loss'], label='Valid Loss', marker='o')
    
    plt.title("Loss theo Epoch")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
def train(model, train_dataloader, valid_dataloader,
          criterion, optimizer, device,
          num_epochs=10, name="default", patience=10):

    model.to(device)
    best_valid_loss = float('inf')
    epochs_no_improve = 0
    history = {'train_loss': [], 'valid_loss': []}

    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, verbose=True
    )

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for images, questions, answer_input, answer_target in train_dataloader:
            images = images.to(device)
            questions = questions.to(device)
            answer_input = answer_input.to(device)
            answer_target = answer_target.to(device)

            optimizer.zero_grad()
            outputs = model(images, questions, answer_input)
            loss = criterion(outputs.view(-1, outputs.size(-1)),
                             answer_target.view(-1))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        train_loss = running_loss / len(train_dataloader)

        
        model.eval()
        valid_loss = 0.0
        with torch.no_grad():
            for images, questions, answer_input, answer_target in valid_dataloader:
                images = images.to(device)
                questions = questions.to(device)
                answer_input = answer_input.to(device)
                answer_target = answer_target.to(device)

                outputs = model(images, questions, answer_input)
                loss = criterion(outputs.view(-1, outputs.size(-1)),
                                 answer_target.view(-1))
                valid_loss += loss.item()

        valid_loss = valid_loss / len(valid_dataloader)

        
        scheduler.step(valid_loss)

        history['train_loss'].append(train_loss)
        history['valid_loss'].append(valid_loss)

        print(f"Epoch [{epoch+1}/{num_epochs}]: "
              f"Train Loss = {train_loss:.4f} | "
              f"Valid Loss = {valid_loss:.4f}")

        
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'valid_loss': valid_loss
        }, f"last_{name}.pt")

        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            epochs_no_improve = 0
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'valid_loss': valid_loss
            }, f"best_{name}.pt")

            print(f"Best model saved at epoch {epoch+1} with validation loss: {valid_loss:.4f}")
        else:
            epochs_no_improve += 1

        
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}, no improvement for {patience} epochs.")
            break

    print("Training finished.")
    return history


In [None]:
def evaluate_bleu(model, dataloader, ans_vocab_file, device, max_len=20):
    ans_word_to_idx = load_vocab(ans_vocab_file)
    
    ans_idx_to_word = {idx: word for word, idx in ans_word_to_idx.items()}
    
    model.eval()
    references = []  
    hypotheses = []  
    
    
    start_token = ans_word_to_idx.get("<sos>", 2)
    end_token = ans_word_to_idx.get("<eos>", 3)
    pad_token = ans_word_to_idx.get("<pad>", 0)
    
    smooth = SmoothingFunction().method4

    with torch.no_grad():
        for images, questions, _, answers_gt in dataloader:
            images = images.to(device)
            questions = questions.to(device)
            batch_size = images.size(0)
            
            
            answer_input = torch.full((batch_size, 1), start_token, dtype=torch.long).to(device)
            
            
            for _ in range(max_len):
                outputs = model(images, questions, answer_input)  
                next_token = torch.argmax(outputs[:, -1, :], dim=-1).unsqueeze(1)  
                answer_input = torch.cat([answer_input, next_token], dim=1)
            
            
            for i in range(batch_size):
                
                pred_ids = answer_input[i, 1:].tolist()
                pred_words = []
                for idx in pred_ids:
                    word = ans_idx_to_word.get(idx, "<unk>")
                    if word in ("<eos>", "<pad>"):
                        break
                    pred_words.append(word)
                hypotheses.append(pred_words)
                
                
                gt_ids = answers_gt[i].tolist()
                gt_words = []
                for idx in gt_ids:
                    word = ans_idx_to_word.get(idx, "<unk>")
                    if word in ("<eos>", "<pad>"):
                        break
                    gt_words.append(word)
                
                references.append([gt_words])
                
    bleu = corpus_bleu(references, hypotheses, smoothing_function=smooth)
    print(f"BLEU score: {bleu:.4f}")
    return bleu


In [None]:
json_path_train = "/kaggle/working/train.json"  
json_path_test = "/kaggle/working/test.json"  

train_dataset = VQADataset(json_path_train, max_qu_len=20, typeData='train')
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

valid_dataset = VQADataset(json_path_train, max_qu_len=20, typeData='valid')
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=True)

test_dataset = VQADataset(json_path_test, max_qu_len=20, typeData='test')
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [None]:
for images, questions, answers_input, answers_target in train_dataloader:
    for i in range(5):
        print(f"\n--- Sample {i+1} ---")
        print("Image shape:", images[i].shape)
        print("Question token IDs:", questions[i])
        print("Answer input token IDs:", answers_input[i])
        print("Answer target token IDs:", answers_target[i])
    break


In [None]:
epochs = 50

In [None]:
vocab_qu = load_vocab("/kaggle/working/question_vocabs.txt")
vocab_ans = load_vocab("/kaggle/working/answer_vocabs.txt")

qu_vocab_size = len(vocab_qu)
ans_vocab_size = len(vocab_ans)

PAD_IDX = vocab_ans["<pad>"]
SOS_IDX = vocab_ans["<sos>"]
EOS_IDX = vocab_ans["<eos>"]

In [None]:
feature_size = 512
word_embed = 128
hidden_size = 256
num_hidden = 2

## Pretrained Model

### With Attention

#### MobileNetV2

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
set_seed(42)
mobilenet = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
mobilenet.classifier = nn.Identity() 
in_features_mobilenet = 1280  

mobilenet_model = VQAModel(
    img_model=mobilenet,
    img_in_features=in_features_mobilenet,
    qu_vocab_size=qu_vocab_size,
    ans_vocab_size=ans_vocab_size,
    word_embed=word_embed,
    hidden_size=hidden_size,
    num_hidden=num_hidden,
    feature_size=feature_size,
    with_att=True,
    is_train_image=False,
    pad_idx=PAD_IDX  
)
  
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = torch.optim.Adam(mobilenet_model.parameters(), lr=0.0005, weight_decay=1e-5)


In [None]:
# history = train(mobilenet_model, train_dataloader, valid_dataloader, criterion, optimizer, device, num_epochs=epochs, name='mobile_net')

In [None]:
# plot_loss(history)

In [None]:
checkpoint_path = "/kaggle/input/dl_mt_model/pytorch/default/1/model/seq2seq_concat/best_mobile_net.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
mobilenet_model.load_state_dict(checkpoint["model_state_dict"])
mobilenet_model.to(device)
mobilenet_model.eval()


bleu_score = evaluate_bleu(
    model=mobilenet_model,
    dataloader=test_dataloader,     
    ans_vocab_file="/kaggle/working/answer_vocabs.txt",
    device=device,
    max_len=20
)

print(f"BLEU Score on test set: {bleu_score:.4f}")

#### ResNet50

In [None]:
set_seed(42)

resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
resnet.fc = nn.Identity()
in_features_resnet = 2048   

resnet_model = VQAModel(
    img_model=resnet,
    img_in_features=in_features_resnet,
    qu_vocab_size=qu_vocab_size,
    ans_vocab_size=ans_vocab_size,
    word_embed=word_embed,
    hidden_size=hidden_size,
    num_hidden=num_hidden,
    feature_size=feature_size,
    with_att=True,
    is_train_image=False,
    pad_idx=PAD_IDX
)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = torch.optim.Adam(resnet_model.parameters(), lr=0.0005, weight_decay=1e-5)

In [None]:
# history = train(resnet_model, train_dataloader, valid_dataloader, criterion, optimizer, device, num_epochs=epochs, name='resnet')

In [None]:
# plot_loss(history)

In [None]:
checkpoint_path = "/kaggle/input/dl_mt_model/pytorch/default/1/model/seq2seq_concat/best_resnet.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
resnet_model.load_state_dict(checkpoint["model_state_dict"])
resnet_model.to(device)
resnet_model.eval()


bleu_score = evaluate_bleu(
    model=resnet_model,
    dataloader=test_dataloader,     
    ans_vocab_file="/kaggle/working/answer_vocabs.txt",
    device=device,
    max_len=20
)

print(f"BLEU Score on test set: {bleu_score:.4f}")

#### EfficientNetB3 

In [None]:
set_seed(42)
efficientnet = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
efficientnet.classifier = nn.Identity()

in_features_efficientnet = 1536

efficientnet_model = VQAModel(
    efficientnet, 
    in_features_efficientnet, 
    qu_vocab_size, 
    ans_vocab_size,
    word_embed, 
    hidden_size, 
    num_hidden, 
    feature_size,  
    with_att=True, 
    is_train_image=False,
    pad_idx=PAD_IDX
)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = torch.optim.Adam(efficientnet_model.parameters(), lr=0.0005, weight_decay=1e-5)

In [None]:
# history = train(efficientnet_model, train_dataloader, valid_dataloader, criterion, optimizer, device, num_epochs=epochs, name='efficientnet')

In [None]:
# plot_loss(history)

In [None]:
checkpoint_path = "/kaggle/input/dl_mt_model/pytorch/default/1/model/seq2seq_concat/best_efficientnet.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
efficientnet_model.load_state_dict(checkpoint["model_state_dict"])
efficientnet_model.to(device)
efficientnet_model.eval()


bleu_score = evaluate_bleu(
    model=efficientnet_model,
    dataloader=test_dataloader,     
    ans_vocab_file="/kaggle/working/answer_vocabs.txt",
    device=device,
    max_len=20
)

print(f"BLEU Score on test set: {bleu_score:.4f}")

### Without Attention

#### MobileNetV2

In [None]:
set_seed(42)
mobilenet = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
mobilenet.classifier = nn.Identity()  
in_features_mobilenet = 1280  

mobilenet_model_no_att = VQAModel(
    img_model=mobilenet,
    img_in_features=in_features_mobilenet,
    qu_vocab_size=qu_vocab_size,
    ans_vocab_size=ans_vocab_size,
    word_embed=word_embed,
    hidden_size=hidden_size,
    num_hidden=num_hidden,
    feature_size=feature_size,
    with_att=False,             
    is_train_image=False,
    pad_idx=PAD_IDX
)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = torch.optim.Adam(mobilenet_model_no_att.parameters(), lr=0.0005, weight_decay=1e-5)

In [None]:
# history = train(mobilenet_model_no_att, train_dataloader, valid_dataloader, criterion, optimizer, device, num_epochs=epochs, name='mobile_net_no_att')

In [None]:
# plot_loss(history)

In [None]:
checkpoint_path = "/kaggle/input/dl_mt_model/pytorch/default/1/model/seq2seq_concat/best_mobile_net_no_att.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
mobilenet_model_no_att.load_state_dict(checkpoint["model_state_dict"])
mobilenet_model_no_att.to(device)
mobilenet_model_no_att.eval()


bleu_score = evaluate_bleu(
    model=mobilenet_model_no_att,
    dataloader=test_dataloader,     
    ans_vocab_file="/kaggle/working/answer_vocabs.txt",
    device=device,
    max_len=20
)

print(f"BLEU Score on test set: {bleu_score:.4f}")

#### ResNet50

In [None]:
set_seed(42)
resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
resnet.fc = nn.Identity()
in_features_resnet = 2048   

resnet_model_no_att = VQAModel(
    img_model=resnet,
    img_in_features=in_features_resnet,
    qu_vocab_size=qu_vocab_size,
    ans_vocab_size=ans_vocab_size,
    word_embed=word_embed,
    hidden_size=hidden_size,
    num_hidden=num_hidden,
    feature_size=feature_size,
    with_att=False,              
    is_train_image=False,
    pad_idx=PAD_IDX
)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = torch.optim.Adam(resnet_model_no_att.parameters(), lr=0.0005, weight_decay=1e-5)

In [None]:
# history = train(resnet_model_no_att, train_dataloader, valid_dataloader, criterion, optimizer, device, num_epochs=epochs, name='resnet_no_att')

In [None]:
# plot_loss(history)

In [None]:
checkpoint_path = "/kaggle/input/dl_mt_model/pytorch/default/1/model/seq2seq_concat/best_resnet_no_att.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
resnet_model_no_att.load_state_dict(checkpoint["model_state_dict"])
resnet_model_no_att.to(device)
resnet_model_no_att.eval()


bleu_score = evaluate_bleu(
    model=resnet_model_no_att,
    dataloader=test_dataloader,     
    ans_vocab_file="/kaggle/working/answer_vocabs.txt",
    device=device,
    max_len=20
)

print(f"BLEU Score on test set: {bleu_score:.4f}")

#### EfficientNetB3 

In [None]:
set_seed(42)
efficientnet = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT)
efficientnet.classifier = nn.Identity()

in_features_efficientnet = 1536

efficientnet_model_no_att = VQAModel(
    img_model=efficientnet,
    img_in_features=in_features_efficientnet,
    qu_vocab_size=qu_vocab_size,
    ans_vocab_size=ans_vocab_size,
    word_embed=word_embed,
    hidden_size=hidden_size,
    num_hidden=num_hidden,
    feature_size=feature_size,
    with_att=False,               
    is_train_image=False,
    pad_idx=PAD_IDX
)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = torch.optim.Adam(efficientnet_model_no_att.parameters(), lr=0.0005, weight_decay=1e-5)


In [None]:
# history = train(efficientnet_model_no_att, train_dataloader, valid_dataloader, criterion, optimizer, device, num_epochs=epochs, name='efficientnet_no_att')

In [None]:
# plot_loss(history)

In [None]:
checkpoint_path = "/kaggle/input/dl_mt_model/pytorch/default/1/model/seq2seq_concat/best_efficientnet_no_att.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
efficientnet_model_no_att.load_state_dict(checkpoint["model_state_dict"])
efficientnet_model_no_att.to(device)
efficientnet_model_no_att.eval()


bleu_score = evaluate_bleu(
    model=efficientnet_model_no_att,
    dataloader=test_dataloader,     
    ans_vocab_file="/kaggle/working/answer_vocabs.txt",
    device=device,
    max_len=20
)

print(f"BLEU Score on test set: {bleu_score:.4f}")

## FromScarth Model

In [None]:
class CNN_Self_Build(nn.Module):
    def __init__(self, feature_dim=3096, dropout_rate=0.5):
        super(CNN_Self_Build, self).__init__()
        
        self.block1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)  
        )
        
        
        self.block2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)  
        )
        
        
        self.block3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)  
        )
        
        
        self.block4 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2)  
        )
        
        
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))  
        
        
        self.fc = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(512, feature_dim)
        )
        
    def forward(self, x):
        x = self.block1(x)   
        x = self.block2(x)   
        x = self.block3(x)   
        x = self.block4(x)   
        x = self.global_avg_pool(x)  
        x = x.view(x.size(0), -1)      
        features = self.fc(x)          
        return features


### With Attention

In [None]:
set_seed(42)
cnn_self_build = CNN_Self_Build(feature_dim=2048, dropout_rate=0.5)

in_features_cnn_self_build = 2048

cnn_model = VQAModel(
    img_model=cnn_self_build,
    img_in_features=in_features_cnn_self_build,
    qu_vocab_size=qu_vocab_size,
    ans_vocab_size=ans_vocab_size,
    word_embed=word_embed,
    hidden_size=hidden_size,
    num_hidden=num_hidden,
    feature_size=feature_size,
    with_att=True,                
    is_train_image=True,         
    pad_idx=PAD_IDX
)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.0005, weight_decay=1e-5)

In [None]:
# history = train(cnn_model, train_dataloader, valid_dataloader, criterion, optimizer, device, num_epochs=epochs, name='default')

In [None]:
# plot_loss(history)

In [None]:
checkpoint_path = "/kaggle/input/dl_mt_model/pytorch/default/1/model/seq2seq_concat/best_default.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
cnn_model.load_state_dict(checkpoint["model_state_dict"])
cnn_model.to(device)
cnn_model.eval()


bleu_score = evaluate_bleu(
    model=cnn_model,
    dataloader=test_dataloader,     
    ans_vocab_file="/kaggle/working/answer_vocabs.txt",
    device=device,
    max_len=20
)

print(f"BLEU Score on test set: {bleu_score:.4f}")

### Without Attention

In [None]:
set_seed(42)
cnn_self_build = CNN_Self_Build(feature_dim=2048, dropout_rate=0.5)

in_features_cnn_self_build = 2048

cnn_model_no_att = VQAModel(
    img_model=cnn_self_build,
    img_in_features=in_features_cnn_self_build,
    qu_vocab_size=qu_vocab_size,
    ans_vocab_size=ans_vocab_size,
    word_embed=word_embed,
    hidden_size=hidden_size,
    num_hidden=num_hidden,
    feature_size=feature_size,
    with_att=False,           
    is_train_image=True,       
    pad_idx=PAD_IDX
)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX, label_smoothing=0.1)
optimizer = torch.optim.Adam(cnn_model_no_att.parameters(), lr=0.0005, weight_decay=1e-5)

In [None]:
# history = train(cnn_model_no_att, train_dataloader, valid_dataloader, criterion, optimizer, device, num_epochs=epochs, name='default_no_att')

In [None]:
# plot_loss(history)

In [None]:
checkpoint_path = "/kaggle/input/dl_mt_model/pytorch/default/1/model/seq2seq_concat/best_default_no_att.pt"
checkpoint = torch.load(checkpoint_path, map_location=device)
cnn_model_no_att.load_state_dict(checkpoint["model_state_dict"])
cnn_model_no_att.to(device)
cnn_model_no_att.eval()


bleu_score = evaluate_bleu(
    model=cnn_model_no_att,
    dataloader=test_dataloader,     
    ans_vocab_file="/kaggle/working/answer_vocabs.txt",
    device=device,
    max_len=20
)

print(f"BLEU Score on test set: {bleu_score:.4f}")

# Implement

In [None]:
# cnn_self_build = CNN_Self_Build(feature_dim=2048, dropout_rate=0.5)

# in_features_cnn_self_build = 2048

# cnn_model_no_att = VQAModel(
#     img_model=cnn_self_build,
#     img_in_features=in_features_cnn_self_build,
#     qu_vocab_size=qu_vocab_size,
#     ans_vocab_size=ans_vocab_size,
#     word_embed=word_embed,
#     hidden_size=hidden_size,
#     num_hidden=num_hidden,
#     feature_size=feature_size,
#     with_att=False,              
#     is_train_image=True,       
#     pad_idx=PAD_IDX
# )

# checkpoint_path = '/kaggle/input/dl_demo_df_na/pytorch/default/1/best_default_no_att (1).pt'

# checkpoint = torch.load(checkpoint_path, map_location=device)
# cnn_model_no_att.load_state_dict(checkpoint['model_state_dict'])
# cnn_model_no_att.to(device)
# cnn_model_no_att.eval()

In [None]:
# mobilenet = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
# mobilenet.classifier = nn.Identity()
# in_features_mobilenet = 1280

# mobilenet_model_with_att = VQAModel(
#     mobilenet, 
#     in_features_mobilenet, 
#     qu_vocab_size, 
#     ans_vocab_size,
#     word_embed, 
#     hidden_size, 
#     num_hidden, 
#     feature_size, 
#     with_att=True,       
#     is_train_image=False  
# )

# checkpoint_path = '/kaggle/input/dl_demo_mb_na/pytorch/default/1/best_mobile_net (3).pt'

# checkpoint = torch.load(checkpoint_path, map_location=device)
# mobilenet_model_with_att.load_state_dict(checkpoint['model_state_dict'])
# mobilenet_model_with_att.to(device)
# mobilenet_model_with_att.eval()


In [None]:
# resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
# resnet.fc = nn.Identity()
# in_features_resnet = 2048   

# resnet_model_no_att = VQAModel(
#     img_model=resnet,
#     img_in_features=in_features_resnet,
#     qu_vocab_size=qu_vocab_size,
#     ans_vocab_size=ans_vocab_size,
#     word_embed=word_embed,
#     hidden_size=hidden_size,
#     num_hidden=num_hidden,
#     feature_size=feature_size,
#     with_att=False,            
#     is_train_image=False,
#     pad_idx=PAD_IDX
# )
# checkpoint_path = '/kaggle/input/dl_demo_rn_na/pytorch/default/1/best_resnet_no_att.pt'

# checkpoint = torch.load(checkpoint_path, map_location=device)
# resnet_model_no_att.load_state_dict(checkpoint['model_state_dict'])
# resnet_model_no_att.to(device)
# resnet_model_no_att.eval()

In [None]:
# def tokenize_question(question, ques_vocab, max_qu_len=20):
#     tokens = [ques_vocab.get(word, ques_vocab.get("<unk>", 0)) for word in question.lower().split()]
#     if len(tokens) > max_qu_len:
#         tokens = tokens[:max_qu_len]
#     else:
#         tokens += [0] * (max_qu_len - len(tokens))
#     return tokens

In [None]:
# def idx_to_answer(index, ans_vocab_path):
#     with open(ans_vocab_path, 'r', encoding='utf-8') as f:
#         vocab = f.read().splitlines()
#     if 0 <= index < len(vocab):
#         return vocab[index]
#     else:
#         return "<unk>"

In [None]:
# def visualize_result(image_path, question, predicted_answer):
#     image = Image.open(image_path).convert("RGB")
#     plt.figure(figsize=(8, 6))
#     plt.imshow(image)
#     plt.axis('off')
#     plt.gca().add_patch(patches.Rectangle((0, 0), image.width, 100, linewidth=0, facecolor='black', alpha=0.6))
#     plt.text(5, 20, f"Câu hỏi: {question}", color='white', fontsize=12, verticalalignment='top')
#     plt.text(5, 60, f"Dự đoán: {predicted_answer}", color='yellow', fontsize=12, verticalalignment='top')
#     plt.show()


In [None]:
# def implement(model, image_path, ques_vocab_path, ans_vocab_path, transform, device, max_qu_len=20, max_ans_len=20):
#     model.to(device)
#     model.eval()

#     ques_vocab = load_vocab(ques_vocab_path)
#     with open(ans_vocab_path, 'r', encoding='utf-8') as f:
#         ans_vocab = f.read().splitlines()
#     ans_idx_to_word = {idx: word for idx, word in enumerate(ans_vocab)}
#     ans_word_to_idx = {word: idx for idx, word in enumerate(ans_vocab)}

#     pad_token_id = ques_vocab.get('<pad>', 0)
#     unk_token_id = ques_vocab.get('<unk>', 1)
#     start_token_id = ans_word_to_idx.get('<start>', 1)
#     end_token_id = ans_word_to_idx.get('<end>', 2)

#    
#     image = Image.open(image_path).convert("RGB")
#     image_tensor = transform(image).unsqueeze(0).to(device)

#     question = input("Nhập câu hỏi: ").strip().lower()
#     tokens = question.split()
#     question_ids = [ques_vocab.get(token, unk_token_id) for token in tokens]
#     question_ids = question_ids[:max_qu_len] + [pad_token_id] * max(0, max_qu_len - len(question_ids))
#     question_tensor = torch.tensor(question_ids, dtype=torch.long).unsqueeze(0).to(device)

#     answer_input = [start_token_id]

#     with torch.no_grad():
#         for _ in range(max_ans_len):
#             input_tensor = torch.tensor(answer_input, dtype=torch.long).unsqueeze(0).to(device)
#             output = model(image_tensor, question_tensor, input_tensor)  # [1, seq_len, vocab_size]
#             next_token_logits = output[0, -1]  # lấy ra token cuối cùng
#             next_token = torch.argmax(next_token_logits).item()

#             if next_token == end_token_id:
#                 break

#             answer_input.append(next_token)

#     predicted_ids = answer_input[1:]
#     predicted_words = []
#     for idx in predicted_ids:
#         word = ans_idx_to_word.get(idx, "<unk>")
#         if word in ["<pad>", "<eos>"]:
#             break
#         predicted_words.append(word)

#     predicted_answer = ' '.join(predicted_words)

#     print("\n===== KẾT QUẢ DỰ ĐOÁN =====")
#     print("Câu hỏi:", question)
#     print("Trả lời:", predicted_answer)

#     visualize_result(image_path, question, predicted_answer)
#     return predicted_answer


In [None]:
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# ])

----

In [None]:
# ques_vocab_path = "/kaggle/working/question_vocabs.txt"
# ans_vocab_path = "/kaggle/working/answer_vocabs.txt"
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# image_path = "/kaggle/input/vqa-test/data/places/id_751.png"
# predicted_answer = implement(resnet_model_no_att, image_path, ques_vocab_path, ans_vocab_path, transform, device)