In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from PIL import Image
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import BeitModel, BertModel, BeitImageProcessor, AutoTokenizer, get_linear_schedule_with_warmup
import wandb
from huggingface_hub import login

In [None]:
with open("account_config.json", "r") as f:
    config = json.load(f)

wandb_key = config.get("wandb_key")
hf_token = config.get("hf_token")

In [5]:
wandb.login(key="65147faf3b81e0c1415dcd6fd695c6585a0535b9")
hf_token = "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"
login(hf_token)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mppdddd00123[0m ([33mppddddpp[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
# Define paths
IMAGE_FOLDER = "/kaggle/input/vqa-dataset/vqa_dataset/images"
TRAIN_CSV = "/kaggle/input/final-vqa-dataset/train.csv"
VAL_CSV = "/kaggle/input/final-vqa-dataset/val.csv"

In [7]:
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    return df

train_df = load_data(TRAIN_CSV)
val_df = load_data(VAL_CSV)

In [8]:
# Create answer vocabulary
unique_answers = sorted(set(train_df["answer"].tolist()))
answer2id = {ans: i for i, ans in enumerate(unique_answers)}
id2answer = {i: ans for i, ans in enumerate(unique_answers)}
num_labels = len(answer2id)

# Load BEiT-3 processor
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
image_processor = BeitImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

  return func(*args, **kwargs)


In [9]:
class VQADataset(Dataset):
    def __init__(self, dataframe, image_processor, tokenizer, answer2id, base_image_dir):
        self.data = dataframe
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.answer2id = answer2id
        self.base_image_dir = base_image_dir
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Extract category and image name from the image_path
        image_path = row["image_path"]
        category, image_name = image_path.split('/')

        full_image_path = os.path.join(self.base_image_dir, category, image_name)
        
        # Open the image
        image = Image.open(full_image_path).convert("RGB")
        question = row["question"]
        answer = row["answer"]
        
        # Process the image and question
        image_tensor = self.image_processor(image, return_tensors="pt")["pixel_values"].squeeze(0)
        question_tensor = self.tokenizer(question, padding="max_length", truncation=True, max_length=50, return_tensors="pt")
        label = torch.tensor(self.answer2id.get(answer, 0), dtype=torch.long)
        
        return {
            "image": image_tensor,
            "input_ids": question_tensor["input_ids"].squeeze(0),
            "attention_mask": question_tensor["attention_mask"].squeeze(0),
            "label": label,
        }

In [10]:
# Load datasets
dataset_train = VQADataset(train_df, image_processor, tokenizer, answer2id, IMAGE_FOLDER)
dataset_val = VQADataset(val_df, image_processor, tokenizer, answer2id, IMAGE_FOLDER)
train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True)
val_loader = DataLoader(dataset_val, batch_size=16, shuffle=False)

In [13]:
class BEiTForVQA(nn.Module):
    def __init__(self, num_labels):
        super(BEiTForVQA, self).__init__()
        self.beit = BeitModel.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")  # Use a transformer model for text encoding
        self.image_encoder = nn.Linear(768, 512)
        self.text_encoder_linear = nn.Linear(768, 512)  # Linear layer for text features after BERT
        self.classifier = nn.Linear(512, num_labels)

    def forward(self, image, input_ids, attention_mask):
        image_features = self.beit(image).last_hidden_state[:, 0, :]  # Extract CLS token for image
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0, :]  # Extract CLS token for text
        fusion = torch.relu(self.image_encoder(image_features) + self.text_encoder_linear(text_features))
        logits = self.classifier(fusion)
        return logits

In [14]:
# Training loop
def train(model, train_loader, optimizer, criterion, scheduler, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress_bar:
            optimizer.zero_grad()
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)
            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            progress_bar.set_postfix({"loss": loss.item()})
        wandb.log({"train_loss": total_loss / len(train_loader)})
        print(f"Epoch {epoch+1}: Train Loss = {total_loss/len(train_loader):.4f}")
        save_checkpoint(model, tokenizer, optimizer, scheduler, epoch)

# Checkpoint functions
def save_checkpoint(model, tokenizer, optimizer, scheduler, epoch, output_dir="/kaggle/working/beit3-vqa-checkpoints"):
    os.makedirs(output_dir, exist_ok=True)
    checkpoint_path = os.path.join(output_dir, f"checkpoint-epoch-{epoch}")
    model.save_pretrained(checkpoint_path)
    tokenizer.save_pretrained(checkpoint_path)
    torch.save(optimizer.state_dict(), os.path.join(checkpoint_path, "optimizer.pt"))
    torch.save(scheduler.state_dict(), os.path.join(checkpoint_path, "scheduler.pt"))
    print(f"Checkpoint saved at {checkpoint_path}")

In [15]:
# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BEiTForVQA(num_labels).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)
criterion = nn.CrossEntropyLoss()
wandb.init(project="beit3-vqa", name="beit3-vqa-run")

Some weights of BeitModel were not initialized from the model checkpoint at microsoft/beit-base-patch16-224-pt22k and are newly initialized: ['beit.pooler.layernorm.bias', 'beit.pooler.layernorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Run training
train(model, train_loader, optimizer, criterion, scheduler)

Epoch 1/3:   1%|          | 56/10331 [28:06<86:15:10, 30.22s/it, loss=1.88]

In [None]:
# Save final model
final_model_path = "/kaggle/working/beit3-vqa-model"
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)
wandb.finish()
print("Training complete and model saved!")

In [None]:
# Push model to Hugging Face Hub
HUGGINGFACE_MODEL_ID = "ppdddd/beit3-vqa-finetuned"
login(token=os.getenv("HUGGINGFACE_TOKEN"))
model.push_to_hub(HUGGINGFACE_MODEL_ID,private=True)
tokenizer.push_to_hub(HUGGINGFACE_MODEL_ID,private=True)
print(f"Model pushed to Hugging Face Hub: {HUGGINGFACE_MODEL_ID}")