```
pandas==2.2.2
torch==2.6.0+cu124
torchaudio==2.6.0+cu124
turchvision==0.21.0+cu124
pillow==11.2.1
transformers==4.53.2
numpy==2.0.2
tqdm==4.67.1
scikit-learn==1.6.1
peft==0.16.0
```
python==3.11.13

OS==Ubuntu 22.04.4 LTS

In [None]:
!unzip -qq scpc_data.zip

In [None]:
!unzip -qq visual7w_images.zip

In [None]:
!unzip -qq dataset_v7w_telling.zip

In [5]:

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from tqdm.notebook import tqdm
import os
import logging
from sklearn.model_selection import train_test_split
import random
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
from torch.amp import autocast, GradScaler
import warnings
warnings.filterwarnings('ignore')


In [6]:
seed = 41

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [7]:
class Config:
    def __init__(self):
        self.model_name = "laion/CLIP-ViT-L-14-laion2B-s32B-b82K"
        self.processor_name = "laion/CLIP-ViT-L-14-laion2B-s32B-b82K"
        self.train_csv_path = 'train_combined.csv'
        self.test_csv_path = 'test.csv'
        self.sample_submission_path = 'sample_submission.csv'
        self.output_submission_path = 'submission_pluto.csv'
        self.image_base_path = './'
        self.batch_size = 64  # Î∞∞Ïπò ÌÅ¨Í∏∞ Ï¶ùÍ∞Ä
        self.num_epochs = 6
        self.learning_rate = 1e-4  # ÌïôÏäµÎ•† Í∞êÏÜå
        self.weight_decay = 1.5e-2
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.num_choices = 4
        self.validation_split = 0.15
        self.patience = 3  # Early stopping
        self.save_best_model = True
        self.model_save_path = 'best_clip_vqa_model.pth'
        # LoRA ÏÑ§Ï†ï
        self.lora_r = 32  # rank
        self.lora_alpha = 64  # alpha
        self.lora_dropout = 0.1

        self.lora_target_modules = [
            "q_proj",
            "v_proj",
            #"k_proj",
            "out_proj",  # attention layers
           # "fc1", "fc2",  # feed forward layers
           # "visual_projection", #"text_projection"  # projection layers
        ]

In [8]:
config = Config()
print(f"Using device: {config.device}")
print(f"Model: {config.model_name}")
print(f"Batch size: {config.batch_size}")
print(f"Learning rate: {config.learning_rate}")

Using device: cuda
Model: laion/CLIP-ViT-L-14-laion2B-s32B-b82K
Batch size: 64
Learning rate: 0.0001


In [9]:
df = pd.read_csv(config.test_csv_path)
max_length = df['Question'].astype(str).str.len().max()
print("ÏµúÎåÄ Í∏ÄÏûê Ïàò(Q):", max_length)

max_length = df['A'].astype(str).str.len().max()
print("ÏµúÎåÄ Í∏ÄÏûê Ïàò(A):", max_length)

max_length = df['B'].astype(str).str.len().max()
print("ÏµúÎåÄ Í∏ÄÏûê Ïàò(B):", max_length)

max_length = df['C'].astype(str).str.len().max()
print("ÏµúÎåÄ Í∏ÄÏûê Ïàò(C):", max_length)

max_length = df['D'].astype(str).str.len().max()
print("ÏµúÎåÄ Í∏ÄÏûê Ïàò(D):", max_length)

ÏµúÎåÄ Í∏ÄÏûê Ïàò(Q): 117
ÏµúÎåÄ Í∏ÄÏûê Ïàò(A): 91
ÏµúÎåÄ Í∏ÄÏûê Ïàò(B): 102
ÏµúÎåÄ Í∏ÄÏûê Ïàò(C): 86
ÏµúÎåÄ Í∏ÄÏûê Ïàò(D): 93


In [10]:
max_word_count = df['Question'].astype(str).str.split().str.len().max()
print("ÏµúÎåÄ Îã®Ïñ¥ Ïàò(Question):", max_word_count)

max_word_count = df['A'].astype(str).str.split().str.len().max()
print("ÏµúÎåÄ Îã®Ïñ¥ Ïàò(A):", max_word_count)

max_word_count = df['B'].astype(str).str.split().str.len().max()
print("ÏµúÎåÄ Îã®Ïñ¥ Ïàò(B):", max_word_count)

max_word_count = df['C'].astype(str).str.split().str.len().max()
print("ÏµúÎåÄ Îã®Ïñ¥ Ïàò(C):", max_word_count)

max_word_count = df['D'].astype(str).str.split().str.len().max()
print("ÏµúÎåÄ Îã®Ïñ¥ Ïàò(D):", max_word_count)

ÏµúÎåÄ Îã®Ïñ¥ Ïàò(Question): 24
ÏµúÎåÄ Îã®Ïñ¥ Ïàò(A): 16
ÏµúÎåÄ Îã®Ïñ¥ Ïàò(B): 15
ÏµúÎåÄ Îã®Ïñ¥ Ïàò(C): 16
ÏµúÎåÄ Îã®Ïñ¥ Ïàò(D): 17


In [11]:
import json
def append_to_existing_csv_pandas(json_file_path, existing_csv_path, starting_id=None):
    """
    Í∏∞Ï°¥ CSV ÌååÏùºÏóê ÏÉàÎ°úÏö¥ Îç∞Ïù¥ÌÑ∞Î•º Ï∂îÍ∞ÄÌïòÎäî Ìï®Ïàò (pandas ÏÇ¨Ïö©)
    """

    # Í∏∞Ï°¥ CSV ÌååÏùº ÏùΩÍ∏∞
    existing_df = pd.read_csv(existing_csv_path, encoding='utf-8')

    # starting_idÍ∞Ä ÏßÄÏ†ïÎêòÏßÄ ÏïäÏúºÎ©¥ Í∏∞Ï°¥ Îç∞Ïù¥ÌÑ∞Ïùò ÎßàÏßÄÎßâ ID + 1Î°ú ÏÑ§Ï†ï
    if starting_id is None:
        starting_id =100

    # JSON ÌååÏùº ÏùΩÍ∏∞
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    rows = []
    current_id = starting_id

    # Í∞Å Ïù¥ÎØ∏ÏßÄÏôÄ QA ÏåçÏùÑ Ï≤òÎ¶¨
    for image in data['images']:
        for qa in image['qa_pairs']:
            choices = qa['multiple_choices'].copy()
            answer = qa['answer']

            if answer in choices:
                choices.remove(answer)

            while len(choices) < 3:
                choices.append("")

            choices = choices[:3]
            random.shuffle(choices)
            choices.append(answer)

            row = {
                'ID': current_id,
                'img_path': f'./images/{image["filename"]}',
                'Question': qa['question'],
                'A': choices[0] if choices[0] else "",
                'B': choices[1] if choices[1] else "",
                'C': choices[2] if choices[2] else "",
                'D': choices[3],
                'answer': 'D'
            }

            rows.append(row)
            current_id += 1

    # ÏÉàÎ°úÏö¥ DataFrame ÏÉùÏÑ±
    new_df = pd.DataFrame(rows)

    # Í∏∞Ï°¥ Îç∞Ïù¥ÌÑ∞ÏôÄ Ìï©ÏπòÍ∏∞
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)

    # CSV ÌååÏùºÎ°ú Ï†ÄÏû•
    combined_df.to_csv('train_combined.csv', index=False, encoding='utf-8')

    print(f"Ï∂îÍ∞Ä ÏôÑÎ£å: {len(new_df)}Í∞úÏùò ÌñâÏù¥ Í∏∞Ï°¥ CSVÏóê Ï∂îÍ∞ÄÎêòÏóàÏäµÎãàÎã§.")
    print(f"Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞: {len(combined_df)}Í∞ú Ìñâ")


In [12]:
append_to_existing_csv_pandas('dataset_v7w_telling.json','train.csv')

Ï∂îÍ∞Ä ÏôÑÎ£å: 139868Í∞úÏùò ÌñâÏù¥ Í∏∞Ï°¥ CSVÏóê Ï∂îÍ∞ÄÎêòÏóàÏäµÎãàÎã§.
Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞: 139928Í∞ú Ìñâ


In [13]:
class VQADataset(Dataset):
    def __init__(self, df, processor, image_base_path, is_train=True):
        self.df = df.reset_index(drop=True)
        self.processor = processor
        self.image_base_path = image_base_path
        self.is_train = is_train
        self.num_choices = config.num_choices

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_base_path, row["img_path"])
        question = row["Question"]
        choices = [row["A"], row["B"], row["C"], row["D"]]

        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            #logger.error(f"Error loading image {image_path}: {e}")
            # Í∏∞Î≥∏ Ïù¥ÎØ∏ÏßÄ ÏÉùÏÑ± (Í≤ÄÏùÄÏÉâ Ïù¥ÎØ∏ÏßÄ)
            print('error: Unable to open image')
            image = Image.new('RGB', (224, 224), color='black')

        # ÏßàÎ¨∏Í≥º Í∞Å ÏÑ†ÌÉùÏßÄÎ•º Í≤∞Ìï©Ìïú ÌÖçÏä§Ìä∏ ÏÉùÏÑ±
        texts = [f"Question: {question} Answer: {choice}" for choice in choices]

        # Ïù¥ÎØ∏ÏßÄÏôÄ ÌÖçÏä§Ìä∏ Ïù∏ÏΩîÎî©
        try:
            # Ïù¥ÎØ∏ÏßÄ Ïù∏ÏΩîÎî©
            image_inputs = self.processor(
                images=image,
                return_tensors="pt",
                do_rescale=True,
                do_normalize=True
            )

            # ÌÖçÏä§Ìä∏ Ïù∏ÏΩîÎî©
            text_inputs = self.processor(
                text=texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=77
            )

            result = {
                'pixel_values': image_inputs.pixel_values.squeeze(0),
                'input_ids': text_inputs.input_ids,
                'attention_mask': text_inputs.attention_mask,
                'id': row["ID"]
            }

            if self.is_train:
                label_to_index = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
                result['labels'] = torch.tensor(label_to_index[row["answer"]], dtype=torch.long)

            return result

        except Exception as e:

            raise e

In [14]:
def collate_fn(batch):
    """Î∞∞Ïπò Îç∞Ïù¥ÌÑ∞Î•º Ï†ÅÏ†àÌûà Í≤∞Ìï©"""
    if not batch:
        return None

    # Î∞∞Ïπò ÎÇ¥ Î™®Îì† ÌÖêÏÑúÏùò ÌÅ¨Í∏∞Î•º ÎßûÏ∂§
    pixel_values = torch.stack([item['pixel_values'] for item in batch])

    # ÌÖçÏä§Ìä∏ ÏûÖÎ†•Îì§Ïùò ÏµúÎåÄ Í∏∏Ïù¥ Ï∞æÍ∏∞
    max_length = 0
    for item in batch:
        max_length = max(max_length, item['input_ids'].shape[1])

    # Ìå®Îî©ÏùÑ Ï†ÅÏö©ÌïòÏó¨ Î™®Îì† ÌÖçÏä§Ìä∏ ÏûÖÎ†•Ïùò Í∏∏Ïù¥Î•º ÎßûÏ∂§
    padded_input_ids = []
    padded_attention_mask = []

    for item in batch:
        input_ids = item['input_ids']  # shape: (4, seq_len)
        attention_mask = item['attention_mask']  # shape: (4, seq_len)

        current_length = input_ids.shape[1]
        if current_length < max_length:
            # Ìå®Îî© Ï†ÅÏö©
            pad_size = max_length - current_length
            input_ids = torch.nn.functional.pad(input_ids, (0, pad_size), value=0)
            attention_mask = torch.nn.functional.pad(attention_mask, (0, pad_size), value=0)

        padded_input_ids.append(input_ids)
        padded_attention_mask.append(attention_mask)

    # Î∞∞Ïπò Ï∞®ÏõêÏúºÎ°ú Í≤∞Ìï©
    all_input_ids = torch.cat(padded_input_ids, dim=0)  # shape: (batch_size * 4, max_length)
    all_attention_mask = torch.cat(padded_attention_mask, dim=0)  # shape: (batch_size * 4, max_length)

    ids = [item['id'] for item in batch]

    result = {
        'pixel_values': pixel_values,
        'input_ids': all_input_ids,
        'attention_mask': all_attention_mask,
        'ids': ids
    }

    if 'labels' in batch[0]:
        labels = torch.stack([item['labels'] for item in batch])
        result['labels'] = labels

    return result

In [15]:
class CLIPVQAModel(nn.Module):
    """CLIP Í∏∞Î∞ò VQA Î™®Îç∏"""
    def __init__(self, clip_model_name, num_choices=4, lora_config=None):
        super().__init__()
        self.clip_model = CLIPModel.from_pretrained(
            clip_model_name,
            use_safetensors=False,
            #load_in_4bit=True,  # ÎòêÎäî load_in_8bit=True
            #device_map="auto",
            )
        self.num_choices = num_choices

        #self.clip_model = prepare_model_for_kbit_training(self.clip_model)

            # LoRA Ï†ÅÏö©
        self.clip_model = get_peft_model(self.clip_model, lora_config)

        print("LoRA Î™®Îç∏ Ï†ïÎ≥¥:")
        self.clip_model.print_trainable_parameters()

    def forward(self, pixel_values, input_ids, attention_mask):
        batch_size = pixel_values.shape[0]

        # Ïù¥ÎØ∏ÏßÄ ÌäπÏÑ± Ï∂îÏ∂ú
        image_features = self.clip_model.get_image_features(pixel_values)

        # ÌÖçÏä§Ìä∏ ÌäπÏÑ± Ï∂îÏ∂ú
        text_features = self.clip_model.get_text_features(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # ÌäπÏÑ± Ï†ïÍ∑úÌôî
        image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)

        # ÌÖçÏä§Ìä∏ ÌäπÏÑ±ÏùÑ (batch_size, num_choices, embedding_dim)Î°ú Ïû¨Íµ¨ÏÑ±
        text_features = text_features.view(batch_size, self.num_choices, -1)

        # Ïú†ÏÇ¨ÎèÑ Í≥ÑÏÇ∞
        # image_features: (batch_size, embedding_dim)
        # text_features: (batch_size, num_choices, embedding_dim)
        logits = torch.bmm(
            image_features.unsqueeze(1),
            text_features.transpose(1, 2)
        ).squeeze(1)

        # Ïò®ÎèÑ Ïä§ÏºÄÏùºÎßÅ Ï†ÅÏö©
        logit_scale = self.clip_model.logit_scale.exp()
        logits = logit_scale * logits

        return logits

In [16]:
scaler = GradScaler('cuda')
def train_epoch(model, dataloader, optimizer, criterion, device):
    """Ìïú ÏóêÌè¨ÌÅ¨ ÌõàÎ†®"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    progress_bar = tqdm(dataloader, desc="Training")

    for batch in progress_bar:
        if batch is None:
            continue

        # Îç∞Ïù¥ÌÑ∞Î•º ÎîîÎ∞îÏù¥Ïä§Î°ú Ïù¥Îèô
        pixel_values = batch['pixel_values'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # ÏàúÏ†ÑÌåå
        #logits = model(pixel_values, input_ids, attention_mask)
        #loss = criterion(logits, labels)

        # Ïó≠Ï†ÑÌåå
        #loss.backward()
        #optimizer.step()

        with autocast('cuda'):
          logits=model(pixel_values, input_ids, attention_mask)
          loss = criterion(logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # ÌÜµÍ≥Ñ ÏóÖÎç∞Ïù¥Ìä∏
        total_loss += loss.item()
        pred = logits.argmax(dim=1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)

        # ÏßÑÌñâÎ•† ÌëúÏãúÏ§Ñ ÏóÖÎç∞Ïù¥Ìä∏
        progress_bar.set_postfix({
            'loss': total_loss / (progress_bar.n + 1),
            'acc': 100. * correct / total
        })

    avg_loss = total_loss / len(dataloader)
    accuracy = 100. * correct / total

    return avg_loss, accuracy

def validate_epoch(model, dataloader, criterion, device):
    """Ìïú ÏóêÌè¨ÌÅ¨ Í≤ÄÏ¶ù"""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Validation")

        for batch in progress_bar:
            if batch is None:
                continue

            pixel_values = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            #logits = model(pixel_values, input_ids, attention_mask)
            #loss = criterion(logits, labels)

            with autocast('cuda'):
              logits = model(pixel_values, input_ids, attention_mask)
              loss = criterion(logits, labels)

            total_loss += loss.item()
            pred = logits.argmax(dim=1)
            correct += (pred == labels).sum().item()
            total += labels.size(0)

            progress_bar.set_postfix({
                'loss': total_loss / (progress_bar.n + 1),
                'acc': 100. * correct / total
            })

    avg_loss = total_loss / len(dataloader)
    accuracy = 100. * correct / total

    return avg_loss, accuracy

In [17]:
def create_lora_config(config):
    """LoRA ÏÑ§Ï†ï ÏÉùÏÑ±"""
    lora_config = LoraConfig(
        r=config.lora_r,
        lora_alpha=config.lora_alpha,
        target_modules=config.lora_target_modules,
        lora_dropout=config.lora_dropout,
        bias="none",
        task_type=TaskType.FEATURE_EXTRACTION,
    )
    return lora_config

In [None]:
def main():
    # Îç∞Ïù¥ÌÑ∞ Î°úÎìú
    train_df = pd.read_csv(config.train_csv_path)
    test_df = pd.read_csv(config.test_csv_path)

    # ÌõàÎ†®/Í≤ÄÏ¶ù Î∂ÑÌï†
    train_data, val_data = train_test_split(
        train_df,
        test_size=config.validation_split,
        random_state=seed,
        stratify=train_df['answer']
    )

    print(f"Training samples: {len(train_data)}")
    print(f"Validation samples: {len(val_data)}")
    print(f"Test samples: {len(test_df)}")

    # ÌîÑÎ°úÏÑ∏ÏÑú Î°úÎìú
    processor = CLIPProcessor.from_pretrained(config.processor_name)

    # Îç∞Ïù¥ÌÑ∞ÏÖã ÏÉùÏÑ±
    train_dataset = VQADataset(train_data, processor, config.image_base_path, is_train=True)
    val_dataset = VQADataset(val_data, processor, config.image_base_path, is_train=True)
    test_dataset = VQADataset(test_df, processor, config.image_base_path, is_train=False)

    # Îç∞Ïù¥ÌÑ∞Î°úÎçî ÏÉùÏÑ±
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        num_workers=2
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2
    )
    lora_config = create_lora_config(config)
    # Î™®Îç∏ ÏÉùÏÑ±
    model = CLIPVQAModel(config.model_name, config.num_choices,lora_config).to(config.device)

    # ÏòµÌã∞ÎßàÏù¥Ï†Ä Î∞è ÏÜêÏã§ Ìï®Ïàò
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay
    )

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    # Ïä§ÏºÄÏ§ÑÎü¨ (ÏÑ†ÌÉùÏÇ¨Ìï≠)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.5,
        patience=2,
        verbose=True
    )

    # ÌõàÎ†® Î£®ÌîÑ
    best_val_acc = 0
    patience_counter = 0

    print("=" * 60)
    print("Starting Training...")
    print("=" * 60)

    for epoch in range(config.num_epochs):
        print(f"\nEpoch {epoch + 1}/{config.num_epochs}")
        print("-" * 40)

        # ÌõàÎ†®
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, config.device)

        # Í≤ÄÏ¶ù
        val_loss, val_acc = validate_epoch(model, val_loader, criterion, config.device)

        # Ïä§ÏºÄÏ§ÑÎü¨ ÏóÖÎç∞Ïù¥Ìä∏
        scheduler.step(val_loss)

        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

        # Î≤†Ïä§Ìä∏ Î™®Îç∏ Ï†ÄÏû•
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0

            if config.save_best_model:
                torch.save(model.state_dict(), config.model_save_path)
                print(f"New best model saved with validation accuracy: {val_acc:.2f}%")
        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= config.patience:
            print(f"Early stopping triggered after {epoch + 1} epochs")
            break
    print('Train finish...')
    '''
    # Î≤†Ïä§Ìä∏ Î™®Îç∏ Î°úÎìú
    if config.save_best_model and os.path.exists(config.model_save_path):
        model.load_state_dict(torch.load(config.model_save_path))
        print(f"Loaded best model with validation accuracy: {best_val_acc:.2f}%")

    # ÌÖåÏä§Ìä∏ Ï∂îÎ°†
    print("\n" + "=" * 60)
    print("Starting Inference...")
    print("=" * 60)

    model.eval()
    predictions = []
    ids = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            if batch is None:
                continue

            pixel_values = batch['pixel_values'].to(config.device)
            input_ids = batch['input_ids'].to(config.device)
            attention_mask = batch['attention_mask'].to(config.device)

            logits = model(pixel_values, input_ids, attention_mask)
            pred_indices = logits.argmax(dim=1).cpu().numpy()

            label_map = {0: "A", 1: "B", 2: "C", 3: "D"}
            batch_predictions = [label_map[idx] for idx in pred_indices]

            predictions.extend(batch_predictions)
            ids.extend(batch['ids'])

    # Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ±
    submission_df = pd.DataFrame({'ID': ids, 'answer': predictions})
    submission_df.to_csv(config.output_submission_path, index=False)

    print(f"‚úÖ Submission saved to {config.output_submission_path}")
    print(f"üìä Final Results:")
    print(f"   - Best Validation Accuracy: {best_val_acc:.2f}%")
    print(f"   - Total Predictions: {len(predictions)}")
    '''
if __name__ == "__main__":
    main()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Training samples: 118938
Validation samples: 20990
Test samples: 852


preprocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

LoRA Î™®Îç∏ Ï†ïÎ≥¥:
trainable params: 6,488,064 || all params: 434,104,577 || trainable%: 1.4946
Starting Training...

Epoch 1/6
----------------------------------------


Training:   0%|          | 0/1859 [00:00<?, ?it/s]

Validation:   0%|          | 0/328 [00:00<?, ?it/s]

Train Loss: 0.8134, Train Acc: 75.19%
Val Loss: 0.7728, Val Acc: 77.59%
New best model saved with validation accuracy: 77.59%

Epoch 2/6
----------------------------------------


Training:   0%|          | 0/1859 [00:00<?, ?it/s]

Validation:   0%|          | 0/328 [00:00<?, ?it/s]

Train Loss: 0.6921, Train Acc: 82.73%
Val Loss: 0.7643, Val Acc: 78.37%
New best model saved with validation accuracy: 78.37%

Epoch 3/6
----------------------------------------


Training:   0%|          | 0/1859 [00:00<?, ?it/s]

Validation:   0%|          | 0/328 [00:00<?, ?it/s]

Train Loss: 0.5872, Train Acc: 89.71%
Val Loss: 0.7929, Val Acc: 77.78%

Epoch 4/6
----------------------------------------


Training:   0%|          | 0/1859 [00:00<?, ?it/s]

Validation:   0%|          | 0/328 [00:00<?, ?it/s]

Train Loss: 0.5012, Train Acc: 95.25%
Val Loss: 0.8320, Val Acc: 76.60%

Epoch 5/6
----------------------------------------


Training:   0%|          | 0/1859 [00:00<?, ?it/s]

Validation:   0%|          | 0/328 [00:00<?, ?it/s]

Train Loss: 0.4523, Train Acc: 97.84%
Val Loss: 0.8511, Val Acc: 76.16%
Early stopping triggered after 5 epochs
Train finish...
Loaded best model with validation accuracy: 78.37%

Starting Inference...


Testing:   0%|          | 0/14 [00:00<?, ?it/s]

‚úÖ Submission saved to submission_pluto.csv
üìä Final Results:
   - Best Validation Accuracy: 78.37%
   - Total Predictions: 852


CLIP paper: https://arxiv.org/abs/2103.00020

VIsual7W paper: https://arxiv.org/abs/1511.03416

https://github.com/yukezhu/visual7w-toolkit

https://ai.stanford.edu/~yukez/visual7w/