In [None]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
!pip install wandb



In [None]:
import wandb
wandb.login(key='c257f40183313193c79c27c82b63e6adf9865921')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import argparse
import numpy as np
from datasets import load_dataset
from transformers import (
    ViltForImagesAndTextClassification,
    ViltForQuestionAnswering,
    ViltProcessor,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)

from sklearn.metrics import f1_score
import torch
from datasets import ClassLabel
from functools import partial
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence


# The mount path is the location on the beam volume that we will access
MOUNT_PATH = "/content/drive/MyDrive/stat-sw-proj"

# Define static configuration variables
CONFIG = {
    "model_name": "dandelin/vilt-b32-mlm",
    "do_train": True,
    "do_eval": True,
    "learning_rate": 2e-5,
    "train_batch_size": 64,
    "eval_batch_size": 64,
    "num_train_epochs": 3,
    "logging_steps": 10,
    "save_steps": 100,
    "output_dir": "/content/drive/MyDrive/stat-sw-proj/vilt_model",
    "cache_dir": "/content/drive/MyDrive/stat-sw-proj/cache_dir",
}

def custom_collate_fn(examples):
    """
    Custom collate function to properly batch the preprocessed examples.
    """
    # Convert lists to tensors where necessary
    pixel_values = [torch.tensor(example["pixel_values"]) if isinstance(example["pixel_values"], list) else example["pixel_values"] for example in examples]
    input_ids = [torch.tensor(example["input_ids"]) if isinstance(example["input_ids"], list) else example["input_ids"] for example in examples]
    attention_mask = [torch.tensor(example["attention_mask"]) if isinstance(example["attention_mask"], list) else example["attention_mask"] for example in examples]
    labels = [torch.tensor(example["labels"]) if isinstance(example["labels"], int) else example["labels"] for example in examples]

    # Create the batch
    batch = {
        "pixel_values": torch.stack(pixel_values),
        "input_ids": pad_sequence(input_ids, batch_first=True),
        "attention_mask": pad_sequence(attention_mask, batch_first=True),
        "labels": torch.stack(labels)  # Assuming labels are tensors
    }
    return batch

def preprocess(example, label2id):
    inputs = processor(
        text=example['question'],
        images=example['image'],
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=36  # Add explicit max length
    )
    # Squeeze the batch dimension but keep as tensors
    inputs = {k: v.squeeze(0) for k, v in inputs.items()}
    inputs['labels'] = label2id[example['answer']]
    return inputs

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    f1 = f1_score(labels, preds, average="weighted")
    return {"f1": f1}

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Custom compute_loss function that does not include num_items_in_batch.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss



args = CONFIG

# Load SLAKE dataset
slake_data = load_dataset("mdwiratathya/SLAKE-vqa-english")

# Initialize processor and model
global processor
processor = ViltProcessor.from_pretrained(args['model_name'], cache_dir=args['cache_dir'])

# Get unique answers and create ClassLabel feature
unique_answers = (
    slake_data['train'].unique('answer') +
    slake_data['validation'].unique('answer') +
    slake_data['test'].unique('answer')
)
unique_answers = list(set(unique_answers))

label2id = {label: idx for idx, label in enumerate(unique_answers)}
id2label = {idx: label for label, idx in label2id.items()}

# Initialize model with correct number of labels
model = ViltForQuestionAnswering.from_pretrained(
    args['model_name'],
    num_labels=len(unique_answers),
    id2label=id2label,
    label2id=label2id,
    cache_dir=args['cache_dir']
)
model.config.problem_type = "single_label_classification"

# Preprocess datasets
preprocess_fn = partial(preprocess, label2id=label2id)
train_data = slake_data['train'].map(
    preprocess_fn,
    load_from_cache_file=True,
    cache_file_name="/content/drive/MyDrive/stat-sw-proj/train",
    remove_columns=slake_data['train'].column_names
)

test_data = slake_data['test'].map(
    preprocess_fn,
    load_from_cache_file=True,
    cache_file_name="/content/drive/MyDrive/stat-sw-proj/test",
    remove_columns=slake_data['test'].column_names
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=args["output_dir"],
    per_device_train_batch_size=args['train_batch_size'],
    per_device_eval_batch_size=args['eval_batch_size'],
    num_train_epochs=args['num_train_epochs'],
    evaluation_strategy="epoch" if args['do_eval'] else "no",
    save_steps=args['save_steps'],
    logging_steps=args['logging_steps'],
    learning_rate=args['learning_rate'],
    remove_unused_columns=False,
    fp16=True
)

# Initialize trainer with custom collate function
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data if args['do_train'] else None,
    eval_dataset=test_data if args['do_eval'] else None,
    compute_metrics=compute_metrics,
    data_collator = custom_collate_fn
)

# Train and evaluate
if args['do_train']:
    trainer.train()
    # Save the fine-tuned model
    trainer.save_model(args['output_dir'])

    # Save the processor (tokenizer + feature extractor) to the same directory
    processor.save_pretrained(args['output_dir'])

if args['do_eval']:
    eval_result = trainer.evaluate(eval_dataset=test_data)
    print(f"Test Evaluation results: {eval_result}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViltForQuestionAnswering were not initialized from the model checkpoint at dandelin/vilt-b32-mlm and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.1.bias', 'classifier.1.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmj868[0m ([33mmj868-rutgers-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1
1,1.3573,1.499743,0.622789
2,1.7313,1.147463,0.693393
3,0.4168,1.061808,0.716681


Test Evaluation results: {'eval_loss': 1.0618702173233032, 'eval_f1': 0.7166807502841986, 'eval_runtime': 37.0933, 'eval_samples_per_second': 2.696, 'eval_steps_per_second': 2.696, 'epoch': 3.0}


In [None]:
trainer.save_model(args['output_dir'])

# Save the processor (tokenizer + feature extractor) to the same directory
processor.save_pretrained(args['output_dir'])

[]