In [24]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoImageProcessor,
    CLIPModel,
    TrainingArguments,
    Trainer,
    set_seed,
)
from peft import LoraConfig, get_peft_model
from torchvision.transforms import (
    Resize, CenterCrop, ToTensor, Normalize, Compose, 
    InterpolationMode, RandomResizedCrop
)
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

Device: cuda
GPU: Tesla T4


In [25]:
EXPERIMENT_MODE = "full" 

DATASET_NAME = "AnyModal/flickr30k"
IMAGE_COLUMN = "image"
CAPTION_COLUMN = "original_alt_text"
MODEL_NAME = "openai/clip-vit-base-patch32"
OUTPUT_DIR = "./clip-flickr30k-lora"
SEED = 42

CONFIGS = {
    "fast": {
        "desc": "Quick test",
        "use_subset": True,
        "subset_size": 10000,
        "batch_size": 64,
        "grad_accum": 1,
        "lr": 5e-5,
        "epochs": 1,
        "lora_r": 16,
        "lora_modules": ["q_proj", "v_proj"],
        "save_steps": 100,
        "eval_steps": 100,
    },
    "full": {
        "desc": "Best quality",
        "use_subset": False,
        "subset_size": None,
        "batch_size": 32,
        "grad_accum": 2,
        "lr": 1e-4,
        "epochs": 3,
        "lora_r": 16,
        "lora_modules": ["q_proj", "v_proj"],
        "save_steps": 100,
        "eval_steps": 100,
    }
}

config = CONFIGS[EXPERIMENT_MODE]
print(f"\n{'='*60}")
print(f"MODE: {EXPERIMENT_MODE.upper()}")
print(f"Description: {config['desc']}")
print(f"{'='*60}\n")


USE_SUBSET = config["use_subset"]
SUBSET_SIZE = config["subset_size"]
PER_DEVICE_TRAIN_BATCH_SIZE = config["batch_size"]
PER_DEVICE_EVAL_BATCH_SIZE = config["batch_size"]
GRADIENT_ACCUMULATION_STEPS = config["grad_accum"]
LEARNING_RATE = config["lr"]
NUM_TRAIN_EPOCHS = config["epochs"]
LORA_R = config["lora_r"]
LORA_TARGET_MODULES = config["lora_modules"]
SAVE_STEPS = config["save_steps"]
EVAL_STEPS = config["eval_steps"]


WEIGHT_DECAY = 0.01
WARMUP_STEPS = 200
MAX_SEQ_LENGTH = 77
LOGGING_STEPS = 25
LORA_ALPHA = LORA_R * 2
LORA_DROPOUT = 0.2

print(f"Config Summary:")
print(f"  Batch Size: {PER_DEVICE_TRAIN_BATCH_SIZE} (effective: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS})")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  LoRA Rank: {LORA_R}")
print(f"  LoRA Modules: {LORA_TARGET_MODULES}")
print(f"  Epochs: {NUM_TRAIN_EPOCHS}")
if USE_SUBSET:
    print(f"  Dataset Size: {SUBSET_SIZE} examples (subset)")
else:
    print(f"  Dataset Size: Full dataset")
print()



MODE: FULL
Description: Best quality - 5-7 hours on T4 GPU

Config Summary:
  Batch Size: 32 (effective: 64)
  Learning Rate: 0.0001
  LoRA Rank: 16
  LoRA Modules: ['q_proj', 'v_proj']
  Epochs: 3
  Dataset Size: Full dataset



In [26]:
set_seed(SEED)

dataset = load_dataset(DATASET_NAME)
print("Original dataset:", dataset)

if USE_SUBSET and "train" in dataset:
    print(f"\nUsing subset: {SUBSET_SIZE} training examples")
    dataset["train"] = dataset["train"].select(range(min(SUBSET_SIZE, len(dataset["train"]))))
   
    val_size = min(int(SUBSET_SIZE * 0.1), len(dataset["validation"]))
    dataset["validation"] = dataset["validation"].select(range(val_size))

print(f"\nFinal dataset sizes:")
print(f"  Train: {len(dataset['train'])} examples")
print(f"  Validation: {len(dataset['validation'])} examples")
print(f"  Test: {len(dataset['test'])} examples")

Original dataset: DatasetDict({
    train: Dataset({
        features: ['image', 'alt_text', 'sentids', 'split', 'img_id', 'filename', 'original_alt_text'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['image', 'alt_text', 'sentids', 'split', 'img_id', 'filename', 'original_alt_text'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['image', 'alt_text', 'sentids', 'split', 'img_id', 'filename', 'original_alt_text'],
        num_rows: 1000
    })
})

Final dataset sizes:
  Train: 29000 examples
  Validation: 1014 examples
  Test: 1000 examples


In [27]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
image_processor = AutoImageProcessor.from_pretrained(MODEL_NAME)

image_size = image_processor.size.get("shortest_edge", 224)
image_mean = image_processor.image_mean
image_std = image_processor.image_std

print(f"Image size: {image_size}, Mean: {image_mean}, Std: {image_std}")

Image size: 224, Mean: [0.48145466, 0.4578275, 0.40821073], Std: [0.26862954, 0.26130258, 0.27577711]


In [28]:
train_transform = Compose([
    Resize(int(image_size * 1.15), interpolation=InterpolationMode.BICUBIC),
    RandomResizedCrop(image_size, scale=(0.8, 1.0), interpolation=InterpolationMode.BICUBIC),
    ToTensor(),
    Normalize(mean=image_mean, std=image_std),
])

eval_transform = Compose([
    Resize(int(image_size * 1.15), interpolation=InterpolationMode.BICUBIC),
    CenterCrop(image_size),
    ToTensor(),
    Normalize(mean=image_mean, std=image_std),
])


In [29]:

import random
def transform_train_combined(examples):
    images = examples[IMAGE_COLUMN]
    captions = examples[CAPTION_COLUMN]
    
    pixel_values = []
    for img in images:
        if isinstance(img, str):
            img = Image.open(img).convert("RGB")
        elif isinstance(img, dict) and "path" in img:
            img = Image.open(img["path"]).convert("RGB")
        pixel_values.append(train_transform(img))
    
    
    caption_texts = []
    for c in captions:
        if isinstance(c, list) and len(c) > 0:
            caption_texts.append(random.choice(c))
        else:
            caption_texts.append(c if isinstance(c, str) else c[0])
    
   
    tokens = tokenizer(
        caption_texts,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length",
        truncation=True
    )
    
    return {
        "pixel_values": pixel_values,
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

def transform_eval_combined(examples):
   
    images = examples[IMAGE_COLUMN]
    captions = examples[CAPTION_COLUMN]

    pixel_values = []
    for img in images:
        if isinstance(img, str):
            img = Image.open(img).convert("RGB")
        elif isinstance(img, dict) and "path" in img:
            img = Image.open(img["path"]).convert("RGB")
        pixel_values.append(eval_transform(img))
    
    caption_texts = [c[0] if isinstance(c, list) else c for c in captions]
    
    tokens = tokenizer(
        caption_texts,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length",
        truncation=True
    )
    
    return {
        "pixel_values": pixel_values,
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }


dataset["train"].set_transform(transform_train_combined)
dataset["validation"].set_transform(transform_eval_combined)

print("\nCombined transforms configured:")
print("  Training: RANDOM caption per epoch")
print("  Validation: First caption only")


âœ“ Combined transforms configured:
  Training: RANDOM caption per epoch (data augmentation!)
  Validation: First caption only (deterministic)


In [30]:
def collate_fn(examples):
    return {
        "pixel_values": torch.stack([ex["pixel_values"] for ex in examples]),
        "input_ids": torch.tensor([ex["input_ids"] for ex in examples], dtype=torch.long),
        "attention_mask": torch.tensor([ex["attention_mask"] for ex in examples], dtype=torch.long),
        "return_loss": True
    }

In [31]:
print("\nLoading model...")
base_model = CLIPModel.from_pretrained(MODEL_NAME)
base_model.to(device)


for p in base_model.parameters():
    p.requires_grad = False

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
)

model = get_peft_model(base_model, lora_config)
model.to(device)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"\nModel loaded!")
print(f"Trainable params: {trainable:,} ({100*trainable/total:.2f}%)")
print(f"Total params: {total:,}")


Loading model...

Model loaded!
Trainable params: 983,040 (0.65%)
Total params: 152,260,353


In [38]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import EarlyStoppingCallback


training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    
    
    logging_steps=LOGGING_STEPS,
    logging_first_step=True,
    
    save_steps=SAVE_STEPS,
    eval_strategy="steps",
    eval_steps=EVAL_STEPS,
    save_total_limit=2,
    
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    fp16=True,  
    dataloader_num_workers=4, 
    dataloader_pin_memory=True,
    
    report_to="none",
    remove_unused_columns=False,
    push_to_hub=False,

    logging_strategy="steps",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=collate_fn,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

print("\nTrainer configured")
print(f"Total training steps: {len(dataset['train']) // (PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS}")
print(f"Logging every {LOGGING_STEPS} steps")
print(f"Evaluating every {EVAL_STEPS} steps")


Trainer configured!
Total training steps: 1359
Logging every 25 steps
Evaluating every 100 steps


In [39]:
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60 + "\n")

train_result = trainer.train()


trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
image_processor.save_pretrained(OUTPUT_DIR)

print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print("\nTraining metrics:")
for key, value in train_result.metrics.items():
    print(f"  {key}: {value}")



STARTING TRAINING



ValueError: The model did not return a loss from the inputs, only the following keys: logits_per_image,logits_per_text,text_embeds,image_embeds,text_model_output,vision_model_output. For reference, the inputs it received are pixel_values,input_ids,attention_mask.

In [34]:
print("\nRunning final evaluation")
eval_metrics = trainer.evaluate()

print("\nFinal validation metrics:")
for key, value in eval_metrics.items():
    print(f"  {key}: {value}")


Running final evaluation...





Final validation metrics:
  eval_loss: 0.11009262502193451
  eval_runtime: 6.0484
  eval_samples_per_second: 167.647
  eval_steps_per_second: 2.645
  epoch: 3.0


In [None]:
from pathlib import Path
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
print(f"\nLoRA adapters saved to: {OUTPUT_DIR}")