In [1]:
!pip install transformers datasets

from IPython import display

display.clear_output()

In [2]:
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel

## Download data

In [3]:
dataset = load_dataset("Marqo/deepfashion-multimodal")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [33]:
train_dataset = dataset['data'].select(range(500))

In [34]:
train_dataset

Dataset({
    features: ['image', 'category1', 'category2', 'category3', 'text', 'item_ID'],
    num_rows: 500
})

In [35]:
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["image", "text"]]
)

In [36]:
train_dataset

Dataset({
    features: ['image', 'text'],
    num_rows: 500
})

## Load pretrained model

In [37]:
import torch
from torch.utils.data import DataLoader
from transformers import CLIPProcessor, CLIPModel
from torch.optim import AdamW

model_name = "openai/clip-vit-large-patch14"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

## Data loader

In [38]:
# Dataloader
def collate_fn(batch):
    texts = [item["text"] for item in batch]
    images = [item["image"] for item in batch]
    return processor(text=texts, images=images, return_tensors="pt", padding=True, truncation=True)

In [42]:
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

In [None]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

## Train

In [None]:
for epoch in range(20):
    total_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text

        # Tạo nhãn (so khớp đúng ảnh với đúng văn bản)
        ground_truth = torch.arange(len(logits_per_image), device=device)

        # Contrastive loss
        loss_i = torch.nn.functional.cross_entropy(logits_per_image, ground_truth)
        loss_t = torch.nn.functional.cross_entropy(logits_per_text, ground_truth)
        loss = (loss_i + loss_t) / 2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Avg Loss: {total_loss / len(train_dataloader):.4f}")

## Save model

In [None]:
from transformers import CLIPModel, CLIPProcessor

SAVE_DIR = "./clip-finetuned"

import os
os.makedirs(SAVE_DIR, exist_ok=True)

model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)

## Push to HuggingFace

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model_repo = 'FreddyFazbear0209/CLIP_for_visual_recognition'
model.push_to_hub(model_repo)
processor.push_to_hub(model_repo)

## Load model from HuggingFace

In [None]:
model = CLIPModel.from_pretrained("FreddyFazbear0209/CLIP_for_visual_recognition").to(device)
processor = CLIPProcessor.from_pretrained("FreddyFazbear0209/CLIP_for_visual_recognition")