<a href="https://colab.research.google.com/github/matteomrz/20242R0136COSE47402/blob/main/final/final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install openai-clip
!pip install datasets
!pip install torch
!pip install tqdm



In [2]:
from datasets import load_dataset
from torch.utils.data import random_split

ds = load_dataset("bazyl/GTSRB")

train_full = ds['train']

# Map used Street Sign IDs to text descriptions
id_to_description = {
    18: "General caution",
    19: "Dangerous curve left",
    20: "Dangerous curve right",
    21: "Winding road",
    22: "Bumpy road",
    23: "Slippery road",
    24: "Road narrows on the right",
    25: "Road work",
    26: "Traffic lights",
    27: "Pedestrians",
    28: "Children crossing",
    29: "Bike crossing",
    30: "Beware of ice/snow",
    31: "Wild animals crossing",
}

# Filter for warning signs
train_full = [example for example in train_full if example['ClassId'] in id_to_description]

# Replace ClassId with Text Description
for instance in train_full:
    instance['Description'] = id_to_description[instance['ClassId']]

len_train = int(0.8 * len(train_full))
train, val = random_split(train_full, [len_train, len(train_full) - len_train])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
import clip
import torch

model, preprocess = clip.load("ViT-B/32", jit=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [4]:
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image

In [5]:
from torchvision import transforms
from torch.utils.data import Dataset

class WarningSignDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image = Image.open(BytesIO(item['Path']['bytes']))
        return self.transform(image), item['ClassId'] - 18

        # returns the number of the fucking description

In [6]:
from torch.utils.data import DataLoader

# Create DataLoader for training and validation sets
train_loader = DataLoader(WarningSignDataset(train), batch_size=32, shuffle=True)
val_loader = DataLoader(WarningSignDataset(val), batch_size=32, shuffle=False)

In [7]:
import torch.nn as nn

# Modify the model to include a classifier for subcategories
class CLIPFineTuner(nn.Module):
    def __init__(self, model, num_classes):
        super(CLIPFineTuner, self).__init__()
        self.model = model
        self.classifier = nn.Linear(model.visual.output_dim, num_classes)

    def forward(self, x):
        with torch.no_grad():
            features = self.model.encode_image(x).float()  # Convert to float32
        return self.classifier(features)

In [18]:
num_classes = len(id_to_description)
model_ft = CLIPFineTuner(model, num_classes).to(device)

In [19]:
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_ft.classifier.parameters(), lr=5e-4)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

In [20]:
from tqdm import tqdm

# Number of epochs for training
num_epochs = 20

# Training loop
for epoch in range(num_epochs):
    model_ft.train()  # Set the model to training mode
    running_loss = 0.0  # Initialize running loss for the current epoch
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}, Loss: 0.0000")  # Initialize progress bar

    for images, labels in pbar:
        images, labels = images.to(device), labels.to(device)  # Move images and labels to the device (GPU or CPU)
        optimizer.zero_grad()  # Clear the gradients of all optimized variables
        outputs = model_ft(images)  # Forward pass: compute predicted outputs by passing inputs to the model
        loss = criterion(outputs, labels)  # Calculate the loss
        loss.backward()  # Backward pass: compute gradient of the loss with respect to model parameters
        optimizer.step()  # Perform a single optimization step (parameter update)

        running_loss += loss.item()  # Update running loss
        pbar.set_description(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}")  # Update progress bar with current loss

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')  # Print average loss for the epoch

    scheduler.step()  # Update learning rate scheduler

    # Validation
    model_ft.eval()  # Set the model to evaluation mode
    correct = 0  # Initialize correct predictions counter
    total = 0  # Initialize total samples counter

    with torch.no_grad():  # Disable gradient calculation for validation
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)  # Move images and labels to the device
            outputs = model_ft(images)  # Forward pass: compute predicted outputs by passing inputs to the model
            _, predicted = torch.max(outputs.data, 1)  # Get the class label with the highest probability
            total += labels.size(0)  # Update total samples
            correct += (predicted == labels).sum().item()  # Update correct predictions

    print(f'Validation Accuracy: {100 * correct / total}%')  # Print validation accuracy for the epoch

# Save the fine-tuned model
torch.save(model_ft.state_dict(), 'clip_finetuned.pth')  # Save the model's state dictionary


Epoch 1/20, Loss: 2.1131: 100%|██████████| 192/192 [00:18<00:00, 10.28it/s]


Epoch [1/20], Loss: 2.1131
Validation Accuracy: 50.78431372549019%


Epoch 2/20, Loss: 1.5761: 100%|██████████| 192/192 [00:18<00:00, 10.12it/s]


Epoch [2/20], Loss: 1.5761
Validation Accuracy: 66.40522875816994%


Epoch 3/20, Loss: 1.2589: 100%|██████████| 192/192 [00:18<00:00, 10.18it/s]


Epoch [3/20], Loss: 1.2589
Validation Accuracy: 73.20261437908496%


Epoch 4/20, Loss: 1.0587: 100%|██████████| 192/192 [00:18<00:00, 10.35it/s]


Epoch [4/20], Loss: 1.0587
Validation Accuracy: 77.51633986928104%


Epoch 5/20, Loss: 0.9193: 100%|██████████| 192/192 [00:18<00:00, 10.57it/s]


Epoch [5/20], Loss: 0.9193
Validation Accuracy: 81.17647058823529%


Epoch 6/20, Loss: 0.8150: 100%|██████████| 192/192 [00:18<00:00, 10.44it/s]


Epoch [6/20], Loss: 0.8150
Validation Accuracy: 83.39869281045752%


Epoch 7/20, Loss: 0.7352: 100%|██████████| 192/192 [00:18<00:00, 10.28it/s]


Epoch [7/20], Loss: 0.7352
Validation Accuracy: 84.37908496732027%


Epoch 8/20, Loss: 0.6715: 100%|██████████| 192/192 [00:18<00:00, 10.43it/s]


Epoch [8/20], Loss: 0.6715
Validation Accuracy: 85.49019607843137%


Epoch 9/20, Loss: 0.6171: 100%|██████████| 192/192 [00:17<00:00, 10.69it/s]


Epoch [9/20], Loss: 0.6171
Validation Accuracy: 87.58169934640523%


Epoch 10/20, Loss: 0.5721: 100%|██████████| 192/192 [00:18<00:00, 10.51it/s]


Epoch [10/20], Loss: 0.5721
Validation Accuracy: 87.7124183006536%


Epoch 11/20, Loss: 0.5475: 100%|██████████| 192/192 [00:19<00:00,  9.78it/s]


Epoch [11/20], Loss: 0.5475
Validation Accuracy: 88.16993464052288%


Epoch 12/20, Loss: 0.5410: 100%|██████████| 192/192 [00:18<00:00, 10.49it/s]


Epoch [12/20], Loss: 0.5410
Validation Accuracy: 88.16993464052288%


Epoch 13/20, Loss: 0.5364: 100%|██████████| 192/192 [00:17<00:00, 10.76it/s]


Epoch [13/20], Loss: 0.5364
Validation Accuracy: 88.16993464052288%


Epoch 14/20, Loss: 0.5323: 100%|██████████| 192/192 [00:18<00:00, 10.60it/s]


Epoch [14/20], Loss: 0.5323
Validation Accuracy: 88.30065359477125%


Epoch 15/20, Loss: 0.5281: 100%|██████████| 192/192 [00:18<00:00, 10.49it/s]


Epoch [15/20], Loss: 0.5281
Validation Accuracy: 88.23529411764706%


Epoch 16/20, Loss: 0.5260: 100%|██████████| 192/192 [00:18<00:00, 10.36it/s]


Epoch [16/20], Loss: 0.5260
Validation Accuracy: 88.36601307189542%


Epoch 17/20, Loss: 0.5215: 100%|██████████| 192/192 [00:17<00:00, 10.85it/s]


Epoch [17/20], Loss: 0.5215
Validation Accuracy: 88.49673202614379%


Epoch 18/20, Loss: 0.5170: 100%|██████████| 192/192 [00:17<00:00, 10.68it/s]


Epoch [18/20], Loss: 0.5170
Validation Accuracy: 88.49673202614379%


Epoch 19/20, Loss: 0.5135: 100%|██████████| 192/192 [00:18<00:00, 10.37it/s]


Epoch [19/20], Loss: 0.5135
Validation Accuracy: 88.69281045751634%


Epoch 20/20, Loss: 0.5090: 100%|██████████| 192/192 [00:18<00:00, 10.25it/s]


Epoch [20/20], Loss: 0.5090
Validation Accuracy: 88.69281045751634%
