In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-dm85orbn
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-dm85orbn
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Othercomputers/lucas-yoga/UiB/INFO381/code/INFO381-GitHub


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import clip
import time
import os

# Connect to personal files if using Google Colab
def connect_to_drive(computer_name, subpath):
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        target_path = os.path.join("/content/drive/Othercomputers", computer_name, subpath)
        os.chdir(target_path)
        using_colab = True
    except:
        print("Not using Google Colab")
        using_colab = False
    return using_colab
using_colab = connect_to_drive("lucas-yoga", "Current/INFO381/code/INFO381-GitHub")

# Local imports
from utils import get_dataloaders

In [4]:
# 1) Load the CLIP model and choose device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()  # Put CLIP in eval mode.

# 2) Optionally freeze all CLIP parameters or only part of them.
for param in clip_model.parameters():
    param.requires_grad = False

### Image preprocessing

In [5]:

# 3) Define your image transform. You can use clip_preprocess or your own.
#    For best results, the official clip_preprocess often works well:
transform = clip_preprocess


### Define train and test path

### Create the datasets with ImageFolder

Automatically reads the REAL and FAKE folders and lables them

In [8]:
train_loader, test_loader = get_dataloaders(zip_path="cifar_train_test.zip")
print(train_loader)

### Defining the model

Using CLIP with a linear classifier on top

In [11]:
# 5) Create a simple classifier head for the CLIP embeddings.
#    CLIP's ViT-B/32 has a default embedding dimension of 512.
class CLIPClassifier(nn.Module):
    def __init__(self, clip_model, embed_dim=512, num_classes=2):
        super().__init__()
        self.clip_model = clip_model
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, images):
        # Encode images with CLIP's ViT (frozen), no gradient needed.
        with torch.no_grad():
            image_embeddings = self.clip_model.encode_image(images)
        image_embeddings = image_embeddings.float()  # cast to float32
        # Trainable classification head
        return self.classifier(image_embeddings)

# 6) Instantiate our model with CLIP backbone + linear head.
model = CLIPClassifier(clip_model, embed_dim=512, num_classes=2).to(device)

# 7) Define loss function and optimizer (only training the classifier head).
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)


### Training the model

In [12]:
# 8) Training loop (similar to your ResNet loop).
num_epochs = 1
for epoch in range(num_epochs):
    start_time = time.time()

    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Clear old gradients
        optimizer.zero_grad()

        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()                     # Backprop
        optimizer.step()                    # Update classifier head

        # Track training loss
        running_loss += loss.item()

        # Compute accuracy
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = correct / total * 100
    end_time = time.time()

    print(f"Epoch [{epoch+1}/{num_epochs}], "
        f"Loss: {epoch_loss:.4f}, "
        f"Accuracy: {epoch_accuracy:.2f}%, "
        f"Time: {end_time - start_time:.2f} s")

# 9) Save the final classifier state...
torch.save(model.state_dict(), "clip_vit_classifier.pth")
print("CLIP-based model saved successfully!")

Epoch [1/1], Loss: 0.2163, Accuracy: 91.92%, Time: 210.93 s
CLIP-based model saved successfully!
