In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

In [None]:
try:
    # Comment out if not using colab
    from google.colab import drive
    drive.mount('/content/drive')

    # Specific for luca's computer
    %cd "/content/drive/Othercomputers/lucas-yoga/UiB/INFO381/code/INFO381-GitHub"
except:
    print("Not using Google Colab")

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import clip
import time


In [None]:

# 1) Load the CLIP model and choose device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()  # Put CLIP in eval mode.

# 2) Optionally freeze all CLIP parameters or only part of them.
for param in clip_model.parameters():
    param.requires_grad = False


### Image preprocessing

In [5]:

# 3) Define your image transform. You can use clip_preprocess or your own.
#    For best results, the official clip_preprocess often works well:
transform = clip_preprocess


### Define train and test path

In [6]:

# 4) Load your dataset similarly to how you did for ResNet.
path_train = "cifar_train_test/train"
path_test = "cifar_train_test/test"

### Create the datasets with ImageFolder

Automatically reads the REAL and FAKE folders and lables them

In [None]:
def get_or_create_dataloaders(path_train, path_test, transform, train_loader_path="dataloaders/clip_train_loader.pt", test_loader_path="dataloaders/clip_test_loader.pt"):
    import os, torch

    if os.path.exists(train_loader_path) and os.path.exists(test_loader_path):
        train_loader = torch.load(train_loader_path)
        test_loader = torch.load(test_loader_path)
    else:
        train_dataset = ImageFolder(root=path_train, transform=transform)
        test_dataset = ImageFolder(root=path_test, transform=transform)

        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

        torch.save(train_loader, train_loader_path)
        torch.save(test_loader, test_loader_path)

    return train_loader, test_loader

train_loader, test_loader = get_or_create_dataloaders(path_train, path_test, transform)

### Defining the model

Using CLIP with a linear classifier on top

In [8]:
# 5) Create a simple classifier head for the CLIP embeddings.
#    CLIP's ViT-B/32 has a default embedding dimension of 512.
class CLIPClassifier(nn.Module):
    def __init__(self, clip_model, embed_dim=512, num_classes=2):
        super().__init__()
        self.clip_model = clip_model
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, images):
        # Encode images with CLIP's ViT (frozen), no gradient needed.
        with torch.no_grad():
            image_embeddings = self.clip_model.encode_image(images)
        # Trainable classification head
        return self.classifier(image_embeddings)

# 6) Instantiate our model with CLIP backbone + linear head.
model = CLIPClassifier(clip_model, embed_dim=512, num_classes=2).to(device)

# 7) Define loss function and optimizer (only training the classifier head).
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)


### Training the model

In [None]:
# 8) Training loop (similar to your ResNet loop).
num_epochs = 1
for epoch in range(num_epochs):
    start_time = time.time()

    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Clear old gradients
        optimizer.zero_grad()

        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()                     # Backprop
        optimizer.step()                    # Update classifier head

        # Track training loss
        running_loss += loss.item()

        # Compute accuracy
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = correct / total * 100
    end_time = time.time()

    print(f"Epoch [{epoch+1}/{num_epochs}], "
        f"Loss: {epoch_loss:.4f}, "
        f"Accuracy: {epoch_accuracy:.2f}%, "
        f"Time: {end_time - start_time:.2f} s")

# 9) Save the final classifier state.
torch.save(model.state_dict(), "clip_vit_classifier.pth")
print("CLIP-based model saved successfully!")