In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-3bhxvl54
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-3bhxvl54
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import clip
import time
import os

# Connect to personal files if using Google Colab
def connect_to_drive(computer_name, subpath):
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        target_path = os.path.join("/content/drive/Othercomputers", computer_name, subpath)
        os.chdir(target_path)
        using_colab = True
    except:
        print("Not using Google Colab")
        using_colab = False
    return using_colab
using_colab = connect_to_drive("lucas-yoga", "Current/INFO381/code/INFO381-GitHub")

# Local imports
from utils import get_dataloaders
from model_definitions import CLIPClassifier

Mounted at /content/drive


In [3]:
# 1) Load the CLIP model and choose device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
clip_model.eval()  # Put CLIP in eval mode.

# 2) Optionally freeze all CLIP parameters or only part of them.
for param in clip_model.parameters():
    param.requires_grad = False

100%|███████████████████████████████████████| 338M/338M [00:09<00:00, 36.1MiB/s]


### Image preprocessing

In [4]:

# 3) Define your image transform. You can use clip_preprocess or your own.
#    For best results, the official clip_preprocess often works well:
transform = clip_preprocess


### Define train and test path

### Create the datasets with ImageFolder

Automatically reads the REAL and FAKE folders and lables them

In [7]:
train_loader, test_loader = get_dataloaders(zip_path="fake_vs_real.zip", transform=transform)
print(train_loader)

Running in Google Colab
<torch.utils.data.dataloader.DataLoader object at 0x798651c96c50>


In [13]:
# just get some basic stats

# trainset stats
print("Training dataset stats:")
print(f"Number of batches: {len(train_loader)}")
for batch_idx, (images, labels) in enumerate(train_loader):
  print(f"Batch {batch_idx}:")
  print(f"Images shape: {images.shape}")
  print(f"Labels shape: {labels.shape}")
  if batch_idx == 0:
    break

# testset stats
print("Testing dataset stats:")
print(f"Number of batches: {len(test_loader)}")
for batch_idx, (images, labels) in enumerate(test_loader):
  print(f"Batch {batch_idx}:")
  print(f"Images shape: {images.shape}")
  print(f"Labels shape: {labels.shape}")
  if batch_idx == 0:
    break


Training dataset stats:
Number of batches: 189
Batch 0:
Images shape: torch.Size([32, 3, 224, 224])
Labels shape: torch.Size([32])
Testing dataset stats:
Number of batches: 48
Batch 0:
Images shape: torch.Size([32, 3, 224, 224])
Labels shape: torch.Size([32])


### Defining the model

Using CLIP with a linear classifier on top

In [14]:
# 6) Instantiate our model with CLIP backbone + linear head.
model = CLIPClassifier(clip_model, embed_dim=512, num_classes=2).to(device)

# 7) Define loss function and optimizer (only training the classifier head).
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)


### Training the model

In [15]:
# 8) Training loop (similar to your ResNet loop).
num_epochs = 1
for epoch in range(num_epochs):
    start_time = time.time()

    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Clear old gradients
        optimizer.zero_grad()

        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()                     # Backprop
        optimizer.step()                    # Update classifier head

        # Track training loss
        running_loss += loss.item()

        # Compute accuracy
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = correct / total * 100
    end_time = time.time()

    print(f"Epoch [{epoch+1}/{num_epochs}], "
        f"Loss: {epoch_loss:.4f}, "
        f"Accuracy: {epoch_accuracy:.2f}%, "
        f"Time: {end_time - start_time:.2f} s")



Epoch [1/1], Loss: 0.3019, Accuracy: 89.68%, Time: 213.28 s


In [16]:
# 9) Save the final classifier state...
torch.save(model.state_dict(), "models/clip_vit_classifier_fake_vs_real.pth")
print("CLIP-based model saved successfully!")

CLIP-based model saved successfully!
