In [1]:
!pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-fju3zfz9
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-fju3zfz9
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369497 sha256=1c8f876c116dfe77648ba23008f4d634c766c834c1214f1f5c8717d29eea2185
  Stored in directory: /tmp/pip-ephem-wheel-cache-8p4as3wy/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b112deb897d5b50f5ad9a37e4
Successfully built clip
Inst

In [2]:
!pip install torch torchvision




In [3]:


# Import necessary libraries
import torch
import torchvision
import torchvision.transforms as transforms
import clip
from PIL import Image

# Check if CUDA is available and set device accordingly
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the CLIP model
model, preprocess = clip.load("ViT-B/32", device=device)

# Define the transformation for CIFAR-10 images
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to fit CLIP's input dimensions
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load the CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False)

# Text descriptions for CIFAR-10 classes
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
text_descriptions = [f"a photo of a {cls}" for cls in classes]
text_tokens = clip.tokenize(text_descriptions).to(device)

# Inference loop
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in testloader:
        images = images.to(device)
        labels = labels.to(device)

        # Calculate image and text features
        image_features = model.encode_image(images)
        text_features = model.encode_text(text_tokens)

        # Determine similarity and predict class
        similarities = (image_features @ text_features.T).softmax(dim=-1)
        predicted_classes = similarities.argmax(dim=-1)

        # Evaluate predictions
        correct += (predicted_classes == labels).sum().item()
        total += labels.size(0)

# Calculate accuracy
accuracy = 100 * correct / total
print(f'Accuracy of CLIP on the CIFAR-10 test images: {accuracy}%')


100%|████████████████████████████████████████| 338M/338M [00:03<00:00, 109MiB/s]


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 29852227.67it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Accuracy of CLIP on the CIFAR-10 test images: 78.08%


In [7]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import clip
from PIL import Image

# Custom Classifier Definition
class CustomCLIPClassifier(nn.Module):
    def __init__(self, clip_model, num_classes):
        super(CustomCLIPClassifier, self).__init__()
        self.clip_model = clip_model
        self.classifier = nn.Sequential(
            nn.Linear(1024, 512),  # Adjust dimensions to concatenate image and text features
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, image, text, labels):
        with torch.no_grad():
            image_features = self.clip_model.encode_image(image).float()  # Convert to float32
            text_features = self.clip_model.encode_text(text)[labels].float()  # Convert to float32 and select relevant text features
        features = torch.cat((image_features, text_features), dim=1)
        return self.classifier(features)

# Initialize device, model, and text descriptions
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Text descriptions for CIFAR-10 classes
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
text_descriptions = [f"a photo of a {cls}" for cls in classes]
text_tokens = clip.tokenize(text_descriptions).to(device)

# Instantiate the custom classifier model
num_classes = 10  # CIFAR-10 has 10 classes
model = CustomCLIPClassifier(clip_model, num_classes).to(device)

# Define the transformation for CIFAR-10 images
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to fit CLIP's input dimensions
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load the CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

# Set CLIP model parameters to not require gradients
for param in model.clip_model.parameters():
    param.requires_grad = False

# Training loop
num_epochs = 5  # Set the number of epochs

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs, text_tokens, labels)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 99:  # Print every 100 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

print('Finished Training')

# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        outputs = model(images, text_tokens, labels)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the Custom CLIP Classifier on the CIFAR-10 test images: {accuracy}%')


Files already downloaded and verified
Files already downloaded and verified
[1,   100] loss: 0.060
[1,   200] loss: 0.002
[1,   300] loss: 0.001
[1,   400] loss: 0.001
[1,   500] loss: 0.000
[1,   600] loss: 0.000
[1,   700] loss: 0.000
[2,   100] loss: 0.000
[2,   200] loss: 0.000
[2,   300] loss: 0.000
[2,   400] loss: 0.000
[2,   500] loss: 0.000
[2,   600] loss: 0.000
[2,   700] loss: 0.000
[3,   100] loss: 0.000
[3,   200] loss: 0.000
[3,   300] loss: 0.000
[3,   400] loss: 0.000
[3,   500] loss: 0.000
[3,   600] loss: 0.000
[3,   700] loss: 0.000
[4,   100] loss: 0.000
[4,   200] loss: 0.000
[4,   300] loss: 0.000
[4,   400] loss: 0.000
[4,   500] loss: 0.000
[4,   600] loss: 0.000
[4,   700] loss: 0.000
[5,   100] loss: 0.000
[5,   200] loss: 0.000
[5,   300] loss: 0.000
[5,   400] loss: 0.000
[5,   500] loss: 0.000
[5,   600] loss: 0.000
[5,   700] loss: 0.000
Finished Training
Accuracy of the Custom CLIP Classifier on the CIFAR-10 test images: 100.0%


In [8]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import clip
from sklearn.model_selection import KFold

# Custom Classifier Definition
class CustomCLIPClassifier(nn.Module):
    def __init__(self, clip_model, num_classes):
        super(CustomCLIPClassifier, self).__init__()
        self.clip_model = clip_model
        self.classifier = nn.Sequential(
            nn.Linear(1024, 512),  # Adjust dimensions to concatenate image and text features
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, image, text, labels):
        with torch.no_grad():
            image_features = self.clip_model.encode_image(image).float()  # Convert to float32
            text_features = self.clip_model.encode_text(text)[labels].float()  # Convert to float32 and select relevant text features
        features = torch.cat((image_features, text_features), dim=1)
        return self.classifier(features)

# Initialize device, model, and text descriptions
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)
num_classes = 10  # CIFAR-10 has 10 classes

# Text descriptions for CIFAR-10 classes
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
text_descriptions = [f"a photo of a {cls}" for cls in classes]
text_tokens = clip.tokenize(text_descriptions).to(device)

# Transformation for CIFAR-10 images
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to fit CLIP's input dimensions
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load the entire CIFAR-10 dataset (without splitting into train and test)
dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

# K-Fold Cross-Validation setup
k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

# Cross-validation loop
for fold, (train_ids, val_ids) in enumerate(kfold.split(dataset)):
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_ids)

    # Define data loaders for training and validation data in this fold
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=64, sampler=train_subsampler)
    valloader = torch.utils.data.DataLoader(dataset, batch_size=64, sampler=val_subsampler)

    # Init the neural network
    model = CustomCLIPClassifier(clip_model, num_classes).to(device)

    # Initialize optimizer, loss function, etc.
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

    # Set CLIP model parameters to not require gradients
    for param in model.clip_model.parameters():
        param.requires_grad = False

    # Train the model for this fold
    num_epochs = 5  # Set the number of epochs
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs, text_tokens, labels)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 100 == 99:  # Print every 100 mini-batches
                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
                running_loss = 0.0

    # Evaluate the model on validation set for this fold
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in valloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images, text_tokens, labels)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Print accuracy for this fold
    accuracy = 100 * correct / total
    print(f'Accuracy for fold {fold}: {accuracy}% \n')

    # [Record performance metrics for each fold if necessary]


Files already downloaded and verified
FOLD 0
--------------------------------
[1,   100] loss: 0.059
[1,   200] loss: 0.002
[1,   300] loss: 0.001
[1,   400] loss: 0.001
[1,   500] loss: 0.000
[1,   600] loss: 0.000
[2,   100] loss: 0.000
[2,   200] loss: 0.000
[2,   300] loss: 0.000
[2,   400] loss: 0.000
[2,   500] loss: 0.000
[2,   600] loss: 0.000
[3,   100] loss: 0.000
[3,   200] loss: 0.000
[3,   300] loss: 0.000
[3,   400] loss: 0.000
[3,   500] loss: 0.000
[3,   600] loss: 0.000
[4,   100] loss: 0.000
[4,   200] loss: 0.000
[4,   300] loss: 0.000
[4,   400] loss: 0.000
[4,   500] loss: 0.000
[4,   600] loss: 0.000
[5,   100] loss: 0.000
[5,   200] loss: 0.000
[5,   300] loss: 0.000
[5,   400] loss: 0.000
[5,   500] loss: 0.000
[5,   600] loss: 0.000
Accuracy for fold 0: 100.0% 

FOLD 1
--------------------------------
[1,   100] loss: 0.060
[1,   200] loss: 0.002
[1,   300] loss: 0.001
[1,   400] loss: 0.001
[1,   500] loss: 0.000
[1,   600] loss: 0.000
[2,   100] loss: 0.000
[