# Task 1: CLIP Predictions
- In this notebook we use CLIP to predict a class for each image
- The embeddings are generated using the OpenAI CLIP model (ViT-B/32).
- We save the predictions in a text file

In [None]:
import torch
import clip
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from tqdm import tqdm
from utils import *

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

### Dataset

In [None]:
# Load the CUB-200-2011 dataset

data_dir = 'data/CUB_200_2011'

images, labels, classes, _, _, _, _, _, _, _, _ = load_cub_dataset(data_dir)

print(images.head())
print(labels.head())
print(classes.head())

print(images.shape)
print(labels.shape)
print(classes.shape)

### CLIP

In [None]:
clip_model, clip_preprocess = clip.load("ViT-B/32", device=DEVICE, jit=False)
clip_model.eval()

In [None]:
img_id = 1
image_name = images.iloc[img_id]['file_path']
gt_class = labels.iloc[img_id]['class_id']
gt_class_name = classes[classes['class_id'] == gt_class]['class_name'][0]
bird_species = recognize_bird_species(os.path.join(os.path.join(data_dir, 'images'), image_name))
print(bird_species)
pred_class = classes[classes['class_name'] == bird_species]['class_id'][0]
print(f'Predicted class: {pred_class}, class name: {bird_species}')
print(f"GT class: {gt_class}, class name: {gt_class_name}")

#### Run CLIP on all images

In [None]:
accuracy = 0
with open('clip_predictions.txt', 'w') as f:
    for img_id in tqdm(images['image_id']):
        image_name = images.iloc[img_id]['file_path']
        gt_class = labels.iloc[img_id]['class_id']
        bird_species = recognize_bird_species(os.path.join(os.path.join(data_dir, 'images'), image_name), classes)
        pred_class = classes[classes['class_name'] == bird_species]['class_id'][0]
        f.write(f'{img_id} {pred_class} {int(gt_class == pred_class)}\n')
        accuracy += int(gt_class == pred_class)
    accuracy /= len(images)
    print(f'Accuracy: {accuracy}')

In [22]:
accuracy = 0
total = 0
with open('clip_predictions_1_3000.txt', 'r') as f:
    for line in f.readlines():
        accuracy += int(line.split(' ')[-1][0])
        total += 1
    accuracy /= total
print(f'Accuracy: {np.round(accuracy*100, 2)}%')

Accuracy: 39.81%
