# Task 3: Bird Species Classifier
- In this notebook, we trained a classifier on top of CLIP.
- Logistic regression, MLP and Random Forest classifiers used CLIP embeddings of bird images.
- The classifier predicts the bird species based on the embeddings and evaluates its performance.
- The trained model is saved for use in the bird species exploration and retrieval application.

In [1]:
# !pip install ftfy regex tqdm
# !pip install git+https://github.com/openai/CLIP.git
# !pip install catboost
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# import clip
import joblib
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os, json
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# !pip install numpy==1.23
# from catboost import CatBoostClassifier

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
# Load the CUB-200-2011 dataset
data_dir = './data/CUB_200_2011'
images_dir = os.path.join(data_dir, 'images')
parts_dir = os.path.join(data_dir, 'parts')

images, labels_df, classes, bounding_boxes, parts, part_locs, parts_click_locs, attributes, certainties, image_attribute_labels, llava_captions = load_cub_dataset(data_dir)

print(images.head())
print(labels_df.head())
print(classes.head())

print(images.shape)
print(labels_df.shape)
print(classes.shape)

  image_attribute_labels = pd.read_csv(os.path.join(data_dir, 'attributes/image_attribute_labels.txt'),


   image_id                                          file_path
0         1  001.Black_footed_Albatross/Black_Footed_Albatr...
1         2  001.Black_footed_Albatross/Black_Footed_Albatr...
2         3  001.Black_footed_Albatross/Black_Footed_Albatr...
3         4  001.Black_footed_Albatross/Black_Footed_Albatr...
4         5  001.Black_footed_Albatross/Black_Footed_Albatr...
   image_id  class_id
0         1         1
1         2         1
2         3         1
3         4         1
4         5         1
   class_id                  class_name
0         1  001.Black_footed_Albatross
1         2        002.Laysan_Albatross
2         3         003.Sooty_Albatross
3         4       004.Groove_billed_Ani
4         5          005.Crested_Auklet
(11788, 2)
(11788, 2)
(200, 2)


In [5]:
clip_embeds_imgs = np.load(os.path.join(data_dir, 'clip_embeds_imgs.npy'))
clip_embeds_text = np.load(os.path.join(data_dir, 'clip_embeds_text.npy'))
print(clip_embeds_imgs.shape, clip_embeds_text.shape)

(11788, 512) (11788, 512)


In [6]:
clip_embeds = (clip_embeds_imgs + clip_embeds_text) / 2.0
# clip_embeds = np.concatenate((clip_embeds_imgs, clip_embeds_text), axis=1)
print(clip_embeds.shape)

(11788, 512)


In [7]:
labels = np.load(os.path.join(data_dir, 'labels.npy'))
print(labels.shape)

(11788,)


## CLIP

In [None]:
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

100%|███████████████████████████████████████| 338M/338M [00:19<00:00, 17.8MiB/s]


## Classifier

## Logistic Regression Classifier

In [None]:
dataset_size = len(clip_embeds)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

X_train, X_test, y_train, y_test = train_test_split(clip_embeds, labels, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000, solver="lbfgs", multi_class="multinomial", C=0.1)
clf.fit(X_train, y_train)

train_preds = clf.predict(X_train)
test_preds = clf.predict(X_test)

train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")



Train Accuracy: 0.9317
Test Accuracy: 0.7867


In [None]:
# Save the trained model
joblib.dump(clf, os.path.join(data_dir, 'classifier.pkl'))

['/content/drive/MyDrive/Bird-Species-Exploration-and-Retrieval/Dataset/CUB_200_2011/classifier_concat.pkl']

In [None]:
os.path.join(data_dir, 'images')

'/content/drive/MyDrive/Bird-Species-Exploration-and-Retrieval/Dataset/CUB_200_2011/images'

In [None]:
dataset_size = len(clip_embeds)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

X_train, X_test, y_train, y_test = train_test_split(clip_embeds_imgs, labels, test_size=0.2, random_state=42)

clf_img = LogisticRegression(max_iter=1000, solver="lbfgs", multi_class="multinomial", C=0.1)
clf_img.fit(X_train, y_train)

train_preds = clf_img.predict(X_train)
test_preds = clf_img.predict(X_test)

train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")



Train Accuracy: 0.9123
Test Accuracy: 0.7502


In [None]:
joblib.dump(clf_img, os.path.join(data_dir, 'classifier_img.pkl'))

['/content/drive/MyDrive/Bird-Species-Exploration-and-Retrieval/Dataset/CUB_200_2011/classifier_img.pkl']

In [None]:
dataset_size = len(clip_embeds)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

X_train, X_test, y_train, y_test = train_test_split(clip_embeds_text, labels, test_size=0.2, random_state=42)

clf_text = LogisticRegression(max_iter=1000, solver="lbfgs", multi_class="multinomial")
clf_text.fit(X_train, y_train)

train_preds = clf_text.predict(X_train)
test_preds = clf_text.predict(X_test)

# Compute accuracy for each set
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

# Print results
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")



Train Accuracy: 0.8670
Test Accuracy: 0.7918


In [None]:
joblib.dump(clf_text, os.path.join(data_dir, 'classifier_text.pkl'))

['/content/drive/MyDrive/Bird-Species-Exploration-and-Retrieval/Dataset/CUB_200_2011/classifier_text.pkl']

In [None]:

clf_path = os.path.join(data_dir, 'classifier.pkl')
clf = joblib.load(clf_path)

In [None]:
def baseline(img_path = None, text = None):
    if img_path is None and text is None:
      raise ValueError("Both img_path and text cannot be None")
    else:
      if img_path is not None and text is None:
        img = Image.open(img_path).convert('RGB')
        img = clip_preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            features = clip_model.encode_image(img).cpu().numpy()
            pred_class = clf_img.predict(features)
            return pred_class[0]
      elif img_path is None and text is not None:
        with torch.no_grad():
            features = clip_model.encode_text(clip.tokenize(text).to(device)).cpu().numpy()
      else:
        img = Image.open(img_path).convert('RGB')
        img = clip_preprocess(img).unsqueeze(0).to(device)
        with torch.no_grad():
            features = clip_model.encode_image(img).cpu().numpy()
            features += clip_model.encode_text(clip.tokenize(text).to(device)).cpu().numpy()
            features /= 2.0
      pred_class = clf.predict(features)
      return pred_class[0]

## MLP Classifier

In [20]:
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size=256, num_classes=200):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
dataset_size = len(clip_embeds)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

X_train, X_test, y_train, y_test = train_test_split(clip_embeds, labels, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [22]:
mlp_model = MLPClassifier(512, 200)
batch_size = 32

train_loader_mlp = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_loader_mlp = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    mlp_model.train()
    total_loss = 0

    for X_batch, y_batch in train_loader_mlp:
        X_batch, y_batch = X_batch, y_batch

        optimizer.zero_grad()
        outputs = mlp_model(X_batch)
        loss = criterion(outputs, y_batch-1)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - MLP Loss: {total_loss:.4f}")

mlp_model.eval()

y_pred_mlp = []
y_true = []
with torch.no_grad():
    for X_batch, y_batch in train_loader_mlp:
        X_batch = X_batch.to(device)
        outputs = mlp_model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy() + 1
        y_pred_mlp.extend(preds)
        y_true.extend(y_batch.numpy())

mlp_accuracy = accuracy_score(y_true, y_pred_mlp)
print(f"MLP Train Accuracy: {mlp_accuracy:.4f}")

y_pred_mlp_test = []
y_true_test = []

with torch.no_grad():
    for X_batch, y_batch in test_loader_mlp:
        X_batch = X_batch.to(device)
        outputs = mlp_model(X_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy() + 1
        y_pred_mlp_test.extend(preds)
        y_true_test.extend(y_batch.numpy())

mlp_accuracy = accuracy_score(y_true_test, y_pred_mlp_test)
print(f"MLP Test Accuracy: {mlp_accuracy:.4f}")

Epoch 1/10 - MLP Loss: 1195.4645
Epoch 2/10 - MLP Loss: 554.5718
Epoch 3/10 - MLP Loss: 371.9142
Epoch 4/10 - MLP Loss: 301.1002
Epoch 5/10 - MLP Loss: 262.7152
Epoch 6/10 - MLP Loss: 234.3354
Epoch 7/10 - MLP Loss: 214.1758
Epoch 8/10 - MLP Loss: 196.3562
Epoch 9/10 - MLP Loss: 182.9171
Epoch 10/10 - MLP Loss: 171.4317
MLP Train Accuracy: 0.8423
MLP Test Accuracy: 0.7358


## Random Forests

In [9]:
dataset_size = len(clip_embeds)
train_size = int(0.8 * dataset_size)
test_size = dataset_size - train_size

X_train, X_test, y_train, y_test = train_test_split(clip_embeds, labels, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest classifier
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,  # Limit the depth of the trees (Reg)
    min_samples_split=10,  
    min_samples_leaf=5,  
    max_features='sqrt', 
    bootstrap=True,  
    random_state=42
)
clf.fit(X_train, y_train)

# Make predictions
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Train Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')

Train Accuracy: 0.96
Test Accuracy: 0.58
