In [None]:
?

In [None]:
!pip install ftfy regex tqdm
# !pip install git+https://github.com/openai/CLIP.git
!pip install --upgrade -q accelerate bitsandbytes
!pip install git+https://github.com/huggingface/transformers.git
from transformers import AutoProcessor, LlavaForConditionalGeneration
from transformers import BitsAndBytesConfig
import torch
import torch.nn as nn
from torchvision import transforms, datasets
from torch.utils.data import Dataset, DataLoader, random_split
# import clip
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os, json
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from peft import LoraConfig, get_peft_model
from transformers import CLIPProcessor, CLIPModel

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
[31mERROR: Operation cancelled by user[0m[31m
[0m^C
^C


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(device)
model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_name).to(device)
processor = CLIPProcessor.from_pretrained(model_name)

In [None]:

# Apply LoRA to CLIP
config = LoraConfig(
    r=8,  # Low-rank factor
    lora_alpha=32,
    target_modules=["visual_projection", "text_projection"],  # LoRA for vision & text
    lora_dropout=0.1
)
clip_model = get_peft_model(clip_model, config)

In [None]:


# Load dataset from JSON file
with open("dataset.json", "r") as f:
    dataset = json.load(f)

# Custom Dataset Class with Hard Negatives
class BirdDataset(Dataset):
    def __init__(self, dataset):
        self.data = list(dataset.values())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        positive_sample = self.data[idx]

        # Load positive image
        image = Image.open(positive_sample["image_path"]).convert("RGB")

        # Text Description (Llava-generated)
        text = positive_sample["llava_text"]

        # Select a negative sample (wrong bird text)
        neg_idx = np.random.randint(0, len(self.data))
        while neg_idx == idx:  # Avoid picking the same sample
            neg_idx = np.random.randint(0, len(self.data))
        negative_text = self.data[neg_idx]["llava_text"]

        return image, text, negative_text




In [None]:
# DataLoader
train_dataset = BirdDataset(dataset)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Training Configuration
optimizer = torch.optim.Adam(clip_model.parameters(), lr=5e-5)
loss_fn = torch.nn.CosineEmbeddingLoss()  # Contrastive loss
epochs = 5



In [None]:
def retrieve_text(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = clip_model(**inputs)
        image_features = outputs.image_embeds

    similarities = {}

    for key, sample in dataset.items():
        text = sample["llava_text"]
        inputs = processor(text=[text], return_tensors="pt").to(device)

        with torch.no_grad():
            text_features = clip_model(**inputs).text_embeds

        similarity = torch.nn.functional.cosine_similarity(image_features, text_features)
        similarities[key] = similarity.item()

    best_match = max(similarities, key=similarities.get)
    return dataset[best_match]["llava_text"]

# Example test
test_image = "path/to/test/image.jpg"
print(retrieve_text(test_image))


In [None]:
# Training Loop
clip_model.train()

for epoch in range(epochs):
    total_loss, total_correct, total_samples = 0, 0, 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)

    for images, texts, negative_texts in progress_bar:
        inputs_pos = processor(text=texts, images=images, return_tensors="pt", padding=True).to(device)
        inputs_neg = processor(text=negative_texts, images=images, return_tensors="pt", padding=True).to(device)

        # Forward pass
        outputs_pos = clip_model(**inputs_pos)
        outputs_neg = clip_model(**inputs_neg)

        image_features = outputs_pos.image_embeds
        text_features = outputs_pos.text_embeds
        neg_text_features = outputs_neg.text_embeds

        # Compute loss (positive pair should be close, negative should be far)
        target_pos = torch.ones(image_features.shape[0]).to(device)  # Positive pair target
        target_neg = -torch.ones(image_features.shape[0]).to(device)  # Negative pair target

        loss_pos = loss_fn(image_features, text_features, target_pos)
        loss_neg = loss_fn(image_features, neg_text_features, target_neg)

        loss = loss_pos + loss_neg  # Combine losses

        # Accuracy: Positive similarity > Negative similarity
        pos_sim = torch.nn.functional.cosine_similarity(image_features, text_features).mean().item()
        neg_sim = torch.nn.functional.cosine_similarity(image_features, neg_text_features).mean().item()
        accuracy = (pos_sim > neg_sim).sum().item() / image_features.shape[0]

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Logging
        total_loss += loss.item()
        total_correct += accuracy
        total_samples += 1

        progress_bar.set_postfix({"Loss": f"{loss.item():.4f}", "Acc": f"{accuracy:.2%}", "Pos Sim": f"{pos_sim:.3f}", "Neg Sim": f"{neg_sim:.3f}"})

    avg_loss = total_loss / total_samples
    avg_acc = total_correct / total_samples
    print(f"Epoch {epoch+1}: Avg Loss = {avg_loss:.4f}, Avg Accuracy = {avg_acc:.2%}")

# Save the fine-tuned model
clip_model.save_pretrained("clip_lora_bird_retrieval")

In [None]:
def retrieve_image(query_text):
    inputs = processor(text=[query_text], return_tensors="pt").to(device)

    with torch.no_grad():
        text_features = clip_model(**inputs).text_embeds

    similarities = {}

    for key, sample in dataset.items():
        image = Image.open(sample["image_path"]).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)

        with torch.no_grad():
            image_features = clip_model(**inputs).image_embeds

        similarity = torch.nn.functional.cosine_similarity(text_features, image_features)
        similarities[key] = similarity.item()

    best_match = max(similarities, key=similarities.get)
    return dataset[best_match]["image_path"]

# Example test
query_text = "A cliff swallow with a red forehead perched on a wooden post."
print(retrieve_image(query_text))
