In [None]:
import torch
import clip
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd


from transformers import BlipProcessor, BlipForConditionalGeneration, LlavaProcessor, LlavaForConditionalGeneration

### Hyperparameters

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

### Dataset

In [3]:
# Load the CUB-200-2011 dataset
def load_cub_dataset(data_dir):
    images = pd.read_csv(os.path.join(data_dir, 'images.txt'), sep=' ', names=['image_id', 'file_path'])
    labels = pd.read_csv(os.path.join(data_dir, 'image_class_labels.txt'), sep=' ', names=['image_id', 'class_id'])
    classes = pd.read_csv(os.path.join(data_dir, 'classes.txt'), sep=' ', names=['class_id', 'class_name'])
    return images, labels, classes
data_dir = 'data'

images, labels, classes = load_cub_dataset(data_dir)

print(images.head())
print(labels.head())
print(classes.head())

print(images.shape)
print(labels.shape)
print(classes.shape)

   image_id                                          file_path
0         1  001.Black_footed_Albatross/Black_Footed_Albatr...
1         2  001.Black_footed_Albatross/Black_Footed_Albatr...
2         3  001.Black_footed_Albatross/Black_Footed_Albatr...
3         4  001.Black_footed_Albatross/Black_Footed_Albatr...
4         5  001.Black_footed_Albatross/Black_Footed_Albatr...
   image_id  class_id
0         1         1
1         2         1
2         3         1
3         4         1
4         5         1
   class_id                  class_name
0         1  001.Black_footed_Albatross
1         2        002.Laysan_Albatross
2         3         003.Sooty_Albatross
3         4       004.Groove_billed_Ani
4         5          005.Crested_Auklet
(11788, 2)
(11788, 2)
(200, 2)


## Task 1

### CLIP

In [None]:
clip_model, clip_preprocess = clip.load("ViT-B/32", device=DEVICE, jit=False)
clip_model.eval()

def get_clip_features(img_path):
    img = Image.open(img_path)
    img_input = clip_preprocess(img).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        img_features = clip_model.encode_image(img_input)
    return img_features

def get_clip_text_features(text):
    text_input = clip.tokenize([text]).to(DEVICE)
    with torch.no_grad():
        text_features = clip_model.encode_text(text_input)
    return text_features

def get_clip_similarity_score(img_features, text_features):
    img_features /= img_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity_score = (100.0 * img_features @ text_features.T).softmax(dim=-1)
    return similarity_score.item()

def recognize_bird_species(img_path):
    img_features = get_clip_features(img_path)
    similarities = []
    for class_name in classes['class_name']:
        text_features = get_clip_text_features(class_name)
        similarity_score = get_clip_similarity_score(img_features, text_features)
        similarities.append((class_name, similarity_score))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[0][0]

In [45]:
img_id = 1
image_name = images.iloc[img_id]['file_path']
gt_class = labels.iloc[img_id]['class_id']
bird_species = recognize_bird_species(os.path.join(os.path.join(data_dir, 'images'), image_name))
print(f'Predicted class: {bird_species}')
print(f"GT class: {classes[classes['class_id'] == gt_class]['class_name'][0]}")

Predicted class: 001.Black_footed_Albatross
GT class: 001.Black_footed_Albatross


#### Run CLIP on all images

In [None]:
accuracy = 0
with open('clip_predictions.txt', 'w') as f:
    for img_id in tqdm(images['image_id']):
        image_name = images.iloc[img_id]['file_path']
        gt_class = labels.iloc[img_id]['class_id']
        gt_class = classes[classes['class_id'] == gt_class]['class_name'][0]
        bird_species = recognize_bird_species(os.path.join(os.path.join(data_dir, 'images'), image_name))
        f.write(f'{img_id} {int(gt_class == bird_species)}\n')
        accuracy += int(gt_class == bird_species)
    accuracy /= len(images)
    print(f'Accuracy: {accuracy}')

## Task 2

In [None]:
img_path = 'data/images/001.Black_footed_Albatross/Black_Footed_Albatross_0001_796111.jpg'
img = Image.open(img_path).convert('RGB')

### BLIP-2

In [None]:
blip_model_path = "Salesforce/blip2-opt-2.7b"

blip_processor = BlipProcessor.from_pretrained(blip_model_path, trust_remote_code=True)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_path, trust_remote_code=True)

def generate_blip_caption(image):
    inputs = blip_processor(image, return_tensors="pt")
    output = blip_model.generate(**inputs)
    return blip_processor.decode(output[0], skip_special_tokens=True)

  else:
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'BertTokenizerFast'.


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\miret\miniconda3\envs\cv\lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\miret\AppData\Local\Temp\ipykernel_19016\2979974320.py", line 3, in <module>
    blip_processor = BlipProcessor.from_pretrained(blip_model_path, trust_remote_code=True)
  File "c:\Users\miret\miniconda3\envs\cv\lib\site-packages\transformers\processing_utils.py", line 466, in from_pretrained
  File "c:\Users\miret\miniconda3\envs\cv\lib\site-packages\transformers\processing_utils.py", line 512, in _get_arguments_from_pretrained
  File "c:\Users\miret\miniconda3\envs\cv\lib\site-packages\transformers\tokenization_utils_base.py", line 2029, in from_pretrained
  File "c:\Users\miret\miniconda3\envs\cv\lib\site-packages\transformers\tokenization_utils_base.py", line 2261, in _from_pretrained
    return tokenizer
  File "c:\Users\miret\miniconda3\envs\cv\lib\site-package

In [None]:

blip_caption = generate_blip_caption(img)

print("BLIP-2 Caption:", blip_caption)


NameError: name 'generate_blip_caption' is not defined

### Llava

In [None]:
llava_processor = LlavaProcessor.from_pretrained("liuhaotian/llava-v1.5-7b")
llava_model = LlavaForConditionalGeneration.from_pretrained("liuhaotian/llava-v1.5-7b")

def generate_llava_caption(image):
    inputs = llava_processor(image, return_tensors="pt")
    output = llava_model.generate(**inputs)
    return llava_processor.decode(output[0], skip_special_tokens=True)

In [None]:
llava_caption = generate_llava_caption(img)
print("LLaVA Caption:", llava_caption)
