In [1]:
# Install necessary libraries
!pip install transformers peft datasets nltk tqdm pillow matplotlib

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim import AdamW
from PIL import Image
import os
import nltk
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm

nltk.download('punkt')

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# 1. Set up the environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
# Load captions
captions_file = "/content/drive/MyDrive/LLM Projects/flickr_dataset/captions.txt"
image_dir = "/content/drive/MyDrive/LLM Projects/flickr_dataset/Images"

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
captions_dict = {}
with open(captions_file, 'r') as f:
  lines = f.readlines()
  for line in lines[1:20000]:
        parts = line.strip().split(',', 1)
        if len(parts) == 2:
            image_file, caption = parts
            if image_file not in captions_dict:
                captions_dict[image_file] = caption
            # captions_dict[image_file].append(caption)

In [10]:
len(captions_dict)

4000

In [None]:
captions_dict

{'1000268201_693b08cb0e.jpg': 'A child in a pink dress is climbing up a set of stairs in an entry way .',
 '1001773457_577c3a7d70.jpg': 'A black dog and a spotted dog are fighting',
 '1002674143_1b742ab4b8.jpg': 'A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .',
 '1003163366_44323f5815.jpg': 'A man lays on a bench while his dog sits by him .',
 '1007129816_e794419615.jpg': 'A man in an orange hat starring at something .',
 '1007320043_627395c3d8.jpg': 'A child playing on a rope net .',
 '1009434119_febe49276a.jpg': 'A black and white dog is running in a grassy garden surrounded by a white fence .',
 '1012212859_01547e3f17.jpg': '"A dog shakes its head near the shore , a red ball next to it ."',
 '1015118661_980735411b.jpg': 'A boy smiles in front of a stony wall in a city .',
 '1015584366_dfcec3c85a.jpg': 'A black dog leaps over a log .',
 '101654506_8eb26cfb60.jpg': 'A brown and white dog is running through the snow .',
 '101669240_b2d3e7f1

In [19]:
# Load the fine-tuned model and processor
model_id = "Salesforce/blip-image-captioning-base"
model_path = '/content/drive/MyDrive/LLM Projects/model'
finetuned_model = AutoModelForVision2Seq.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_id)

In [20]:
import matplotlib.pyplot as plt
import torch
from PIL import Image
import os

def visualize_image_with_caption(image_path, caption):
    image = Image.open(image_path).convert('RGB')
    plt.imshow(image)
    plt.title(caption)
    plt.axis('off')
    plt.show()

def generate_and_plot_captions(model, processor, image_dir, captions_dict, device, num_images=5):
    model.eval()
    images = list(captions_dict.keys())[:num_images]
    with torch.no_grad():
        for image_file in images:
            image_path = os.path.join(image_dir, image_file)
            image = Image.open(image_path).convert('RGB')
            pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
            generated_output = model.generate(pixel_values=pixel_values, max_new_tokens=64)
            generated_caption = processor.batch_decode(generated_output, skip_special_tokens=True)[0]
            visualize_image_with_caption(image_path, generated_caption)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the device
finetuned_model.to(device)

# Generate and plot captions after finetuning
print("Generating captions with finetuned model:")
generate_and_plot_captions(finetuned_model, processor, image_dir, captions_dict, device)


Output hidden; open in https://colab.research.google.com to view.