In [15]:
import numpy as np
import pandas as pd
import os
import torch
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, GPT2TokenizerFast
from PIL import Image
from torch.utils.data import DataLoader, Dataset

# Define dataset paths
dataset_path = r"C:\Users\nalla\Desktop\AD\dataset"
image_folder = os.path.join(dataset_path, "Images")
captions_file = os.path.join(dataset_path, "captions.txt")  # Ensure correct filename

# Load dataset
df = pd.read_csv(captions_file)
df.columns = ["image", "caption"]

# Preprocess text captions
def preprocess_text(text):
    text = text.lower().strip()
    text = "".join([char if char.isalnum() or char.isspace() else "" for char in text])  # Remove special characters
    text = "startseq " + text + " endseq"
    return text

df["caption"] = df["caption"].apply(preprocess_text)

# Load Pretrained ViT-GPT2 Model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Function to generate captions
def generate_caption(image_name):
    image_path = os.path.join(image_folder, image_name)  # Get full image path
    image = Image.open(image_path).convert("RGB")
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    
    with torch.no_grad():
        output = model.generate(pixel_values)
    caption = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return caption

# Test with a sample image
sample_image_name = "10815824_2997e03d76.jpg"
predicted_caption = generate_caption(sample_image_name)
print("Generated Caption:", predicted_caption)

# Model Evaluation
def evaluate_model(sample_size=100):
    total_samples = min(sample_size, len(df))  # Ensure we don't exceed dataset size
    for i in range(total_samples):
        image_name = df.iloc[i]["image"]
        actual_caption = df.iloc[i]["caption"]
        predicted_caption = generate_caption(image_name)
        print(f"Actual: {actual_caption}\nPredicted: {predicted_caption}\n")
    print("Evaluation complete.")

evaluate_model()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared de

Generated Caption: a man is holding a hose to a horse 
Actual: startseq a child in a pink dress is climbing up a set of stairs in an entry way  endseq
Predicted: a little girl standing next to a wooden fence 

Actual: startseq a girl going into a wooden building  endseq
Predicted: a little girl standing next to a wooden fence 

Actual: startseq a little girl climbing into a wooden playhouse  endseq
Predicted: a little girl standing next to a wooden fence 

Actual: startseq a little girl climbing the stairs to her playhouse  endseq
Predicted: a little girl standing next to a wooden fence 

Actual: startseq a little girl in a pink dress going into a wooden cabin  endseq
Predicted: a little girl standing next to a wooden fence 

Actual: startseq a black dog and a spotted dog are fighting endseq
Predicted: a dog and a cat walking down a street 

Actual: startseq a black dog and a tricolored dog playing with each other on the road  endseq
Predicted: a dog and a cat walking down a street 

A

KeyboardInterrupt: 

In [14]:
import os
captions_file = r"C:\Users\nalla\Desktop\AD\dataset\captions.csv"
print("File exists:", os.path.exists(captions_file))


File exists: False
