## Image Caption Generator

In [2]:

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torchvision.datasets import CocoDetection
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, Subset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import os
from PIL import Image
import numpy as np
import requests
from io import BytesIO
from datasets import load_dataset
from tqdm import tqdm
import cv2
import pycocoevalcap
from ultralytics import YOLO
from pycocotools.coco import COCO
from pycocoevalcap.eval import COCOEvalCap
from collections import Counter
# for metrics, COCO API metrics is used
import json
from pycocoevalcap import bleu, meteor, rouge, spice
#from pycocoevalcap.evalcap import COCOEvalCap

# Load YOLOv5 for feature extraction
import torch.hub

In [3]:
# setting the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# Load pre-trained YOLOv5 model from Torch Hub
yolov5 = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).eval().to(device)

Using cache found in /home/pokepe/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-12-17 Python-3.10.15 torch-2.0.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3070, 8192MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [5]:
# Define transformations for image preprocessing
# !!!!! NOTE chatgpt says this is not needed in yolov5
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [6]:
# Vocabulary, will be used later.
vocabulary = []

In [7]:
# "annotations": [{"image_id": 179765,"id": 38,"caption": "A black Honda motorcycle parked in front of a garage."},...}
class CocoDataset(Dataset): # <start> cat sat on the mat <end> -> {0 32 24 34 3 3 1 }
    def __init__(self, root_dir, annotation_file, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        with open(annotation_file, 'r') as f:
            self.coco_data = json.load(f)
        self.tokenizer = get_tokenizer("basic_english")  # Tokenizer from torchtext
        self.annotations = self.coco_data['annotations']
        # Build vocabulary
        self.vocab = self.build_vocab()
        self.image_id_to_file = {img['id']: img['file_name'] for img in self.coco_data['images']}
        self.image_id_to_img = {}


    def build_vocab(self):
        counter = Counter()
        for annotation in tqdm(self.annotations):
            caption = annotation['caption']
            tokens = self.tokenizer(caption.lower())
            counter.update(tokens)
        vocab = build_vocab_from_iterator([counter], specials=["<pad>", "<unk>", "<bos>", "<eos>"])
        vocab.set_default_index(vocab["<unk>"])  # Out-of-vocabulary words will be mapped to <unk>
        global vocabulary
        vocabulary = vocab
        return vocab

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        image_id = self.annotations[idx]['image_id']

        file_name = self.image_id_to_file.get(image_id)
        img_path = os.path.join(self.root_dir, file_name)

        try:
          image = Image.open(img_path).convert('RGB')
          self.image_id_to_img[image_id] = image # NOTE
        except Exception as e:
          print(f"Error opening image: {file_name}")
          raise e

        caption = self.annotations[idx]['caption']
        tokens = self.tokenizer(caption.lower())

        # Convert caption tokens to indices
        caption_indices = [self.vocab['<bos>']] + [self.vocab[token] for token in tokens] + [self.vocab['<eos>']]

        if self.transform:
            image = self.transform(image)

        return image, {'image_id': image_id,'captions': torch.tensor(caption_indices, dtype=torch.long)}


In [8]:
#MAINNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
import torch
import torch.nn as nn
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        self.cnn = models.resnet50(pretrained=True).to(device)
        self.backbone = nn.Sequential(*list(self.cnn.children())[:-1])  # Get everything except the last detection layer
        self.fc = nn.Linear(self.cnn.fc.in_features, embed_size)  # Linear layer to resize

    def forward(self, images):
        with torch.no_grad():  # Freeze CNN parameters
            features = self.backbone(images)
        features = features.view(features.size(0), -1)  # Flatten
        features = self.fc(features)
        return features

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        embeddings = self.embedding(captions)
        inputs = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        lstm_out, _ = self.lstm(inputs)
        output = self.fc(lstm_out)
        return output

class ImageCaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs

In [11]:
import torch
import torch.nn as nn
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        self.cnn = models.resnet50(pretrained=True).to(device)
        self.backbone = nn.Sequential(*list(self.cnn.children())[:-1])  # Get everything except the last detection layer
        self.fc = nn.Linear(self.cnn.fc.in_features, embed_size)  # Linear layer to resize
        self.dropout = nn.Dropout(0.3)  # Add dropout to the encoder

    def forward(self, images):
        with torch.no_grad():  # Freeze CNN parameters
            features = self.backbone(images)
        features = features.view(features.size(0), -1)  # Flatten
        features = self.fc(features)
        features = self.dropout(features)  # Apply dropout to the feature vector
        return features

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1, dropout_prob=0.3):
        super(DecoderRNN, self).__init__()
        # Embedding layer to convert words to a vector representation
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # Bidirectional LSTM with dropout
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=dropout_prob)
        # Linear layer: multiply hidden_size by 2 because of the bidirectional LSTM
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        # Add a dropout layer after the LSTM output before the final linear layer
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, features, captions):
        # Convert captions to embeddings
        embeddings = self.embedding(captions)
        # Concatenate features and embeddings along the sequence dimension
        inputs = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        # Pass through LSTM
        lstm_out, _ = self.lstm(inputs)
        # Apply dropout after LSTM output
        lstm_out = self.dropout(lstm_out)
        # Pass through fully connected layer to generate vocabulary scores
        output = self.fc(lstm_out)
        return output

class ImageCaptioningModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, images, captions):
        features = self.encoder(images)
        outputs = self.decoder(features, captions)
        return outputs


In [9]:
from torch.utils.data import Subset
import random 

train_image_number = 20000
val_image_number = 4000

train_images_path = './coco/train2017'
train_annotations_path = './coco/annotations/captions_train2017.json'
val_images_path = './coco/val2017'
val_annotations_path = './coco/annotations/captions_val2017.json'

# Load COCO Dataset (captions and images) using
train_dataset = CocoDataset(train_images_path, train_annotations_path, transform=transform)

# Example: Fetch an image and its caption
image, caption = train_dataset[0]
print(image)  # Prints the caption token indices

val_dataset = CocoDataset(val_images_path, val_annotations_path, transform=transform)

subset_indices = [i for i in range(4000)]

train_subset = Subset(train_dataset, subset_indices)
val_subset = Subset(val_dataset, subset_indices)


100%|██████████| 591753/591753 [00:03<00:00, 185451.68it/s]


tensor([[[2.24891, 2.24891, 2.24891,  ..., 2.24891, 2.24891, 2.24891],
         [2.24891, 2.24891, 2.24891,  ..., 2.24891, 2.24891, 2.24891],
         [2.24891, 2.24891, 2.24891,  ..., 2.24891, 2.24891, 2.24891],
         ...,
         [2.24891, 2.24891, 2.24891,  ..., 2.24891, 2.24891, 2.24891],
         [2.24891, 2.24891, 2.24891,  ..., 2.24891, 2.24891, 2.24891],
         [2.24891, 2.24891, 2.24891,  ..., 2.24891, 2.24891, 2.24891]],

        [[2.42857, 2.42857, 2.42857,  ..., 2.42857, 2.42857, 2.42857],
         [2.42857, 2.42857, 2.42857,  ..., 2.42857, 2.42857, 2.42857],
         [2.42857, 2.42857, 2.42857,  ..., 2.42857, 2.42857, 2.42857],
         ...,
         [2.42857, 2.42857, 2.42857,  ..., 2.42857, 2.42857, 2.42857],
         [2.42857, 2.42857, 2.42857,  ..., 2.42857, 2.42857, 2.42857],
         [2.42857, 2.42857, 2.42857,  ..., 2.42857, 2.42857, 2.42857]],

        [[2.64000, 2.64000, 2.64000,  ..., 2.64000, 2.64000, 2.64000],
         [2.64000, 2.64000, 2.64000,  ..., 2.

100%|██████████| 25014/25014 [00:00<00:00, 187834.58it/s]


In [10]:
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    """Collates a batch of (image, caption) pairs and pads captions."""
    images, targets = zip(*batch)
    image_ids = [target["image_id"] for target in targets]
    captions = [target["captions"] for target in targets]
    # Pad captions to have the same length in the batch
    padded_captions = pad_sequence(captions, batch_first=True, padding_value=0)  # <pad> token is 0

    # Stack images into a batch
    images = torch.stack(images, 0) # dim: The dimension along which to stack. If dim=0, it adds a new first dimension to the tensors.
   # NO NEED FOR padded_captions = torch.stack(padded_captions, 0) , padded_captions are already stacked

    return images, image_ids, padded_captions

In [11]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    for images, image_ids, captions in dataloader:
        #captions = targets['captions']
        images = images.to(device)
        captions = captions.to(device)
        #print(len(captions))
        #print(len(images))
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])  # Exclude last token in captions

        batch_size, seq_len, vocab_size = outputs.size()
        outputs = outputs.view(-1, vocab_size)  # Flattens outputs
        captions = captions.view(-1)
        loss = criterion(outputs, captions)
        loss.backward()
        optimizer.step()
        # Print the current loss for monitoring the training progress
        print(f"Loss: {loss.item()}")

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for images, image_ids, captions in dataloader:
            #captions = targets['captions']
            images = images.to(device)
            captions = captions.to(device)

            #TODO DIMENSIONALITY PROBLEM
            outputs = model(images, captions[:, :-1])  # Exclude last token in captions
            batch_size, seq_len, vocab_size = outputs.size()
            outputs = outputs.view(-1, vocab_size)  # Flattens outputs
            captions = captions.view(-1)
            loss = criterion(outputs, captions)  # Teacher forcing
            total_loss += loss.item()

    return total_loss / len(dataloader)


In [12]:
# create a directory for checkpoints -> this can be used for storing state of the model in the midst of the training.
checkpoint_dir = 'checkpoints'
os.makedirs(checkpoint_dir, exist_ok= True)

def save_checkpoint(model, optimizer, epoch, loss, checkpoint_path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")


In [13]:
# Hyperparameters
embed_size = 256
hidden_size = 512
num_epochs = 20
learning_rate = 0.001
batch_size = 32
num_layers = 1

# Data Loaders
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Vocab
train_dataset.vocab = train_dataset.build_vocab()
# Instantiate Encoder, Decoder, and Model
encoder = EncoderCNN(embed_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, len(train_dataset.vocab), num_layers=num_layers).to(device) # TODO
model = ImageCaptioningModel(encoder, decoder).to(device)

# Optimizer and Loss Function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()



100%|██████████| 591753/591753 [00:03<00:00, 180057.09it/s]


In [14]:
# Training Loop
for epoch in tqdm(range(num_epochs)):
    train(model, train_loader, optimizer, criterion, device)

    print(f"Training for epoch {epoch+1}/{num_epochs} completed.")

    # Commented out - quicker
    #val_loss = evaluate(model, val_loader, criterion, device)
    #print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss}")

# Save the trained model
torch.save(model.state_dict(), 'image_captioning_model.pth')

  0%|          | 0/20 [00:00<?, ?it/s]

Loss: 10.303236961364746
Loss: 9.765588760375977
Loss: 9.63361644744873
Loss: 8.469826698303223
Loss: 8.934879302978516
Loss: 8.609504699707031
Loss: 7.357395648956299
Loss: 3.458204507827759
Loss: 5.4030303955078125
Loss: 5.59335470199585
Loss: 3.741607666015625
Loss: 4.528093338012695
Loss: 3.055696964263916
Loss: 3.767695903778076
Loss: 4.262363910675049
Loss: 3.697359085083008
Loss: 3.659416913986206
Loss: 3.507753610610962
Loss: 3.8023314476013184
Loss: 2.6519644260406494
Loss: 2.4620091915130615
Loss: 3.209965944290161
Loss: 2.9807045459747314
Loss: 2.321748733520508
Loss: 3.3059980869293213
Loss: 3.7758474349975586
Loss: 3.577812910079956
Loss: 3.366856098175049
Loss: 2.9022982120513916
Loss: 3.3959171772003174
Loss: 2.109205722808838
Loss: 3.089293956756592
Loss: 3.2200818061828613
Loss: 2.431415319442749
Loss: 3.783979892730713
Loss: 3.1795654296875
Loss: 3.107797145843506
Loss: 3.337613344192505
Loss: 3.935781955718994
Loss: 3.344294548034668
Loss: 2.5701756477355957
Loss: 2.

  5%|▌         | 1/20 [00:57<18:17, 57.74s/it]

Loss: 2.903203248977661
Training for epoch 1/20 completed.
Loss: 2.283644199371338
Loss: 2.926236391067505
Loss: 1.938520908355713
Loss: 2.3469133377075195
Loss: 2.7200067043304443
Loss: 2.664088249206543
Loss: 2.7759318351745605
Loss: 2.4772658348083496
Loss: 2.664966583251953
Loss: 2.2136499881744385
Loss: 1.8992018699645996
Loss: 2.3496780395507812
Loss: 2.274320363998413
Loss: 2.446026086807251
Loss: 2.324126958847046
Loss: 2.1978394985198975
Loss: 2.5261402130126953
Loss: 2.349794626235962
Loss: 2.6232335567474365
Loss: 2.4349141120910645
Loss: 1.8419959545135498
Loss: 2.1446473598480225
Loss: 2.6375203132629395
Loss: 2.434880495071411
Loss: 2.520979166030884
Loss: 2.477773666381836
Loss: 2.5511157512664795
Loss: 1.643232822418213
Loss: 2.7238476276397705
Loss: 2.4706547260284424
Loss: 2.8832101821899414
Loss: 2.2659056186676025
Loss: 2.6295204162597656
Loss: 2.3754796981811523
Loss: 2.525576591491699
Loss: 2.821744441986084
Loss: 2.236917734146118
Loss: 2.3979809284210205
Loss: 2

 10%|█         | 2/20 [01:54<17:08, 57.12s/it]

Loss: 2.6155879497528076
Training for epoch 2/20 completed.
Loss: 2.135232448577881
Loss: 1.4719494581222534
Loss: 1.4447453022003174
Loss: 2.145150899887085
Loss: 2.4998040199279785
Loss: 2.1977577209472656
Loss: 2.3376100063323975
Loss: 2.3836827278137207
Loss: 2.1681954860687256
Loss: 2.279686450958252
Loss: 2.5275020599365234
Loss: 2.42924427986145
Loss: 2.029277801513672
Loss: 2.0721263885498047
Loss: 2.041142225265503
Loss: 1.9536534547805786
Loss: 1.950884222984314
Loss: 1.8921024799346924
Loss: 1.6811708211898804
Loss: 2.514660120010376
Loss: 2.316126823425293
Loss: 1.520200490951538
Loss: 2.097724676132202
Loss: 2.088435649871826
Loss: 2.0740814208984375
Loss: 1.6526646614074707
Loss: 2.169065475463867
Loss: 1.910798192024231
Loss: 2.180478572845459
Loss: 2.18224835395813
Loss: 1.406362533569336
Loss: 1.8615151643753052
Loss: 1.5467772483825684
Loss: 1.9669660329818726
Loss: 2.7429232597351074
Loss: 1.7559759616851807
Loss: 2.1327197551727295
Loss: 2.101102590560913
Loss: 2.42

 15%|█▌        | 3/20 [02:51<16:13, 57.29s/it]

Loss: 2.2554898262023926
Training for epoch 3/20 completed.
Loss: 1.7696681022644043
Loss: 2.0841763019561768
Loss: 1.7276802062988281
Loss: 1.0350818634033203
Loss: 2.071413993835449
Loss: 1.5036183595657349
Loss: 2.038670778274536
Loss: 2.3130295276641846
Loss: 1.8785425424575806
Loss: 1.9975738525390625
Loss: 1.5842801332473755
Loss: 1.4596294164657593
Loss: 2.166879653930664
Loss: 2.017914056777954
Loss: 1.730704426765442
Loss: 2.115539073944092
Loss: 1.7619354724884033
Loss: 1.8080488443374634
Loss: 1.8648821115493774
Loss: 2.4638123512268066
Loss: 1.4483741521835327
Loss: 1.9282692670822144
Loss: 1.3949732780456543
Loss: 1.7495168447494507
Loss: 1.500714659690857
Loss: 1.880820393562317
Loss: 1.6420012712478638
Loss: 1.3934117555618286
Loss: 2.0787439346313477
Loss: 2.0282955169677734
Loss: 2.0675411224365234
Loss: 1.5672264099121094
Loss: 1.6363236904144287
Loss: 1.3549281358718872
Loss: 1.6020143032073975
Loss: 1.986546516418457
Loss: 2.271732807159424
Loss: 1.7830485105514526


 20%|██        | 4/20 [03:49<15:17, 57.32s/it]

Loss: 2.4539144039154053
Training for epoch 4/20 completed.
Loss: 1.9471547603607178
Loss: 1.8197799921035767
Loss: 1.857600450515747
Loss: 1.7659292221069336
Loss: 1.6599135398864746
Loss: 1.4964687824249268
Loss: 1.5009453296661377
Loss: 1.6548056602478027
Loss: 1.7211157083511353
Loss: 1.6107888221740723
Loss: 1.8777656555175781
Loss: 1.7438950538635254
Loss: 1.5801095962524414
Loss: 1.2517006397247314
Loss: 1.6474435329437256
Loss: 1.1039535999298096
Loss: 1.5770840644836426
Loss: 1.9414658546447754
Loss: 2.039097785949707
Loss: 1.7698482275009155
Loss: 1.773969054222107
Loss: 1.9833285808563232
Loss: 1.3790072202682495
Loss: 1.8081508874893188
Loss: 2.1207756996154785
Loss: 2.0475635528564453
Loss: 1.6014577150344849
Loss: 1.6102700233459473
Loss: 2.0546483993530273
Loss: 1.3837813138961792
Loss: 1.7449028491973877
Loss: 1.9524307250976562
Loss: 1.0001447200775146
Loss: 1.6193443536758423
Loss: 1.8289375305175781
Loss: 1.961186408996582
Loss: 1.7525615692138672
Loss: 1.52983522415

 25%|██▌       | 5/20 [04:46<14:21, 57.42s/it]

Loss: 1.437455415725708
Training for epoch 5/20 completed.
Loss: 1.6122967004776
Loss: 1.3551443815231323
Loss: 1.7806012630462646
Loss: 1.5500743389129639
Loss: 1.3419013023376465
Loss: 1.902029275894165
Loss: 1.3262566328048706
Loss: 1.551301121711731
Loss: 1.3603719472885132
Loss: 1.767556071281433
Loss: 1.7359060049057007
Loss: 1.8900842666625977
Loss: 1.697970986366272
Loss: 1.535747766494751
Loss: 1.4168975353240967
Loss: 1.4873687028884888
Loss: 1.2438056468963623
Loss: 1.48582124710083
Loss: 1.7005817890167236
Loss: 1.5185723304748535
Loss: 1.4996329545974731
Loss: 1.647178053855896
Loss: 1.3181856870651245
Loss: 1.6992663145065308
Loss: 1.6895904541015625
Loss: 1.0334407091140747
Loss: 1.64697265625
Loss: 1.4826563596725464
Loss: 1.858628273010254
Loss: 1.5483884811401367
Loss: 1.6166282892227173
Loss: 1.722256064414978
Loss: 1.306017518043518
Loss: 1.786978840827942
Loss: 1.7673736810684204
Loss: 1.7118313312530518
Loss: 1.9943206310272217
Loss: 1.6068626642227173
Loss: 1.630

 30%|███       | 6/20 [05:44<13:23, 57.39s/it]

Loss: 1.6624833345413208
Training for epoch 6/20 completed.
Loss: 1.3264330625534058
Loss: 1.4279636144638062
Loss: 1.610304355621338
Loss: 1.2838664054870605
Loss: 1.5177114009857178
Loss: 1.2408463954925537
Loss: 1.4480712413787842
Loss: 1.7518548965454102
Loss: 1.6398365497589111
Loss: 1.3939167261123657
Loss: 1.6035652160644531
Loss: 1.2232953310012817
Loss: 1.0007364749908447
Loss: 1.463964581489563
Loss: 1.4084689617156982
Loss: 1.2034322023391724
Loss: 1.2888566255569458
Loss: 1.4936527013778687
Loss: 1.100463628768921
Loss: 1.1568061113357544
Loss: 1.5811781883239746
Loss: 1.5344425439834595
Loss: 1.126524567604065
Loss: 1.5811363458633423
Loss: 1.3262921571731567
Loss: 1.2304400205612183
Loss: 0.9674520492553711
Loss: 1.2364797592163086
Loss: 1.368921160697937
Loss: 1.6670787334442139
Loss: 1.6584367752075195
Loss: 1.523141622543335
Loss: 0.9713083505630493
Loss: 1.0072708129882812
Loss: 1.1731629371643066
Loss: 1.6153396368026733
Loss: 1.5087814331054688
Loss: 1.5717412233352

 35%|███▌      | 7/20 [06:43<12:33, 57.93s/it]

Loss: 1.4920684099197388
Training for epoch 7/20 completed.
Loss: 0.8348223567008972
Loss: 1.335086703300476
Loss: 1.240159511566162
Loss: 1.3457976579666138
Loss: 1.3619840145111084
Loss: 1.074265718460083
Loss: 1.1825796365737915
Loss: 1.3839818239212036
Loss: 1.4781519174575806
Loss: 1.3032886981964111
Loss: 1.3959436416625977
Loss: 1.2174861431121826
Loss: 0.9714825749397278
Loss: 1.3115917444229126
Loss: 1.464302659034729
Loss: 1.3618404865264893
Loss: 0.9795985817909241
Loss: 1.2726407051086426
Loss: 1.243410587310791
Loss: 1.3986691236495972
Loss: 1.4551721811294556
Loss: 1.1959834098815918
Loss: 1.2643851041793823
Loss: 1.3267383575439453
Loss: 1.459643006324768
Loss: 1.2884000539779663
Loss: 1.2041434049606323
Loss: 1.34604811668396
Loss: 0.9686623215675354
Loss: 1.399109125137329
Loss: 1.185205101966858
Loss: 1.1417046785354614
Loss: 1.28189218044281
Loss: 1.1929852962493896
Loss: 1.1195076704025269
Loss: 1.2141902446746826
Loss: 1.5317553281784058
Loss: 1.0702526569366455
Lo

 40%|████      | 8/20 [07:40<11:33, 57.80s/it]

Loss: 1.3970311880111694
Training for epoch 8/20 completed.
Loss: 0.9038370251655579
Loss: 1.0662157535552979
Loss: 1.2349399328231812
Loss: 1.2796319723129272
Loss: 0.9843257665634155
Loss: 1.116329550743103
Loss: 1.458113193511963
Loss: 1.35286283493042
Loss: 1.2838107347488403
Loss: 1.1197211742401123
Loss: 1.0994386672973633
Loss: 1.3183056116104126
Loss: 0.9087726473808289
Loss: 0.9840123653411865
Loss: 1.3307558298110962
Loss: 0.7631388902664185
Loss: 1.0958775281906128
Loss: 1.392417073249817
Loss: 1.4192967414855957
Loss: 1.2631064653396606
Loss: 1.2753623723983765
Loss: 1.1114391088485718
Loss: 1.2083463668823242
Loss: 0.9434276819229126
Loss: 1.1279581785202026
Loss: 0.9445687532424927
Loss: 1.2080062627792358
Loss: 1.1316207647323608
Loss: 1.1118632555007935
Loss: 1.2372629642486572
Loss: 1.1299630403518677
Loss: 1.159907579421997
Loss: 1.131993055343628
Loss: 1.3990144729614258
Loss: 1.2150505781173706
Loss: 1.480776071548462
Loss: 1.3357268571853638
Loss: 1.260773181915283

 45%|████▌     | 9/20 [08:37<10:32, 57.53s/it]

Loss: 1.5144495964050293
Training for epoch 9/20 completed.
Loss: 1.2245491743087769
Loss: 1.2552987337112427
Loss: 1.198824167251587
Loss: 0.8747392892837524
Loss: 1.03768789768219
Loss: 0.5882848501205444
Loss: 1.1621941328048706
Loss: 1.1988184452056885
Loss: 1.0728538036346436
Loss: 1.1622657775878906
Loss: 0.9663501977920532
Loss: 1.0447447299957275
Loss: 0.8703034520149231
Loss: 1.145530104637146
Loss: 0.6845554709434509
Loss: 1.2733697891235352
Loss: 0.9908967018127441
Loss: 0.8269729614257812
Loss: 0.9640638828277588
Loss: 1.0510482788085938
Loss: 0.9717980027198792
Loss: 0.9689153432846069
Loss: 1.0026711225509644
Loss: 0.9483258724212646
Loss: 1.0367181301116943
Loss: 1.2308608293533325
Loss: 0.8665789365768433
Loss: 0.8863940834999084
Loss: 1.0916087627410889
Loss: 1.0487347841262817
Loss: 1.0094618797302246
Loss: 0.9486490488052368
Loss: 1.2387429475784302
Loss: 1.2478731870651245
Loss: 1.277565836906433
Loss: 1.2094942331314087
Loss: 1.157456636428833
Loss: 1.0731070041656

 50%|█████     | 10/20 [09:35<09:34, 57.47s/it]

Loss: 1.1137278079986572
Training for epoch 10/20 completed.
Loss: 0.8735116720199585
Loss: 0.969809353351593
Loss: 0.766646683216095
Loss: 1.0660628080368042
Loss: 0.9033779501914978
Loss: 1.1410820484161377
Loss: 0.7732274532318115
Loss: 0.9580475687980652
Loss: 0.8015791177749634
Loss: 0.9698168039321899
Loss: 0.8904579877853394
Loss: 0.9262308478355408
Loss: 0.7234194874763489
Loss: 0.9022526741027832
Loss: 0.9537334442138672
Loss: 0.7884479761123657
Loss: 0.9100027680397034
Loss: 1.0469768047332764
Loss: 0.6260789632797241
Loss: 1.094545602798462
Loss: 1.0231691598892212
Loss: 0.9903561472892761
Loss: 0.9914488792419434
Loss: 0.9678760170936584
Loss: 0.8026531934738159
Loss: 1.0027824640274048
Loss: 1.0967947244644165
Loss: 1.0951868295669556
Loss: 1.0293279886245728
Loss: 0.9095494151115417
Loss: 0.9618173241615295
Loss: 0.835662841796875
Loss: 1.02528715133667
Loss: 0.939609169960022
Loss: 0.8960994482040405
Loss: 1.2548985481262207
Loss: 1.0363342761993408
Loss: 0.9237062931060

 55%|█████▌    | 11/20 [10:31<08:35, 57.26s/it]

Loss: 0.9178515076637268
Training for epoch 11/20 completed.
Loss: 0.8312145471572876
Loss: 0.980207622051239
Loss: 0.8121345639228821
Loss: 0.786509096622467
Loss: 0.9155299663543701
Loss: 1.0179243087768555
Loss: 0.7286046743392944
Loss: 0.6594154834747314
Loss: 0.9426072239875793
Loss: 0.9201928377151489
Loss: 0.7364159226417542
Loss: 0.8295145630836487
Loss: 0.8715540170669556
Loss: 0.8062383532524109
Loss: 0.7366161942481995
Loss: 0.9110913872718811
Loss: 0.8512378931045532
Loss: 0.7534313201904297
Loss: 0.7988302707672119
Loss: 0.9630758762359619
Loss: 0.9426370859146118
Loss: 0.883995532989502
Loss: 0.9587497711181641
Loss: 0.6680932641029358
Loss: 0.6626421213150024
Loss: 0.8082929253578186
Loss: 0.9081289172172546
Loss: 0.9040498733520508
Loss: 0.8911315202713013
Loss: 0.6590617895126343
Loss: 0.7358447313308716
Loss: 0.8455418944358826
Loss: 0.7819992899894714
Loss: 1.0076119899749756
Loss: 0.8803702592849731
Loss: 1.050864577293396
Loss: 0.8175356388092041
Loss: 0.8202269673

 60%|██████    | 12/20 [11:28<07:36, 57.05s/it]

Loss: 1.0065834522247314
Training for epoch 12/20 completed.
Loss: 0.5088280439376831
Loss: 0.611611545085907
Loss: 0.9065206050872803
Loss: 0.7046988606452942
Loss: 0.733473539352417
Loss: 0.9169486165046692
Loss: 0.8093717694282532
Loss: 0.8161312341690063
Loss: 0.6357094049453735
Loss: 0.965101957321167
Loss: 0.8318338394165039
Loss: 0.8707458972930908
Loss: 0.856526792049408
Loss: 0.5536171793937683
Loss: 0.8559227585792542
Loss: 0.7247717380523682
Loss: 0.8093843460083008
Loss: 0.6051395535469055
Loss: 0.7692160606384277
Loss: 0.6118781566619873
Loss: 0.63841313123703
Loss: 0.7397124767303467
Loss: 0.9665504693984985
Loss: 0.890221893787384
Loss: 0.8451602458953857
Loss: 0.7002806067466736
Loss: 0.6865993142127991
Loss: 0.7694782018661499
Loss: 0.7367793917655945
Loss: 0.7379729747772217
Loss: 0.893670380115509
Loss: 0.7251977920532227
Loss: 0.8438502550125122
Loss: 0.7130379676818848
Loss: 0.7018370628356934
Loss: 0.8150073289871216
Loss: 0.5966081023216248
Loss: 0.93538981676101

 65%|██████▌   | 13/20 [12:25<06:39, 57.04s/it]

Loss: 0.9462502598762512
Training for epoch 13/20 completed.
Loss: 0.706486701965332
Loss: 0.646699845790863
Loss: 0.7967525124549866
Loss: 0.4504604935646057
Loss: 0.4728926420211792
Loss: 0.6845601797103882
Loss: 0.6817191243171692
Loss: 0.9026537537574768
Loss: 0.7999178171157837
Loss: 0.6506499648094177
Loss: 0.7307261228561401
Loss: 0.819068193435669
Loss: 0.594143807888031
Loss: 0.528573215007782
Loss: 0.9359986186027527
Loss: 0.6913900375366211
Loss: 0.542145311832428
Loss: 0.4886474311351776
Loss: 0.7650473117828369
Loss: 0.8277038931846619
Loss: 0.8013221621513367
Loss: 0.705682098865509
Loss: 0.6742092967033386
Loss: 0.772634744644165
Loss: 0.7562857270240784
Loss: 0.691334068775177
Loss: 0.7007625699043274
Loss: 0.8544331192970276
Loss: 0.6727510690689087
Loss: 0.736060380935669
Loss: 0.6207637190818787
Loss: 0.8753424882888794
Loss: 0.6282373666763306
Loss: 0.9247085452079773
Loss: 0.8549631237983704
Loss: 0.8086532354354858
Loss: 0.6425784230232239
Loss: 0.6352203488349915

 70%|███████   | 14/20 [13:22<05:43, 57.20s/it]

Loss: 0.5702903270721436
Training for epoch 14/20 completed.
Loss: 0.594262957572937
Loss: 0.4040023386478424
Loss: 0.6184923648834229
Loss: 0.8171519041061401
Loss: 0.7337793707847595
Loss: 0.7189821004867554
Loss: 0.6124460101127625
Loss: 0.6874768733978271
Loss: 0.6483497619628906
Loss: 0.6769912838935852
Loss: 0.5473714470863342
Loss: 0.6190892457962036
Loss: 0.8183560967445374
Loss: 0.662368655204773
Loss: 0.4823402762413025
Loss: 0.5098415613174438
Loss: 0.7209383249282837
Loss: 0.4367368519306183
Loss: 0.5037466883659363
Loss: 0.57541424036026
Loss: 0.5377852320671082
Loss: 0.5386519432067871
Loss: 0.5852541327476501
Loss: 0.6513530015945435
Loss: 0.6986162066459656
Loss: 0.3931148052215576
Loss: 0.5374906659126282
Loss: 0.49530354142189026
Loss: 0.6746878027915955
Loss: 0.47646480798721313
Loss: 0.6613060235977173
Loss: 0.6496604084968567
Loss: 0.6649014353752136
Loss: 0.6805303692817688
Loss: 0.7554482817649841
Loss: 0.5402862429618835
Loss: 0.6348165273666382
Loss: 0.61243730

 75%|███████▌  | 15/20 [14:19<04:44, 56.99s/it]

Loss: 0.6221263408660889
Training for epoch 15/20 completed.
Loss: 0.5557713508605957
Loss: 0.5769983530044556
Loss: 0.5272520184516907
Loss: 0.6533177495002747
Loss: 0.7448930144309998
Loss: 0.7013372778892517
Loss: 0.5944256782531738
Loss: 0.5302544832229614
Loss: 0.5755730271339417
Loss: 0.5118703246116638
Loss: 0.5469719171524048
Loss: 0.5117856860160828
Loss: 0.6123226284980774
Loss: 0.5674293041229248
Loss: 0.7028522491455078
Loss: 0.514717161655426
Loss: 0.4598684012889862
Loss: 0.6634965538978577
Loss: 0.44643205404281616
Loss: 0.6977081894874573
Loss: 0.6162223219871521
Loss: 0.620979905128479
Loss: 0.64249187707901
Loss: 0.49491333961486816
Loss: 0.5962654948234558
Loss: 0.7043567895889282
Loss: 0.6753487586975098
Loss: 0.6524402499198914
Loss: 0.44908761978149414
Loss: 0.5729437470436096
Loss: 0.6004294753074646
Loss: 0.636594295501709
Loss: 0.5413053631782532
Loss: 0.464236855506897
Loss: 0.6071351766586304
Loss: 0.5497872233390808
Loss: 0.5573252439498901
Loss: 0.466741651

 80%|████████  | 16/20 [15:16<03:47, 56.85s/it]

Loss: 0.44602566957473755
Training for epoch 16/20 completed.
Loss: 0.6308956146240234
Loss: 0.631583571434021
Loss: 0.5479777455329895
Loss: 0.5257661938667297
Loss: 0.5560868382453918
Loss: 0.5615657567977905
Loss: 0.46143361926078796
Loss: 0.4682648181915283
Loss: 0.587077260017395
Loss: 0.3758333921432495
Loss: 0.5504461526870728
Loss: 0.5755209922790527
Loss: 0.3013230562210083
Loss: 0.49715760350227356
Loss: 0.41956186294555664
Loss: 0.5269845724105835
Loss: 0.6260808110237122
Loss: 0.5577893257141113
Loss: 0.6854280233383179
Loss: 0.4375481903553009
Loss: 0.5338658094406128
Loss: 0.635415256023407
Loss: 0.5003035664558411
Loss: 0.33763667941093445
Loss: 0.4663233160972595
Loss: 0.5508321523666382
Loss: 0.5950608253479004
Loss: 0.5303947329521179
Loss: 0.5516262650489807
Loss: 0.36119017004966736
Loss: 0.6324134469032288
Loss: 0.5597353577613831
Loss: 0.5755230188369751
Loss: 0.46198445558547974
Loss: 0.5815666317939758
Loss: 0.503441333770752
Loss: 0.7526502013206482
Loss: 0.478

 85%|████████▌ | 17/20 [16:13<02:51, 57.05s/it]

Loss: 0.5396349430084229
Training for epoch 17/20 completed.
Loss: 0.6161797642707825
Loss: 0.4035584628582001
Loss: 0.5075555443763733
Loss: 0.4963674545288086
Loss: 0.535139262676239
Loss: 0.37108930945396423
Loss: 0.4082365334033966
Loss: 0.5585090517997742
Loss: 0.5114308595657349
Loss: 0.4842778742313385
Loss: 0.5863111019134521
Loss: 0.5568817257881165
Loss: 0.34671908617019653
Loss: 0.4977073073387146
Loss: 0.5894880890846252
Loss: 0.5642631649971008
Loss: 0.4849368929862976
Loss: 0.38924071192741394
Loss: 0.46960899233818054
Loss: 0.3146933615207672
Loss: 0.4168439209461212
Loss: 0.5116803646087646
Loss: 0.5541884303092957
Loss: 0.5836654901504517
Loss: 0.5645782947540283
Loss: 0.482900470495224
Loss: 0.599270761013031
Loss: 0.5166884064674377
Loss: 0.5089482069015503
Loss: 0.591561496257782
Loss: 0.5199607014656067
Loss: 0.44378146529197693
Loss: 0.6200149059295654
Loss: 0.4431939125061035
Loss: 0.6219499707221985
Loss: 0.40320703387260437
Loss: 0.3737047016620636
Loss: 0.4491

 90%|█████████ | 18/20 [17:13<01:55, 57.79s/it]

Loss: 0.4487525224685669
Training for epoch 18/20 completed.
Loss: 0.4672106206417084
Loss: 0.5506153702735901
Loss: 0.3618931770324707
Loss: 0.28554290533065796
Loss: 0.5415396094322205
Loss: 0.45187491178512573
Loss: 0.5154229998588562
Loss: 0.41303393244743347
Loss: 0.5730336904525757
Loss: 0.4838342070579529
Loss: 0.49102285504341125
Loss: 0.47674936056137085
Loss: 0.5744675397872925
Loss: 0.4146616458892822
Loss: 0.5081989169120789
Loss: 0.4972776174545288
Loss: 0.5610988140106201
Loss: 0.528190016746521
Loss: 0.5436177849769592
Loss: 0.40899458527565
Loss: 0.47426751255989075
Loss: 0.43283215165138245
Loss: 0.35027503967285156
Loss: 0.519888699054718
Loss: 0.5689070820808411
Loss: 0.40842950344085693
Loss: 0.43705084919929504
Loss: 0.4790489971637726
Loss: 0.482715904712677
Loss: 0.4859209954738617
Loss: 0.4964331090450287
Loss: 0.5019394755363464
Loss: 0.32184457778930664
Loss: 0.4433518350124359
Loss: 0.4760717451572418
Loss: 0.5729319453239441
Loss: 0.558347761631012
Loss: 0.5

 95%|█████████▌| 19/20 [18:09<00:57, 57.41s/it]

Loss: 0.6126670241355896
Training for epoch 19/20 completed.
Loss: 0.5819835066795349
Loss: 0.3393446207046509
Loss: 0.4014557898044586
Loss: 0.48826828598976135
Loss: 0.3481905162334442
Loss: 0.5199215412139893
Loss: 0.3674403429031372
Loss: 0.4801744520664215
Loss: 0.4589276909828186
Loss: 0.40401801466941833
Loss: 0.4250420033931732
Loss: 0.4739600718021393
Loss: 0.42321306467056274
Loss: 0.4779113531112671
Loss: 0.4708516597747803
Loss: 0.4043266177177429
Loss: 0.45678529143333435
Loss: 0.3687330484390259
Loss: 0.487093061208725
Loss: 0.38777753710746765
Loss: 0.45918211340904236
Loss: 0.4636353552341461
Loss: 0.5059917569160461
Loss: 0.4799322485923767
Loss: 0.4955994486808777
Loss: 0.4503340721130371
Loss: 0.4498043656349182
Loss: 0.41796061396598816
Loss: 0.4137883484363556
Loss: 0.37119805812835693
Loss: 0.38375282287597656
Loss: 0.5257906913757324
Loss: 0.2793300747871399
Loss: 0.3580341041088104
Loss: 0.3538447916507721
Loss: 0.4970168471336365
Loss: 0.4973238408565521
Loss: 

100%|██████████| 20/20 [19:07<00:00, 57.35s/it]

Loss: 0.5843541622161865
Training for epoch 20/20 completed.





In [15]:
def generate_caption(image, model, vocab, device):

    model.eval()
    print(image.shape)
    image = image.unsqueeze(0).to(device) # unsqueezing for encapsulating it inside a batch
    print(image.shape)
    features = model.encoder(image)

    # Generate caption
    caption = ['<bos>']
    for _ in range(50):  # Maximum caption length
        input_caption = torch.tensor([vocab[token] for token in caption]).unsqueeze(0).to(device)
        outputs = model.decoder(features, input_caption)
        _, predicted = outputs.max(2)  # outputs.shape = (batch,seq_len,vocab_size), vocab_size needed because we need to calculate each word's probabilty and pick the best of them
        predicted_word = vocab.get_itos()[predicted[0, -1].item()] # get the first batch (only one), last predicted word in lstm(next word) convert it to string instead of an id
        caption.append(predicted_word)
        if predicted_word == '<eos>':
            break
    print(caption)
    return ' '.join(caption[1:-1])  # Removing <bos> and <eos>

# Load trained model and generate a caption
model.load_state_dict(torch.load('image_captioning_model.pth'))
#image = Image.open("path_to_image.jpg")  # Replace with your image path
#caption = generate_caption(image, model, train_dataset.vocab, device)
#print(f"Generated Caption: {caption}")


<All keys matched successfully>

In [16]:
def generate_captions_for_coco(val_loader,model,vocab, max_length= 50, num_samples=5):
    all_captions = {}
    model.eval()
    with torch.no_grad():
        for i, (images, image_ids,captions) in enumerate(val_loader):
            #image_ids = targets['image_id']
            images = images.to(device)
            for image_id, image in zip(image_ids,images):
              generated_caption = generate_caption(image, model, vocab, device)
              print(f"type(generated_caption) = {type(generated_caption)} , {type(image_id)}")
              all_captions[image_id] = generated_caption
              print(f"Generated Caption for Image {image_id}: {generated_caption}")
            if i >= num_samples:
                break
    print(all_captions)
    return all_captions



In [17]:

def evaluate_captioning_model(generated_captions, coco_annotation_file=val_annotations_path,coco_image_dir=val_images_path):
    """
    Evaluate the image captioning model using COCO evaluation metrics: BLEU, METEOR, ROUGE, CIDEr.

    Parameters:
        generated_captions (dict): Dictionary of generated captions with image_ids as keys.
        coco_annotation_file (str): Path to COCO annotations file.

    Returns:
        dict: Dictionary containing BLEU, METEOR, ROUGE, CIDEr scores.
    """
    # Load the COCO dataset annotations (reference captions)
    coco = COCO(coco_annotation_file)

    # Create a dictionary for the generated captions (image_id -> caption)
    coco_results = [{'image_id': image_id, 'caption': caption} for image_id, caption in generated_captions.items()]


    # Save the generated captions in a temporary file
    with open('generated_captions.json', 'w') as f:
        json.dump(coco_results, f)

    # Load the results into COCO's evaluation API
    coco_results = coco.loadRes('generated_captions.json')
    print(coco_results)
    print("Generated Captions Image IDs:", generated_captions.keys())
    print("Ground Truth Image IDs:", coco.getImgIds())


    # since we filtered the images to contain only first 1000 images, lets filter the metric
    all_image_ids = coco.getImgIds()
    filtered_image_ids = [image_id for image_id in generated_captions.keys()]

    # we need to revise the filtered version of the actual annotations
    gts = {}
    for image_id in filtered_image_ids:
        caption_ids = coco.getAnnIds(imgIds=image_id)
        annotations = coco.loadAnns(caption_ids)
        gts[image_id] = [annotation['caption'] for annotation in annotations]


    # Set up the evaluation
    print("gts.keys(): ",gts.keys())
    print("coco_results.keys(): ", generated_captions.keys())
    #assert(gts.keys() == coco_results.keys())

    coco_eval = COCOEvalCap(coco, coco_results)
    coco_eval.params['image_id'] = filtered_image_ids
    coco_eval.evaluate()

    # Extract and return the metrics (BLEU, METEOR, ROUGE, CIDEr)
    metrics = coco_eval.eval
    return metrics

generated_captions = generate_captions_for_coco(val_loader,model,vocabulary)
# Evaluate the generated captions
metrics = evaluate_captioning_model(generated_captions)


print("Evaluation Metrics:", metrics)

torch.Size([3, 224, 224])
torch.Size([1, 3, 224, 224])
['<bos>', 'a', 'motorcycle', 'parked', 'in', 'a', 'parking', 'lot', 'with', 'a', 'helmet', 'on', 'the', 'seat', '.', '<eos>']
type(generated_caption) = <class 'str'> , <class 'int'>
Generated Caption for Image 179765: a motorcycle parked in a parking lot with a helmet on the seat .
torch.Size([3, 224, 224])
torch.Size([1, 3, 224, 224])
['<bos>', 'a', 'motorcycle', 'parked', 'in', 'a', 'parking', 'lot', 'with', 'a', 'helmet', 'on', 'the', 'seat', '.', '<eos>']
type(generated_caption) = <class 'str'> , <class 'int'>
Generated Caption for Image 179765: a motorcycle parked in a parking lot with a helmet on the seat .
torch.Size([3, 224, 224])
torch.Size([1, 3, 224, 224])
['<bos>', 'a', 'large', 'black', 'cat', 'sits', 'inside', 'of', 'an', 'empty', 'bathtub', '.', '<eos>']
type(generated_caption) = <class 'str'> , <class 'int'>
Generated Caption for Image 190236: a large black cat sits inside of an empty bathtub .
torch.Size([3, 224, 2

PTBTokenizer tokenized 2696 tokens at 39480.90 tokens per second.
PTBTokenizer tokenized 576 tokens at 10368.56 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 498, 'reflen': 469, 'guess': [498, 453, 408, 363], 'correct': [227, 71, 14, 6]}
ratio: 1.0618336886970963
Bleu_1: 0.456
Bleu_2: 0.267
Bleu_3: 0.135
Bleu_4: 0.080
computing METEOR score...
METEOR: 0.131
computing Rouge score...
ROUGE_L: 0.350
computing CIDEr score...
CIDEr: 0.262
computing SPICE score...


Parsing reference captions
Parsing test captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.5 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.9

SPICE evaluation took: 7.512 s
SPICE: 0.056
Evaluation Metrics: {'Bleu_1': 0.45582329317177545, 'Bleu_2': 0.26728730404406575, 'Bleu_3': 0.13483671952061507, 'Bleu_4': 0.07978427257672112, 'METEOR': 0.13071145719065927, 'ROUGE_L': 0.34994471888266043, 'CIDEr': 0.2622720325163794, 'SPICE': 0.05623738523704123}


In [18]:
import matplotlib.pyplot as plt
from PIL import Image
from pycocotools.coco import COCO

def evaluate_captioning_model(generated_captions, coco_annotation_file=val_annotations_path, coco_image_dir=val_images_path):
    """
    Evaluate the image captioning model using COCO evaluation metrics: BLEU, METEOR, ROUGE, CIDEr.

    Parameters:
        generated_captions (dict): Dictionary of generated captions with image_ids as keys.
        coco_annotation_file (str): Path to COCO annotations file.

    Returns:
        dict: Dictionary containing BLEU, METEOR, ROUGE, CIDEr scores.
    """
    # Load the COCO dataset annotations (reference captions)
    coco = COCO(coco_annotation_file)

    # Create a dictionary for the generated captions (image_id -> caption)
    coco_results = [{'image_id': image_id, 'caption': caption} for image_id, caption in generated_captions.items()]

    # Save the generated captions in a temporary file
    with open('generated_captions.json', 'w') as f:
        json.dump(coco_results, f)

    # Load the results into COCO's evaluation API
    coco_results = coco.loadRes('generated_captions.json')
    print(coco_results)
    print("Generated Captions Image IDs:", generated_captions.keys())
    print("Ground Truth Image IDs:", coco.getImgIds())

    # Filter the image IDs for evaluation
    filtered_image_ids = [image_id for image_id in generated_captions.keys()]

    # Get the reference annotations for evaluation
    gts = {}
    for image_id in filtered_image_ids:
        caption_ids = coco.getAnnIds(imgIds=image_id)
        annotations = coco.loadAnns(caption_ids)
        gts[image_id] = [annotation['caption'] for annotation in annotations]

    # Set up the evaluation
    print("gts.keys(): ", gts.keys())
    print("coco_results.keys(): ", generated_captions.keys())

    coco_eval = COCOEvalCap(coco, coco_results)
    coco_eval.params['image_id'] = filtered_image_ids
    coco_eval.evaluate()

    # Extract and return the metrics (BLEU, METEOR, ROUGE, CIDEr)
    metrics = coco_eval.eval
    return metrics

def show_images_with_captions(generated_captions, coco_annotation_file, coco_image_dir):
    """
    Display images with their generated captions.

    Parameters:
        generated_captions (dict): Dictionary of generated captions with image_ids as keys.
        coco_annotation_file (str): Path to COCO annotations file.
        coco_image_dir (str): Path to the directory containing COCO images.
    """
    # Load the COCO dataset annotations
    coco = COCO(coco_annotation_file)

    for image_id, caption in generated_captions.items():
        # Get image information from COCO
        try:
            image_info = coco.loadImgs(image_id)[0]
            image_path = f"{coco_image_dir}/{image_info['file_name']}"
            print(f"Image path: {image_path}")  # Debug: Print the image path
        except KeyError:
            print(f"Image ID {image_id} not found in the dataset.")
            continue

        # Load and display the image using PIL and matplotlib
        try:
            image = Image.open(image_path)
        except FileNotFoundError:
            print(f"Image file not found at {image_path}")
            continue

        # Print the caption
        print(f"Generated Caption: {caption}")

        # Display the image
        plt.figure(figsize=(8, 8))
        plt.imshow(image)
        plt.axis('off')
        plt.title(f"Generated Caption: {caption}", fontsize=14, wrap=True)
        plt.show()

# Generate captions for COCO validation images
generated_captions = generate_captions_for_coco(val_loader, model, vocabulary)

# Evaluate the generated captions
metrics = evaluate_captioning_model(generated_captions)

# Print evaluation metrics
print("Evaluation Metrics:", metrics)

# Display images and their captions
show_images_with_captions(generated_captions, val_annotations_path, val_images_path)


torch.Size([3, 224, 224])
torch.Size([1, 3, 224, 224])
['<bos>', 'a', 'motorcycle', 'parked', 'in', 'a', 'parking', 'lot', 'with', 'a', 'helmet', 'on', 'the', 'seat', '.', '<eos>']
type(generated_caption) = <class 'str'> , <class 'int'>
Generated Caption for Image 179765: a motorcycle parked in a parking lot with a helmet on the seat .
torch.Size([3, 224, 224])
torch.Size([1, 3, 224, 224])
['<bos>', 'a', 'motorcycle', 'parked', 'in', 'a', 'parking', 'lot', 'with', 'a', 'helmet', 'on', 'the', 'seat', '.', '<eos>']
type(generated_caption) = <class 'str'> , <class 'int'>
Generated Caption for Image 179765: a motorcycle parked in a parking lot with a helmet on the seat .
torch.Size([3, 224, 224])
torch.Size([1, 3, 224, 224])
['<bos>', 'a', 'large', 'black', 'cat', 'sits', 'inside', 'of', 'an', 'empty', 'bathtub', '.', '<eos>']
type(generated_caption) = <class 'str'> , <class 'int'>
Generated Caption for Image 190236: a large black cat sits inside of an empty bathtub .
torch.Size([3, 224, 2

PTBTokenizer tokenized 2696 tokens at 46768.05 tokens per second.
PTBTokenizer tokenized 576 tokens at 13652.71 tokens per second.


setting up scorers...
computing Bleu score...
{'testlen': 498, 'reflen': 469, 'guess': [498, 453, 408, 363], 'correct': [227, 71, 14, 6]}
ratio: 1.0618336886970963
Bleu_1: 0.456
Bleu_2: 0.267
Bleu_3: 0.135
Bleu_4: 0.080
computing METEOR score...
METEOR: 0.131
computing Rouge score...
ROUGE_L: 0.350
computing CIDEr score...
CIDEr: 0.262
computing SPICE score...


Parsing reference captions
Parsing test captions


SPICE evaluation took: 814.4 ms
SPICE: 0.056
Evaluation Metrics: {'Bleu_1': 0.45582329317177545, 'Bleu_2': 0.26728730404406575, 'Bleu_3': 0.13483671952061507, 'Bleu_4': 0.07978427257672112, 'METEOR': 0.13071145719065927, 'ROUGE_L': 0.34994471888266043, 'CIDEr': 0.2622720325163794, 'SPICE': 0.05623738523704123}
loading annotations into memory...
Done (t=0.04s)
creating index...
index created!
Image path: ./coco/val2017/000000179765.jpg
Generated Caption: a motorcycle parked in a parking lot with a helmet on the seat .
Image path: ./coco/val2017/000000190236.jpg
Generated Caption: a large black cat sits inside of an empty bathtub .
Image path: ./coco/val2017/000000331352.jpg
Generated Caption: a toilet with a hello kitty seat sitting in a room .
Image path: ./coco/val2017/000000517069.jpg
Generated Caption: a man is riding a bike down a road .
Image path: ./coco/val2017/000000182417.jpg
Generated Caption: a computer desk with a triple monitor computer setup .
Image path: ./coco/val2017/0

  plt.figure(figsize=(8, 8))


Image path: ./coco/val2017/000000117425.jpg
Generated Caption: a man with a knife and sharpener instructing a group of women .
Image path: ./coco/val2017/000000159977.jpg
Generated Caption: a giraffe standing on a dirt floor with rocks and trees in the background
Image path: ./coco/val2017/000000232538.jpg
Generated Caption: a large boat full of men is sitting on a cart
Image path: ./coco/val2017/000000177934.jpg
Generated Caption: a motorcycle parked in a parking lot with a helmet on the seat .
Image path: ./coco/val2017/000000096549.jpg
Generated Caption: a large jetliner flying over a traffic filled street .
Image path: ./coco/val2017/000000469192.jpg
Generated Caption: a large group of sheep are grazing in a field .
Image path: ./coco/val2017/000000044652.jpg
Generated Caption: a large jetliner flying over a traffic filled street .
Image path: ./coco/val2017/000000210502.jpg
Generated Caption: a large boat full of men is sitting on a cart
Image path: ./coco/val2017/000000298396.jpg