In [10]:
# -*- coding: utf-8 -*-
"""
Created on Mon May 27 16:57:52 2024

@author: Naitik
"""
import torch
import json

from pycocoevalcap.cider import cider
import pycocoevalcap.meteor as meteor
from pycocoevalcap.rouge import rouge
from pycocoevalcap.bleu import bleu
from Model import *
from get_loader import get_loader
import torchvision.transforms as transforms
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize
from torch.utils.tensorboard import SummaryWriter

In [11]:
def caption_generate(model,dataset,image,device,max_length = 50):
    outputs=[dataset.vocab.stoi["<SOS>"]]
    for i in range(max_length):
        trg_tensor =torch.LongTensor(outputs).unsqueeze(1).to(device)
        image = image.to(device)
        
        with torch.no_grad():
            output = model(image,trg_tensor)
            
        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)
        
        if best_guess == dataset.vocab.stoi["<EOS>"]:
            break
    caption = [dataset.vocab.itos[idx] for idx in outputs]
    
    return caption[1:]

In [12]:
def run_validation(model, validation_dataloader, validation_dataset, max_len, device, writer):
    model.eval()
    count = 0

    expected = []
    predicted = []
    results_dict = {}

    with torch.no_grad():
        for idx , (image , caption) in enumerate(validation_dataloader):
            
            count += 1
            image = image.to(device)
            #encoder_mask = batch["decoder_mask"].to(device) # (b, 1, 1, seq_len)
            
            # Check that the batch size is 1
            assert image.size(0) == 1, "Batch size must be 1 for validation"

            print("Processing Image:", count)
            model_out = caption_generate(model, validation_dataset , image , device , max_len)

            # Convert PyTorch tensors to NumPy arrays
            target_text = caption.detach().cpu().numpy().tolist()
            target_text_flat = [token for sublist in target_text for token in sublist]
            
            # Initialize strings to store the predicted and target text
            model_out_text = ""
            target_text_2 = ""

            # Iterate over the predicted tokens
            for i in model_out:
                token = i
                if token == '<EOS>':
                    break
                model_out_text += token + " "

            # Iterate over the target tokens
            for i in target_text_flat:
                token = validation_dataset.vocab.itos[i]
                if token == '<EOS>':
                    break
                target_text_2 += token + " "


            expected.append(target_text_2.strip())
            predicted.append(model_out_text.strip())

            # Alternatively, if you have a loop for multiple pairs, you can use:
            print("Expected :- ", target_text_2)
            print("Predicted :- ", model_out_text)
    results_dict["expected"] = expected
    results_dict["predicted"] = predicted

    with open("results.json", "w") as json_file:
        json.dump(results_dict, json_file, indent=4)
        

In [None]:

transform = transforms.Compose([transforms.Resize((350,350)),
                            transforms.RandomCrop((256,256)),
                            transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

images_path , caption_path = r"D:\ML\Korea\Jishu\Jishu\rsicd\images" , r"D:\ML\Korea\Jishu\Jishu\rsicd\captions.csv"

BATCH_SIZE = 32
validation_dataloader , validation_dataset = get_loader(images_path,caption_path ,transform,batch_size = BATCH_SIZE,num_workers=4 , train = False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 15
learning_rate = 3e-4
trg_vocab_size = len(validation_dataset.vocab)

embedding_size = 512
num_heads = 8
num_decoder_layers = 4
dropout = 0.20
pad_index=validation_dataset.vocab.stoi["<PAD>"]
save_model = True
writer =SummaryWriter("runs/loss_plot")
step = 0
max_len = 50


# Now we load the model
model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Original_Image_Captioning_Model\original_unpruned_model.pth')
model = model.to(device)

# Initialize the tensorboard
logs_dir = "logs"
writer = SummaryWriter(logs_dir)

# Now we have to send these dataset and dataloaders to the run_validation function
run_validation(model, validation_dataloader, validation_dataset, max_len, device , writer)

  model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Original_Image_Captioning_Model\original_unpruned_model.pth')
  model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Original_Image_Captioning_Model\original_unpruned_model.pth')


Processing Image: 1


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Expected :-  <SOS> many trees and green buildings are close to a viaduct . 
Predicted :-  many green trees and several buildings are in a resort with a pond . 
Processing Image: 2
Expected :-  <SOS> many aircraft are parked in an airport . 
Predicted :-  many planes are parked near several terminals in an airport with several parking lots . 
Processing Image: 3
Expected :-  <SOS> a rectangular square with some green trees lies between a road with cars and several buildings . 
Predicted :-  many buildings and some green trees are in a dense residential area . 
Processing Image: 4
Expected :-  <SOS> it is an exactly round stadium with plenty of bleachers in it surrounding the soccer field . 
Predicted :-  a playground is surrounded by many green trees and several buildings . 
Processing Image: 5
Expected :-  <SOS> this is a square gym . 
Predicted :-  a football field is in a white oval stadium . 
Processing Image: 6
Expected :-  <SOS> round the square is surrounded by rows of houses . 


In [17]:
with open("results.json", "r") as file:
    data = json.load(file)  # Convert JSON to Python dictionary

In [21]:
expected = data["4542"][0]
predicted = data["4542"][1]

In [22]:
len(expected)

4543

In [24]:
scorer = rouge.Rouge()
cider_scorer = cider.Cider()
bleu_scorer = bleu.Bleu(4)
ciders = 0

In [25]:
gts = {}
res = {}
met = 0

for idx, (ref, pred) in enumerate(zip(expected, predicted)):
    if isinstance(ref, str):
        ref = [ref]
    if isinstance(pred, str):
        pred = [pred]

    # Sanity check
    assert(type(ref) is list)
    assert(len(ref) > 0)
    assert(type(pred) is list)
    assert(len(pred) == 1)

    # Fill the gts and res dictionaries for CIDEr scoring
    gts[idx] = ref
    res[idx] = pred
    met += meteor_score([word_tokenize(ref[0].replace("<SOS" , ""))], word_tokenize(pred[0]))

In [26]:
met/len(expected)

0.3173358242124882

In [27]:
cider_score, cider_scores = cider_scorer.compute_score(gts, res)
rougeL, rouge_scores = scorer.compute_score(gts, res)
bleu_score, bleu_scores = bleu_scorer.compute_score(gts, res)

{'testlen': 55406, 'reflen': 58475, 'guess': [55406, 50863, 46320, 41777], 'correct': [22848, 8144, 4378, 2528]}
ratio: 0.947516032492502


In [28]:
ciders = cider_score
logs_dir = "logs"
writer = SummaryWriter(logs_dir)
writer.add_scalar('validation RougeL', rougeL)
writer.add_scalar('validation Bleu', bleu_score[0])
writer.add_scalar('validation Cider', ciders)
writer.flush()

print("ROUGE-L = ", rougeL)
print("Cider = ", ciders)
print("Bleu = ", bleu_score)

ROUGE-L =  0.35568647810979576
Cider =  0.9987754388360597
Bleu =  [0.39015334772089255, 0.24311263918155132, 0.17418954146338497, 0.13189019128871057]


In [30]:
logs_dir = "logs"
writer = SummaryWriter(logs_dir)
writer.add_scalar('validation Meteor', met/len(expected))
writer.flush()

print("Meteor = ", met/len(expected))

Meteor =  0.3173358242124882
