In [9]:
# -*- coding: utf-8 -*-
"""
Created on Mon May 27 16:57:52 2024

@author: Naitik
"""
import torch # type: ignore
import json

from pycocoevalcap.cider import cider
import pycocoevalcap.meteor as meteor
from pycocoevalcap.rouge import rouge
from pycocoevalcap.bleu import bleu
from get_loader_test import get_loader
import torchvision.transforms as transforms # type: ignore
from  New_Pruned_Model import EncodertoDecoder as pruned_model
from nltk.translate.meteor_score import meteor_score # type: ignore
from nltk.tokenize import word_tokenize # type: ignore
from torch.utils.tensorboard import SummaryWriter # type: ignore

In [2]:
import sys
import os

gl = sys.stdout

# Suppress print statements
def suppress_stdout():
    sys.stdout = open(os.devnull, 'w')

# Restore print statements
def restore_stdout():
    sys.stdout = gl

In [3]:
def caption_generate_2(model,dataset,image,device,max_length = 50):
    outputs=[dataset.vocab.stoi["<SOS>"]]
    for i in range(max_length):
        trg_tensor =torch.LongTensor(outputs).unsqueeze(1).to(device)
        print(trg_tensor.shape)
        image = image.to(device)
        
        
        with torch.no_grad():
            output = model(image,trg_tensor)
                
        print("From here" , output.shape)
            
        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)
        
        if best_guess == dataset.vocab.stoi["<EOS>"]:
            break
    caption = [dataset.vocab.itos[idx] for idx in outputs]
    
    return caption[1:]

def caption_generate(model,dataset,image,device,max_length = 50):
    outputs=[dataset.vocab.stoi["<SOS>"]]
    for i in range(max_length):
        trg_tensor =torch.LongTensor(outputs).unsqueeze(1).to(device)
        print(trg_tensor.shape)
        image = image.to(device)
        
        
        with torch.no_grad():
            output, _ = model(image,trg_tensor)
                
        print("From here" , output.shape)
            
        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)
        
        if best_guess == dataset.vocab.stoi["<EOS>"]:
            break
    caption = [dataset.vocab.itos[idx] for idx in outputs]
    
    return caption[1:]

In [4]:
import shutil

In [5]:
import os
import json
import torch
import torchvision.utils as vutils

def run_validation(pruned_model, unpruned_model, validation_dataloader, validation_dataset, max_len, device):
    pruned_model.eval()
    unpruned_model.eval()
    
    results_dict = {}

    # Create logs directory
    logs_dir = "rscid_logs"
    os.makedirs(logs_dir, exist_ok=True)

    with torch.no_grad():
        for idx, (image, caption, image_name) in enumerate(validation_dataloader):
            image = image.to(device)

            # Ensure batch size is 1
            assert image.size(0) == 1, "Batch size must be 1 for validation"

            print(f"Processing Image {idx + 1}: {image_name[0]}")

            # Generate captions for both models
            suppress_stdout()  # Disable print
            pruned_out = caption_generate(pruned_model, validation_dataset, image, device, max_len)
            unpruned_out = caption_generate_2(unpruned_model, validation_dataset, image, device, max_len)
            restore_stdout()

            # Convert target captions to text
            target_text = caption.detach().cpu().numpy().tolist()
            target_text_flat = [token for sublist in target_text for token in sublist]

            # Convert predicted tokens to text
            def tokens_to_text(tokens):
                text = ""
                for token in tokens:
                    if token == '<EOS>':
                        break
                    text += token + " "
                return text.strip()

            expected_text = " ".join(validation_dataset.vocab.itos[i] for i in target_text_flat if validation_dataset.vocab.itos[i] != '<EOS>')
            pruned_text = tokens_to_text(pruned_out)
            unpruned_text = tokens_to_text(unpruned_out)

            # Store results in dictionary
            image_name_str = image_name[0]  # Assuming image_name is a list with one string
            if image_name_str not in results_dict:
                results_dict[image_name_str] = []
            
            results_dict[image_name_str].append({
                "Expected": expected_text,
                "Pruned_Predicted": pruned_text,
                "Unpruned_Predicted": unpruned_text
            })
            
            print({
                "Expected": expected_text,
                "Pruned_Predicted": pruned_text,
                "Unpruned_Predicted": unpruned_text
            })

            # Create image folder inside logs
            image_folder = os.path.join(logs_dir, "".join(image_name_str.split(".")[:-1]))
            os.makedirs(image_folder, exist_ok=True)

            # Save image inside the corresponding folder
            image_path = os.path.join(image_folder, "image.png")
            shutil.copy(os.path.join('D:\ML\Korea\Jishu\Jishu\\rsicd\images\\', image_name_str), image_path)

            # Save results in text format for easy access
            with open(os.path.join(image_folder, "captions.json"), "w") as f:
                json.dump(results_dict[image_name_str], f, indent=4)

            print(f"✅ Saved results for {image_name_str}")

    # Save full results dictionary as JSON
    with open(os.path.join(logs_dir, "results.json"), "w") as json_file:
        json.dump(results_dict, json_file, indent=4)

    print("✅ All results saved successfully in logs folder!")


  shutil.copy(os.path.join('D:\ML\Korea\Jishu\Jishu\\rsicd\images\\', image_name_str), image_path)


In [6]:
print("Hello")

Hello


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:

transform = transforms.Compose([transforms.Resize((256,256)),
                            transforms.ToTensor(),
                            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

images_path , caption_path = r"D:\ML\Korea\Jishu\Jishu\rsicd\images" , r"D:\ML\Korea\Jishu\Jishu\rsicd\captions.csv"
validation_dataloader , validation_dataset = get_loader(images_path,caption_path ,transform,num_workers=4 , train = False)
trg_vocab_size = len(validation_dataset.vocab)
max_len = 50

# Our Approach

In [11]:

pruned_model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Complete_Model_Pruning\model_final_4_2_T.pth')
pruned_model = pruned_model.to(device)

# Initialize the tensorboard
logs_dir = "logs"
writer = SummaryWriter(logs_dir)

  pruned_model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Complete_Model_Pruning\model_final_4_2_T.pth')
  pruned_model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Complete_Model_Pruning\model_final_4_2_T.pth')


# Unpruned Model

In [8]:
import torch
import sys
sys.path.append("../Original_Image_Captioning_Model")  # Add the parent folder
from Model import *  # Now import works

unpruned_model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Original_Image_Captioning_Model\original_unpruned_model.pth')
unpruned_model = unpruned_model.to(device)

  unpruned_model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Original_Image_Captioning_Model\original_unpruned_model.pth')
  unpruned_model = torch.load('D:\ML\Korea\Jishu\Jishu\Final_Docs\Original_Image_Captioning_Model\original_unpruned_model.pth')


In [12]:
print("Hello")

Hello


In [13]:
run_validation(pruned_model,unpruned_model, validation_dataloader, validation_dataset, max_len, device)

Processing Image 1: resort_145.jpg


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


{'Expected': '<SOS> several buildings and many green trees are near a river .', 'Pruned_Predicted': 'several buildings with swimming pool are semi - surrounded by a piece of water .', 'Unpruned_Predicted': 'many buildings and green trees are in a school .'}
✅ Saved results for resort_145.jpg
Processing Image 2: mediumresidential_259.jpg
{'Expected': '<SOS> some buildings and many green trees are located in an average residential area .', 'Pruned_Predicted': 'some buildings and many green trees are in a medium residential area .', 'Unpruned_Predicted': 'many green trees are around a building with a swimming pool .'}
✅ Saved results for mediumresidential_259.jpg
Processing Image 3: playground_106.jpg
{'Expected': '<SOS> a football field is close to a road and several cars .', 'Pruned_Predicted': 'a football field is near a road with cars .', 'Unpruned_Predicted': 'a football field is near several buildings and some green trees .'}
✅ Saved results for playground_106.jpg
Processing Image 4

# Calculate Metric Score

In [18]:
with open("results.json", "r") as file:
    data = json.load(file)  # Convert JSON to Python dictionary

In [19]:
expected = data["expected"]
predicted = data["predicted"]

In [23]:
scorer = rouge.Rouge()
cider_scorer = cider.Cider()
bleu_scorer = bleu.Bleu(4)
ciders = 0

In [24]:
gts = {}
res = {}
met = 0

for idx, (ref, pred) in enumerate(zip(expected, predicted)):
    if isinstance(ref, str):
        ref = [ref]
    if isinstance(pred, str):
        pred = [pred]

    # Sanity check
    assert(type(ref) is list)
    assert(len(ref) > 0)
    assert(type(pred) is list)
    assert(len(pred) == 1)

    # Fill the gts and res dictionaries for CIDEr scoring
    gts[idx] = ref
    res[idx] = pred
    met += meteor_score([word_tokenize(ref[0].replace("<SOS" , ""))], word_tokenize(pred[0]))

In [26]:
cider_score, cider_scores = cider_scorer.compute_score(gts, res)
rougeL, rouge_scores = scorer.compute_score(gts, res)
bleu_score, bleu_scores = bleu_scorer.compute_score(gts, res)

{'testlen': 52343, 'reflen': 58475, 'guess': [52343, 47800, 43257, 38714], 'correct': [26786, 12228, 7146, 4460]}
ratio: 0.8951346729371373


In [None]:
print("ROUGE-L = ", rougeL)
print("Cider = ", ciders)
print("Bleu = ", bleu_score)

ROUGE-L =  0.43039683333350276
Cider =  1.7229689709756495
Bleu =  [0.4551677779184838, 0.3218182357591058, 0.24780879933616404, 0.19871690288396454]


In [None]:
print("Meteor = ", met/len(expected))

Meteor =  0.39987502808888814


# 