In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Prepare for Evaluating

## Import libraries

In [2]:
!pip install bert-score
!pip install rouge_score
!pip install pycocoevalcap

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->be

In [3]:
import os
import torch
import json
import numpy as np
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from transformers import AutoModel, AutoTokenizer
from pycocoevalcap.cider.cider import Cider

## Evaluation

In [4]:
def load_generated_captions(captions_file):
    captions_dict = {}
    with open(captions_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) != 2:
                print(f"Warning: Invalid line format in {captions_file}: {line.strip()}")
                continue
            img_name, caption = parts
            captions_dict[img_name] = caption.strip()
    return captions_dict

def load_gt_captions(captions_file):
    gt_captions = {}
    with open(captions_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) != 2:
                print(f"Warning: Invalid line format in {captions_file}: {line.strip()}")
                continue
            img_name, caption = parts
            if img_name not in gt_captions:
                gt_captions[img_name] = []
            gt_captions[img_name].append(caption.strip())
    return gt_captions

# Function to get n-grams from a string
def get_ngrams(text, n):
    tokens = text.split()
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

# Function to debug common n-grams
def debug_ngrams(gts, hyp, max_n=4):
    for n in range(1, max_n+1):
        gts_ngrams = [get_ngrams(gt, n) for gt in gts]
        hyp_ngrams = get_ngrams(hyp, n)
        # Find common n-grams between the hypothesis and each ground truth
        # Note: We're looking for n-grams that appear in at least one GT and in the hypothesis
        common_ngrams = set()
        for gt_ngram_set in gts_ngrams:
            common_ngrams.update(set(gt_ngram_set).intersection(set(hyp_ngrams)))
        print(f"{n}-gram common: {common_ngrams}")

# Evaluation function
def evaluate_captions(generated_captions, gt_captions, dataset_name="Unknown"):
    bleu_scores = {f"BLEU-{i}": [] for i in range(1, 5)}
    rouge_l_scores = []
    bert_p_scores = []
    bert_r_scores = []
    bert_f1_scores = []
    all_gts = {}  # For CIDEr
    all_hyp = {}  # For CIDEr
    smoothing = SmoothingFunction().method1
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
    cider_scorer = Cider()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize PhoBERT for BERTScore (Vietnamese language model)
    phobert = AutoModel.from_pretrained("vinai/phobert-base")
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
    from bert_score.utils import model2layers
    if "vinai/phobert-base" not in model2layers:
        model2layers["vinai/phobert-base"] = 12

    for idx, img_name in enumerate(tqdm(gt_captions.keys(), desc=f"Evaluating {dataset_name}")):
        if img_name not in generated_captions:
            print(f"Warning: No generated caption for {img_name}")
            continue
            
        ground_truths = gt_captions[img_name]  # List of reference captions
        generated_caption = generated_captions.get(img_name, "")
        generated_tokens = generated_caption.split()

        # Skip verbose printing for every image to improve performance
        # print(f"Image: {img_name}")
        # print(f"Generated Caption: {generated_caption}")
        # print(f"Ground Truth Captions: {ground_truths}")
        
        if not generated_caption.strip() or not any(gt.strip() for gt in ground_truths):
            print(f"Warning: Empty caption or ground truth detected for {img_name}!")
            continue
            
        # debug_ngrams(ground_truths, generated_caption)  # Comment out for performance

        # Store for CIDEr - properly format for multiple references
        all_gts[idx] = [gt for gt in ground_truths if gt.strip()]
        all_hyp[idx] = [generated_caption if generated_caption.strip() else ""]

        # BLEU-1 to BLEU-4 with multiple references
        reference_tokenized = [gt.split() for gt in ground_truths]
        for i in range(1, 5):
            weights = tuple(1.0/i if j < i else 0 for j in range(4))
            # Calculate BLEU against all references at once (not individually)
            bleu = sentence_bleu(
                reference_tokenized,  # Pass all references at once
                generated_tokens,
                weights=weights,
                smoothing_function=smoothing
            )
            bleu_scores[f"BLEU-{i}"].append(bleu)

        # ROUGE-L with best match among references
        best_rouge_l = 0.0
        for gt in ground_truths:
            rouge_scores = scorer.score(gt, generated_caption)
            best_rouge_l = max(best_rouge_l, rouge_scores['rougeL'].fmeasure)
        rouge_l_scores.append(best_rouge_l)

        # BERTScore with all references
        # bert-score package handles multiple references internally and returns the best scores
        P, R, F1 = bert_score(
            [generated_caption],  # Single hypothesis
            [ground_truths],      # List of references for one image
            model_type="vinai/phobert-base",
            lang="vi",
            device=device,
            use_fast_tokenizer=True,
            rescale_with_baseline=False
        )
        bert_p_scores.append(P.item())
        bert_r_scores.append(R.item())
        bert_f1_scores.append(F1.item())

    # Compute CIDEr
    try:
        cider_score, cider_scores_per_image = cider_scorer.compute_score(all_gts, all_hyp)
        cider_scores = list(cider_scores_per_image)
    except Exception as e:
        print(f"Error computing CIDEr: {e}")
        cider_scores = [0.0] * len(all_gts)

    # Compute average scores
    avg_bleu_scores = {f"BLEU-{i}": np.mean(bleu_scores[f"BLEU-{i}"]) for i in range(1, 5)}
    avg_rouge_l = np.mean(rouge_l_scores)
    avg_bert_p = np.mean(bert_p_scores)
    avg_bert_r = np.mean(bert_r_scores)
    avg_bert_f1 = np.mean(bert_f1_scores)
    avg_cider = np.mean(cider_scores)

    # Print results
    print(f"\nEvaluation Results for {dataset_name}:")
    for i in range(1, 5):
        print(f"Average BLEU-{i}: {avg_bleu_scores[f'BLEU-{i}']:.4f}")
    print(f"Average ROUGE-L: {avg_rouge_l:.4f}")
    print(f"Average BERTScore Precision: {avg_bert_p:.4f}")
    print(f"Average BERTScore Recall: {avg_bert_r:.4f}")
    print(f"Average BERTScore F1: {avg_bert_f1:.4f}")
    print(f"Average CIDEr: {avg_cider:.4f}")

    results = {
        **avg_bleu_scores,
        "ROUGE-L": avg_rouge_l,
        "BERTScore_P": avg_bert_p,
        "BERTScore_R": avg_bert_r,
        "BERTScore_F1": avg_bert_f1,
        "CIDEr": avg_cider
    }
    return results

# Main

## CNN + LSTM (Attention)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on device: {device}")

# define datasets
datasets = [
    {
        "name": "capydata-ic",
        "gt_captions_file": "/kaggle/input/capydata-ic/data/test/captions.txt",
        "generated_captions_file": "/kaggle/input/test-generated-captions/cnn_lstm/cnn_lstm/us_generated_captions.txt"
    },
    {
        "name": "test_uit_viic",
        "gt_captions_file": "/kaggle/input/uit-viic-test/test/captions.txt",
        "generated_captions_file": "/kaggle/input/test-generated-captions/cnn_lstm/cnn_lstm/uit_generated_captions.txt"
    }
]

results = {}
for dataset in datasets:
    # Load captions
    generated_captions = load_generated_captions(dataset["generated_captions_file"])
    gt_captions = load_gt_captions(dataset["gt_captions_file"])
    
    # Evaluate
    results[dataset["name"]] = evaluate_captions(
        generated_captions, gt_captions, dataset["name"]
    )
    
# Compare results
print("\nComparison of Results:")
print(f"{'Metric':<20} {'capydata-ic':<15} {'test_uit_viic':<15}")
for metric in results["capydata-ic"]:
    print(f"{metric:<20} {results['capydata-ic'][metric]:<15.4f} {results['test_uit_viic'][metric]:<15.4f}")

Running on device: cuda


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

2025-05-08 05:44:34.333969: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746683074.567152      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746683074.629384      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]


Evaluating capydata-ic:   0%|          | 0/279 [00:00<?, ?it/s][A
Evaluating capydata-ic:   0%|          | 1/279 [00:02<09:55,  2.14s/it][A
Evaluating capydata-ic:   1%|          | 2/279 [00:03<07:34,  1.64s/it][A
Evaluating capydata-ic:   1%|          | 3/279 [00:04<06:43,  1.46s/it][A
Evaluating capydata-ic:   1%|▏         | 4/279 [00:05<06:25,  1.40s/it][A
Evaluating capydata-ic:   2%|▏         | 5/279 [00:07<06:30,  1.42s/it][A
Evaluating capydata-ic:   2%|▏         | 6/279 [00:08<06:16,  1.38s/it][A
Evaluating capydata-ic:   3%|▎         | 7/279 [00:10<06:23,  1.41s/it][A
Evaluating capydata-ic:   3%|▎         | 8/279 [00:12<07:00,  1.55s/it][A
Evaluating capydata-ic:   3%|▎         | 9/279 [00:13<06:37,  1.47s/it][A
Evaluating capydata-ic:   4%|▎         | 10/279 [00:14<06:21,  1.42s/it][A
Evaluating capydata-ic:   4%|▍         | 11/279 [00:15<06:07,  1.37s/it][A
Evaluating capydata-ic:   4%|▍         | 12/279 [00:17<06:03,  1.36s/it][A
Evaluating capydata-ic:   5%|


Evaluation Results for capydata-ic:
Average BLEU-1: 0.5873
Average BLEU-2: 0.3728
Average BLEU-3: 0.2227
Average BLEU-4: 0.1413
Average ROUGE-L: 0.4859
Average BERTScore Precision: 0.5960
Average BERTScore Recall: 0.5779
Average BERTScore F1: 0.5853
Average CIDEr: 0.3922


Evaluating test_uit_viic: 100%|██████████| 578/578 [09:56<00:00,  1.03s/it]



Evaluation Results for test_uit_viic:
Average BLEU-1: 0.3962
Average BLEU-2: 0.1623
Average BLEU-3: 0.0715
Average BLEU-4: 0.0455
Average ROUGE-L: 0.4052
Average BERTScore Precision: 0.5075
Average BERTScore Recall: 0.5480
Average BERTScore F1: 0.5241
Average CIDEr: 0.0952

Comparison of Results:
Metric               capydata-ic     test_uit_viic  
BLEU-1               0.5873          0.3962         
BLEU-2               0.3728          0.1623         
BLEU-3               0.2227          0.0715         
BLEU-4               0.1413          0.0455         
ROUGE-L              0.4859          0.4052         
BERTScore_P          0.5960          0.5075         
BERTScore_R          0.5779          0.5480         
BERTScore_F1         0.5853          0.5241         
CIDEr                0.3922          0.0952         


## CNN + T5 

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on device: {device}")

# define datasets
datasets = [
    {
        "name": "capydata-ic",
        "gt_captions_file": "/kaggle/input/capydata-ic/data/test/captions.txt",
        "generated_captions_file": "/kaggle/input/test-generated-captions/cnn_t5/cnn_t5/us_generated_captions.txt"
    },
    {
        "name": "test_uit_viic",
        "gt_captions_file": "/kaggle/input/uit-viic-test/test/captions.txt",
        "generated_captions_file": "/kaggle/input/test-generated-captions/cnn_t5/cnn_t5/uit_generated_captions.txt"
    }
]

results = {}
for dataset in datasets:
    # Load captions
    generated_captions = load_generated_captions(dataset["generated_captions_file"])
    gt_captions = load_gt_captions(dataset["gt_captions_file"])
    
    # Evaluate
    results[dataset["name"]] = evaluate_captions(
        generated_captions, gt_captions, dataset["name"]
    )
    
# Compare results
print("\nComparison of Results:")
print(f"{'Metric':<20} {'capydata-ic':<15} {'test_uit_viic':<15}")
for metric in results["capydata-ic"]:
    print(f"{metric:<20} {results['capydata-ic'][metric]:<15.4f} {results['test_uit_viic'][metric]:<15.4f}")

Running on device: cuda


Evaluating capydata-ic: 100%|██████████| 279/279 [04:48<00:00,  1.03s/it]



Evaluation Results for capydata-ic:
Average BLEU-1: 0.5640
Average BLEU-2: 0.3263
Average BLEU-3: 0.1849
Average BLEU-4: 0.1163
Average ROUGE-L: 0.4910
Average BERTScore Precision: 0.5739
Average BERTScore Recall: 0.5577
Average BERTScore F1: 0.5643
Average CIDEr: 0.3244


Evaluating test_uit_viic: 100%|██████████| 578/578 [09:58<00:00,  1.04s/it]



Evaluation Results for test_uit_viic:
Average BLEU-1: 0.3516
Average BLEU-2: 0.1577
Average BLEU-3: 0.0707
Average BLEU-4: 0.0459
Average ROUGE-L: 0.4394
Average BERTScore Precision: 0.5141
Average BERTScore Recall: 0.5571
Average BERTScore F1: 0.5312
Average CIDEr: 0.1001

Comparison of Results:
Metric               capydata-ic     test_uit_viic  
BLEU-1               0.5640          0.3516         
BLEU-2               0.3263          0.1577         
BLEU-3               0.1849          0.0707         
BLEU-4               0.1163          0.0459         
ROUGE-L              0.4910          0.4394         
BERTScore_P          0.5739          0.5141         
BERTScore_R          0.5577          0.5571         
BERTScore_F1         0.5643          0.5312         
CIDEr                0.3244          0.1001         
