# **Model Evaluation & Testing**

In [1]:
import sys
from pathlib import Path

root_dir = str(Path.cwd().parent.absolute())
if not root_dir in sys.path:
    sys.path.insert(0, root_dir)

In [2]:
import tensorflow as tf
import pickle
import numpy as np

import matplotlib.pyplot as plt

from src.evaluation.metrics import BLEUScore
from src.evaluation.beam_search import BeamSearchDecoder
from src.data.preprocessing import DataPreprocessor
from src.utils import load_tokenizer
from config import Config

2025-10-22 13:07:37.575504: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-22 13:07:37.634629: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-22 13:07:38.974964: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


## **1. Load Models & Tokenizers**

In [3]:
# Load BiLSTM model
bilstm_model = tf.keras.models.load_model(
    f"{Config.ARTIFACT_PATH}/bilstm/final_bilstm_model.keras",
    compile=False
)

# Load LSTM model
lstm_model = tf.keras.models.load_model(
    f"{Config.ARTIFACT_PATH}/lstm/final_lstm_model.keras",
    compile=False
)

# Load tokenizers
tokenizer_en = load_tokenizer(f"{Config.ARTIFACT_PATH}/tokenizers/tokenizer_en.pkl")
tokenizer_vi = load_tokenizer(f"{Config.ARTIFACT_PATH}/tokenizers/tokenizer_vi.pkl")

print("Models and tokenizers loaded")

I0000 00:00:1761113259.937701 1450830 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3470 MB memory:  -> device: 0, name: NVIDIA GeForce MX230, pci bus id: 0000:01:00.0, compute capability: 6.1


Tokenizer loaded from .../artifacts/tokenizers/tokenizer_en.pkl
Tokenizer loaded from .../artifacts/tokenizers/tokenizer_vi.pkl
Models and tokenizers loaded


## **2. Setup Decoders**

In [4]:
bilstm_decoder = BeamSearchDecoder(
    bilstm_model, tokenizer_en, tokenizer_vi,
    Config.MAX_LENGTH_SRC, Config.MAX_LENGTH_TRG
)

lstm_decoder = BeamSearchDecoder(
    lstm_model, tokenizer_en, tokenizer_vi,
    Config.MAX_LENGTH_SRC, Config.MAX_LENGTH_TRG
)

bleu_scorer = BLEUScore(weights=[0.25, 0.25, 0.25, 0.25])

## **3. Test Translations**

In [5]:
test_sentences = [
    "Hello, how are you?",
    "I love machine learning.",
    "The weather is beautiful today.",
    "Can you help me with this problem?",
    "Thank you for your time."
]

print("="*80)
print("TRANSLATION TESTS")
print("="*80)

for text in test_sentences:
    print(f"\nInput: {text}")

    # BiLSTM
    bilstm_trans = bilstm_decoder.translate(text)
    print(f"BiLSTM: {bilstm_trans}")

    # LSTM
    lstm_trans = lstm_decoder.translate(text)
    print(f"LSTM: {lstm_trans}")
    print("-"*80)

TRANSLATION TESTS

Input: Hello, how are you?


2025-10-22 13:07:42.992544: E tensorflow/core/util/util.cc:131] oneDNN supports DT_HALF only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


BiLSTM: <UNK> <UNK> bạn <UNK> như <UNK>
LSTM: <UNK> <UNK> <UNK> bạn
--------------------------------------------------------------------------------

Input: I love machine learning.
BiLSTM: tôi <UNK> <UNK> <UNK>
LSTM: tôi <UNK> <UNK> <UNK>
--------------------------------------------------------------------------------

Input: The weather is beautiful today.
BiLSTM: <UNK> <UNK> <UNK> <UNK>
LSTM: <UNK> <UNK> <UNK> <UNK>
--------------------------------------------------------------------------------

Input: Can you help me with this problem?
BiLSTM: bạn có_thể <UNK> tôi <UNK> <UNK> này không
LSTM: bạn có_thể <UNK> tôi với <UNK> này không
--------------------------------------------------------------------------------

Input: Thank you for your time.
BiLSTM: <UNK> bạn đã <UNK> <UNK> của <UNK>
LSTM: <UNK> bạn <UNK> <UNK> của bạn
--------------------------------------------------------------------------------


## **4. BLEU Score Evaluation**

In [None]:
# Load test data
preprocessor = DataPreprocessor(
    max_vocab_src=Config.MAX_VOCAB_SIZE_SRC,
    max_vocab_trg=Config.MAX_VOCAB_SIZE_TRG
)

df = preprocessor.load_data(
    src_path=f"{Config.DATA_PATH}/raw/en.txt",
    trg_path=f"{Config.DATA_PATH}/raw/vi.txt",
    max_length_src=Config.MAX_LENGTH_SRC,
    max_length_trg=Config.MAX_LENGTH_TRG
)

In [None]:
# Get test set
_, _, test_df = preprocessor.split_data(df)

In [None]:
# Evaluate on test set (sample 100 for speed)
test_sample = test_df.sample(n=min(100, len(test_df)))

bilstm_bleu_scores = []
lstm_bleu_scores = []

for idx, row in test_sample.iterrows():
    en_text = row['src']
    vi_ref = row['trg'].replace('START ', '').replace(' END', '')

    # BiLSTM
    bilstm_trans = bilstm_decoder.translate(en_text)
    bilstm_bleu = bleu_scorer.compute(vi_ref, bilstm_trans)
    bilstm_bleu_scores.append(bilstm_bleu)

    # LSTM
    lstm_trans = lstm_decoder.translate(en_text)
    lstm_bleu = bleu_scorer.compute(vi_ref, lstm_trans)
    lstm_bleu_scores.append(lstm_bleu)

In [None]:
print("\n" + "="*80)
print("BLEU SCORE EVALUATION (100 samples)")
print("="*80)
print(f"BiLSTM - Average BLEU: {np.mean(bilstm_bleu_scores):.2f}")
print(f"LSTM - Average BLEU: {np.mean(lstm_bleu_scores):.2f}")
print(f"Improvement: {np.mean(bilstm_bleu_scores) - np.mean(lstm_bleu_scores):.2f} points")

## **5. Visualization**

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(bilstm_bleu_scores, bins=20, alpha=0.7, label='BiLSTM', edgecolor='black')
ax.hist(lstm_bleu_scores, bins=20, alpha=0.7, label='LSTM', edgecolor='black')

ax.set_xlabel('BLEU Score')
ax.set_ylabel('Frequency')
ax.set_title('BLEU Score Distribution Comparison')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
# plt.savefig(f'{Config.ASSETS_PATH}/bleu_comparison.png', dpi=300)
plt.show()

## **6. Summary Report**

In [None]:
print("\n" + "="*80)
print("FINAL EVALUATION SUMMARY")
print("="*80)
print(f"\nBiLSTM Model:")
print(f"  - Average BLEU: {np.mean(bilstm_bleu_scores):.2f}")
print(f"  - Min BLEU: {np.min(bilstm_bleu_scores):.2f}")
print(f"  - Max BLEU: {np.max(bilstm_bleu_scores):.2f}")
print(f"  - Parameters: {bilstm_model.count_params():,}")

print(f"\nLSTM Model:")
print(f"  - Average BLEU: {np.mean(lstm_bleu_scores):.2f}")
print(f"  - Min BLEU: {np.min(lstm_bleu_scores):.2f}")
print(f"  - Max BLEU: {np.max(lstm_bleu_scores):.2f}")
print(f"  - Parameters: {lstm_model.count_params():,}")

print(f"\nConclusion:")
print(f"  BiLSTM performs {np.mean(bilstm_bleu_scores) - np.mean(lstm_bleu_scores):.2f} BLEU points better")
print("="*80)