# **Model Evaluation & Testing**

In [None]:
import sys
from pathlib import Path

root_dir = str(Path.cwd().parent.parent.absolute())
if not root_dir in sys.path:
    sys.path.insert(0, root_dir)

In [None]:
import tensorflow as tf
import pickle
import numpy as np

import matplotlib.pyplot as plt

from src.evaluation.metrics import BLEUScore
from src.evaluation.beam_search import BeamSearchDecoder
from src.data.preprocessing import DataPreprocessor
from src.utils import load_tokenizer
from config import Config

config = Config.to_dict()

## **1. Load Models & Tokenizers**

In [None]:
# Load BiLSTM model
bilstm_model = tf.keras.models.load_model(
    f"{config["model_save_path"]}/bilstm_model.h5",
    compile=False
)

# Load LSTM model
lstm_model = tf.keras.models.load_model(
    f"{config["model_save_path"]}/lstm_model.h5",
    compile=False
)

# Load tokenizers
tokenizer_en = load_tokenizer(f"{config['tokenizer_path']}/tokenizer_en.pkl")
tokenizer_vi = load_tokenizer(f"{config['tokenizer_path']}/tokenizer_vi.pkl")

print("Models and tokenizers loaded")

## **2. Setup Decoders**

In [None]:
bilstm_decoder = BeamSearchDecoder(
    bilstm_model, tokenizer_en, tokenizer_vi,
    config['max_length_src'], config['max_length_trg']
)

lstm_decoder = BeamSearchDecoder(
    lstm_model, tokenizer_en, tokenizer_vi,
    config['max_length_src'], config['max_length_trg']
)

bleu_scorer = BLEUScore()

## **3. Test Translations**

In [None]:
test_sentences = [
    "Hello, how are you?",
    "I love machine learning.",
    "The weather is beautiful today.",
    "Can you help me with this problem?",
    "Thank you for your time."
]

print("="*80)
print("TRANSLATION TESTS")
print("="*80)

for text in test_sentences:
    print(f"\nInput: {text}")

    # BiLSTM
    bilstm_trans = bilstm_decoder.translate(text)
    print(f"BiLSTM: {bilstm_trans}")

    # LSTM
    lstm_trans = lstm_decoder.translate(text)
    print(f"LSTM: {lstm_trans}")
    print("-"*80)

## **4. BLEU Score Evaluation**

In [None]:
# Load test data
preprocessor = DataPreprocessor(
    max_vocab_src=config['max_vocab_size_src'],
    max_vocab_trg=config['max_vocab_size_trg']
)

df = preprocessor.load_data(
    src_path=f"{config["data_path"]}/raw/en.txt",
    trg_path=f"{config["data_path"]}/raw/vi.txt",
    max_length_src=config['max_length_src'],
    max_length_trg=config['max_length_trg']
)

In [None]:
# Get test set
_, _, test_df = preprocessor.split_data(df)

In [None]:
# Evaluate on test set (sample 100 for speed)
test_sample = test_df.sample(n=min(100, len(test_df)))

bilstm_bleu_scores = []
lstm_bleu_scores = []

for idx, row in test_sample.iterrows():
    en_text = row['english']
    vi_ref = row['vietnamese'].replace('START ', '').replace(' END', '')

    # BiLSTM
    bilstm_trans = bilstm_decoder.translate(en_text)
    bilstm_bleu = bleu_scorer.compute(vi_ref, bilstm_trans)
    bilstm_bleu_scores.append(bilstm_bleu)

    # LSTM
    lstm_trans = lstm_decoder.translate(en_text)
    lstm_bleu = bleu_scorer.compute(vi_ref, lstm_trans)
    lstm_bleu_scores.append(lstm_bleu)

In [None]:
print("\n" + "="*80)
print("BLEU SCORE EVALUATION (100 samples)")
print("="*80)
print(f"BiLSTM - Average BLEU: {np.mean(bilstm_bleu_scores):.2f}")
print(f"LSTM - Average BLEU: {np.mean(lstm_bleu_scores):.2f}")
print(f"Improvement: {np.mean(bilstm_bleu_scores) - np.mean(lstm_bleu_scores):.2f} points")

## **5. Visualization**

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(bilstm_bleu_scores, bins=20, alpha=0.7, label='BiLSTM', edgecolor='black')
ax.hist(lstm_bleu_scores, bins=20, alpha=0.7, label='LSTM', edgecolor='black')

ax.set_xlabel('BLEU Score')
ax.set_ylabel('Frequency')
ax.set_title('BLEU Score Distribution Comparison')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{config["assets_path"]}/bleu_comparison.png', dpi=300)
plt.show()

## **6. Summary Report**

In [None]:
print("\n" + "="*80)
print("FINAL EVALUATION SUMMARY")
print("="*80)
print(f"\nBiLSTM Model:")
print(f"  - Average BLEU: {np.mean(bilstm_bleu_scores):.2f}")
print(f"  - Min BLEU: {np.min(bilstm_bleu_scores):.2f}")
print(f"  - Max BLEU: {np.max(bilstm_bleu_scores):.2f}")
print(f"  - Parameters: {bilstm_model.count_params():,}")

print(f"\nLSTM Model:")
print(f"  - Average BLEU: {np.mean(lstm_bleu_scores):.2f}")
print(f"  - Min BLEU: {np.min(lstm_bleu_scores):.2f}")
print(f"  - Max BLEU: {np.max(lstm_bleu_scores):.2f}")
print(f"  - Parameters: {lstm_model.count_params():,}")

print(f"\nConclusion:")
print(f"  BiLSTM performs {np.mean(bilstm_bleu_scores) - np.mean(lstm_bleu_scores):.2f} BLEU points better")
print("="*80)