# **Model Training - LSTM & BiLSTM**

In [15]:
import sys
from pathlib import Path

root_dir = str(Path.cwd().parent.parent.absolute())
if not root_dir in sys.path:
    sys.path.insert(0, root_dir)

In [16]:
import os
import matplotlib.pyplot as plt

from config import Config
from src.utils.gpu_utils import GPUMemoryManager
from src.data.preprocessing import DataPreprocessor
from src.models.bilstm_attention import BiLSTMAttentionModel
from src.models.lstm_attention import LSTMAttentionModel
from src.training.trainer import ModelTrainer
from src.utils.helpers import save_tokenizer

config = Config.to_dict()

## **1. GPU Setup**

In [3]:
GPUMemoryManager.clear_session()
GPUMemoryManager.setup_gpu(
    memory_limit_mb=config.get("gpu_memory_limit", 15000),
    allow_growth=config.get("gpu_memory_growth", True)
)

if config.get("use_mixed_precision", True):
    GPUMemoryManager.enable_mixed_precision()

GPUMemoryManager.get_memory_info()

Session cleared
GPU memory growth enabled
GPU memory limited to 15000 MB
Found 1 GPU(s)
Mixed precision (FP16) enabled
Compute: float16, Variable: float32
GPU 0: 36 MB / 4096 MB (0.9%)


## **2. Configuration**

In [5]:
print("Configuration:")
skip_keys = ['log_dir', 'assets_path', 'model_save_path', 'tokenizer_path', 'checkpoint_path', 'data_path']
for key, value in config.items():
    if key not in skip_keys:
        print(f"   {key}: {value}")

print("\nMemory Estimate:")
for key, value in Config.estimate_memory().items():
    print(f"   {key}: {value}")

Configuration:
   batch_size: 256
   epochs: 100
   validation_split: 0.1
   learning_rate: 0.001
   beta_1: 0.9
   beta_2: 0.98
   epsilon: 1e-09
   use_lr_scheduler: True
   warmup_steps: 4000
   total_steps: 100000
   early_stopping_patience: 5
   reduce_lr_patience: 3
   reduce_lr_factor: 0.5
   min_lr: 1e-06
   save_best_only: True
   monitor: val_loss
   log_dir: /home/qctrung/Projects/nlp-projects/machine-translation/logs
   tensorboard_update_freq: epoch
   max_vocab_size_src: 25000
   max_vocab_size_trg: 20000
   min_word_frequency: 2
   embedding_dim: 64
   lstm_units: 128
   attention_heads: 2
   max_length_src: 40
   max_length_trg: 50
   use_mixed_precision: True
   gpu_memory_growth: True
   gpu_memory_limit: 15000
   label_smoothing: 0.1
   use_layer_norm: True
   beam_width: 5
   use_beam_search: True
   data_path: /home/qctrung/Projects/nlp-projects/machine-translation/data
   model_save_path: /home/qctrung/Projects/nlp-projects/machine-translation/models/saved_models


## **3. Data Preprocessing**

In [8]:
preprocessor = DataPreprocessor(
    max_vocab_src=config['max_vocab_size_src'],
    max_vocab_trg=config['max_vocab_size_trg'],
    min_frequency=config.get("min_word_frequency", 2),
    name_logger="data_preprocessing",
    filename_logger=f"{config["log_dir"]}/data_preprocessing.log"
)

# Load data
df = preprocessor.load_data(
    src_path=f"{config["data_path"]}/raw/en.txt",
    trg_path=f"{config["data_path"]}/raw/vi.txt",
    max_length_src=config['max_length_src'],
    max_length_trg=config['max_length_trg']
)

print(f"Dataset: {df.shape}")

2025-10-19 17:22:48 - data_preprocessing - INFO - Filtered: 0.9826271998248352 pairs kept
2025-10-19 17:22:48 - data_preprocessing - INFO - Memory save: 1.7%
Dataset: (143609, 2)


In [9]:
# Split
train_df, val_df, test_df = preprocessor.split_data(df)
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train: 114888, Val: 14360, Test: 14361


In [10]:
# Build tokenizers
tokenizer_en, tokenizer_vi = preprocessor.build_tokenizers(train_df)

# Save
os.makedirs(config["tokenizer_path"], exist_ok=True)
save_tokenizer(tokenizer_en, f'{config["tokenizer_path"]}/tokenizer_en.pkl')
save_tokenizer(tokenizer_vi, f'{config["tokenizer_path"]}/tokenizer_vi.pkl')

2025-10-19 17:26:32 - data_preprocessing - INFO - Filtering rare words in source dataset...
2025-10-19 17:26:32 - data_preprocessing - INFO - Vocab reduced: 37536 → 23934
2025-10-19 17:26:32 - data_preprocessing - INFO - Reduction: 36.2%
2025-10-19 17:26:32 - data_preprocessing - INFO - Filtering rare words in target dataset...
2025-10-19 17:26:33 - data_preprocessing - INFO - Vocab reduced: 27105 → 14679
2025-10-19 17:26:33 - data_preprocessing - INFO - Reduction: 45.8%
Tokenizer saved to /home/qctrung/Projects/nlp-projects/machine-translation/models/tokenizers/tokenizer_en.pkl
Tokenizer saved to /home/qctrung/Projects/nlp-projects/machine-translation/models/tokenizers/tokenizer_vi.pkl


In [11]:
# Prepare sequences
en_train, vi_in_train, vi_out_train = preprocessor.prepare_sequences(
    train_df, config['max_length_src'], config['max_length_trg']
)
en_val, vi_in_val, vi_out_val = preprocessor.prepare_sequences(
    val_df, config['max_length_src'], config['max_length_trg']
)

print(f"Training sequences: {en_train.shape}")

2025-10-19 17:27:45 - data_preprocessing - INFO - Sequences memory: 61.36 MB
2025-10-19 17:27:45 - data_preprocessing - INFO - Sequences memory: 7.67 MB
Training sequences: (114888, 40)


## **4. Build BiLSTM Model**

In [12]:
model_builder = BiLSTMAttentionModel(
    config=config,
    name_logger="bilstm_attention",
    filename_logger=f"{config['log_dir']}/bilstm_attention.log"
)
bilstm_model = model_builder.build(
    vocab_size_src=config['max_vocab_size_src'],
    vocab_size_trg=config['max_vocab_size_trg'],
    max_len_src=config['max_length_src'],
    max_len_trg=config['max_length_trg']
)

bilstm_model.summary()

I0000 00:00:1760869762.582917   15763 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15000 MB memory:  -> device: 0, name: NVIDIA GeForce MX230, pci bus id: 0000:01:00.0, compute capability: 6.1
I0000 00:00:1760869762.602794   15763 cuda_executor.cc:508] failed to allocate 14.65GiB (15728640000 bytes) from device: RESOURCE_EXHAUSTED: : CUDA_ERROR_OUT_OF_MEMORY: out of memory
I0000 00:00:1760869762.602976   15763 cuda_executor.cc:508] failed to allocate 13.18GiB (14155776000 bytes) from device: RESOURCE_EXHAUSTED: : CUDA_ERROR_OUT_OF_MEMORY: out of memory
I0000 00:00:1760869762.603185   15763 cuda_executor.cc:508] failed to allocate 11.87GiB (12740198400 bytes) from device: RESOURCE_EXHAUSTED: : CUDA_ERROR_OUT_OF_MEMORY: out of memory
I0000 00:00:1760869762.603377   15763 cuda_executor.cc:508] failed to allocate 10.68GiB (11466178560 bytes) from device: RESOURCE_EXHAUSTED: : CUDA_ERROR_OUT_OF_MEMORY: out of memory
I0000 00:00:1760869762.603525   157

## **5. Train BiLSTM**

In [None]:
steps_per_epoch = len(en_train) // config['batch_size']
config['total_steps'] = steps_per_epoch * config['epochs']

trainer = ModelTrainer(model=bilstm_model, config=config)

bilstm_history = trainer.train(
    train_data=(en_train, vi_in_train, vi_out_train),
    val_data=(en_val, vi_in_val, vi_out_val)
)

In [None]:
# Save model
os.makedirs(config["model_save_path"], exist_ok=True)
trainer.save_model(f"{config["model_save_path"]}/bilstm_model.h5")

## **6. Build LSTM Model**

In [13]:
GPUMemoryManager.clear_session()

lstm_builder = LSTMAttentionModel(
    config=config,
    name_logger="lstm_attention",
    filename_logger=f"{config['log_dir']}/lstm_attention.log"
)
lstm_model = lstm_builder.build(
    vocab_size_src=config['max_vocab_size_src'],
    vocab_size_trg=config['max_vocab_size_trg'],
    max_len_src=config['max_length_src'],
    max_len_trg=config['max_length_trg']
)

lstm_model.summary()

Session cleared


## **7. Train LSTM**

In [None]:
lstm_trainer = ModelTrainer(
    model=lstm_model,
    config=config,
    logger_name="",

)

lstm_history = lstm_trainer.train(
    train_data=(en_train, vi_in_train, vi_out_train),
    val_data=(en_val, vi_in_val, vi_out_val)
)

In [None]:
# Save
lstm_trainer.save_model(f"{config["model_save_path"]}/lstm_model.h5")

## **8. Compare Results**

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Loss
axes[0].plot(bilstm_history.history['loss'], label='BiLSTM Train')
axes[0].plot(bilstm_history.history['val_loss'], label='BiLSTM Val')
axes[0].plot(lstm_history.history['loss'], label='LSTM Train', linestyle='--')
axes[0].plot(lstm_history.history['val_loss'], label='LSTM Val', linestyle='--')
axes[0].set_title('Model Loss Comparison')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy
axes[1].plot(bilstm_history.history['accuracy'], label='BiLSTM Train')
axes[1].plot(bilstm_history.history['val_accuracy'], label='BiLSTM Val')
axes[1].plot(lstm_history.history['accuracy'], label='LSTM Train', linestyle='--')
axes[1].plot(lstm_history.history['val_accuracy'], label='LSTM Val', linestyle='--')
axes[1].set_title('Model Accuracy Comparison')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{config["assets_path"]}/comparison.png", dpi=300)
plt.show()

In [None]:
print(f"BiLSTM - Final Val Loss: {bilstm_history.history['val_loss'][-1]:.4f}")
print(f"LSTM - Final Val Loss: {lstm_history.history['val_loss'][-1]:.4f}")