## Setup

In [1]:
import numpy as np

import einops

import tensorflow as tf
import tensorflow_text as tf_text

import random

In [2]:
CONFIG = {
    # Reproducibility
    "SEED": 42,  # Controls randomization for reproducible results
    
    # Data parameters
    "BATCH_SIZE": 64,  # Batch size for training and evaluation
    "MAX_VOCAB_SIZE": 5000,  # Maximum vocabulary size for text processors
    "TRAIN_SPLIT": 0.8,  # 80% training, 20% validation split
    
    # Model architecture
    "UNITS": 256,  # Dimension of embeddings and hidden RNN units
    
    # Training parameters
    "OPTIMIZER": "adam",  # Optimizer algorithm
    "EPOCHS": 20,  # Number of training epochs
    "STEPS_PER_EPOCH": 100,  # Number of steps per epoch
    "VALIDATION_STEPS": 20,  # Number of validation steps per epoch
    "EARLY_STOPPING_PATIENCE": 3,  # Number of epochs with no improvement before stopping
    
    # Translation/Testing parameters
    "MAX_LENGTH": 50,  # Maximum length of generated translations
    "TEMPERATURE": 0.0,  # Controls randomness in generation (0.0 = deterministic)
    "TEST_SAMPLE_COUNT": 5,  # Number of examples to display during testing
    "BLEU_SAMPLE_SIZE": 100  # Number of samples to evaluate for BLEU score
}

In [3]:
# Set seeds for all random number generators
random.seed(CONFIG["SEED"])
np.random.seed(CONFIG["SEED"])
tf.random.set_seed(CONFIG["SEED"])

print(f"Random seeds set to {CONFIG['SEED']} for reproducible results")

Random seeds set to 42 for reproducible results


## Prepare dataset

In [4]:
import pathlib
import os
import requests
import zipfile
import io

# URL for the dataset
url = "https://www.manythings.org/anki/ind-eng.zip"

# Define a directory to store the downloaded files
data_dir = pathlib.Path(os.path.join(os.getcwd(), 'dataset'))

# Create the directory if it doesn't exist
data_dir.mkdir(exist_ok=True)

# Path for the extracted text file
path_to_file = data_dir / 'ind.txt'

# Download and extract only if the file doesn't exist
if not path_to_file.exists():
    print(f"Downloading dataset from {url}...")
    
    # Download the zip file with headers that mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5'
    }
    response = requests.get(url, headers=headers)
        
    # Check if the download was successful
    if response.status_code == 200:
        # Extract the zip file
        print("Extracting files...")
        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
            # Extract only the ind.txt file
            for file in zip_ref.namelist():
                if file.endswith('ind.txt'):
                    zip_ref.extract(file, data_dir)
                    # Move the file if it's in a subdirectory
                    extracted_path = data_dir / file
                    if str(extracted_path) != str(path_to_file):
                        os.rename(extracted_path, path_to_file)
                    break
        print("Dataset extracted successfully.")
    else:
        raise Exception(f"Failed to download the dataset. Status code: {response.status_code}")
else:
    print(f"Dataset file already exists at {path_to_file}")

# Verify the file exists
if not path_to_file.exists():
  raise FileNotFoundError(f"Dataset file not found at {path_to_file}")

Downloading dataset from https://www.manythings.org/anki/ind-eng.zip...
Extracting files...
Dataset extracted successfully.


In [40]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  # The format is: English<tab>Indonesian<tab>Attribution info
  # We only need the first two columns
  pairs = [line.split('\t')[:2] for line in lines]

  # English is the source language (context), Indonesian is the target
  context = np.array([source for source, target in pairs])
  target = np.array([target for source, target in pairs])

  return target, context

In [41]:
target_raw, context_raw = load_data(path_to_file)
print(context_raw[-1])

If a person has not had a chance to acquire his target language by the time he's an adult, he's unlikely to be able to reach native speaker level in that language.


In [42]:
print(target_raw[-1])

Jika seseorang tidak berkesempatan untuk menguasai bahasa yang diinginkannya ketika menginjak dewasa, maka kecil kemungkinan ia akan bisa mencapai tingkatan penutur asli dalam bahasa tersebut.


In [43]:
# Set buffer size dynamically and store in CONFIG
CONFIG["BUFFER_SIZE"] = len(context_raw)

is_train = np.random.uniform(size=(len(target_raw),)) < CONFIG["TRAIN_SPLIT"]

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[is_train], target_raw[is_train]))
    .shuffle(CONFIG["BUFFER_SIZE"])
    .batch(CONFIG["BATCH_SIZE"]))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
    .shuffle(CONFIG["BUFFER_SIZE"])
    .batch(CONFIG["BATCH_SIZE"]))

In [44]:
for example_context_strings, example_target_strings in train_raw.take(1):
  print(example_context_strings[:5])
  print()
  print(example_target_strings[:5])
  break

tf.Tensor(
[b"She's unconscious." b'I ran into an old friend.'
 b'In the near future, we will be able to put an end to AIDS.'
 b'Tom recommended that I apply for the job.'
 b'When did you learn to make pizza?'], shape=(5,), dtype=string)

tf.Tensor(
[b'Dia tidak sadar.' b'Aku berjumpa dengan kawan lamaku.'
 b'Dalam waktu dekat ini kita akan mampu mengobati AIDS'
 b'Tom merekomendasikan saya untuk melamar pekerjaan.'
 b'Kapan kamu belajar membuat pizza'], shape=(5,), dtype=string)


In [45]:
def tf_lower_and_split_punct(text):
  # Split accented characters.
  text = tf_text.normalize_utf8(text, 'NFKD')
  text = tf.strings.lower(text)
  # Keep space, a to z, and select punctuation.
  # This regex should work fine for Indonesian
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  # Strip whitespace.
  text = tf.strings.strip(text)

  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text

In [46]:
context_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=CONFIG["MAX_VOCAB_SIZE"],
    ragged=True)

In [47]:
context_text_processor.adapt(train_raw.map(lambda context, target: context))

# Here are the first 10 words from the vocabulary:
context_text_processor.get_vocabulary()[:10]

2025-04-22 21:16:08.089489: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


['',
 '[UNK]',
 np.str_('[START]'),
 np.str_('[END]'),
 np.str_('.'),
 np.str_('i'),
 np.str_('tom'),
 np.str_('the'),
 np.str_('you'),
 np.str_('to')]

In [48]:
target_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=CONFIG["MAX_VOCAB_SIZE"],
    ragged=True)

target_text_processor.adapt(train_raw.map(lambda context, target: target))
target_text_processor.get_vocabulary()[:10]

['',
 '[UNK]',
 np.str_('[START]'),
 np.str_('[END]'),
 np.str_('.'),
 np.str_('tom'),
 np.str_('aku'),
 np.str_('?'),
 np.str_('tidak'),
 np.str_('yang')]

In [49]:
example_tokens = context_text_processor(example_context_strings)
example_tokens[:3, :]

<tf.RaggedTensor [[2, 236, 1510, 4, 3], [2, 5, 264, 209, 79, 159, 244, 4, 3],
 [2, 14, 7, 582, 1077, 16, 32, 60, 34, 338, 9, 237, 79, 807, 9, 4478, 4, 3]]>

In [50]:
def process_text(context, target):
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out


train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

## Define Model

In [51]:
class ShapeChecker():
  def __init__(self):
    # Keep a cache of every axis-name seen
    self.shapes = {}

  def __call__(self, tensor, names, broadcast=False):
    if not tf.executing_eagerly():
      return

    parsed = einops.parse_shape(tensor, names)

    for name, new_dim in parsed.items():
      old_dim = self.shapes.get(name, None)
      
      if (broadcast and new_dim == 1):
        continue

      if old_dim is None:
        # If the axis name is new, add its length to the cache.
        self.shapes[name] = new_dim
        continue

      if new_dim != old_dim:
        raise ValueError(f"Shape mismatch for dimension: '{name}'\n"
                         f"    found: {new_dim}\n"
                         f"    expected: {old_dim}\n")

In [52]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, text_processor, units):
    super(Encoder, self).__init__()
    self.text_processor = text_processor
    self.vocab_size = text_processor.vocabulary_size()
    self.units = units
    
    # The embedding layer converts tokens to vectors
    self.embedding = tf.keras.layers.Embedding(self.vocab_size, units,
                                               mask_zero=True)

    self.rnn = tf.keras.layers.SimpleRNN(units,
                        # Return the sequence and state
                        return_sequences=True,
                        recurrent_initializer='glorot_uniform')

  def call(self, x):
    shape_checker = ShapeChecker()
    shape_checker(x, 'batch s')

    # 2. The embedding layer looks up the embedding vector for each token.
    x = self.embedding(x)
    shape_checker(x, 'batch s units')

    # 3. The GRU processes the sequence of embeddings.
    x = self.rnn(x)
    shape_checker(x, 'batch s units')

    # 4. Returns the new sequence of embeddings.
    return x

  def convert_input(self, texts):
    texts = tf.convert_to_tensor(texts)
    if len(texts.shape) == 0:
      texts = tf.convert_to_tensor(texts)[tf.newaxis]
    context = self.text_processor(texts).to_tensor()
    context = self(context)
    return context

In [53]:
for (ex_context_tok, ex_tar_in), ex_tar_out in train_ds.take(1):
  print(ex_context_tok[0, :10].numpy()) 
  print()
  print(ex_tar_in[0, :10].numpy()) 
  print(ex_tar_out[0, :10].numpy()) 

[   2    5 1017  102 3452  210    4    3    0    0]

[   2    6  630   45  198 1218    4    0    0    0]
[   6  630   45  198 1218    4    3    0    0    0]


In [54]:
# Encode the input sequence.
encoder = Encoder(context_text_processor, CONFIG["UNITS"])
ex_context = encoder(ex_context_tok)

print(f'Context tokens, shape (batch, s): {ex_context_tok.shape}')
print(f'Encoder output, shape (batch, s, units): {ex_context.shape}')

Context tokens, shape (batch, s): (64, 19)
Encoder output, shape (batch, s, units): (64, 19, 256)


In [55]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, text_processor, units):
    super(Decoder, self).__init__()
    self.text_processor = text_processor
    self.vocab_size = text_processor.vocabulary_size()
    self.word_to_id = tf.keras.layers.StringLookup(
        vocabulary=text_processor.get_vocabulary(),
        mask_token='', oov_token='[UNK]')
    self.id_to_word = tf.keras.layers.StringLookup(
        vocabulary=text_processor.get_vocabulary(),
        mask_token='', oov_token='[UNK]',
        invert=True)
    self.start_token = self.word_to_id('[START]')
    self.end_token = self.word_to_id('[END]')

    self.units = units

    # 1. The embedding layer converts token IDs to vectors
    self.embedding = tf.keras.layers.Embedding(self.vocab_size,
                                             units, mask_zero=True)

    # 2. The RNN keeps track of what's been generated so far.
    self.rnn = tf.keras.layers.SimpleRNN(units,
                             return_sequences=True,
                             return_state=True,
                             recurrent_initializer='glorot_uniform')

    # 3. This fully connected layer produces the logits for each output token.
    self.output_layer = tf.keras.layers.Dense(self.vocab_size)

  def call(self,
         context, x,
         state=None,
         return_state=False):  
    shape_checker = ShapeChecker()
    shape_checker(x, 'batch t')
    shape_checker(context, 'batch s units')

    # 1. Lookup the embeddings
    x = self.embedding(x)
    shape_checker(x, 'batch t units')

    # 2. Process the target sequence.
    x, state = self.rnn(x, initial_state=state)
    shape_checker(x, 'batch t units')

    # 3. Generate logit predictions for the next token.
    logits = self.output_layer(x)
    shape_checker(logits, 'batch t target_vocab_size')

    if return_state:
      return logits, state
    else:
      return logits
  
  def get_initial_state(self, context):
    batch_size = tf.shape(context)[0]
    start_tokens = tf.fill([batch_size, 1], self.start_token)
    done = tf.zeros([batch_size, 1], dtype=tf.bool)
    embedded = self.embedding(start_tokens)
    
    # Create initial state directly with the correct shape
    initial_state = tf.zeros([batch_size, self.units])
    
    return start_tokens, done, initial_state
  
  def tokens_to_text(self, tokens):
    words = self.id_to_word(tokens)
    result = tf.strings.reduce_join(words, axis=-1, separator=' ')
    result = tf.strings.regex_replace(result, '^ *\\[START\\] *', '')
    result = tf.strings.regex_replace(result, ' *\\[END\\] *$', '')
    # Add this line to fix punctuation spacing
    result = tf.strings.regex_replace(result, ' ([.?!,])', '\\1')
    return result
  
  def get_next_token(self, context, next_token, done, state, temperature = 0.0):
    logits, state = self(
      context, next_token,
      state = state,
      return_state=True) 
    
    if temperature == 0.0:
      next_token = tf.argmax(logits, axis=-1)
    else:
      logits = logits[:, -1, :]/temperature
      next_token = tf.random.categorical(logits, num_samples=1)

    # If a sequence produces an `end_token`, set it `done`
    done = done | (next_token == self.end_token)
    # Once a sequence is done it only produces 0-padding.
    next_token = tf.where(done, tf.constant(0, dtype=tf.int64), next_token)
    
    return next_token, done, state

In [56]:
decoder = Decoder(target_text_processor, CONFIG["UNITS"])

In [57]:
logits = decoder(ex_context, ex_tar_in)

print(f'encoder output shape: (batch, s, units) {ex_context.shape}')
print(f'input target tokens shape: (batch, t) {ex_tar_in.shape}')
print(f'logits shape shape: (batch, target_vocabulary_size) {logits.shape}')

encoder output shape: (batch, s, units) (64, 19, 256)
input target tokens shape: (batch, t) (64, 18)
logits shape shape: (batch, target_vocabulary_size) (64, 18, 5000)




In [58]:
# Setup the loop variables.
next_token, done, state = decoder.get_initial_state(ex_context)
tokens = []

for n in range(10):
  # Run one step.
  next_token, done, state = decoder.get_next_token(
      ex_context, next_token, done, state, temperature=1.0)
  # Add the token to the output.
  tokens.append(next_token)

# Stack all the tokens together.
tokens = tf.concat(tokens, axis=-1) # (batch, t)

# Convert the tokens back to a a string
result = decoder.tokens_to_text(tokens)
result[:3].numpy()

array([b'ikutilah jawabnya romantis saatnya letak hadir seusia hanya terkaya membuatkanmu',
       b'hatihati berbagai dituntaskan energi dipermalukan rhythm dididik surga skorpio tenaga',
       b'italia membelikanmu perubahan piringnya perhatiannya sekelasku depanmu mewawancarai merayakan kanada'],
      dtype=object)

In [59]:
class Translator(tf.keras.Model):
  def __init__(self, units,
               context_text_processor,
               target_text_processor):
    super().__init__()
    # Build the encoder and decoder
    encoder = Encoder(context_text_processor, units)
    decoder = Decoder(target_text_processor, units)

    self.encoder = encoder
    self.decoder = decoder

  def call(self, inputs):
    context, x = inputs
    context = self.encoder(context)
    logits = self.decoder(context, x)

    try:
      # Delete the keras mask, so keras doesn't scale the loss+accuracy. 
      del logits._keras_mask
    except AttributeError:
      pass

    return logits
  
  def translate(self,
              texts, *,
              max_length=CONFIG["MAX_LENGTH"],
              temperature=CONFIG["TEMPERATURE"]):
    # Process the input texts
    context = self.encoder.convert_input(texts)
    batch_size = tf.shape(texts)[0]

    # Setup the loop inputs
    tokens = []
    next_token, done, state = self.decoder.get_initial_state(context)

    for _ in range(max_length):
      # Generate the next token
      next_token, done, state = self.decoder.get_next_token(
          context, next_token, done, state, temperature)
          
      # Collect the generated tokens
      tokens.append(next_token)
      
      if tf.executing_eagerly() and tf.reduce_all(done):
        break

    # Stack the lists of tokens
    tokens = tf.concat(tokens, axis=-1)   # t*[(batch 1)] -> (batch, t)

    result = self.decoder.tokens_to_text(tokens)
    return result

In [60]:
model = Translator(CONFIG["UNITS"], context_text_processor, target_text_processor)

logits = model((ex_context_tok, ex_tar_in))

print(f'Context tokens, shape: (batch, s, units) {ex_context_tok.shape}')
print(f'Target tokens, shape: (batch, t) {ex_tar_in.shape}')
print(f'logits, shape: (batch, t, target_vocabulary_size) {logits.shape}')

Context tokens, shape: (batch, s, units) (64, 19)
Target tokens, shape: (batch, t) (64, 18)
logits, shape: (batch, t, target_vocabulary_size) (64, 18, 5000)




In [61]:
model.summary()

## Train Model

In [62]:
def masked_loss(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)

    # Mask off the losses on padding.
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    # Return the total.
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

In [63]:
def masked_acc(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)
    
    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)
    
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [64]:
model.compile(optimizer=CONFIG["OPTIMIZER"],
              loss=masked_loss, 
              metrics=[masked_acc, masked_loss])

In [65]:
model.evaluate(val_ds, steps=20, return_dict=True)

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - loss: 8.5191 - masked_acc: 0.0056 - masked_loss: 8.5191


{'loss': 8.518777847290039,
 'masked_acc': 0.0020025845151394606,
 'masked_loss': 8.518777847290039}

In [66]:
history = model.fit(
    train_ds.repeat(), 
    epochs=CONFIG["EPOCHS"],
    steps_per_epoch=CONFIG["STEPS_PER_EPOCH"],
    validation_data=val_ds,
    validation_steps=CONFIG["VALIDATION_STEPS"],
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=CONFIG["EARLY_STOPPING_PATIENCE"])])

Epoch 1/20




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 152ms/step - loss: 6.4979 - masked_acc: 0.1765 - masked_loss: 6.4979 - val_loss: 4.7472 - val_masked_acc: 0.2859 - val_masked_loss: 4.7472
Epoch 2/20
[1m 48/100[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m7s[0m 153ms/step - loss: 4.9233 - masked_acc: 0.2859 - masked_loss: 4.9233

KeyboardInterrupt: 

## Evaluation

In [None]:
# Define constants
SEPARATOR_LINE = "=" * 50

# Create test pairs from the validation data
is_test = np.random.uniform(size=(len(target_raw),)) >= 0.8
test_context = context_raw[is_test]
test_target = target_raw[is_test]
test_pairs = list(zip(test_context, test_target))

# Get the English texts
test_eng_texts = [pair[0] for pair in test_pairs]

# Display both input and expected translation
print(f"Testing {CONFIG['TEST_SAMPLE_COUNT']} random samples from the test set:")
for _ in range(CONFIG["TEST_SAMPLE_COUNT"]):
    # Select a random English sentence
    input_sentence = random.choice(test_eng_texts)
    
    # Get the corresponding Indonesian translation from our model
    translation = model.translate([input_sentence])[0].numpy().decode()
    
    # Print results
    print(SEPARATOR_LINE)
    print(f"Input: {input_sentence}")
    print(f"Translation: {translation}")

print(SEPARATOR_LINE)

Testing 5 random samples from the test set:
Input: I am curious.
Translation: aku tidak tahu apa yang bisa terjadi. 
Input: I'm not an idiot.
Translation: aku tidak tahu apa yang bisa terjadi. 
Input: One of my teeth hurts.
Translation: aku tidak tahu apa yang bisa terjadi. 
Input: How can we change that?
Translation: aku tidak tahu apa yang bisa terjadi. 
Input: I wonder why Tom is staring at me.
Translation: aku tidak tahu apa yang bisa terjadi. 


In [None]:
# Create test dataset from the same split used earlier
test_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
    .batch(CONFIG["BATCH_SIZE"])
)

test_ds = test_raw.map(process_text, tf.data.AUTOTUNE)

# Evaluate the model on the test set
print("Evaluating model on test set...")
test_results = model.evaluate(test_ds, return_dict=True)

print(f"\nTest accuracy: {test_results['masked_acc']:.4f} ({test_results['masked_acc']*100:.2f}%)")
print(f"Test loss: {test_results['loss']:.4f}")

# Compare with validation metrics
print("\nComparison with validation metrics:")
val_results = model.evaluate(val_ds, steps=20, return_dict=True, verbose=0)
print(f"Validation accuracy: {val_results['masked_acc']:.4f} ({val_results['masked_acc']*100:.2f}%)")
print(f"Test accuracy:       {test_results['masked_acc']:.4f} ({test_results['masked_acc']*100:.2f}%)")

Evaluating model on test set...
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - loss: 3.6665 - masked_acc: 0.4044 - masked_loss: 3.6677

Test accuracy: 0.3510 (35.10%)
Test loss: 3.9590

Comparison with validation metrics:
Validation accuracy: 0.3395 (33.95%)
Test accuracy:       0.3510 (35.10%)


In [None]:
# BLEU evaluation cell
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_bleu(model, context_raw, target_raw, 
                 sample_size=CONFIG["BLEU_SAMPLE_SIZE"], 
                 verbose=True):
    """
    Evaluate BLEU score on a sample of test pairs
    
    Args:
        model: Trained Translator model
        context_raw: Array of English sentences
        target_raw: Array of Indonesian sentences
        sample_size: Number of examples to evaluate (None = all)
        verbose: Whether to print detailed results
    
    Returns:
        Average BLEU score
    """
    # Create test pairs from the validation data
    is_test = np.random.uniform(size=(len(target_raw),)) >= 0.8
    test_context = context_raw[is_test]
    test_target = target_raw[is_test]
    test_pairs = list(zip(test_context, test_target))
    
    # Create a sample if specified
    if sample_size and sample_size < len(test_pairs):
        sampled_pairs = random.sample(test_pairs, sample_size)
        if verbose:
            print(f"Evaluating on {sample_size} samples (out of {len(test_pairs)} total)")
    else:
        sampled_pairs = test_pairs
        if verbose:
            print(f"Evaluating on full test set ({len(test_pairs)} examples)")
    
    test_eng_texts = [pair[0].lower() for pair in sampled_pairs]
    test_ind_texts = [pair[1].lower() for pair in sampled_pairs]
    smoother = SmoothingFunction().method1
    
    # Display examples if verbose
    if verbose:
        print("\nShowing example translations:")
        for i in range(min(len(sampled_pairs), 5)):  # Show up to 5 examples
            # Use model.translate method for translation
            candidate = model.translate([test_eng_texts[i]])[0].numpy().decode()
            reference = test_ind_texts[i]
            
            # Clean up tokens for fair comparison
            reference_tokens = reference.strip().split()
            candidate_tokens = candidate.strip().split()
            
            print(f"Input: {test_eng_texts[i]}")
            print(f"Candidate: {candidate}")
            print(f"Reference: {reference}")
            
            if len(candidate_tokens) == 0:
                score = 0.0
                print(f"BLEU Score: {score:.4f} (empty translation)")
            else:
                # Choose weights based on reference sentence length
                if len(reference_tokens) < 4:
                    # Use BLEU-1 for short sentences (unigrams only)
                    weights = (1.0, 0.0, 0.0, 0.0)
                    metric_type = "BLEU-1 (short sentence)"
                else:
                    # Use standard BLEU-4 for longer sentences
                    weights = (0.25, 0.25, 0.25, 0.25)
                    metric_type = "BLEU-4"
                    
                score = sentence_bleu([reference_tokens], candidate_tokens,
                                    weights=weights,
                                    smoothing_function=smoother)
                print(f"BLEU Score: {score:.4f} using {metric_type}\n")
    
    # Calculate BLEU on entire sample
    print("Calculating score...")
    total_bleu = 0
    count = len(sampled_pairs)
    
    for i in range(count):
        progress = (i + 1) / count * 100
        if verbose:
            print(f"Progress: {progress:.1f}% ({i+1}/{count})", end="\r")
            
        # Use model.translate method for translation
        candidate = model.translate([test_eng_texts[i]])[0].numpy().decode()
        reference = test_ind_texts[i]
        
        # Clean up tokens for fair comparison
        reference_tokens = reference.strip().split()
        candidate_tokens = candidate.strip().split()
        
        if len(candidate_tokens) == 0:
            score = 0.0
        else:
            # Choose weights based on sentence length
            if len(reference_tokens) < 4:
                # Use BLEU-1 for short sentences (unigrams only)
                weights = (1.0, 0.0, 0.0, 0.0)
            else:
                # Use standard BLEU-4 for longer sentences
                weights = (0.25, 0.25, 0.25, 0.25)
                
            score = sentence_bleu([reference_tokens], candidate_tokens,
                                 weights=weights,
                                 smoothing_function=smoother)
        
        total_bleu += score
    
    avg_bleu = total_bleu / count
    print("\nCalculation complete!                ")
    print(f"Average BLEU score: {avg_bleu:.4f}")
    return avg_bleu

# Evaluate with a sample of test pairs
quick_bleu = evaluate_bleu(model, context_raw, target_raw, sample_size=CONFIG["BLEU_SAMPLE_SIZE"])

# For final evaluation with more samples
# full_bleu = evaluate_bleu(model, context_raw, target_raw, sample_size=500)

Evaluating on 100 samples (out of 2681 total)

Showing example translations:
Input: she asked me for an unreasonable sum of money.
Candidate: aku tidak tahu apa yang bisa terjadi. 
Reference: dia meminta uang kepadaku dalam jumlah yang tidak masuk akal.
BLEU Score: 0.0256 using BLEU-4

Input: do you agree?
Candidate: aku tidak tahu apa yang bisa terjadi. 
Reference: apakah anda setuju?
BLEU Score: 0.0000 using BLEU-1 (short sentence)

Input: this soup is really delicious, isn't it?
Candidate: aku tidak tahu apa yang bisa terjadi. 
Reference: sup ini benar-benar lezat, ya?
BLEU Score: 0.0000 using BLEU-4

Input: don't forget to knock.
Candidate: aku tidak tahu apa yang bisa terjadi. 
Reference: jangan lupa mengetuk pintu.
BLEU Score: 0.0000 using BLEU-4

Input: the surgeon who did tom's operation is very experienced and highly regarded.
Candidate: aku tidak tahu apa yang bisa terjadi. 
Reference: dokter bedah yang menangani operasi tom sangat berpengalaman dan dihormati.
BLEU Score: 0.0