In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Embedding, Dropout, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import re
import os


In [4]:
# --- Hugging Face Dataset Import ---
from datasets import load_dataset

# --- 1. Data Acquisition and Preprocessing ---

# Recommended Lightweight Dataset: 'small_parallel_en-fr' from Hugging Face
# This dataset is specifically designed for English-French translation.
# We'll load a small portion of the training set to keep it manageable for a "simple" LLM.
try:
    print("Loading 'sethjsa/medline_en_fr_parallel_doc' dataset from Hugging Face...")
    # Load the full DatasetDict first
    # This will return {'train': Dataset object}
    # dataset = load_dataset('sethjsa/wmt_en_fr_parallel', split='train[:5000]') 
    dataset_dict = load_dataset("sethjsa/medline_en_fr_parallel_doc")
    # Select the 'train' split and then slice it.
    train_split_size = len(dataset_dict['train'])
    num_examples_to_load = min(800, train_split_size) # Load up to 100
    dataset = dataset_dict['train'].select(range(num_examples_to_load))
    print(f"Loading {num_examples_to_load} examples from the dataset.")
    
    

     # --- DEBUGGING START ---
    print("\n--- Inspecting first example structure from Hugging Face dataset ---")
    if len(dataset) > 0:
        first_example = dataset[0]
        print(f"Type of first_example: {type(first_example)}")
        print(f"Keys of first_example: {first_example.keys()}")
        
        if 'translation' in first_example:
            print(f"Type of first_example['translation']: {type(first_example['translation'])}")
            if isinstance(first_example['translation'], dict):
                print(f"Keys of first_example['translation']: {first_example['translation'].keys()}")
                print(f"Sample English: {first_example['translation']['en']}")
                print(f"Sample French: {first_example['translation']['fr']}")
            else:
                print(f"first_example['translation'] is NOT a dict. Its value: {first_example['translation']}")
        else:
            print("'translation' key NOT found in first_example.")
    else:
        print("Dataset is empty.")
    print("--- End Inspection ---")
    # --- DEBUGGING END ---
    
    raw_en_sentences = [example['en'] for example in dataset]
    raw_fr_sentences = [example['fr'] for example in dataset]
    print(f"Loaded {len(raw_en_sentences)} parallel sentences.")
    
except Exception as e:
    print(f"Error loading dataset from Hugging Face: {e}")
    print("Falling back to a very small illustrative dataset.")
    # Fallback to a tiny manual dataset if Hugging Face dataset loading fails
    raw_en_sentences = [
        "I am a student.",
        "How are you?",
        "She likes cats.",
        "We are learning machine translation.",
        "The dog is brown.",
        "He reads books.",
        "Where is the library?",
        "What is your name?",
        "This is a test sentence.",
        "Please translate this.",
        "Hello world.",
        "I love deep learning."
    ]
    raw_fr_sentences = [
        "Je suis étudiant.",
        "Comment allez-vous ?",
        "Elle aime les chats.",
        "Nous apprenons la traduction automatique.",
        "Le chien est marron.",
        "Il lit des livres.",
        "Où est la bibliothèque ?",
        "Quel est votre nom ?",
        "Ceci est une phrase test.",
        "Veuillez traduire ceci.",
        "Bonjour le monde.",
        "J'adore l'apprentissage profond."
    ]


Loading 'sethjsa/medline_en_fr_parallel_doc' dataset from Hugging Face...


README.md:   0%|          | 0.00/302 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.30M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/800 [00:00<?, ? examples/s]

Loading 800 examples from the dataset.

--- Inspecting first example structure from Hugging Face dataset ---
Type of first_example: <class 'dict'>
Keys of first_example: dict_keys(['fr', 'en'])
'translation' key NOT found in first_example.
--- End Inspection ---
Loaded 800 parallel sentences.


In [5]:

# Add special tokens
SOS_TOKEN = "<sos>" # Start of Sequence
EOS_TOKEN = "<eos>" # End of Sequence
UNK_TOKEN = "<unk>" # Unknown Token
PAD_TOKEN = "<pad>" # Padding Token

def preprocess_sentence(sentence):
    sentence = sentence.lower()
    # Add space before punctuation (adjust as needed for specific languages/punctuation)
    sentence = re.sub(r'([.!?])', r' \1', sentence) 
    # Remove characters that are not letters, punctuation, or spaces
    sentence = re.sub(r'[^a-zA-Z0-9.!?\s]', '', sentence) # Allow numbers for more general text
    sentence = sentence.strip()
    return sentence

# Preprocess raw sentences
en_sentences = [preprocess_sentence(s) for s in raw_en_sentences]
fr_sentences = [preprocess_sentence(s) for s in raw_fr_sentences]

# Add SOS and EOS tokens to French sentences for decoder input/target
fr_sentences_in = [f"{SOS_TOKEN} {s}" for s in fr_sentences]
fr_sentences_out = [f"{s} {EOS_TOKEN}" for s in fr_sentences]

print("\nExample processed sentences:")
print("English:", en_sentences[0])
print("French input:", fr_sentences_in[0])
print("French output:", fr_sentences_out[0])




Example processed sentences:
English: pmid 20847962  tl sedlak a pu e aymong m gao n khan h quan kh humphries sex differences in coronary catheterization and revascularization following acute myocardial infarction time trends from 1994 to 2003 in british columbia .  background  studies before the turn of the century reported sex differences in procedure rates . it is unknown whether these differences persist . objectives  to examine time trends and sex differences in coronary catheterization and revascularization following acute myocardial infarction ami . methods  a retrospective analysis was performed of all patients 20 years of age or older who were admitted to hospital in british columbia with an ami between april 1 1994 and march 31 2003 . segmented regression analysis was used to examine the inflection point of the time trend in 90day catheterization rates postami . multivariable cox regression modelling was used to evaluate sex differences in receiving catheterization and revas

In [6]:
# Tokenization
# Keras Tokenizer by default filters punctuation, which we've handled,
# so we pass an empty filter string to keep our preprocessed tokens.
# We also ensure UNK_TOKEN is handled.
en_tokenizer = Tokenizer(filters='', oov_token=UNK_TOKEN)
fr_tokenizer = Tokenizer(filters='', oov_token=UNK_TOKEN)

# Fit on processed sentences including special tokens
en_tokenizer.fit_on_texts(en_sentences)
# Fit French tokenizer on both input and output sequences to ensure comprehensive vocabulary
fr_tokenizer.fit_on_texts(fr_sentences_in + fr_sentences_out) 

# Ensure PAD_TOKEN is in the vocabulary and assigned ID 0 for easier masking
# Adjust existing indices if 0 is already taken
if PAD_TOKEN not in en_tokenizer.word_index:
    en_tokenizer.word_index[PAD_TOKEN] = 0 # Assign ID 0 to PAD_TOKEN
    # Shift other indices if 0 was previously assigned to another word
    for word, idx in list(en_tokenizer.word_index.items()):
        if idx == 0 and word != PAD_TOKEN:
            en_tokenizer.word_index[word] = len(en_tokenizer.word_index)
            break
if PAD_TOKEN not in fr_tokenizer.word_index:
    fr_tokenizer.word_index[PAD_TOKEN] = 0 # Assign ID 0 to PAD_TOKEN
    for word, idx in list(fr_tokenizer.word_index.items()):
        if idx == 0 and word != PAD_TOKEN:
            fr_tokenizer.word_index[word] = len(fr_tokenizer.word_index)
            break

# Rebuild index_word maps after potentially adjusting word_index
en_tokenizer.index_word = {idx: word for word, idx in en_tokenizer.word_index.items()}
fr_tokenizer.index_word = {idx: word for word, idx in fr_tokenizer.word_index.items()}


# Convert text to sequences of integers
encoder_input_sequences = en_tokenizer.texts_to_sequences(en_sentences)
decoder_input_sequences = fr_tokenizer.texts_to_sequences(fr_sentences_in)
decoder_target_sequences = fr_tokenizer.texts_to_sequences(fr_sentences_out)

# Calculate vocabulary sizes (including 0 for padding)
en_vocab_size = len(en_tokenizer.word_index)
fr_vocab_size = len(fr_tokenizer.word_index)

print(f"\nEnglish Vocab Size (including PAD): {en_vocab_size}")
print(f"French Vocab Size (including PAD): {fr_vocab_size}")

# Padding
# It's important for Transformer models to have fixed-length inputs for batching.
# We use 'post' padding, adding zeros at the end.
max_en_len = max(len(s) for s in encoder_input_sequences) if encoder_input_sequences else 1
max_fr_len = max(len(s) for s in decoder_input_sequences) if decoder_input_sequences else 1

encoder_input_data = pad_sequences(encoder_input_sequences, maxlen=max_en_len, padding='post', value=0)
decoder_input_data = pad_sequences(decoder_input_sequences, maxlen=max_fr_len, padding='post', value=0)
decoder_target_data = pad_sequences(decoder_target_sequences, maxlen=max_fr_len, padding='post', value=0)

print(f"Max English sequence length: {max_en_len}")
print(f"Max French sequence length: {max_fr_len}")
print(f"Encoder Input Shape: {encoder_input_data.shape}")
print(f"Decoder Input Shape: {decoder_input_data.shape}")
print(f"Decoder Target Shape: {decoder_target_data.shape}")





English Vocab Size (including PAD): 16980
French Vocab Size (including PAD): 19938
Max English sequence length: 615
Max French sequence length: 541
Encoder Input Shape: (800, 615)
Decoder Input Shape: (800, 541)
Decoder Target Shape: (800, 541)


In [7]:
# --- 2. Define the Transformer Architecture (Encoder-Decoder) ---

class PositionalEncoding(Layer):
    def __init__(self, position, d_model, **kwargs):
        super(PositionalEncoding, self).__init__(**kwargs)
        self.position = position
        self.d_model = d_model
        # self.pos_encoding = self._get_angles(np.arange(position)[np.newaxis, :],
        #                                      np.arange(d_model)[np.newaxis, :])
        self.pos_encoding = self._get_angles(np.arange(position)[:, np.newaxis], # Make it a column vector
                                     np.arange(d_model)[np.newaxis, :])  # Make it a row vector
        self.pos_encoding = tf.cast(self.pos_encoding, dtype=tf.float32)

    def _get_angles(self, position, i):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(self.d_model))
        return position * angle_rates

    def call(self, inputs):
        seq_len = tf.shape(inputs)[1]
        return inputs + self.pos_encoding[:seq_len, :]

    def get_config(self):
        config = super().get_config()
        config.update({
            "position": self.position,
            "d_model": self.d_model,
        })
        return config


def create_padding_mask(seq):
    # This mask is for padding tokens (value 0). It will be 1 where there is padding.
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(seq_len):
    # This mask is to prevent a token from attending to future tokens in the sequence.
    # It will be 1 for future tokens.
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    return look_ahead_mask  # (seq_len, seq_len)

def create_masks(inp, tar):
    # Encoder padding mask (for self-attention in encoder)
    enc_padding_mask = create_padding_mask(inp)
    
    # Decoder look-ahead mask (for masked self-attention in decoder)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    # Decoder target padding mask (for masked self-attention in decoder, combines with look-ahead)
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) # Combine to mask both future and padding
    
    # Decoder padding mask (for cross-attention in decoder, attends to encoder output)
    # This is the same as enc_padding_mask because it masks the *encoder output* (source sequence)
    # based on its padding.
    dec_padding_mask = create_padding_mask(inp) 
  
    return enc_padding_mask, combined_mask, dec_padding_mask


class EncoderLayer(Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, **kwargs):
        super(EncoderLayer, self).__init__(**kwargs)
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)
        self.ffn = tf.keras.Sequential([
            Dense(dff, activation='relu'),
            Dense(d_model)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        
        # Store parameters for get_config
        self.d_model_param = d_model
        self.num_heads_param = num_heads
        self.dff_param = dff
        self.rate_param = rate

    def call(self, x, training, mask):
        # Self-attention
        attn_output = self.mha(x, x, x, attention_mask=mask) 
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        # Feed-forward network
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "d_model": self.d_model_param,
            "num_heads": self.num_heads_param,
            "dff": self.dff_param,
            "rate": self.rate_param,
        })
        return config


class DecoderLayer(Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, **kwargs):
        super(DecoderLayer, self).__init__(**kwargs)
        self.mha1 = MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads) # Masked self-attention
        self.mha2 = MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads) # Cross-attention
        self.ffn = tf.keras.Sequential([
            Dense(dff, activation='relu'),
            Dense(d_model)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        self.dropout3 = Dropout(rate)
        
        # Store parameters for get_config
        self.d_model_param = d_model
        self.num_heads_param = num_heads
        self.dff_param = dff
        self.rate_param = rate

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        # Masked self-attention (attends to its own previous outputs)
        attn1 = self.mha1(x, x, x, attention_mask=look_ahead_mask) 
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1)

        # Cross-attention (attends to encoder output)
        attn2 = self.mha2(out1, enc_output, enc_output, attention_mask=padding_mask) 
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)

        # Feed-forward network
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        return self.layernorm3(out2 + ffn_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "d_model": self.d_model_param,
            "num_heads": self.num_heads_param,
            "dff": self.dff_param,
            "rate": self.rate_param,
        })
        return config


class Encoder(Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(input_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(maximum_position_encoding, d_model)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = Dropout(rate)
        
        # Store parameters for get_config
        self.input_vocab_size = input_vocab_size
        self.maximum_position_encoding = maximum_position_encoding
        self.rate = rate

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) # Scale embeddings
        x = self.pos_encoding(x)
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)
        return x
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "num_layers": self.num_layers,
            "d_model": self.d_model,
            "num_heads": self.enc_layers[0].num_heads_param, # Access from first layer's stored param
            "dff": self.enc_layers[0].dff_param,
            "input_vocab_size": self.input_vocab_size,
            "maximum_position_encoding": self.maximum_position_encoding,
            "rate": self.rate,
        })
        return config


class Decoder(Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.d_model = d_model
        self.num_layers = num_layers
        self.embedding = Embedding(target_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(maximum_position_encoding, d_model)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = Dropout(rate)
        
        # Store parameters for get_config
        self.target_vocab_size = target_vocab_size
        self.maximum_position_encoding = maximum_position_encoding
        self.rate = rate

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) # Scale embeddings
        x = self.pos_encoding(x)
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_output=enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask)
        return x
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "num_layers": self.num_layers,
            "d_model": self.d_model,
            "num_heads": self.dec_layers[0].num_heads_param,
            "dff": self.dec_layers[0].dff_param,
            "target_vocab_size": self.target_vocab_size,
            "maximum_position_encoding": self.maximum_position_encoding,
            "rate": self.rate,
        })
        return config

In [8]:
class Transformer(Model):
    def __init__(self, num_layers, d_model, num_heads, dff,
                 input_vocab_size, target_vocab_size,
                 pe_input, pe_target, rate=0.1, **kwargs):
        super(Transformer, self).__init__(**kwargs)
        self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                               input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                               target_vocab_size, pe_target, rate)
        self.final_layer = Dense(target_vocab_size)

    def call(self, inputs, training=False):
        inp, tar = inputs # inp = encoder_input, tar = decoder_input
        enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp, tar)

        enc_output = self.encoder(inp, training=training, mask=enc_padding_mask)  
        dec_output = self.decoder(tar, enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=dec_padding_mask)
        
        final_output = self.final_layer(dec_output) 
        return final_output

    def get_config(self):
        config = super().get_config()
        config.update({
            "num_layers": self.encoder.num_layers, 
            "d_model": self.encoder.d_model,
            "num_heads": self.encoder.enc_layers[0].num_heads_param,
            "dff": self.encoder.enc_layers[0].dff_param,
            "input_vocab_size": self.encoder.input_vocab_size,
            "target_vocab_size": self.decoder.target_vocab_size,
            "pe_input": self.encoder.maximum_position_encoding,
            "pe_target": self.decoder.maximum_position_encoding,
            "rate": self.encoder.rate,
        })
        return config


# --- 3. Training Steps ---

# Hyperparameters (keep small for "simple" LLM)
num_layers = 2
d_model = 64 # Embedding dimension
num_heads = 4
dff = 128 # Hidden layer size in feed forward network
dropout_rate = 0.1

# Instantiate the Transformer model
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=en_vocab_size,
    target_vocab_size=fr_vocab_size,
    pe_input=max_en_len,
    pe_target=max_fr_len,
    rate=dropout_rate
)

# Custom learning rate schedule (often used with Transformers)
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        # arg1 = tf.math.rsqrt(step)
        arg1 = tf.math.rsqrt(tf.cast(step, tf.float32))
        # arg2 = step * (self.warmup_steps ** -1.5)
        arg2 = tf.cast(step, tf.float32) * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

    def get_config(self):
        return {
            "d_model": self.d_model.numpy(),
            "warmup_steps": self.warmup_steps,
        }

learning_rate = CustomSchedule(d_model)
optimizer = Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9,clipnorm=1.0)

# Loss function: SparseCategoricalCrossentropy as targets are integer IDs
loss_object = SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    # Create a mask to ignore padding tokens (ID 0) from the loss calculation
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    # Apply the mask to the loss
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    # Return the mean loss, considering only non-padded tokens
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def accuracy_function(real, pred):
    # Calculate accuracy by comparing predicted token (argmax) with real token
    # accuracies = tf.cast(tf.math.equal(real, tf.argmax(pred, axis=2)), tf.float32)
    accuracies = tf.cast(tf.math.equal(real, tf.cast(tf.argmax(pred, axis=2), tf.int32)), tf.float32)
    # Create a mask to ignore padding tokens (ID 0) from accuracy calculation
    mask = tf.cast(tf.math.logical_not(tf.math.equal(real, 0)), tf.float32)
    accuracies *= mask
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

transformer.compile(optimizer=optimizer, loss=loss_function, metrics=[accuracy_function])

# Training
epochs = 50 # Adjusted epochs for faster demo with larger dataset
batch_size = 16 # Batch size can be larger now due to more data

print("\n--- Starting Training ---")
transformer.fit(
    x=(encoder_input_data, decoder_input_data),
    y=decoder_target_data,
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
)
print("\n--- Training Complete ---")


# --- 4. Inference (Translation) ---

def evaluate(input_sentence):
    input_sentence = preprocess_sentence(input_sentence)
    
    # Prepare encoder input
    encoder_input = en_tokenizer.texts_to_sequences([input_sentence])
    # Pad to the max_en_len used during training
    encoder_input = pad_sequences(encoder_input, maxlen=max_en_len, padding='post', value=0)
    
    # Prepare decoder input (start with SOS token)
    decoder_input = tf.expand_dims([fr_tokenizer.word_index[SOS_TOKEN]], 0)
    
    output_tokens = []

    # Iterate up to max French length (or until EOS token is predicted)
    for i in range(max_fr_len): 
        predictions = transformer([encoder_input, decoder_input], training=False)
        
        # Select the last token's prediction (the one currently being generated)
        predictions = predictions[:, -1, :] 

        predicted_id = tf.argmax(predictions, axis=-1).numpy()[0]
        
        # Check if EOS token is predicted
        if predicted_id == fr_tokenizer.word_index[EOS_TOKEN]:
            break 

        # Append predicted ID to output tokens
        output_tokens.append(predicted_id)
        
        # Concatenate the predicted ID to the decoder input for the next step
        decoder_input = tf.concat([decoder_input, tf.expand_dims([predicted_id], 0)], axis=-1)

    # Convert predicted IDs back to words
    translated_sentence = []
    for token_id in output_tokens:
        word = fr_tokenizer.index_word.get(token_id, UNK_TOKEN)
        # Avoid adding PAD_TOKEN itself to the translated sentence
        if word != PAD_TOKEN: 
            translated_sentence.append(word)
    
    return " ".join(translated_sentence)

print("\n--- Testing Translation ---")

test_sentences = [
    "I am a student.",
    "How are you?",
    "She likes cats.",
    "We are learning machine translation.",
    "The dog is brown.",
    "He reads books.",
    "Where is the library?",
    "What is your name?",
    "This is a test sentence.",
    "Please translate this.",
    "Hello world.",
    "I love deep learning.",
    "The quick brown fox." # A sentence likely not in the training data
]

for sentence in test_sentences:
    translated = evaluate(sentence)
    print(f"English: '{sentence}'")
    print(f"French (Translated): '{translated}'\n")



--- Starting Training ---
Epoch 1/50


2025-06-15 11:48:20.795306: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2025-06-15 11:48:31.961391: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 74926336 exceeds 10% of free system memory.
2025-06-15 11:48:31.961482: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 96825600 exceeds 10% of free system memory.
2025-06-15 11:48:32.182884: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 96825600 exceeds 10% of free system memory.
2025-06-15 11:48:32.338631: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 85175040 exceeds 10% of free system memory.
2025-06-15 11:48:32.476400: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 74926336 exceeds 10% of free system memory.


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 3s/step - accuracy_function: 7.5377e-06 - loss: 9.8987
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 3s/step - accuracy_function: 0.0034 - loss: 9.8389
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 3s/step - accuracy_function: 0.0431 - loss: 9.6923
Epoch 4/50
[1m12/50[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m2:53[0m 5s/step - accuracy_function: 0.0486 - loss: 9.5157

KeyboardInterrupt: 

In [None]:
print("\n--- Testing Translation ---")

test_sentences = [
    "it is unknown whether these differences persist ."
]

for sentence in test_sentences:
    translated = evaluate(sentence)
    print(f"English: '{sentence}'")
    print(f"French (Translated): '{translated}'\n")



--- Testing Translation ---
English: 'it is unknown whether these differences persist .'
French (Translated): 'e e pmid pmid pmid 24082400 la da la deux la prise la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la 

In [None]:
from IPython.display import display, HTML
import ipywidgets as widgets

# Create UI components
input_box = widgets.Textarea(
    value="Type an English sentence here...",
    description="English:",
    layout=widgets.Layout(width='100%', height='100px')
)

output_box = widgets.Output(layout=widgets.Layout(width='100%', border='1px solid black'))

translate_button = widgets.Button(description="Translate", button_style='primary')

def on_translate_click(b):
    output_box.clear_output()
    user_input = input_box.value
    translation = evaluate(user_input)
    with output_box:
        print(f"Translated: {translation}")

translate_button.on_click(on_translate_click)

# --- Display the UI ---
display(HTML("<h3>English to French Translator</h3>"))
display(input_box)
display(translate_button)
display(HTML("<b>Translation:</b>"))
display(output_box)

Textarea(value='Type an English sentence here...', description='English:', layout=Layout(height='100px', width…

Button(button_style='primary', description='Translate', style=ButtonStyle())

Output(layout=Layout(border='1px solid black', width='100%'))