In [1]:
# ==============================================================================
# 0. SETUP AND IMPORTS
# ==============================================================================
import os
import time
import re
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

print("TensorFlow version:", tf.__version__)


# ==============================================================================
# 1. CONFIGURATION AND HYPERPARAMETERS
# ==============================================================================
# --- Data Configuration ---
CSV_PATH = "/kaggle/input/language-translation-englishfrench/eng_-french.csv"
EN_COL = "English words/sentences"
FR_COL = "French words/sentences"
NUM_SAMPLES = None # Set to an integer for testing, or None for full dataset

# --- Model Hyperparameters ---
MAX_LENGTH = 40
BATCH_SIZE = 64
BUFFER_SIZE = 20000
EPOCHS = 20
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

# ==============================================================================
# !! CRITICAL FIX !!
# Clean up any old checkpoint files before we begin, to prevent state conflicts.
# ==============================================================================
if os.path.exists('./checkpoints'):
    shutil.rmtree('./checkpoints')
    print("Old checkpoints directory has been removed.")


# ==============================================================================
# 2. DATA LOADING AND PREPARATION
# ==============================================================================
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV file not found at {CSV_PATH}. Please update the CSV_PATH variable.")

print("Loading data...")
df = pd.read_csv(CSV_PATH)

if NUM_SAMPLES:
    df = df.head(NUM_SAMPLES)

df = df[[EN_COL, FR_COL]].dropna().reset_index(drop=True)
print(f"Loaded {len(df)} sentence pairs.")


# ==============================================================================
# 3. TEXT PREPROCESSING
# ==============================================================================
def preprocess_sentence(sentence):
    sentence = str(sentence).lower()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-z0-9à-ž?.!,¿]+", " ", sentence, flags=re.IGNORECASE)
    sentence = sentence.strip()
    sentence = "sos " + sentence + " eos"
    return sentence

print("Preprocessing sentences...")
df["eng_proc"] = df[EN_COL].astype(str).apply(preprocess_sentence)
df["fre_proc"] = df[FR_COL].astype(str).apply(preprocess_sentence)


# ==============================================================================
# 4. TOKENIZATION AND PADDING
# ==============================================================================
print("Tokenizing and padding sequences...")
eng_tokenizer = Tokenizer(filters='', oov_token='<unk>')
eng_tokenizer.fit_on_texts(df["eng_proc"].tolist())
eng_sequences = eng_tokenizer.texts_to_sequences(df["eng_proc"].tolist())
eng_vocab_size = len(eng_tokenizer.word_index) + 1

fre_tokenizer = Tokenizer(filters='', oov_token='<unk>')
fre_tokenizer.fit_on_texts(df["fre_proc"].tolist())
fre_sequences = fre_tokenizer.texts_to_sequences(df["fre_proc"].tolist())
fre_vocab_size = len(fre_tokenizer.word_index) + 1

print("English vocab size:", eng_vocab_size)
print("French vocab size:", fre_vocab_size)

eng_padded = pad_sequences(eng_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
fre_padded = pad_sequences(fre_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')


# ==============================================================================
# 5. CREATE TF.DATA DATASETS
# ==============================================================================
print("Creating training and validation datasets...")
eng_train, eng_val, fre_train, fre_val = train_test_split(eng_padded, fre_padded, test_size=0.2, random_state=42)
print("Total train samples:", len(eng_train))
print("Total validation samples:", len(eng_val))

train_dataset = tf.data.Dataset.from_tensor_slices((eng_train, fre_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((eng_val, fre_val)).batch(BATCH_SIZE)


# ==============================================================================
# 6. TRANSFORMER MODEL DEFINITION
# ==============================================================================
print("Defining Transformer model architecture...")

def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float32(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def create_masks(inp, tar_inp):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar_inp)[1])
    dec_target_padding_mask = create_padding_mask(tar_inp)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
    return enc_padding_mask, combined_mask, dec_padding_mask

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % num_heads == 0
        self.depth = d_model // num_heads
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q, k, v = self.wq(q), self.wk(k), self.wv(v)
        q = self.split_heads(q, batch_size); k = self.split_heads(k, batch_size); v = self.split_heads(v, batch_size)
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.d_model))
        out = self.dense(concat_attention)
        return out, attention_weights

def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([tf.keras.layers.Dense(dff, activation='relu'), tf.keras.layers.Dense(d_model)])

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, *, training, mask):
        attn_out, _ = self.mha(x, x, x, mask)
        attn_out = self.dropout1(attn_out, training=training)
        out1 = self.layernorm1(x + attn_out)
        ffn_out = self.ffn(out1)
        ffn_out = self.dropout2(ffn_out, training=training)
        out2 = self.layernorm2(out1 + ffn_out)
        return out2

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, x, *, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_w1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        attn2, attn_w2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)
        ffn_out = self.ffn(out2)
        ffn_out = self.dropout3(ffn_out, training=training)
        out3 = self.layernorm3(ffn_out + out2)
        return out3, attn_w1, attn_w2

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, max_seq_len, rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers  # <--- FIX WAS HERE
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(max_seq_len, d_model)
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, *, training, mask):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)
        return x

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, max_seq_len, rate=0.1):
        super().__init__()
        self.d_model = d_model      # <--- FIX WAS HERE
        self.num_layers = num_layers  # <--- FIX WAS HERE
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(max_seq_len, d_model)
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, *, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output=enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask)
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2
        return x, attention_weights

class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super().__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training=False):
        inp, tar_inp = inputs
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
        enc_output = self.encoder(inp, training=training, mask=enc_padding_mask)
        dec_output, attention_weights = self.decoder(x=tar_inp, enc_output=enc_output, training=training, look_ahead_mask=combined_mask, padding_mask=dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output, attention_weights

# ==============================================================================
# 7. OPTIMIZER, LOSS, AND METRICS
# ==============================================================================
print("Setting up optimizer, loss, and metrics...")

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9, clipnorm=1.0)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

def masked_accuracy(real, pred):
    pred_ids = tf.argmax(pred, axis=-1, output_type=real.dtype)
    matches = tf.cast(tf.equal(real, pred_ids), tf.float32)
    mask = tf.cast(tf.not_equal(real, 0), tf.float32)
    matches *= mask
    return tf.reduce_sum(matches) / tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

# ==============================================================================
# 8. INSTANTIATE MODEL AND SET UP CHECKPOINTS
# ==============================================================================
print("Instantiating the Transformer model...")
transformer = Transformer(
    num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff,
    input_vocab_size=eng_vocab_size, target_vocab_size=fre_vocab_size,
    pe_input=MAX_LENGTH, pe_target=MAX_LENGTH, rate=dropout_rate
)

checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!')

# ==============================================================================
# 9. TRAINING
# ==============================================================================
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    with tf.GradientTape() as tape:
        predictions, _ = transformer((inp, tar_inp), training=True)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(masked_accuracy(tar_real, predictions))

print("\n--- Starting Training ---")
for epoch in range(EPOCHS):
    start = time.time()
    train_loss.reset_state()
    train_accuracy.reset_state()

    for (batch, (inp, tar)) in enumerate(train_dataset):
        train_step(inp, tar)
        if batch % 200 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
    print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

print("--- Training Finished ---")



2025-09-25 18:59:26.138999: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758826766.360872      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758826766.428998      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


TensorFlow version: 2.18.0
Loading data...
Loaded 175621 sentence pairs.
Preprocessing sentences...
Tokenizing and padding sequences...
English vocab size: 14120
French vocab size: 24188
Creating training and validation datasets...
Total train samples: 140496
Total validation samples: 35125


I0000 00:00:1758826789.677392      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Defining Transformer model architecture...
Setting up optimizer, loss, and metrics...
Instantiating the Transformer model...

--- Starting Training ---
Epoch 1 Batch 0 Loss 10.0715 Accuracy 0.0000
Epoch 1 Batch 200 Loss 9.5905 Accuracy 0.0874
Epoch 1 Batch 400 Loss 8.5194 Accuracy 0.1270
Epoch 1 Batch 600 Loss 7.5519 Accuracy 0.1535
Epoch 1 Batch 800 Loss 6.8601 Accuracy 0.1876
Epoch 1 Batch 1000 Loss 6.3499 Accuracy 0.2173
Epoch 1 Batch 1200 Loss 5.9558 Accuracy 0.2428
Epoch 1 Batch 1400 Loss 5.6401 Accuracy 0.2643
Epoch 1 Batch 1600 Loss 5.3825 Accuracy 0.2828
Epoch 1 Batch 1800 Loss 5.1623 Accuracy 0.2992
Epoch 1 Batch 2000 Loss 4.9723 Accuracy 0.3137
Epoch 1 Loss 4.8059 Accuracy 0.3273
Time taken for 1 epoch: 298.17 secs

Epoch 2 Batch 0 Loss 2.8719 Accuracy 0.5017
Epoch 2 Batch 200 Loss 2.9074 Accuracy 0.4935
Epoch 2 Batch 400 Loss 2.8336 Accuracy 0.5043
Epoch 2 Batch 600 Loss 2.7608 Accuracy 0.5155
Epoch 2 Batch 800 Loss 2.6913 Accuracy 0.5271
Epoch 2 Batch 1000 Loss 2.6293 Accur

In [2]:
# ==============================================================================
# 10. EVALUATION AND TRANSLATION (CORRECTED)
# ==============================================================================

# Define the corrected evaluate function
def evaluate(inp_sentence):
    inp_sentence_proc = preprocess_sentence(inp_sentence)
    inp_tensor = tf.convert_to_tensor([eng_tokenizer.texts_to_sequences([inp_sentence_proc])[0]])
    inp_tensor = pad_sequences(inp_tensor, maxlen=MAX_LENGTH, padding='post', truncating='post')
    decoder_input = [fre_tokenizer.word_index['sos']]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(MAX_LENGTH):
        predictions, _ = transformer((inp_tensor, output), training=False)
        predictions = predictions[:, -1:, :]
        
        # **** THIS IS THE FIX ****
        # We specify the output_type to match the 'output' tensor's type (int32).
        predicted_id = tf.argmax(predictions, axis=-1, output_type=tf.int32)
        
        output = tf.concat([output, predicted_id], axis=-1)
        
        # Also, ensure we're comparing an int to an int
        if predicted_id[0][0] == fre_tokenizer.word_index['eos']:
            break
            
    return tf.squeeze(output, axis=0)

# Define the corrected translate function
def translate(sentence):
    result = evaluate(sentence)
    predicted_sentence = fre_tokenizer.sequences_to_texts([result.numpy()])
    predicted_sentence = predicted_sentence[0].replace('sos ', '').replace(' eos', '').strip()
    print(f'Input: {sentence}')
    print(f'Predicted translation: {predicted_sentence}')

# --- Test your model ---
print("\n--- Testing Translations ---")
translate("hello world")
translate("this is a test")
translate("I love to learn new things.")
translate("She is a great writer.")



--- Testing Translations ---
Input: hello world
Predicted translation: bonjour .
Input: this is a test
Predicted translation: c est un contrôle .
Input: I love to learn new things.
Predicted translation: j adore apprendre de nouvelles choses .
Input: She is a great writer.
Predicted translation: elle est un bon écrivain .


In [3]:
import pickle
import os

# Define the output directory in the Kaggle environment
SAVE_DIR = '/kaggle/working/my_translation_model_assets'

# Create the directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)


# 1. Save the entire Transformer model with the .keras extension
print("Saving the Transformer model...")

# **** THIS IS THE FIX ****
# We add the '.keras' extension to the filename.
model_path = os.path.join(SAVE_DIR, 'transformer_model.keras')
transformer.save(model_path)
print(f"Model saved successfully to: {model_path}")


# 2. Save the tokenizers (this part was already correct)
print("Saving the tokenizers...")
tokenizer_eng_path = os.path.join(SAVE_DIR, 'eng_tokenizer.pkl')
with open(tokenizer_eng_path, 'wb') as f:
    pickle.dump(eng_tokenizer, f)

tokenizer_fre_path = os.path.join(SAVE_DIR, 'fre_tokenizer.pkl')
with open(tokenizer_fre_path, 'wb') as f:
    pickle.dump(fre_tokenizer, f)

print(f"Tokenizers saved successfully to: {tokenizer_eng_path} and {tokenizer_fre_path}")
print("\nIMPORTANT: Your model is now saved. To make it permanent, click the 'Save Version' button and choose 'Save & Run All (Commit)'.")

Saving the Transformer model...
Model saved successfully to: /kaggle/working/my_translation_model_assets/transformer_model.keras
Saving the tokenizers...
Tokenizers saved successfully to: /kaggle/working/my_translation_model_assets/eng_tokenizer.pkl and /kaggle/working/my_translation_model_assets/fre_tokenizer.pkl

IMPORTANT: Your model is now saved. To make it permanent, click the 'Save Version' button and choose 'Save & Run All (Commit)'.
