In [1]:
!pip install "tensorflow>=2.12" transformers datasets sentencepiece sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, fsspec, sacrebleu
  Attempting uninstall: fsspec
    Found e

In [2]:
import os, random
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFXLMRobertaModel, PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, trainers, normalizers
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

2025-08-28 19:29:19.368985: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756409359.569844      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756409359.631069      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

max_source_length = 128
max_target_length = 128
target_vocab_size = 16000
hindi_bpe_json = "hindi_bpe.json"

d_model = 768
num_layers = 4
num_heads = 8
dff = 2048
dropout = 0.1

batch_size = 96
epochs = 1
warmup_steps = 4000
lr_factor = 1.0

In [4]:
print("loading dataset")
ds = load_dataset("cfilt/iitb-english-hindi")

english_train = [ex["translation"]["en"] for ex in ds["train"]]
hindi_train = [ex["translation"]["hi"] for ex in ds["train"]]
english_val = [ex["translation"]["en"] for ex in ds["validation"]]
hindi_val = [ex["translation"]["hi"] for ex in ds["validation"]]
english_test = [ex["translation"]["en"] for ex in ds["test"]]
hindi_test = [ex["translation"]["hi"] for ex in ds["test"]]

loading dataset


README.md: 0.00B [00:00, ?B/s]

dataset_infos.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/500k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [5]:
print("train/val/test:", len(english_train), len(english_val), len(english_test))

train/val/test: 1659083 520 2507


In [6]:
if not os.path.exists(hindi_bpe_json):
    print("training hindi bpe tokenizer")
    base_tok = Tokenizer(models.BPE(unk_token="[UNK]"))
    base_tok.normalizer = normalizers.Sequence([normalizers.NFKC()])
    base_tok.pre_tokenizer = Whitespace()
    trainer = BpeTrainer(vocab_size=target_vocab_size, min_frequency=2, special_tokens=["[PAD]","[BOS]","[EOS]","[UNK]"], show_progress=True)
    def hindi_iter():
        for t in (hindi_train + hindi_val):
            yield t.strip()
    base_tok.train_from_iterator(hindi_iter(), trainer=trainer)
    base_tok.save(hindi_bpe_json)
else:
    print("using hindi bpe tokenizer")

training hindi bpe tokenizer





In [7]:
tok_hi = PreTrainedTokenizerFast(
    tokenizer_file=hindi_bpe_json,
    bos_token="[BOS]",
    eos_token="[EOS]",
    unk_token="[UNK]",
    pad_token="[PAD]",
)
pad_id = tok_hi.pad_token_id
bos_id = tok_hi.bos_token_id
eos_id = tok_hi.eos_token_id
unk_id = tok_hi.unk_token_id
vocab_size_hi = tok_hi.vocab_size
print("hindi vocab:", vocab_size_hi, "| ids:", {"pad":pad_id,"bos":bos_id,"eos":eos_id,"unk":unk_id})

print("loading xlm-r")
tok_xlmr = AutoTokenizer.from_pretrained("xlm-roberta-base", use_fast=True)
enc_xlmr = TFXLMRobertaModel.from_pretrained("xlm-roberta-base")
enc_xlmr.trainable = False

hindi vocab: 16000 | ids: {'pad': 0, 'bos': 1, 'eos': 2, 'unk': 3}
loading xlm-r


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

I0000 00:00:1756409491.267933      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your ta

In [8]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_len=10000, d_model=512):
        super().__init__()
        pos = np.arange(max_len)[:, None]
        i = np.arange(d_model)[None, :]
        rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        angles = pos * rates
        pe = np.zeros((max_len, d_model), dtype=np.float32)
        pe[:, 0::2] = np.sin(angles[:, 0::2])
        pe[:, 1::2] = np.cos(angles[:, 1::2])
        self.pe = tf.constant(pe)[None, ...]
    def call(self, x):
        return x + self.pe[:, :tf.shape(x)[1], :]

class TokenEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab, d_model):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(vocab, d_model)
        self.scale = tf.math.sqrt(tf.cast(d_model, tf.float32))
    def call(self, x):
        return self.emb(x) * self.scale

def look_ahead_mask(length: int):
    return tf.linalg.band_part(tf.ones((length, length), dtype=tf.int32), -1, 0)

In [9]:


class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, dff, drop):
        super().__init__()
        self.self_mha  = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=d_model // heads)
        self.cross_mha = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=d_model // heads)
        self.ffn = tf.keras.Sequential([tf.keras.layers.Dense(dff, activation="relu"), tf.keras.layers.Dense(d_model)])
        self.n1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.n2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.n3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.d1 = tf.keras.layers.Dropout(drop)
        self.d2 = tf.keras.layers.Dropout(drop)
        self.d3 = tf.keras.layers.Dropout(drop)
    def call(self, y, enc_out, look_mask, cross_mask, training):
        a1 = self.self_mha(y, y, attention_mask=look_mask, training=training)
        y  = self.n1(y + self.d1(a1, training=training))
        a2 = self.cross_mha(y, enc_out, enc_out, attention_mask=cross_mask, training=training)
        y  = self.n2(y + self.d2(a2, training=training))
        f  = self.ffn(y)
        y  = self.n3(y + self.d3(f, training=training))
        return y

class TransformerDecoder(tf.keras.Model):
    def __init__(self, vocab, d_model, num_layers, heads, dff, drop, pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.emb = TokenEmbedding(vocab, d_model)
        self.pos = PositionalEncoding(10000, d_model)
        self.decoder_layers = [DecoderLayer(d_model, heads, dff, drop) for _ in range(num_layers)]
        self.drop = tf.keras.layers.Dropout(drop)
        self.proj = tf.keras.layers.Dense(vocab)
    def call(self, dec_inp, enc_out, enc_mask_1d, training=False):
        tgt_valid = tf.cast(tf.not_equal(dec_inp, self.pad_id), tf.int32)
        la = look_ahead_mask(tf.shape(dec_inp)[1])
        look_mask = tf.minimum(tf.einsum('bi,bj->bij', tgt_valid, tf.ones_like(tgt_valid)), la)
        cross_mask = tf.einsum('bi,bj->bij', tgt_valid, enc_mask_1d)
        y = self.emb(dec_inp)
        y = self.pos(y)
        y = self.drop(y, training=training)
        for layer in self.decoder_layers:
            y = layer(y, enc_out, look_mask, cross_mask, training)
        return self.proj(y)

def encode_hi_ids(text: str):
    ids = tok_hi.encode(text, add_special_tokens=False)
    ids = [bos_id] + ids[: (max_target_length - 2)] + [eos_id]
    return np.array(ids, dtype=np.int32)

def tf_encode_hi(text):
    ids = tf.numpy_function(lambda s: encode_hi_ids(s.decode("utf-8")), [text], tf.int32)
    ids.set_shape([None])
    return ids

def tf_tokenize_en_batch(strings):
    def _batch(np_batch: np.ndarray):
        sents = [s.decode("utf-8") for s in np_batch]
        enc = tok_xlmr(sents, padding="max_length", truncation=True, max_length=max_source_length, return_tensors="np")
        return enc["input_ids"].astype(np.int32), enc["attention_mask"].astype(np.int32)
    input_ids, attention_mask = tf.numpy_function(_batch, [strings], [tf.int32, tf.int32])
    input_ids.set_shape([None, max_source_length])
    attention_mask.set_shape([None, max_source_length])
    return input_ids, attention_mask

def make_ds(src_texts, tgt_texts, shuffle):
    ds = tf.data.Dataset.from_tensor_slices((src_texts, tgt_texts))
    if shuffle:
        ds = ds.shuffle(100000, seed=42, reshuffle_each_iteration=True)
    def map_targets(src, tgt):
        tgt_ids = tf_encode_hi(tgt)
        dec_inp = tf.cast(tgt_ids[:-1], tf.int32)
        labels  = tf.cast(tgt_ids[1:],  tf.int32)
        return {"src_text": src, "dec_inp": dec_inp, "labels": labels}
    ds = ds.map(map_targets, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.padded_batch(
        batch_size,
        padded_shapes={"src_text": [], "dec_inp": [None], "labels": [None]},
        padding_values={"src_text": tf.constant("", dtype=tf.string), "dec_inp": tf.constant(pad_id, tf.int32), "labels": tf.constant(pad_id, tf.int32)},
        drop_remainder=False,
    )
    def tokenize_src(batch):
        input_ids, attention_mask = tf_tokenize_en_batch(batch["src_text"])
        x = {"input_ids": input_ids, "attention_mask": attention_mask, "dec_inp": batch["dec_inp"]}
        y = batch["labels"]
        return x, y
    return ds.map(tokenize_src, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

ds_train = make_ds(english_train, hindi_train, shuffle=True)
ds_val = make_ds(english_val, hindi_val, shuffle=False)
ds_test = make_ds(english_test, hindi_test, shuffle=False)


In [10]:
class EncDec(tf.keras.Model):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    def call(self, inputs, training=False):
        enc_out = self.encoder(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], training=False).last_hidden_state
        enc_mask_1d = tf.cast(inputs["attention_mask"], tf.int32)
        return self.decoder(inputs["dec_inp"], enc_out, enc_mask_1d, training=training)

decoder = TransformerDecoder(vocab=vocab_size_hi, d_model=d_model, num_layers=num_layers, heads=num_heads, dff=dff, drop=dropout, pad_id=pad_id)
model = EncDec(enc_xlmr, decoder)

In [11]:
class Noam(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup=4000, factor=1.0):
        super().__init__()
        self.d = tf.cast(d_model, tf.float32)
        self.w = warmup
        self.f = factor
    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        return (self.f * tf.math.rsqrt(self.d) * tf.math.minimum(tf.math.rsqrt(step), step * (self.w ** -1.5)))

def masked_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
    mask = tf.cast(tf.not_equal(y_true, pad_id), loss.dtype)
    return tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)

def masked_acc(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    pred = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
    match = tf.cast(tf.equal(y_true, pred), tf.float32)
    mask = tf.cast(tf.not_equal(y_true, pad_id), tf.float32)
    return tf.reduce_sum(match * mask) / tf.reduce_sum(mask)

In [12]:

lr = Noam(d_model, warmup_steps, lr_factor)
opt = tf.keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

model.compile(optimizer=opt, loss=masked_loss, metrics=[masked_acc])

print("training")
history = model.fit(ds_train, validation_data=ds_val, epochs=epochs)


training


I0000 00:00:1756409540.653819     108 service.cc:148] XLA service 0x7811f2c78220 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1756409540.654292     108 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1756409540.722244     108 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1756409540.893806     108 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




In [13]:
print("testing")
print(model.evaluate(ds_test, return_dict=True))

@tf.function
def _encode_batch(input_ids, attention_mask):
    return enc_xlmr(input_ids=input_ids, attention_mask=attention_mask, training=False).last_hidden_state

testing
{'loss': 3.284698963165283, 'masked_acc': 0.4195406436920166}


In [14]:

def translate(en_sentence: str) -> str:
    enc = tok_xlmr([en_sentence], padding="max_length", truncation=True, max_length=max_source_length, return_tensors="tf")
    enc_out = _encode_batch(enc["input_ids"], enc["attention_mask"])
    enc_mask = tf.cast(enc["attention_mask"], tf.int32)
    out_ids = [bos_id]
    for _ in range(max_target_length):
        dec = tf.constant([out_ids], dtype=tf.int32)
        logits = decoder(dec, enc_out, enc_mask, training=False)
        next_id = int(tf.argmax(logits[:, -1, :], axis=-1).numpy()[0])
        if next_id == eos_id:
            break
        out_ids.append(next_id)
    return tok_hi.decode(out_ids[1:], skip_special_tokens=True)

for s in ["How are you?", "Please open the window.", "Machine translation is challenging.", "Where is the nearest hospital?"]:
    print("en:", s)
    print("hi:", translate(s))
    print("//////////////////////////////")


en: How are you?
hi: आप कैसे हैं ?
//////////////////////////////
en: Please open the window.
hi: कृपया खिड़की खोल दें ।
//////////////////////////////
en: Machine translation is challenging.
hi: अनुवाद में सुधार किया गया है ।
//////////////////////////////
en: Where is the nearest hospital?
hi: निकटतम अस्पताल कहां है ?
//////////////////////////////
