In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("preetviradiya/english-hindi-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/preetviradiya/english-hindi-dataset?dataset_version_number=1...


100%|██████████| 13.9M/13.9M [00:00<00:00, 61.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/preetviradiya/english-hindi-dataset/versions/1


In [2]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.layers import Input,Embedding,Add,Dense,Attention,LayerNormalization,Multiply

BATCH_SIZE=8
EPOCHS=1

In [3]:
base="/root/.cache/kagglehub/datasets/preetviradiya/english-hindi-dataset/versions/1"

In [4]:
df=pd.read_csv(os.path.join(base,"Dataset_English_Hindi.csv"))

In [5]:
df.head()

Unnamed: 0,English,Hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.
3,Jump.,छलांग.
4,Hello!,नमस्ते।


In [6]:
df.dropna(inplace=True)

# Train Test split

In [7]:
from sklearn.model_selection import train_test_split

train_english,test_english,train_hindi,test_hindi=train_test_split(df['English'],df['Hindi'],test_size=0.1)
train_english,val_english,train_hindi,val_hindi=train_test_split(train_english,train_hindi,test_size=0.1)

In [8]:
train_english.shape,test_english.shape,val_english.shape

((105430,), (13017,), (11715,))

In [9]:
english_sentences_train=train_english.values
hindi_sentences_train=train_hindi.values

english_sentences_test=test_english.values
hindi_sentences_test=test_hindi.values

english_sentences_val=val_english.values
hindi_sentences_val=val_hindi.values

# Tokenizer

In [10]:
START="<start>"
END="<end>"

In [11]:
english_Tokenizer=Tokenizer(oov_token="<oov>")
english_Tokenizer.fit_on_texts(english_sentences_train)

hindi_Tokenizer=Tokenizer(oov_token="<oov>")
hindi_Tokenizer.fit_on_texts([START+" "+sentence+" "+END for sentence in hindi_sentences_train])

In [12]:
def convert_to_seq(source,target_sent_main,encoder_tok,decoder_tok):
  source_sent_main=[]
  target_input=[]
  target_output=[]

  for source_sent,target_sent in tqdm(zip(source,target_sent_main)):
    source_sent_seq=encoder_tok.texts_to_sequences([source_sent])[0]
    target_sent_seq=decoder_tok.texts_to_sequences([START+" "+target_sent+" "+END])[0]

    source_sent_main.append(source_sent_seq)
    target_input.append(target_sent_seq[:-1])
    target_output.append(target_sent_seq[1:])

  return source_sent_main,target_input,target_output

In [13]:
train_encoder_input,train_decoder_input,train_decoder_output=convert_to_seq(english_sentences_train,hindi_sentences_train,english_Tokenizer,hindi_Tokenizer)

105430it [00:04, 21938.00it/s]


In [14]:
val_encoder_input,val_decoder_input,val_decoder_output=convert_to_seq(english_sentences_val,hindi_sentences_val,english_Tokenizer,hindi_Tokenizer)
test_encoder_input,test_decoder_input,test_decoder_output=convert_to_seq(english_sentences_test,hindi_sentences_test,english_Tokenizer,hindi_Tokenizer)

11715it [00:00, 24373.60it/s]
13017it [00:00, 14897.27it/s]


# Padding

In [15]:
max_len_encoder=max([len(sent) for sent in train_encoder_input])
max_len_decoder=max([len(sent) for sent in train_decoder_input])

max_len_encoder,max_len_decoder

(370, 418)

In [16]:
train_enc=pad_sequences(train_encoder_input,maxlen=max_len_encoder,padding="post")
train_dec_in=pad_sequences(train_decoder_input,maxlen=max_len_decoder,padding="post")
train_dec_out=pad_sequences(train_decoder_output,maxlen=max_len_decoder,padding="post")

val_enc=pad_sequences(val_encoder_input,maxlen=max_len_encoder,padding="post")
val_dec_in=pad_sequences(val_decoder_input,maxlen=max_len_decoder,padding="post")
val_dec_out=pad_sequences(val_decoder_output,maxlen=max_len_decoder,padding="post")

test_enc=pad_sequences(test_encoder_input,maxlen=max_len_encoder,padding="post")
test_dec_in=pad_sequences(test_decoder_input,maxlen=max_len_decoder,padding="post")
test_dec_out=pad_sequences(test_decoder_output,maxlen=max_len_decoder,padding="post")

# Dataset

In [17]:
train_df=(tf.data.Dataset.from_tensor_slices(((train_enc,train_dec_in),train_dec_out)).shuffle(5000)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE))

In [18]:
val_df=(tf.data.Dataset.from_tensor_slices(((val_enc,val_dec_in),val_dec_out)).shuffle(5000)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE))

test_df=(tf.data.Dataset.from_tensor_slices(((test_enc,test_dec_in),test_dec_out)).shuffle(5000)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE))

# Model

In [19]:
# length is the seq_len , depth is the embeding dim
def positional_encoding(length, depth):
    positions=np.arange(length)[:,np.newaxis]  # seq_len , 1

    depths=np.arange(depth)[np.newaxis , : ]  # 1, depth

    angle_rates=1/np.power(10000,(2 * depths // 2)) / depth
    angles=positions*angle_rates    # pos , depth
    angles[:, 0::2] = np.sin(angles[:, 0::2])
    angles[:, 1::2] = np.cos(angles[:, 1::2])

    return tf.cast(angles,tf.float32)

In [20]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,vocab_size,d_model):
        super().__init__()
        self.d_model=d_model
        self.embedding=Embedding(vocab_size,d_model,mask_zero=True)
        # here pos_encoding are generated with length (seq_length) = 2048
        self.pos_encoding=positional_encoding(length=2048,depth=d_model)

    def compute_mask(self, *args, **kwargs):
       return self.embedding.compute_mask(*args, **kwargs)

    def call(self,x):
        # x is input its shape will be  batch_size,seq_len
        # print("yes")
        length=tf.shape(x)[1]
        # print(length)
        x=self.embedding(x)

        # print("yes")
        # now x will be of shape batch_size,seq_len,embed_dim

        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        # print("yes")

        x=x+self.pos_encoding[tf.newaxis,:length,:]
        # print("yes")
        return x

In [21]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [22]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [23]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True
        )

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [24]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [25]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x


In [26]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [27]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
    # print(x)

    # Add dropout.
    x = self.dropout(x)
    # print(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [28]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,num_heads,d_model,dff,dropout_rate=0.1):
        super().__init__()

        self.cross_attention=CrossAttention(num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)

        self.masked_attention=CausalSelfAttention(num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)

        self.ff=FeedForward(d_model, dff, dropout_rate)

    def call(self,x,encoder_x):
        x=self.masked_attention(x)
        x=self.cross_attention(x,encoder_x)
        x=self.ff(x)
        return x

In [29]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,num_layers,vocab_size,num_heads,d_model,dff,dropout_rate=0.1):
        super().__init__()

        self.positional_embedding=PositionalEmbedding(vocab_size,d_model)

        self.decoder_layers=[DecoderLayer(num_heads=num_heads,d_model=d_model,dff=dff,dropout_rate=dropout_rate) for i in range(num_layers)]


        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self,x,encoder_x):
        pe=self.positional_embedding(x)

        pe=self.dropout(pe)

        for decoder_layer in self.decoder_layers:
            pe=decoder_layer(pe,encoder_x)

        return pe

In [30]:
class Transformer(tf.keras.Model):
    def __init__(self,input_vocab_size,target_vocab_size,num_layers,num_heads,d_model,dff,dropout_rate=0.1):
        super().__init__()

        self.encoder=Encoder(num_layers=num_layers,num_heads=num_heads,d_model=d_model,vocab_size=input_vocab_size,dff=dff,dropout_rate=dropout_rate)

        self.decoder=Decoder(num_layers=num_layers,vocab_size=target_vocab_size,num_heads=num_heads,d_model=d_model,dff=dff,dropout_rate=dropout_rate)

        self.output_layer=Dense(target_vocab_size,activation="softmax")

    def call(self,inputs):
        context,x=inputs

        context=self.encoder(context)
        x=self.decoder(x,context)

        out=self.output_layer(x)
        try:
        # Drop the keras mask, so it doesn't scale the losses/metrics.
        # b/250038731
            del out._keras_mask
        except AttributeError:
            pass

        return out

In [31]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [32]:
input_vocab_size=len(english_Tokenizer.word_index)+1
target_vocab_size=len(hindi_Tokenizer.word_index)+1

In [33]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    dropout_rate=dropout_rate)

  angle_rates=1/np.power(10000,(2 * depths // 2)) / depth
  angles=positions*angle_rates    # pos , depth
  angles[:, 0::2] = np.sin(angles[:, 0::2])
  angles[:, 1::2] = np.cos(angles[:, 1::2])


In [34]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,d_model,warmup_steps=4000):
        super().__init__()

        self.d_model=tf.cast(d_model,tf.float32)
        self.warmup_steps=warmup_steps

    def __call__(self,step):
        step = tf.cast(step, dtype=tf.float32)
        lrate=tf.math.rsqrt(self.d_model)*tf.math.minimum(tf.math.sqrt(step),step*(self.warmup_steps ** -1.5))
        return lrate

In [35]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [40]:
def masked_loss(label,pred):
    mask=label!=0
    loss_object=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction=None)
    loss=loss_object(label,pred)

    mask=tf.cast(mask,dtype=loss.dtype)
    loss*=mask

    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def masked_accuracy(label,pred):
    pred=tf.argmax(pred,axis=2)
    label=tf.cast(label,pred.dtype)
    mask=label!=0

    match=label==pred
    match=match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)

    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [41]:
transformer.compile(optimizer=optimizer,loss=masked_loss,metrics=[masked_accuracy])

In [42]:
transformer.fit(train_df,epochs=EPOCHS,validation_data=val_df)

[1m 4117/13179[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m32:17[0m 214ms/step - loss: nan - masked_accuracy: 0.0000e+00

KeyboardInterrupt: 

In [45]:
def translate(sentence, transformer, encoder_tok, decoder_tok, max_len=200):
    # 1. Tokenize encoder input (Python-side)
    enc_seq = encoder_tok.texts_to_sequences([sentence])
    enc_seq = tf.ragged.constant(enc_seq).to_tensor()

    # 2. Decoder start/end ids
    start_id = decoder_tok.word_index["start"]
    end_id   = decoder_tok.word_index["end"]

    # 3. Initial decoder input: <start>
    dec_input = tf.constant([[start_id]], dtype=tf.int32)

    for _ in range(max_len):
        # Model forward pass
        preds = transformer([enc_seq, dec_input], training=False)

        # Last step logits → token id
        next_id = tf.argmax(preds[:, -1:, :], axis=-1, output_type=tf.int32)

        # Append to decoder input
        dec_input = tf.concat([dec_input, next_id], axis=1)

        # Stop if end token appears
        if next_id[0][0].numpy() == end_id:
            break

    # Remove the start token
    output_seq = dec_input.numpy()[0][1:]

    # Convert back to text
    text = decoder_tok.sequences_to_texts([output_seq])[0]
    return text

In [46]:
translate("My name is pankaj",transformer,english_Tokenizer,hindi_Tokenizer)

tf.Tensor([[ 95 164   8   1]], shape=(1, 4), dtype=int32)
tf.Tensor([[3]], shape=(1, 1), dtype=int32)




'<oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oov> <oo

In [47]:
english_Tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x7f7908b38890>

In [48]:
import json

def save_tokenizer(tok, path):
    with open(path, "w") as f:
        f.write(tok.to_json())

In [49]:
save_tokenizer(english_Tokenizer,"english_Tokenizer.json")
save_tokenizer(hindi_Tokenizer,"hindi_Tokenizer.json")

In [56]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

def load_tokenizer(path):
    with open(path) as f:
      return tokenizer_from_json(f.read())


In [57]:
encoder_tokenizer = load_tokenizer("english_Tokenizer.json")
decoder_tokenizer = load_tokenizer("hindi_Tokenizer.json")