In [3]:
!pip install tensorflow




In [1]:
# IMPORTS
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# UPLOAD FILE (hi.translit.sampled.train.tsv)
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# LOAD A SMALL SUBSET OF DATA
source_texts, target_texts = [], []
with open(filename, "r", encoding="utf-8") as f:
    for line in f.readlines()[:10000]:  # Only 10k rows for RAM safety
        parts = line.strip().split("\t")
        if len(parts) >= 2:
            source = parts[1].lower()
            target = '\t' + parts[0] + '\n'
            source_texts.append(source)
            target_texts.append(target)

# BUILD CHAR INDEX
input_chars = sorted(set("".join(source_texts)))
target_chars = sorted(set("".join(target_texts)))
input_token_index = {char: i + 1 for i, char in enumerate(input_chars)}  # +1 for padding
target_token_index = {char: i + 1 for i, char in enumerate(target_chars)}
reverse_target_index = {i: char for char, i in target_token_index.items()}

max_encoder_seq_length = max(len(txt) for txt in source_texts)
max_decoder_seq_length = max(len(txt) for txt in target_texts)

# INTEGER ENCODING + PADDING
def encode(texts, token_index, max_len):
    return pad_sequences([[token_index.get(c, 0) for c in text] for text in texts], maxlen=max_len, padding='post')

encoder_input_data = encode(source_texts, input_token_index, max_encoder_seq_length)
decoder_input_data = encode(target_texts, target_token_index, max_decoder_seq_length)

# Decoder target (shifted left by 1)
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

#  TRAIN/VAL SPLIT
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2)

# MODEL PARAMETERS
vocab_in = len(input_token_index) + 1
vocab_out = len(target_token_index) + 1
embed_dim = 64
hidden_units = 128

# MODEL BUILDER
def build_seq2seq(cell='LSTM'):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    x = Embedding(vocab_in, embed_dim)(encoder_inputs)
    if cell == 'GRU':
        _, state = GRU(hidden_units, return_state=True)(x)
        encoder_states = [state]
    elif cell == 'RNN':
        _, state = SimpleRNN(hidden_units, return_state=True)(x)
        encoder_states = [state]
    else:  # LSTM
        _, state_h, state_c = LSTM(hidden_units, return_state=True)(x)
        encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    y = Embedding(vocab_out, embed_dim)(decoder_inputs)
    if cell == 'GRU':
        y = GRU(hidden_units, return_sequences=True)(y, initial_state=encoder_states)
    elif cell == 'RNN':
        y = SimpleRNN(hidden_units, return_sequences=True)(y, initial_state=encoder_states)
    else:
        y = LSTM(hidden_units, return_sequences=True)(y, initial_state=encoder_states)

    decoder_dense = Dense(vocab_out, activation='softmax')
    decoder_outputs = decoder_dense(y)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

# COMPILE & TRAIN
model = build_seq2seq('LSTM')
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit([X1_train, X2_train], np.expand_dims(y_train, -1),
                    batch_size=64, epochs=10, validation_data=([X1_val, X2_val], np.expand_dims(y_val, -1)))

# PREDICT FUNCTION
def transliterate(text):
    seq = encode([text.lower()], input_token_index, max_encoder_seq_length)
    decoder_input = np.zeros((1, max_decoder_seq_length), dtype=int)
    decoder_input[0, 0] = target_token_index['\t']

    result = ''
    for t in range(1, max_decoder_seq_length):
        preds = model.predict([seq, decoder_input], verbose=0)
        sampled_token_index = np.argmax(preds[0, t - 1, :])
        if sampled_token_index == 0:
            break
        sampled_char = reverse_target_index.get(sampled_token_index, '')
        if sampled_char == '\n':
            break
        result += sampled_char
        decoder_input[0, t] = sampled_token_index
    return result

# sample outputs
test_words= ["jabki", "yah", "jainon", "se", "se"]
for word in test_words:
    print(f"{word} → {transliterate(word)}")


Saving hi.romanized.rejoined.aligned.cased_nopunct.tsv to hi.romanized.rejoined.aligned.cased_nopunct.tsv
Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 120ms/step - accuracy: 0.7716 - loss: 1.7488 - val_accuracy: 0.8434 - val_loss: 0.6679
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 110ms/step - accuracy: 0.8434 - loss: 0.6553 - val_accuracy: 0.8495 - val_loss: 0.6130
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 116ms/step - accuracy: 0.8475 - loss: 0.6169 - val_accuracy: 0.8556 - val_loss: 0.5617
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 123ms/step - accuracy: 0.8585 - loss: 0.5532 - val_accuracy: 0.8678 - val_loss: 0.5080
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 113ms/step - accuracy: 0.8669 - loss: 0.5070 - val_accuracy: 0.8728 - val_loss: 0.4714
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1