In [3]:
import os
import random
import collections
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, TimeDistributed, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [39]:
# 2. Load full data
def load_data(path):
    with open(path, 'r') as f:
        return f.read().strip().split('\n')


In [40]:
eng_all = load_data('data\\small_vocab_en.txt')
fr_all  = load_data('data\\small_vocab_fr.txt')
print(f"Total sentence-pairs: {len(eng_all)}")

Total sentence-pairs: 137860


In [41]:
# 3. Filter to <25 tokens each
pairs = list(zip(eng_all, fr_all))
filtered = [
    (e, f) for e, f in pairs
    if len(e.split()) < 200 and len(f.split()) < 200
]
print(f"Pairs under 200 tokens: {len(filtered)}")

Pairs under 200 tokens: 137860


In [42]:
# 4. Sample 40%
random.seed(1)
n_sample = int(1 * len(filtered))
sampled = random.sample(filtered, n_sample)
eng_sentences, fr_sentences = zip(*sampled)
print(f"Using {len(eng_sentences)} pairs (100%)")

Using 137860 pairs (100%)


In [43]:
# 5. Tokenize + pad + preprocess
def tokenize(sentences):
    tk = Tokenizer(char_level=False)
    tk.fit_on_texts(sentences)
    seqs = tk.texts_to_sequences(sentences)
    return seqs, tk

In [44]:
def preprocess(x, y):
    x_seq, x_tk = tokenize(x)
    y_seq, y_tk = tokenize(y)
    x_pad = pad_sequences(x_seq, padding='post')
    y_pad = pad_sequences(y_seq, padding='post')
    # expand y for sparse loss: (batch, seq_len, 1)
    y_pad = y_pad.reshape(*y_pad.shape, 1)
    return x_pad, y_pad, x_tk, y_tk


In [45]:
X, Y, eng_tk, fr_tk = preprocess(eng_sentences, fr_sentences)
max_eng_len = X.shape[1]
max_fr_len  = Y.shape[1]
eng_vocab   = len(eng_tk.word_index) + 1
fr_vocab    = len(fr_tk.word_index) + 1

print("Max English len:", max_eng_len)
print("Max French  len:", max_fr_len)
print("English vocab size:", eng_vocab)
print("French  vocab size:", fr_vocab)


Max English len: 15
Max French  len: 21
English vocab size: 200
French  vocab size: 346


In [46]:
# 6. Build model
def embed_model(input_shape, output_seq_len, eng_vocab_size, fr_vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=eng_vocab_size, output_dim=64,
                        input_length=input_shape[1]))
    model.add(GRU(64, return_sequences=True, activation='tanh'))
    model.add(TimeDistributed(Dense(fr_vocab_size, activation='softmax')))
    model.compile(
        loss=sparse_categorical_crossentropy,
        optimizer=Adam(1e-3),
        metrics=['accuracy']
    )
    return model

In [47]:
# pad English up to French length for 1:1 timesteps
X_train = pad_sequences(X, maxlen=max_fr_len, padding='post')

In [48]:
model = embed_model(X_train.shape, max_fr_len, eng_vocab, fr_vocab)
model.summary()

In [49]:
# 7. Train
history = model.fit(
    X_train, Y,
    batch_size=1024,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 556ms/step - accuracy: 0.4534 - loss: 3.7949 - val_accuracy: 0.6104 - val_loss: 1.9488
Epoch 2/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 520ms/step - accuracy: 0.6368 - loss: 1.7255 - val_accuracy: 0.6801 - val_loss: 1.2814
Epoch 3/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 519ms/step - accuracy: 0.6875 - loss: 1.2009 - val_accuracy: 0.7205 - val_loss: 1.0060
Epoch 4/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 520ms/step - accuracy: 0.7336 - loss: 0.9480 - val_accuracy: 0.7745 - val_loss: 0.8082
Epoch 5/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 521ms/step - accuracy: 0.7845 - loss: 0.7705 - val_accuracy: 0.8083 - val_loss: 0.6900
Epoch 6/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 535ms/step - accuracy: 0.8148 - loss: 0.6612 - val_accuracy: 0.8275 - val_loss: 0.6099
Epoch 7/10

In [50]:
# 8. Train again for 5 epochs
history = model.fit(
    X_train, Y,
    batch_size=1024,
    epochs=5,
    validation_split=0.2
)

Epoch 1/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 523ms/step - accuracy: 0.8778 - loss: 0.4253 - val_accuracy: 0.8819 - val_loss: 0.4140
Epoch 2/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 518ms/step - accuracy: 0.8848 - loss: 0.3994 - val_accuracy: 0.8872 - val_loss: 0.3934
Epoch 3/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 521ms/step - accuracy: 0.8895 - loss: 0.3824 - val_accuracy: 0.8918 - val_loss: 0.3769
Epoch 4/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 519ms/step - accuracy: 0.8948 - loss: 0.3658 - val_accuracy: 0.8956 - val_loss: 0.3631
Epoch 5/5
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 518ms/step - accuracy: 0.8982 - loss: 0.3498 - val_accuracy: 0.8968 - val_loss: 0.3544


In [51]:
# 8. Inference helper
def logits_to_text(logits, tokenizer):
    idx2w = {i: w for w, i in tokenizer.word_index.items()}
    idx2w[0] = '<PAD>'
    tokens = np.argmax(logits, axis=1)
    return ' '.join(idx2w[t] for t in tokens)

# Quick test
sample_x = X_train[:1]
pred = model.predict(sample_x)[0]   # shape: (seq_len, fr_vocab)
print("Source:   ", eng_sentences[0])
print("Predicted:", logits_to_text(pred, fr_tk))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Source:    the united states is usually busy during september , and it is usually freezing in november .
Predicted: les ã©tats unis est gã©nã©ralement occupã© en septembre et il est gã©nã©ralement en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [52]:
# 1. Save model, tokenizers & config
import pickle
from keras.models import load_model

# — assume `model`, `eng_tk`, `fr_tk`, and `max_fr_len` are in your workspace from training
model.save('translator_model.h5')

with open('eng_tokenizer.pkl', 'wb') as f:
    pickle.dump(eng_tk, f)
with open('fr_tokenizer.pkl', 'wb') as f:
    pickle.dump(fr_tk, f)
with open('config.pkl', 'wb') as f:
    # only need to save the max french sequence length
    pickle.dump({'max_fr_len': max_fr_len}, f)



In [5]:

# 3. Translation function
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import pickle
from keras.models import load_model

def translate_sentence(
    sentence,
    model_path='translator_model_j1.h5',
    eng_tok_path='eng_tokenizer.pkl',
    fr_tok_path='fr_tokenizer.pkl',
    config_path='config.pkl'
):
    """
    Load the saved model and tokenizers, preprocess `sentence` (English),
    and return the predicted French translation as a string.
    """
    # load model & artifacts
    model = load_model(model_path)
    with open(eng_tok_path, 'rb') as f:
        eng_tk = pickle.load(f)
    with open(fr_tok_path, 'rb') as f:
        fr_tk = pickle.load(f)
    with open(config_path, 'rb') as f:
        cfg = pickle.load(f)
    max_fr_len = cfg['max_fr_len']

    # tokenize & pad
    seq = eng_tk.texts_to_sequences([sentence])
    seq_pad = pad_sequences(seq, maxlen=max_fr_len, padding='post')

    # predict
    logits = model.predict(seq_pad)[0]          # shape: (max_fr_len, fr_vocab)
    token_ids = np.argmax(logits, axis=1).tolist()

    # map back to words, dropping any trailing <PAD>
    idx2word = {i: w for w, i in fr_tk.word_index.items()}
    idx2word[0] = '<PAD>'
    # stop at first pad (optional)
    if 0 in token_ids:
        token_ids = token_ids[:token_ids.index(0)]

    return ' '.join(idx2word[idx] for idx in token_ids)


In [17]:
# 4. Example usage:
print(translate_sentence("india is rainy during june "))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
l' inde est pluvieux juin juin


In [16]:

print(translate_sentence("and it is sometimes warm in november ."))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
et il est est chaud en en novembre


In [18]:
# 4. Example usage:
print(translate_sentence("india is rainy during june , and it is sometimes warm in november ."))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
l' inde est pluvieux juin et il est parfois chaud en novembre


In [24]:
# 4. Example usage:
#some words are not in data like "name" so while prediction it will show nothing inplace of that word
print(translate_sentence("my name is"))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
mon est
