# Machine Translation Project (English to French)

In [1]:
import collections
import numpy as np
import json

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (
    Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout,
    Activation, TimeDistributed, RepeatVector
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy



### Verify access to the GPU

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7596125193989179070
xla_global_id: -1
]


## Dataset
For our machine translation project, we opt for a dataset featuring a limited vocabulary, specifically designed to facilitate a more manageable and efficient training process. Unlike the extensive [WMT](http://www.statmt.org/) datasets, our chosen dataset ensures a quicker training time and demands fewer computational resources. This strategic decision aims to balance the learning experience while still achieving meaningful results within practical time constraints.
### Load Data

In [3]:
def load_data(path):
    input_file = path
    with open(input_file, "r") as f:
        data = f.read()
    return data.split('\n')

english_sentences = load_data('data/english')
french_sentences = load_data('data/french')

### Sample Data

In [4]:
english_sentences[:5]

['new jersey is sometimes quiet during autumn , and it is snowy in april .',
 'the united states is usually chilly during july , and it is usually freezing in november .',
 'california is usually quiet during march , and it is usually hot in june .',
 'the united states is sometimes mild during june , and it is cold in september .',
 'your least liked fruit is the grape , but my least liked is the apple .']

By examining the sentences, it's apparent that they have undergone preprocessing: punctuation has been delimited with spaces, and all the text has been converted to lowercase. This preprocessing serves a crucial purpose in text preparation. Firstly, delimiting punctuation with spaces ensures that each punctuation mark is treated as a separate token, aiding the model in understanding sentence structure. Secondly, converting the entire text to lowercase standardizes the input, preventing the model from distinguishing between words solely based on their casing. This uniformity facilitates more effective training and generalization, enhancing the model's ability to grasp patterns and generate accurate translations.

Structure of the Dataset

In [5]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')

print()
print('{} French words.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1823250 English words.
227 unique English words.
10 Most common words in the English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French words.
355 unique French words.
10 Most common words in the French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


### Preprocess
1. Tokenize the words into ids
2. Add padding to make all the sequences the same length.

In [6]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']

text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


In [7]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [8]:
def preprocess(x,y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 345


## Models
- Model 1 is a simple RNN
- Model 2 is a Bidirectional RNN
- Model 3 is an Embedding RNN

### Ids Back to Text
The neural network will be translating the input to words ids, which isn't the final form we want.  We want the French translation.  The function `logits_to_text` will bridge the gab between the logits from the neural network to the French translation.  You'll be using this function to better understand the output of the neural network.

In [9]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

### Model 1: RNN
![RNN](images/rnn.png)
A basic RNN model is a good baseline for sequence data.  In this model, you'll build a RNN that translates English to French.

In [10]:
from sklearn.model_selection import train_test_split


# 🛠 Model Definition
def simple_model(input_shape, french_vocab_size):
    model = Sequential([
        Input(shape=input_shape[1:]),  # Explicit Input Layer
        GRU(256, return_sequences=True),
        TimeDistributed(Dense(1024, activation='relu')),
        Dropout(0.5),
        TimeDistributed(Dense(french_vocab_size, activation='softmax'))
    ])
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate=0.001),  # Lower LR to prevent exploding gradients
                  metrics=['accuracy'])
    
    return model

# 🛠 Data Preprocessing
tmp_x = pad_sequences(preproc_english_sentences, maxlen=max_french_sequence_length, padding='post')
tmp_x = tmp_x.reshape((-1, tmp_x.shape[1], 1))

preproc_french_sentences = np.array(preproc_french_sentences)

# Ensure labels are integer-encoded
if len(preproc_french_sentences.shape) == 3:
    preproc_french_sentences = np.argmax(preproc_french_sentences, axis=-1)

preproc_french_sentences = preproc_french_sentences.reshape((-1, preproc_french_sentences.shape[1]))

# 🚀 Debugging Prints
print(f"Input shape: {tmp_x.shape}")  # Should be (num_samples, sequence_length, 1)
print(f"Target shape: {preproc_french_sentences.shape}")  # Should be (num_samples, sequence_length)
print(f"First few target values: {preproc_french_sentences[:5]}")  # Should be integers

# 🏋️ Train with Validation Data
X_train, X_val, y_train, y_val = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2)
simple_rnn_model = simple_model(tmp_x.shape, french_vocab_size)

simple_rnn_model.fit(X_train, y_train, batch_size=256, epochs=10, validation_data=(X_val, y_val))

Input shape: (137861, 21, 1)
Target shape: (137861, 21)
First few target values: [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Epoch 1/10
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 328ms/step - accuracy: 0.9846 - loss: 0.2405 - val_accuracy: 1.0000 - val_loss: 4.4749e-05
Epoch 2/10
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 362ms/step - accuracy: 1.0000 - loss: 5.1545e-05 - val_accuracy: 1.0000 - val_loss: 3.9105e-05
Epoch 3/10
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 357ms/step - accuracy: 1.0000 - loss: 4.0458e-05 - val_accuracy: 1.0000 - val_loss: 3.5994e-05
Epoch 4/10
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 372ms/step - accuracy: 1.0000 - loss: 4.0739e-05 - val_accuracy: 1.0000 - val_loss: 3.4886e-05
E

<keras.src.callbacks.history.History at 0x2308630d760>

In [11]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
["new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."]

Original text:
['new jersey is sometimes quiet during autumn , and it is snowy in april .']


### Model 2: Bidirectional RNNs
![RNN](images/bidirectional.png)
One restriction of a RNN is that it can't see the future input, only the past.  This is where bidirectional recurrent neural networks come in.  They are able to see the future data.

In [12]:
from sklearn.model_selection import train_test_split

# 🛠 Model Definition
def bd_model(input_shape, french_vocab_size):
    learning_rate = 0.005  # Hyperparameter

    model = Sequential([
        Bidirectional(GRU(128, return_sequences=True), input_shape=(input_shape[1], 1)),  # Fix input shape
        TimeDistributed(Dense(1024, activation='relu')),
        Dropout(0.5),
        TimeDistributed(Dense(french_vocab_size, activation='softmax'))
    ])

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

# Sample preprocessing data (replace with actual data)
english_sentences = ["Hello", "How are you?", "I am fine"]
french_sentences = ["Bonjour", "Comment ça va?", "Je vais bien"]

# Tokenizer for English sentences
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_vocab_size = len(english_tokenizer.word_index) + 1  # Including the padding token
preproc_english_sentences = english_tokenizer.texts_to_sequences(english_sentences)

# Tokenizer for French sentences
french_tokenizer = Tokenizer()
french_tokenizer.fit_on_texts(french_sentences)
french_vocab_size = len(french_tokenizer.word_index) + 1  # Including the padding token
preproc_french_sentences = french_tokenizer.texts_to_sequences(french_sentences)

# Padding sequences to the same length
max_french_sequence_length = max(len(sentence) for sentence in preproc_french_sentences)
tmp_x = pad_sequences(preproc_english_sentences, maxlen=max_french_sequence_length, padding='post')

# Ensure French sentences are integer-encoded (for sparse_categorical_crossentropy)
preproc_french_sentences = pad_sequences(preproc_french_sentences, maxlen=max_french_sequence_length, padding='post')

# 🏋️ Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(tmp_x, preproc_french_sentences, test_size=0.2)

# ✅ Fix Input Shape for RNN (Expanding Dimensions)
X_train = np.expand_dims(X_train, -1)  # Shape: (samples, sequence_length, 1)
X_val = np.expand_dims(X_val, -1)

# 🚀 Train the Model
bd_rnn_model = bd_model(X_train.shape, french_vocab_size)

print(bd_rnn_model.summary())

# 🚀 Train the neural network
bd_rnn_model.fit(X_train, y_train, batch_size=1024, epochs=10, validation_data=(X_val, y_val))


  super().__init__(**kwargs)


None
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.3333 - loss: 2.0892 - val_accuracy: 0.0000e+00 - val_loss: 3.7125
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - accuracy: 0.5000 - loss: 1.7967 - val_accuracy: 0.0000e+00 - val_loss: 6.3793
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - accuracy: 0.6667 - loss: 1.5286 - val_accuracy: 0.0000e+00 - val_loss: 9.6003
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 0.6667 - loss: 1.4122 - val_accuracy: 0.0000e+00 - val_loss: 13.2104
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 0.6667 - loss: 1.2656 - val_accuracy: 0.0000e+00 - val_loss: 15.6220
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 0.8333 - loss: 0.9177 - val_accuracy: 0.0000e+00 - val_loss: 16.1181
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x23085507d60>

In [13]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(bd_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 409ms/step
<PAD> <PAD> <PAD>

Correct Translation:
['Bonjour']

Original text:
['Hello']


### Model 3: Embedding
![RNN](images/embedding-words.png)
You've turned the words into ids, but there's a better representation of a word.  This is called word embeddings.  An embedding is a vector representation of the word that is close to similar words in n-dimensional space, where the n represents the size of the embedding vectors.

In [14]:
# 🛠 Model Definition
def bidirectional_embed_model(input_shape, english_vocab_size, french_vocab_size):
    # Hyperparameters
    learning_rate = 0.005
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size, output_dim=256, input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

# 🏋️ Preprocessing
tmp_x = pad_sequences(preproc_english_sentences, maxlen=max_french_sequence_length, padding='post')

# Build the model
embed_rnn_model = bidirectional_embed_model(tmp_x.shape, english_vocab_size, french_vocab_size)

# 🔍 Model Summary
print(embed_rnn_model.summary())

# 🚀 Train the model
embed_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)




None
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 2.0806 - val_accuracy: 0.0000e+00 - val_loss: 2.1105
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step - accuracy: 1.0000 - loss: 1.8512 - val_accuracy: 0.0000e+00 - val_loss: 2.2131
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 1.0000 - loss: 1.3112 - val_accuracy: 0.0000e+00 - val_loss: 2.4384
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step - accuracy: 0.8333 - loss: 0.7277 - val_accuracy: 0.0000e+00 - val_loss: 2.7941
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 1.0000 - loss: 0.2715 - val_accuracy: 0.0000e+00 - val_loss: 3.2445
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step - accuracy: 1.0000 - loss: 0.0391 - val_accuracy: 0.0000e+00 - val_loss: 3.8224
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x23086946e50>

In [15]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(french_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step
bonjour <PAD> <PAD>

Correct Translation:
['Bonjour']

Original text:
['Hello']


In [16]:
# Save model in native Keras format (.keras)
embed_rnn_model.save('english_to_french_model.keras')

# OR save model in HDF5 format (.h5)
# embed_rnn_model.save('english_to_french_model.h5')

# Serialize English Tokenizer to JSON
import json
with open('english_tokenizer.json', 'w', encoding='utf8') as f:
    json.dump(english_tokenizer.to_json(), f, ensure_ascii=False)

# Serialize French Tokenizer to JSON
with open('french_tokenizer.json', 'w', encoding='utf8') as f:
    json.dump(french_tokenizer.to_json(), f, ensure_ascii=False)

# Save max lengths
with open('sequence_length.json', 'w', encoding='utf8') as f:
    json.dump(max_french_sequence_length, f, ensure_ascii=False)
