In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
print("TensorFlow GPU Device Name:", tf.test.gpu_device_name())

TensorFlow GPU Device Name: /device:GPU:0


2024-06-27 21:19:24.102118: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-06-27 21:19:24.102216: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2024-06-27 21:19:24.102237: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2024-06-27 21:19:24.102311: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-27 21:19:24.102367: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
df = pd.read_csv('data.csv')

In [4]:
df

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [5]:
english_sentences = df['English words/sentences'].values
french_sentences = df['French words/sentences'].values

In [6]:
# Tokenization
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)
english_maxlen = max(len(seq) for seq in english_sequences)
english_vocab_size = len(english_tokenizer.word_index) + 1

french_tokenizer = Tokenizer()
french_tokenizer.fit_on_texts(french_sentences)
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)
french_maxlen = max(len(seq) for seq in french_sequences)
french_vocab_size = len(french_tokenizer.word_index) + 1

In [7]:
# Padding sequences
english_sequences = pad_sequences(english_sequences, maxlen=english_maxlen, padding='post')
french_sequences = pad_sequences(french_sequences, maxlen=french_maxlen, padding='post')

In [8]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(english_sequences, french_sequences, test_size=0.2)

### Model

In [10]:
latent_dim = 256

In [11]:
encoder_inputs = Input(shape=(english_maxlen,))
encoder_embedding = Embedding(english_vocab_size, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]


2024-06-27 21:30:06.779317: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-27 21:30:06.779362: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
# Decoder
decoder_inputs = Input(shape=(french_maxlen,))
decoder_embedding = Embedding(french_vocab_size, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(french_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [13]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [15]:
model.summary()

In [16]:
# Train the model
model.fit([X_train, y_train], y_train, epochs=10, validation_data=([X_val, y_val], y_val))

Epoch 1/10


2024-06-27 21:38:39.107434: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m4391/4391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 127ms/step - accuracy: 0.9296 - loss: 0.6913 - val_accuracy: 0.9884 - val_loss: 0.1057
Epoch 2/10
[1m4391/4391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m555s[0m 126ms/step - accuracy: 0.9910 - loss: 0.0775 - val_accuracy: 0.9948 - val_loss: 0.0484
Epoch 3/10
[1m4391/4391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m556s[0m 127ms/step - accuracy: 0.9963 - loss: 0.0266 - val_accuracy: 0.9967 - val_loss: 0.0329
Epoch 4/10
[1m4391/4391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m555s[0m 126ms/step - accuracy: 0.9985 - loss: 0.0092 - val_accuracy: 0.9974 - val_loss: 0.0273
Epoch 5/10
[1m4391/4391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m571s[0m 130ms/step - accuracy: 0.9994 - loss: 0.0034 - val_accuracy: 0.9977 - val_loss: 0.0253
Epoch 6/10
[1m4391/4391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m564s[0m 128ms/step - accuracy: 0.9999 - loss: 0.0010 - val_accuracy: 0.9978 - val_loss: 0.0234
Epo

<keras.src.callbacks.history.History at 0x37ae6f430>

In [17]:
model.save('model.keras')

### This model achieved 100% accuracy.

### Predictions

In [55]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

# Example new English and French sentences to predict
new_english_sentences = [
    "How are you today?",
    "What time is it?",
    "I enjoy learning new things.",
    "Is it going to rain?",
    "Goodbye, see you later!"
]

new_french_sentences = [
    "Comment vas-tu aujourd'hui ?",
    "Quelle heure est-il ?",
    "J'aime apprendre de nouvelles choses.",
    "Va-t-il pleuvoir ?",
    "Au revoir, à plus tard !"
]

# Tokenize and pad sequences if necessary
new_english_sequences = english_tokenizer.texts_to_sequences(new_english_sentences)
new_french_sequences = french_tokenizer.texts_to_sequences(new_french_sentences)

# Pad sequences to match the model's input shape if necessary
padded_english_sequences = pad_sequences(new_english_sequences, maxlen=44, padding='post')
padded_french_sequences = pad_sequences(new_french_sequences, maxlen=55, padding='post')

# Make predictions using your trained model
predictions = model.predict([padded_english_sequences, padded_french_sequences])

# Convert predictions from indices to actual words using the tokenizer
predicted_sequences = french_tokenizer.sequences_to_texts(predictions.argmax(axis=-1))

# Debug: Print predicted and actual sequences for comparison
for actual, predicted in zip(new_french_sentences, predicted_sequences):
    print(f"Actual:   {actual.strip().lower()}")
    print(f"Predicted: {predicted.strip().lower()}")
    print()  # Print a blank line for clarity




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Actual:   comment vas-tu aujourd'hui ?
Predicted: comment vas tu aujourd'hui

Actual:   quelle heure est-il ?
Predicted: quelle heure est il

Actual:   j'aime apprendre de nouvelles choses.
Predicted: j'aime apprendre de nouvelles choses

Actual:   va-t-il pleuvoir ?
Predicted: va t il pleuvoir

Actual:   au revoir, à plus tard !
Predicted: au revoir à plus tard



### In the predictions, all the predictions were correct with 100% accuracy.