In [44]:
import pandas as pd

# df = pd.read_csv('data/en-fr.csv', names=['en', 'fr'], usecols=['en', 'fr'])
df = pd.read_csv('data/en-fr.txt', names=['en', 'fr', 'attr'], usecols=['en', 'fr'], sep='\t')
df = df.sample(frac=1, random_state=42)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,en,fr
0,You're very clever.,Vous êtes fort ingénieuse.
1,Are there kids?,Y a-t-il des enfants ?
2,Come in.,Entrez !
3,Where's Boston?,Où est Boston ?
4,You see what I mean?,Vous voyez ce que je veux dire ?


In [45]:
# Clean the text by removing punctuation symbols and numbers, converting characters to lowercase,
# and replacing Unicode characters with their ASCII equivalents. 
# For the French samples, insert [start] and [end] tokens at the beginning and end of each phrase
import re
from unicodedata import normalize

def clean_text(text):
    text = normalize('NFD', text.lower())
    text = re.sub('[^A-Za-z ]+', '', text)
    return text

def clean_and_prepare_text(text):
    text = '[start] ' + clean_text(text) + ' [end]'
    return text

df['en'] = df['en'].apply(lambda row: clean_text(row))
df['fr'] = df['fr'].apply(lambda row: clean_and_prepare_text(row))
df.head()

Unnamed: 0,en,fr
0,youre very clever,[start] vous etes fort ingenieuse [end]
1,are there kids,[start] y atil des enfants [end]
2,come in,[start] entrez [end]
3,wheres boston,[start] ou est boston [end]
4,you see what i mean,[start] vous voyez ce que je veux dire [end]


In [46]:
# scan the phrases and determine the maximum length of the English phrases and then of the French phrases. 
# These lengths will determine the lengths of the sequences input to and output from the model

en = df['en']
fr = df['fr']

en_max_len = max(len(line.split()) for line in en)
fr_max_len = max(len(line.split()) for line in fr)
sequence_len = max(en_max_len, fr_max_len)

print(f'Max phrase length (English): {en_max_len}')
print(f'Max phrase length (French): {fr_max_len}')
print(f'Sequence length: {sequence_len}')

Max phrase length (English): 7
Max phrase length (French): 16
Sequence length: 16


In [81]:
# fit one Tokenizer to the English phrases and another Tokenizer to their French equivalents, 
# and generate padded sequences for all the phrases:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(en)
en_sequences = en_tokenizer.texts_to_sequences(en)
en_x = pad_sequences(en_sequences, maxlen=sequence_len, padding='post')

fr_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n')
fr_tokenizer.fit_on_texts(fr)
fr_sequences = fr_tokenizer.texts_to_sequences(fr)
fr_y = pad_sequences(fr_sequences, maxlen=sequence_len + 1, padding='post')

In [82]:
# Compute the vocabulary sizes from the Tokenizer instances:
en_vocab_size = len(en_tokenizer.word_index) + 1
fr_vocab_size = len(fr_tokenizer.word_index) + 1

print(f'Vocabulary size (English): {en_vocab_size}')
print(f'Vocabulary size (French): {fr_vocab_size}')

Vocabulary size (English): 6033
Vocabulary size (French): 12197


In [83]:
# Finally, create the features and the labels the model will be trained with.
# The features are the padded English sequences and the padded French sequences minus the [end] tokens.
# The labels are the padded French sequences minus the [start] tokens. 
# Package the features in a dictionary so they can be input to a model that accepts multiple inputs.

inputs = { 'encoder_input': en_x, 'decoder_input': fr_y[:, :-1] }
outputs = fr_y[:, 1:]

In [87]:
# Now use Keras's functional API to define a model that includes a transformer encoder and a transformer decoder. 
# The model accepts two inputs: padded English sequences for the encoder, and padded French sequences for the decoder. 
# The output from the decoder is fed to a softmax output layer for classification.

import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from keras_nlp.layers import TokenAndPositionEmbedding, TransformerEncoder
from keras_nlp.layers import TransformerDecoder

np.random.seed(42)
tf.random.set_seed(42)

num_heads = 8
embed_dim = 256

encoder_input = Input(shape=(None,), dtype='int64', name='encoder_input')
x = TokenAndPositionEmbedding(en_vocab_size, sequence_len, embed_dim)(encoder_input)
encoder_output = TransformerEncoder(embed_dim, num_heads)(x)
encoded_seq_input = Input(shape=(None, embed_dim))

decoder_input = Input(shape=(None,), dtype='int64', name='decoder_input')
x = TokenAndPositionEmbedding(fr_vocab_size, sequence_len, embed_dim, mask_zero=True)(decoder_input)
x = TransformerDecoder(embed_dim, num_heads)(x, encoded_seq_input)
x = Dropout(0.4)(x)

decoder_output = Dense(fr_vocab_size, activation='softmax')(x)
decoder = Model([decoder_input, encoded_seq_input], decoder_output)
decoder_output = decoder([decoder_input, encoder_output])

model = Model([encoder_input, decoder_input], decoder_output)
model.compile('adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary(line_length=120)

In [88]:
# Train the model, and use an EarlyStopping callback to end training if the validation accuracy fails 
# to improve for three consecutive epochs:

from tensorflow.keras.callbacks import EarlyStopping

callback = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
hist = model.fit(inputs, outputs, epochs=50, validation_split=0.2, callbacks=[callback])

Epoch 1/50


2024-08-18 20:53:23.388650: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: indices[24,2] = 9018 is not in [0, 6033)
	 [[{{function_node __inference_one_step_on_data_118004}}{{node functional_38_1/token_and_position_embedding_40_1/token_embedding_1/GatherV2}}]]


InvalidArgumentError: Graph execution error:

Detected at node functional_38_1/token_and_position_embedding_40_1/token_embedding_1/GatherV2 defined at (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 595, in run_forever

  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 1881, in _run_once

  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 359, in execute_request

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 446, in do_execute

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/var/folders/4m/qtpm281n73j20b82dbnh980w0000gq/T/ipykernel_62857/3041662429.py", line 7, in <module>

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 51, in train_step

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/layers/layer.py", line 901, in __call__

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/models/functional.py", line 175, in call

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/models/functional.py", line 560, in call

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/layers/layer.py", line 901, in __call__

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras_nlp/src/layers/modeling/token_and_position_embedding.py", line 138, in call

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/layers/layer.py", line 901, in __call__

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras_nlp/src/layers/modeling/reversible_embedding.py", line 141, in call

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/layers/core/embedding.py", line 140, in call

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/ops/numpy.py", line 4918, in take

  File "/Users/enyiomaosondu/personal/final-year-project/env/lib/python3.10/site-packages/keras/src/backend/tensorflow/numpy.py", line 1967, in take

indices[24,2] = 9018 is not in [0, 6033)
	 [[{{node functional_38_1/token_and_position_embedding_40_1/token_embedding_1/GatherV2}}]] [Op:__inference_one_step_on_iterator_118311]