# Data loading

In [1]:
import pandas as pd
import numpy as np

chunk_generator = pd.read_csv("en-fr.csv", chunksize=20000)
df = next(chunk_generator)

In [2]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

In [3]:
import re, string
from nltk.stem.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords

MAX_SEQUENCE_LENGTH = 64  # Maximum length of padded sequences

class Preprocessor:
    def __init__(self):
        self.stemmer = PorterStemmer() # stemmer

    def apply_preprocessing(self, text, sos = True, eos = True):
        text = text.lower() # convert to lowercase
        text = self.remove_characters(text)
        text = remove_stopwords(text) # remove stopwords
        # text = self.stemming(text)
        text = self.truncate(text)
        text = self.add_os_tokens(text, sos, eos)
        return text
    
    def stemming(self, text):
        return " ".join([self.stemmer.stem(word) for word in text.split()])

    def remove_characters(self, text):
        emojis = r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+"  # regex for emojis
        html = r'http\S+|www\S+' # regex for urls
        mentions = r"@\w+" # regex for mentions
        hashtags = r"#" # regex for hashtags
        text = re.sub(f'{emojis}|{html}|{mentions}|{hashtags}','',text) # remove emojis, urls, mentions and hashtags
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        return text


    def truncate(self, text):
        if len(text.split())>MAX_SEQUENCE_LENGTH:
            text = text[:MAX_SEQUENCE_LENGTH-2]
        return text

    def add_os_tokens(self, text, sos = True, eos = True): # os is of sentence
        if sos:
            text = '<sos> ' + text
        if eos:
            text += ' <eos>'
        return text
    
preprocessor = Preprocessor()


In [4]:
df['en'] = df['en'].apply(preprocessor.apply_preprocessing)

df['fr_input'] = df['fr'].apply(lambda x: preprocessor.apply_preprocessing(x, sos=True, eos=False))
df['fr_output'] = df['fr'].apply(lambda x: preprocessor.apply_preprocessing(x, sos=False, eos=True))

df.drop('fr', axis=1, inplace=True)
df.head()

Unnamed: 0,en,fr_input,fr_output
0,<sos> changing lives changing society works te...,<sos> il transformé notre vie il transformé la...,il transformé notre vie il transformé la socié...
1,<sos> site map <eos>,<sos> plan du site,plan du site <eos>
2,<sos> feedback <eos>,<sos> rétroaction,rétroaction <eos>
3,<sos> credits <eos>,<sos> crédits,crédits <eos>
4,<sos> français <eos>,<sos> english,english <eos>


# Tokenizer

In [5]:
VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 32

In [6]:
from tokenizers import Tokenizer, models, decoders, trainers
from tokenizers.pre_tokenizers import Whitespace

trainer_en = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=["<pad>", "<sos>", "<eos>", "<unk>"])
trainer_fr = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=["<pad>", "<sos>", "<eos>", "<unk>"])
    
tokenizer_en = Tokenizer(models.BPE(vocab_size=VOCAB_SIZE, unk_token="<unk>"))
tokenizer_fr = Tokenizer(models.BPE(vocab_size=VOCAB_SIZE, unk_token="<unk>"))


# Add pre-tokenizers (e.g., split on whitespace)
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_fr.pre_tokenizer = Whitespace()

# Add decoders (e.g., convert tokens back to strings)
tokenizer_en.decoder = decoders.BPEDecoder()
tokenizer_fr.decoder = decoders.BPEDecoder()

tokenizer_en.train_from_iterator(df['en'], trainer_en)
tokenizer_fr.train_from_iterator(df['fr_input'], trainer_fr)

# Configure padding
tokenizer_en.enable_padding(
    pad_id=tokenizer_en.token_to_id("<pad>"),  # ID of the <pad> token
    pad_token="<pad>",                      # The padding token
    length=MAX_SEQUENCE_LENGTH,                              # Pad sequences to this length (optional)
    direction="left"                       # Pad on the right (default)
)

tokenizer_fr.enable_padding(
    pad_id=tokenizer_fr.token_to_id("<pad>"),  # ID of the <pad> token
    pad_token="<pad>",                      # The padding token
    length=MAX_SEQUENCE_LENGTH,                              # Pad sequences to this length (optional)
    direction="right"                       # Pad on the right (default)
)
# Enable truncation
tokenizer_en.enable_truncation(max_length=MAX_SEQUENCE_LENGTH, direction="left")  # Set desired max length
tokenizer_fr.enable_truncation(max_length=MAX_SEQUENCE_LENGTH, direction="right")  # Set desired max length

tokenizer_en.save("tokenizer_en.json")
tokenizer_fr.save("tokenizer_fr.json")

In [7]:
tokenizer_en.get_vocab()['<pad>'], tokenizer_fr.get_vocab()['<pad>']

(0, 0)

In [8]:
# Load Pre-trained GloVe Embeddings
embedding_index = {} # Dictionary to store word embeddings

with open('glove.6B.100d.txt', encoding='utf8') as glove_file:
    for line in glove_file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector
        
# Initialize embedding matrix
num_words = tokenizer_en.get_vocab_size()
embedding_matrix_en = np.zeros((num_words, EMBEDDING_DIM))

# Populate embedding matrix
for word, index in tokenizer_en.get_vocab().items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_en[index] = embedding_vector

# Initialize embedding matrix
num_words = tokenizer_fr.get_vocab_size()
embedding_matrix_fr = np.zeros((num_words, EMBEDDING_DIM))

# Populate embedding matrix
for word, index in tokenizer_en.get_vocab().items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix_fr[index] = embedding_vector

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

class Seq2SeqModel:
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, latent_dim, max_sequence_length):
        self.input_vocab_size = input_vocab_size # maximum number of unique words in input
        self.output_vocab_size = output_vocab_size # maximum number of unique words in output
        self.embedding_dim = embedding_dim # dimension of word embeddings
        self.latent_dim = latent_dim # dimension of latent space
        self.max_sequence_length = max_sequence_length # maximum length of input and output sequences

        # Build models
        self.encoder = self.build_encoder()
        self.decoder_training = self.build_decoder_training()
        self.model = self.build_model()
        
    def build_encoder(self):
        # Define the encoder
        encoder_inputs = Input(shape=(None,), name="encoder_input") # input sequence
        encoder_embedding = Embedding(input_dim=self.input_vocab_size, # input vocabulary
                                      output_dim=self.embedding_dim,  # dimension of word embeddings
                                      mask_zero=True # Ignore Padding Tokens
                                    )(encoder_inputs) # mask padding
        encoder_outputs, state_h, state_c= LSTM(self.latent_dim, 
                                                dtype='float32',
                                                return_state=True, # To return states
                                                name="encoder_lstm")(encoder_embedding) # LSTM cells

        encoder_model = Model(encoder_inputs, [ state_h, state_c], name="encoder") # Model for encoder
        return encoder_model
    
    def build_decoder_training(self):
        
        decoder_inputs = Input(shape=(None,), name="decoder_input") # input sequence from french
        encoder_state_h = Input(shape=(self.latent_dim,), name="decoder_state_h") # hidden long term memory state from encoder
        encoder_state_c = Input(shape=(self.latent_dim,), name="decoder_state_c") # hidden short term memor state from encoder

        decoder_embedding = Embedding(input_dim=self.output_vocab_size, output_dim=self.embedding_dim, mask_zero=True)(decoder_inputs) # word embeddings
        
        # mask_zero=True Ignore Padding Tokens
        decoder_outputs= LSTM(self.latent_dim, return_sequences=True, name="decoder_lstm", dtype='float32',)\
            (decoder_embedding, initial_state=[encoder_state_h, encoder_state_c]) # LSTM cells

        decoder_dense = Dense(self.output_vocab_size, activation='softmax')(decoder_outputs) # output layer

        decoder_model = Model([decoder_inputs, encoder_state_h, encoder_state_c], 
                              decoder_dense, name="decoder") # Model for decoder
        return decoder_model
    
    def build_decoder_inference(self):
        decoder_inputs = Input(shape=(None,), name="decoder_input") # input sequence from french
        decoder_state_input_h = Input(shape=(self.latent_dim,), name="decoder_state_h") # previous hidden long term memory state
        decoder_state_input_c = Input(shape=(self.latent_dim,), name="decoder_state_c") # previous hidden short term memory state
        
        decoder_embedding = Embedding(input_dim=self.output_vocab_size, output_dim=self.embedding_dim, 
                                      mask_zero=True, # Ignore Padding Tokens
                                      weights=[embedding_matrix_fr]
                                    )(decoder_inputs) # word embeddings
        
        decoder_outputs, state_h, state_c = LSTM(self.latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")\
            (decoder_embedding, initial_state=[decoder_state_input_h, decoder_state_input_c]) # LSTM cells

        decoder_dense = Dense(self.output_vocab_size, activation='softmax')(decoder_outputs) # output layer

        decoder_model = Model([decoder_inputs, decoder_state_input_h, decoder_state_input_c], 
                              [decoder_dense, state_h, state_c], 
                              name="decoder") # Model for decoder
        return decoder_model
    
    def build_model(self):
        
        encoder_inputs = self.encoder.input  # Extract encoder input
        state_h, state_c = self.encoder.output  # Extract encoder states
        
        decoder_inputs = self.decoder_training.input[0]  # Extract decoder input
        decoder_outputs = self.decoder_training([decoder_inputs, state_h, state_c])  # Pass encoder states to decoder

        model = Model([encoder_inputs, decoder_inputs], decoder_outputs, name="seq2seq_model") # Create model
        return model
        
    def compile(self, optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']):
        self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics)


    def infer(self, input_sequence, start_token_id, end_token_id, preprocess, tokenizer_en, tokenizer_fr, max_length=10):
        # Preprocess the input
        input_sequence = preprocess(input_sequence)
        
        # Tokenize the input sequence
        input_sequence = tokenizer_en.encode(input_sequence).ids
        input_sequence = np.array(input_sequence).reshape(1, -1)  # Reshape for model input
        
        # Build the inference decoder if not already built
        if not hasattr(self, 'decoder_inference'):
            self.decoder_inference = self.build_decoder_inference()
        
        # Encode the input sequence
        states_value = self.encoder.predict(input_sequence)  # Get encoder states
        
        # Initialize the target sequence with the start token ID (must be integer, not string)
        target_seq = np.array([[start_token_id]])  # Start with the start token ID
        
        generated_sequence = []

        for _ in range(max_length):
            # Predict the next token
            decoder_outputs, h, c = self.decoder_inference.predict([target_seq] + states_value, verbose=0)
            predicted_token = tf.argmax(decoder_outputs, axis=-1).numpy()[0, 0]  # Extract token ID

            # Append the predicted token to the generated sequence
            generated_sequence.append(predicted_token)
            
            # Stop if the end token is predicted
            if predicted_token == end_token_id:  # Stop on EOS token
                break
            
            # Update the target sequence and states
            target_seq = np.array([[predicted_token]])  # Feed back into decoder
            states_value = [h, c]  # Update states
        generated_text = ""
        # Decode the generated sequence back into a string
        for i in generated_sequence:
            generated_text += str(tokenizer_fr.decode([i], skip_special_tokens=False)) + ' '
        return generated_text

In [10]:
# callbakcs
# Save best model
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="seq2seq_model.h5",
    monitor="val_loss",
    mode="min",
    save_best_only=True
)
# Learning rate scheduler
learning_rate_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    min_lr=1e-6
)

# tdqm keras
from tqdm.keras import TqdmCallback

In [18]:
# Define hyperparameters
input_vocab_size = tokenizer_en.get_vocab_size()
output_vocab_size = tokenizer_fr.get_vocab_size()
CONTEXT_DIM = 300

# Initialize model
seq2seq = Seq2SeqModel(input_vocab_size, output_vocab_size, EMBEDDING_DIM, CONTEXT_DIM, MAX_SEQUENCE_LENGTH)
seq2seq.model.summary(expand_nested=True)

Model: "seq2seq_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 100)    2000000     ['encoder_input[0][0]']          
                                                                                                  
 decoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 300),        481200      ['embedding_4[0][0]']            
                                 (None, 300),                                         

In [19]:
def wrapper_tokenize(text):
    text = preprocessor.apply_preprocessing(text)
    text = tokenizer_en.encode(text, add_special_tokens=True).ids
    return np.array(text)
    

# Tokenize sequences
encoder_input_sequence = np.zeros((len(df), MAX_SEQUENCE_LENGTH), dtype=int)
decoder_input_sequence = np.zeros((len(df), MAX_SEQUENCE_LENGTH), dtype=int)
decoder_output_sequence = np.zeros((len(df), MAX_SEQUENCE_LENGTH), dtype=int)


for j in range(len(df['en'])):
    encoder_input_sequence[j] = wrapper_tokenize(df['en'][j])
    decoder_input_sequence[j] = wrapper_tokenize(df['fr_input'][j])
    decoder_output_sequence[j] = wrapper_tokenize(df['fr_output'][j])

In [20]:
encoder_input_sequence.shape, decoder_input_sequence.shape, decoder_output_sequence.shape

((19999, 32), (19999, 32), (19999, 32))

In [21]:
tdqm_callback = TqdmCallback()
# Compile model
seq2seq.compile()

# Train model
seq2seq.model.fit([encoder_input_sequence, decoder_input_sequence], decoder_output_sequence, 
                  validation_split=0.1, batch_size=64, epochs=50, verbose=0,
              callbacks=[checkpoint_callback, learning_rate_scheduler, tdqm_callback])

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

<keras.callbacks.History at 0x21e70a2b5e0>

# Testing

In [22]:
from tokenizers import Tokenizer
import tensorflow as tf


CONTEXT_DIM = 100


tokenizer_en = Tokenizer.from_file("tokenizer_en.json")
tokenizer_fr = Tokenizer.from_file("tokenizer_fr.json")

# Define hyperparameters
input_vocab_size = tokenizer_en.get_vocab_size()
output_vocab_size = tokenizer_fr.get_vocab_size()


seq2seq = Seq2SeqModel(input_vocab_size, output_vocab_size, EMBEDDING_DIM, CONTEXT_DIM, MAX_SEQUENCE_LENGTH)
seq2seq.model = tf.keras.models.load_model('seq2seq_model.h5')
seq2seq.model.summary()

Model: "seq2seq_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 100)    2000000     ['encoder_input[0][0]']          
                                                                                                  
 decoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 300),        481200      ['embedding_4[0][0]']            
                                 (None, 300),                                         

In [23]:
import pandas as pd

chunk_generator = pd.read_csv("en-fr.csv", chunksize=25000)
df = next(chunk_generator)
df = df.iloc[-5000:]

df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

df['en'] = df['en'].apply(preprocessor.apply_preprocessing)
df['fr_input'] = df['fr'].apply(lambda x: preprocessor.apply_preprocessing(x, sos=True, eos=False))
df['fr_output'] = df['fr'].apply(lambda x: preprocessor.apply_preprocessing(x, sos=False, eos=True))

df.drop('fr', axis=1, inplace=True)
df.head()

Unnamed: 0,en,fr_input,fr_output
0,<sos> première moisson operating industrial sc...,<sos> première moisson œuvre maintenant sur un...,première moisson œuvre maintenant sur une éche...
1,<sos> boulangerie stméthode selling products c...,<sos> la boulangerie stméthode vend ses produi...,la boulangerie stméthode vend ses produits aux...
2,<sos> fact boulangerie stméthode won 2004 priz...,<sos> la boulangerie stméthode d’ailleurs gagn...,la boulangerie stméthode d’ailleurs gagné le p...
3,<sos> breaking health food market… address hea...,<sos> …percée du volet santé… pour r,…percée du volet santé… pour r <eos>
4,<sos> boulangerie gadoua ltée introduced gusta...,<sos> quant à elle la boulangerie ga,quant à elle la boulangerie ga <eos>


In [30]:


# Get the token IDs for <sos> and <eos> from your French tokenizer
sos_token_id = tokenizer_fr.token_to_id("<sos>")
eos_token_id = tokenizer_fr.token_to_id("<eos>")

test_en = "he"
# Then call infer with these IDs
seq2seq.infer(test_en, sos_token_id, eos_token_id,
              preprocess=lambda x: preprocessor.apply_preprocessing(x, sos=True, eos=True), 
              tokenizer_en=tokenizer_en, 
              tokenizer_fr=tokenizer_fr,
              max_length=MAX_SEQUENCE_LENGTH)



'ü pôle clés fairchil demballage ciation ciation demballage ciation ciation bromure 00 concentration disposés compte compte ndnca secouent cholestéro étang ppe darachides darachides darachides chemise né né insur insur insur insur insur '