## Character-level recurrent sequence to sequence model

In [1]:
import numpy as np
import keras
import os
from pathlib import Path

### Configuration

In [2]:
BATCH_SIZE = 64
EPOCHS = 100
LATENT_DIM = 256
NUM_SAMPLES = 10000
DATA_PATH = "./data/fra.txt"

### Preparing the data

In [3]:
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(DATA_PATH, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
    
for line in lines[: min(NUM_SAMPLES, len(lines) -1)]:
    input_text, target_text, _ = line.split("\t")
    
    # using "tab" as the start sequence and "\n" as the end sequence in text.
    
    target_text = "\t" + target_text + "\n"

    input_texts.append(input_text)
    target_texts.append(target_text)
    
    # building a character level vocabulary
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
            
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

In [4]:
input_texts[:10], target_texts[:10]

(['Go.', 'Go.', 'Go.', 'Go.', 'Hi.', 'Hi.', 'Run!', 'Run!', 'Run!', 'Run!'],
 ['\tVa !\n',
  '\tMarche.\n',
  '\tEn route !\n',
  '\tBouge !\n',
  '\tSalut !\n',
  '\tSalut.\n',
  '\tCours\u202f!\n',
  '\tCourez\u202f!\n',
  '\tPrenez vos jambes à vos cous !\n',
  '\tFile !\n'])

#### Concept: sorted

In [5]:
# concpet: sorted
numbers = [5, 2, 9, 1, 5, 6]
sorted_numbers = sorted(numbers)
print(sorted_numbers)

words = ["apple", "banana", "cherry", "date", "fig", "grape"]
sorted_words = sorted(words, key=len)
print(f"Sorted words by length: {sorted_words}")

dictionary = {"apple": 1, "banana": 10, "cherry": 93, "date": 3, "fig": 5, "grape": 6}
sorted_dictionary = sorted(dictionary.items(), key=lambda x: x[1])
print(f"Sorted dictionary by value: {sorted_dictionary}")

[1, 2, 5, 5, 6, 9]
Sorted words by length: ['fig', 'date', 'apple', 'grape', 'banana', 'cherry']
Sorted dictionary by value: [('apple', 1), ('date', 3), ('fig', 5), ('grape', 6), ('banana', 10), ('cherry', 93)]


In [6]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

In [7]:
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

In [8]:
max_encoder_seq_length = max([len(x) for x in input_texts])
max_decoder_seq_length = max([len(x) for x in target_texts])

In [9]:
print(f"Number of samples: {len(input_texts)}")
print(f"Number of unique input tokens: {num_encoder_tokens}")
print(f"Number of unique output tokens: {num_decoder_tokens}")
print(f"Maximum sequence length for inputs: {max_encoder_seq_length}")
print(f"Maximum sequence length for outputs: {max_decoder_seq_length}")

Number of samples: 10000
Number of unique input tokens: 70
Number of unique output tokens: 91
Maximum sequence length for inputs: 14
Maximum sequence length for outputs: 59


In [10]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
) # to make shape (10000, 14, 70)

decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

In [11]:
encoder_input_data.shape, decoder_input_data.shape, decoder_target_data.shape

((10000, 14, 70), (10000, 59, 91), (10000, 59, 91))

In [12]:
encoder_input_data[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.], dtype=float32)

In [13]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [14]:
a = np.zeros(shape=(2,3,4))
a[0,0,3] = 1
a

array([[[0., 0., 0., 1.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

#### Concept ohe and for else:

In [39]:
sample_words = ["a", "bc", "bcd"]
sample_vocab = {" ": 0, "a": 1,"b": 2, "c": 3, "d": 4}
ohe = np.zeros(shape=(len(sample_words), 3, 5)) # ohe for 3 words where for one word max length is 3 and vocab size of data is 4

for index, word in enumerate(sample_words):
    for t, character in enumerate(word):
        ohe[index, t, sample_vocab[character]] = 1.0
        
    ohe[index, t + 1 :, sample_vocab[" "]] = 1.0
    
ohe # [1, 0, 0, 0, 0] is space  [0, 1, 0, 0, 0] is a, [0, 0, 1, 0, 0] is b, [0, 0, 0, 1, 0] is c, [0, 0, 0, 0, 1] is d

array([[[0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.]],

       [[0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0.]],

       [[0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]]])

In [15]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
            
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

In [16]:
input_texts[0]

'Go.'

In [17]:
encoder_input_data[0] # first input for encoder

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0

In [18]:
target_texts[0]

'\tVa !\n'

In [21]:
decoder_input_data[0][0], target_characters[np.argmax(decoder_input_data[0][0])]

(array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.], dtype=float32),
 '\t')

In [29]:
decoder_target_data[0][0], target_characters[np.argmax(decoder_target_data[0][0])]

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.], dtype=float32),
 'V')

In [34]:
input_characters[0]

' '

In [32]:
target_characters[0], target_characters[1], target_characters[2]
# 0 : start token
# 1 : end token
# 2 : space for padding

('\t', '\n', ' ')

In [25]:
for decoder_input, decoder_target in zip(decoder_input_data[0], decoder_target_data[0]):
    print(f"Input character: {target_characters[np.argmax(decoder_input)]}, Target character: {target_characters[np.argmax(decoder_target)]}")

Input character: 	, Target character: V
Input character: V, Target character: a
Input character: a, Target character:  
Input character:  , Target character: !
Input character: !, Target character: 

Input character: 
, Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  
Input character:  , Target character:  


In [37]:
input_texts[0], target_texts[0]

('Go.', '\tVa !\n')

# Building the model

In [40]:
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(LATENT_DIM, return_state=True)
# output from the encoder is not needed, only the states are needed
encoder_outputs, state_hidden, state_cell = encoder(encoder_inputs)

encoder_states = [state_hidden, state_cell]

In [41]:
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

decoder_lstm = keras.layers.LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [42]:
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [43]:
model.summary()

In [44]:
model.compile(
    optimizer="rmsprop",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

In [45]:
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 87ms/step - accuracy: 0.7051 - loss: 1.5626 - val_accuracy: 0.7147 - val_loss: 1.0956
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 83ms/step - accuracy: 0.7458 - loss: 0.9596 - val_accuracy: 0.7332 - val_loss: 0.9549
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 80ms/step - accuracy: 0.7629 - loss: 0.8591 - val_accuracy: 0.7515 - val_loss: 0.8689
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 83ms/step - accuracy: 0.7885 - loss: 0.7634 - val_accuracy: 0.7813 - val_loss: 0.7779
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 85ms/step - accuracy: 0.8013 - loss: 0.7021 - val_accuracy: 0.7862 - val_loss: 0.7303
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 90ms/step - accuracy: 0.8137 - loss: 0.6429 - val_accuracy: 0.7997 - val_loss: 0.6864
Epoch 7/10

<keras.src.callbacks.history.History at 0x21d6be4ad80>

# Inference
- Encode Input and get the inigial decoder state from the encoder layer
- Run one step of decoder with this initial state and input as a "start sequence" in our case "\t"
- Repeat with the current target token and current states

In [46]:
model.save("eng2fre.keras")

In [53]:
encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

In [54]:
decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(LATENT_DIM,))
decoder_state_input_c = keras.Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

In [48]:
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [55]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [57]:
for seq_index in range(0,1000,20):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("\n")
    print(f"Input word: {input_texts[seq_index]}")
    print(f"Output word: {decoded_sentence}")



Input word: Go.
Output word: Pars !



Input word: Run.
Output word: Fuyez !



Input word: Wait!
Output word: Attendez !



Input word: I won!
Output word: Je vais bon coup.



Input word: Attack!
Output word: Attaquez !



Input word: Got it!
Output word: Fais-le !



Input word: I paid.
Output word: Je payai.



Input word: No way!
Output word: C'est attaque !



Input word: We won.
Output word: Nous l'avons emporté.



Input word: Be fair.
Output word: Soyez crudent !



Input word: Beat it.
Output word: Firez vetint !



Input word: Come in.
Output word: Entre.



Input word: Fold it.
Output word: Plie-le.



Input word: Get out!
Output word: Dégage !



Input word: Go away!
Output word: Dégage !



Input word: Go home.
Output word: Va te cami !



Input word: Help me.
Output word: Aide-toi à Tom.



Input word: How sad!
Output word: Comme c'est tris le hemmente.



Input word: I tried.
Output word: J'ai essayé Tom.



Input word: Join me.
Output word: Ellaot !



Input word: Lo