# task 2: seq2seq machine translation with attention

In [1]:
# all the datasets that will be needed
!ls /kaggle/input/

english-to-french  glove-embeddings


In [2]:
import pandas as pd

df_english = pd.read_csv("/kaggle/input/english-to-french/small_vocab_en.csv", sep = '\t' , names = ['english'])
df_french = pd.read_csv("/kaggle/input/english-to-french/small_vocab_fr.csv", sep = '\t' , names = ['french'])

df = pd.concat([df_english, df_french], axis=1)
df.head(2)

Unnamed: 0,english,french
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les états-unis est généralement froid en juill...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137860 entries, 0 to 137859
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  137860 non-null  object
 1   french   137860 non-null  object
dtypes: object(2)
memory usage: 2.1+ MB


# Data pre-processing/cleaning

In [6]:
import string
import re

punctuation_pattern = f"[{re.escape(string.punctuation)}]" # remove all the punctuations
printable_pattern = re.compile(f"[^{re.escape(string.printable)}]") # remove all the non-printable characters

def clean_sentences(sentence):
    clean = str(sentence)
    clean = printable_pattern.sub('', clean)
    clean = re.compile(punctuation_pattern).sub('', clean)
    
    return clean

In [7]:
df['english'] = df['english'].astype(str).str.replace(punctuation_pattern, '', regex=True).str.lower().apply(lambda x: printable_pattern.sub('', x)).str.strip()
df['french']  =  df['french'].astype(str).str.replace(punctuation_pattern, '', regex=True).str.lower().str.strip()

In [8]:
maximum_input_length  = 20 # maximum words for input (english)
maximum_output_length = 20 # maximum words for output (french)

sentences_en = df['english'].apply(lambda x: " ".join(x.split(" ")[:maximum_input_length]))
sentences_fr =  df['french'].apply(lambda x: " ".join(x.split(" ")[:maximum_input_length]))

sentences_fr_input  = df['french'].apply(lambda x: "<start> " + " ".join(x.split(" ")[:maximum_output_length-1]))
sentences_fr_output = df['french'].apply(lambda x: " ".join(x.split(" ")[:maximum_output_length-1]) + " <end>")

print(f"cleaned english sentence         : {sentences_en[0]}")
print(f"cleaned french sentence          : {sentences_fr[0]}")
print(f"cleaned french (start) sentence  : {sentences_fr_input[0]}")
print(f"cleaned french (end) sentence    : {sentences_fr_output[0]}")

cleaned english sentence         : new jersey is sometimes quiet during autumn  and it is snowy in april
cleaned french sentence          : new jersey est parfois calme pendant l automne  et il est neigeux en avril
cleaned french (start) sentence  : <start> new jersey est parfois calme pendant l automne  et il est neigeux en avril
cleaned french (end) sentence    : new jersey est parfois calme pendant l automne  et il est neigeux en avril <end>


In [10]:
from tensorflow.keras.preprocessing.text import  Tokenizer

tokenizer_en = Tokenizer(oov_token='oov')
tokenizer_fr = Tokenizer(oov_token='oov', filters='')

tokenizer_en.fit_on_texts(sentences_en)
tokenizer_fr.fit_on_texts(sentences_fr_input)
tokenizer_fr.fit_on_texts(sentences_fr_output)

tokenized_en = tokenizer_en.texts_to_sequences(sentences_en)
tokenized_fr_input  = tokenizer_fr.texts_to_sequences(sentences_fr_input)
tokenized_fr_output = tokenizer_fr.texts_to_sequences(sentences_fr_output)

vocab_en = len(tokenizer_en.word_index) + 1
vocab_fr = len(tokenizer_fr.word_index) + 1

print(f"vocab english : {vocab_en}")
print(f"vocab hindi   : {vocab_fr}")

print(f"longest english sentence : {maximum_input_length}")
print(f"longest hindi sentence   : {maximum_output_length}")

vocab english : 201
vocab hindi   : 348
longest english sentence : 20
longest hindi sentence   : 20


# why pre-padding input, and post-padding output?

- as lstm carry states, we want as much data to be preserved, so if we put words at last (pre-padding) then it can remember more.
- for output we need to generate a sequence of words from left to right, hence forcing to learn (post-padding), useful words at start

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_en = pad_sequences(tokenized_en, maxlen=maximum_input_length, padding='pre')

padded_fr_input  = pad_sequences(tokenized_fr_input, maxlen=maximum_output_length, padding='post')
padded_fr_output = pad_sequences(tokenized_fr_output, maxlen=maximum_output_length, padding='post')

print(repr(f"english padded       : {padded_en[0]}"))
print(repr(f"french input padded  : {padded_fr_input[0]}"))
print(repr(f"french output padded : {padded_fr_output[0]}"))

'english padded       : [ 0  0  0  0  0  0  0 18 24  2  9 68  5 40  8  4  2 56  3 45]'
'french input padded  : [  5  37  36   2  11  69  39  14  27   9   4   2 114   3  52   0   0   0\n   0   0]'
'french output padded : [ 37  36   2  11  69  39  14  27   9   4   2 114   3  52   6   0   0   0\n   0   0]'


# Train-Test split

In [12]:
from sklearn.model_selection import train_test_split

x_train_en, x_test_en, x_train_fr, x_test_fr, y_train_fr, y_test_fr = train_test_split(
    padded_en, padded_fr_input, padded_fr_output,
    test_size=0.2
)

# this contains the representation of words in 200 dimension vector
- each word can be represented in vec of 200 values
- storing in a `glove_embedding` dict

In [13]:
import numpy as np

glove_embedding = dict()

with open("/kaggle/input/glove-embeddings/glove.6B.200d.txt", encoding='utf-8') as f:
    for line in f:
        values = line.split()
        
        word = values[0]                   # word
        vectors = np.asarray(values[1:])   # 200 dim vector representation of that word
        
        glove_embedding[word] = vectors

In [14]:
# if the vocab words from training set is found then update the embedding_matrix from glove_embedding
embedding_matrix = np.zeros((vocab_en, 200))

for word, index in tokenizer_en.word_index.items():
    vector = glove_embedding.get(word)
    
    if vector is not None:
        embedding_matrix[index] = vector

# Model architecture
- **encoder module**: takes the input words and create states with sentence meaning
- **decoder module**: takes that states, and for a special token `<start>` it produces a sequence of words, and end with `<end>`

In [23]:
from tensorflow.keras.layers import Input, Embedding, LSTM

"""
Encoder

    set maximum input size,
    set embedding_matrix for the words we found in the glove_embedding as weights in embedding layer,
    pass the data to lstm layer, which calculates the states, needed for decoder module
"""
encoder_inputs = Input(shape=(maximum_input_length,), name="encoder_input")
encoder_embedding = Embedding(input_dim=vocab_en, output_dim=200, weights=[embedding_matrix], input_length=maximum_input_length, name="encoder_embedding")(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(256, return_sequences=True, return_state=True, name="encoder_lstm")(encoder_embedding)
encoder_states = [state_h, state_c]

In [24]:
"""
Decoder

    set maximum input size,
    data travels from input -> embedding where each word is representing in 200 dim -> lstm
    where lstm returns sequence of words, which is then pass through a dence layer to get words probability
"""
decoder_inputs = Input(shape=(maximum_output_length,), name="decoder_input")
decoder_embedding = Embedding(input_dim=vocab_fr, output_dim=200, input_length=maximum_output_length, name="decoder_embedding")(decoder_inputs)
decoder_lstm_outputs, _, _ = LSTM(256, return_sequences=True, return_state=True, name="decoder_lstm")(decoder_embedding, initial_state=encoder_states)

In [28]:
from tensorflow.keras.layers import Attention, Concatenate, TimeDistributed, Dense
from tensorflow.keras.models import Model

"""
Luong Attention

    what attention does?
        - for each word that decoder generates, it can look for entine states history in encoder module, and not just last states
        - hence each word from input has some weight at producing words in decoder module

    each word generated in decoder module is influenced by (timed incoder states), and (prev generated decoder word).
    hence a context_vector is generates with those two values
    
    down code is used to calculate weightage of each word (input) to generate new word (decoder).

    in luong attention :- you take dot product of those two values to get weight
    in bahdanau attention :- you add those values, along with calculating weight (little more work for training)
"""
attention = Attention(name="luong_attention")
context_vector = attention([decoder_lstm_outputs, encoder_outputs])
decoder_combined_context = Concatenate(axis=-1, name="attention_concatenate")([context_vector, decoder_lstm_outputs])

# Output layer
output = TimeDistributed(Dense(vocab_fr, activation="softmax"), name="decoder_output")(decoder_combined_context)

In [29]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], output)

# why sparse categorical cross entropy, as the target output data is not one hot encoded
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [34]:
history = model.fit(
    # train data
    [x_train_en, x_train_fr], y_train_fr,

    # test data
    validation_data=([x_test_en, x_test_fr], y_test_fr),

    # batch size (when to update weights), epochs
    batch_size=16, epochs=2
)

Epoch 1/2
[1m6893/6893[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 10ms/step - accuracy: 0.9671 - loss: 0.1670 - val_accuracy: 0.9927 - val_loss: 0.0242
Epoch 2/2
[1m6893/6893[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 10ms/step - accuracy: 0.9935 - loss: 0.0212 - val_accuracy: 0.9940 - val_loss: 0.0197


# creating a prediction model

In [None]:
"""
Encoder
    same as above encoder,
    takes input padded, tokenized vector, and produces states
"""
encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c], name="encoder_input")

In [35]:
"""
Decoder

    takes input states of encoder module,
    and a starting token <start>

    then it produces sequence of words based on (current state + next predicted token)
    till it receives a special token <end> or reached it's maximum output length, then it stops
"""
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_hidden_state_input = Input(shape=(maximum_input_length, 256))

decoder_single_input = Input(shape=(1,))
decoder_single_embed = model.get_layer("decoder_embedding")(decoder_single_input)

decoder_outputs, dec_h, dec_c = model.get_layer("decoder_lstm")( decoder_single_embed, initial_state=[decoder_state_input_h, decoder_state_input_c])

In [36]:
"""
Attention

    each word generated in decoder module is influenced by (timed incoder states), and (prev generated decoder word).
    hence a context_vector is generates with those two values
    
    down code is used to calculate weightage of each word (input) to generate new word (decoder).
"""
context = model.get_layer("luong_attention")([decoder_outputs, decoder_hidden_state_input])
concat = model.get_layer("attention_concatenate")([context, decoder_outputs])
output_tokens = model.get_layer("decoder_output")(concat)

decoder_model = Model(
    [decoder_single_input, decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [output_tokens, dec_h, dec_c]
)

In [37]:
def build_idx2word(tokenizer):
    return {v: k for k, v in tokenizer.word_index.items()}

idx2word_input  = build_idx2word(tokenizer_en)
idx2word_output = build_idx2word(tokenizer_fr)

In [42]:
"""
    takes the sentence, cleans it, tokenize it, pad it,
    then calculate the sentence state from encoder_module,

    state is then sent to decoder module with a token <start>
    then this repeats for maximum length of output sentence time, or is <end> token is received
"""
def translate_english_to_french(input_seq):
    seq = clean_sentences(input_seq)
    seq = tokenizer_en.texts_to_sequences([seq])
    seq = pad_sequences(seq, maxlen=maximum_input_length)
    
    enc_outs, h, c = encoder_model.predict(seq, verbose=False)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_fr.word_index["<start>"]
    
    eos = tokenizer_fr.texts_to_sequences(['<end>'])[0][0]
    output_sentence = []

    for _ in range(maximum_output_length):
        output_tokens, h, c = decoder_model.predict([target_seq, enc_outs, h, c], verbose=False)
        idx = np.argmax(output_tokens[0, 0, :])

        if idx == eos:
            break
            
        if idx > 0:
            word = idx2word_output[idx]
            output_sentence.append(word)

        target_seq[0, 0] = idx

    return " ".join(output_sentence)

In [44]:
english_sentence = "california is usually quiet during march  and it is usually hot in june"
translation = translate_english_to_french(english_sentence)

print(f"english (given)     : {english_sentence}")
print(f"french (prediction) : {translation}")

english (given)     : california is usually quiet during march  and it is usually hot in june
french (prediction) : california est généralement calme en mars et il est généralement chaud en juin


# belu score

In [45]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [46]:
# some random sentences from data

sentences_idx = np.random.choice(sentences_en.index, size=100)

actual_en = sentences_en[sentences_idx]
actual_fr = sentences_fr[sentences_idx]
predicted_fr = [translate_english_to_french(sentence) for sentence in actual_en]

In [47]:
import sacrebleu

bleu = sacrebleu.corpus_bleu(predicted_fr, [actual_fr.to_list()])
print(f"belu score: {bleu.score:.2f}")

belu score: 98.16
