<a href="https://colab.research.google.com/github/nijatmaharramov/NLP_Projects/blob/main/translator_en_es.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this project we will create translator from English To Spanish by using different methods(RNN, Transformer model etc.).

# An encoder- decoder Neural Machine Translation

In [None]:
# First we download the data
from pathlib import Path
import tensorflow as tf

url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets", extract=True)

# Final corrected path
spa_txt_path = Path(path).parent / "spa-eng_extracted" / "spa-eng" / "spa.txt"

# Read the file
text = spa_txt_path.read_text(encoding='utf-8')
print(text[:500])  # Print first 500 characters as a quick check

In [None]:
import numpy as np

text = text.replace("¡", "").replace("¿","")
pairs = [line.split("\t") for line in text.splitlines()]
np.random.seed(42) # extra code - ensures reproducibility on CPU
np.random.shuffle(pairs)
sentences_en, sentences_es = zip(*pairs) # seperates the pairs into to lists

In [None]:
for i in range(3):
    print(sentences_en[i], '=>', sentences_es[i])

In [None]:
vocab_size = 1000
max_length = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length = max_length
)

text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length = max_length
)

text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

In [None]:
text_vec_layer_en.get_vocabulary()[:10] # most used words in English vocabulary

In [None]:
text_vec_layer_es.get_vocabulary()[:10]

In [None]:
X_train  = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec = tf.constant([f'startofseq {s}' for s in sentences_es[:100_000]])
X_valid_dec = tf.constant([f'startofseq {s}' for s in sentences_es[100_000:]])
Y_train = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[:100_000]])
Y_valid = text_vec_layer_es([f'{s} endofseq' for s in sentences_es[100_000:]])

In [None]:
tf.random.set_seed(42)
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [None]:
embed_size=128

encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_inputs_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                                    output_dim = embed_size,
                                                    mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                                    output_dim = embed_size,
                                                    mask_zero=True)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_inputs_ids)

In [None]:
encoder = tf.keras.layers.LSTM(512, return_state = True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [None]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state = encoder_state)

In [None]:
output_layer = tf.keras.layers.Dense(vocab_size, activation = 'softmax')
Y_proba = output_layer(decoder_outputs)

In [None]:
model = tf.keras.Model(inputs = [encoder_inputs, decoder_inputs],
                       outputs = [Y_proba])

model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'nadam',
              metrics = ['accuracy'])

model.fit((X_train, X_train_dec), Y_train, epochs = 3,
          validation_data = ((X_valid, X_valid_dec), Y_valid))

In [None]:
def translate(sentence_en):
    translation = ''
    X = tf.constant([sentence_en])

    for word_idx in range(max_length):
        X_dec = tf.constant(['startofseq' + translation])
        y_proba = model.predict((X, X_dec))[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == 'endofseq':
            break
        translation += ' ' + predicted_word
    return translation.strip()


In [None]:
translate('i like soccer')

# Bidirectional RNNs

In [None]:
tf.random.set_seed(42)
encoder = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_state = True)
)

In [None]:
class ConcatenateStates(tf.keras.layers.Layer):
    def __init__(self):
        super().__init__()
    def call(self, encoder_state):
        return [tf.concat(encoder_state[::2], axis=-1), # short-term (0 & 2)
                tf.concat(encoder_state[1::2], axis=-1)] # long_term (1 & 3)

encoder_outputs, *encoder_state = encoder(encoder_embeddings)
concat_states = ConcatenateStates()
encoder_state = concat_states(encoder_state)

In [None]:
decoder = tf.keras.layers.LSTM(512, return_sequences = True)
decoder_outputs = decoder(decoder_embeddings, initial_state = encoder_state)

output_layer = tf.keras.layers.Dense(vocab_size, activation = 'softmax')
Y_proba = output_layer(decoder_outputs)

model = tf.keras.Model(inputs = [encoder_inputs, decoder_inputs],
                       outputs = [Y_proba])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='nadam',
              metrics=['accuracy'])

model.fit((X_train, X_train_dec), Y_train, epochs = 4,
          validation_data = ((X_valid, X_valid_dec), Y_valid))

# Beam Search

In [None]:
def beam_search(sentence_en, beam_width, verbose=False):
    X = tf.constant([sentence_en])  # encoder input
    X_dec = tf.constant(["startofseq"])  # decoder input
    y_proba = model.predict((X, X_dec))[0, 0]  # first token's probas
    top_k = tf.math.top_k(y_proba, k=beam_width)
    top_translations = [  # list of best (log_proba, translation)
        (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])
        for word_proba, word_id in zip(top_k.values, top_k.indices)
    ]

    # extra code – displays the top first words in verbose mode
    if verbose:
        print("Top first words:", top_translations)

    for idx in range(1, max_length):
        candidates = []
        for log_proba, translation in top_translations:
            if translation.endswith("endofseq"):
                candidates.append((log_proba, translation))
                continue  # translation is finished, so don't try to extend it
            X = tf.constant([sentence_en])  # encoder input
            X_dec = tf.constant(["startofseq " + translation])  # decoder input
            y_proba = model.predict((X, X_dec))[0, idx]  # last token's proba
            for word_id, word_proba in enumerate(y_proba):
                word = text_vec_layer_es.get_vocabulary()[word_id]
                candidates.append((log_proba + np.log(word_proba),
                                   f"{translation} {word}"))
        top_translations = sorted(candidates, reverse=True)[:beam_width]

        # extra code – displays the top translation so far in verbose mode
        if verbose:
            print("Top translations so far:", top_translations)

        if all([tr.endswith("endofseq") for _, tr in top_translations]):
            return top_translations[0][1].replace("endofseq", "").strip()

In [None]:
sentence_en = 'I like soccer and going to the beach'
translate(sentence_en)

In [None]:
beam_search(sentence_en, beam_width = 3, verbose=True)

# Attention Mechanisms

In [None]:
tf.random.set_seed(42)
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [None]:
embed_size = 128

encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                                    output_dim = embed_size)

decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                                    output_dim = embed_size)

encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [None]:
encoding = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(256, return_sequences = True, return_state = True)
                                        )

encoder_outputs, *encoder_state = encoding(encoder_embeddings)
encoder_state = concat_states(encoder_state)

In [None]:
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [None]:
attention_layer = tf.keras.layers.Attention()
attention_outputs = attention_layer([decoder_outputs, encoder_outputs])

output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')
Y_proba = output_layer(attention_outputs)

model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])

model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])

model.fit((X_train, X_train_dec), Y_train, epochs=3,
          validation_data=((X_valid, X_valid_dec), Y_valid))

In [None]:
sentence_en = 'I like soccer and going to the beach'
translate(sentence_en)

# Transformer Model

In [None]:
import tensorflow as tf

vocab_size = 10000
max_length = 50
embed_size = 128
num_heads = 5
ff_dim = 512

#Input layers
encoder_inputs = tf.keras.Input(shape=(None,), dtype=tf.int32, name="encoder_inputs")
decoder_inputs = tf.keras.Input(shape=(None,), dtype=tf.int32, name="decoder_inputs")



#Embedding layer
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                                    output_dim = embed_size,
                                                    mask_zero = True)

decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                                    output_dim = embed_size,
                                                    mask_zero = True)

encoder_embeddings = encoder_embedding_layer(encoder_inputs)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)


#Positional embeddings
pos_embedding_layer = tf.keras.layers.Embedding(max_length, embed_size)
positions_encoder = tf.keras.layers.Lambda(lambda x: tf.range(start = 0, limit = tf.shape(x)[1], delta = 1))(encoder_inputs)
positions_decoder = tf.keras.layers.Lambda(lambda x: tf.range(start = 0, limit = tf.shape(x)[1], delta = 1))(decoder_inputs)
pos_embed_enc = pos_embedding_layer(positions_encoder)
pos_embed_dec = pos_embedding_layer(positions_decoder)

#Add tokens and positional embeddings
encoder_embed = encoder_embeddings + pos_embed_enc
decoder_embed = decoder_embeddings + pos_embed_dec


#Encoder self-attention
encoder_attention = tf.keras.layers.MultiHeadAttention(num_heads = num_heads, key_dim = embed_size)(encoder_embed, encoder_embed)
encoder_attention = tf.keras.layers.LayerNormalization(epsilon = 1e-6)(encoder_embed + encoder_attention)

#Encoder Feed-forward
encoder_ff = tf.keras.layers.Dense(ff_dim, activation = 'relu')(encoder_attention)
encoder_ff = tf.keras.layers.Dense(embed_size)(encoder_ff)
encoder_outputs = tf.keras.layers.LayerNormalization(epsilon = 1e-6)(encoder_attention + encoder_ff)


#Decoder self-attention
causal_mask = tf.keras.layers.Lambda(
    lambda x: tf.linalg.band_part(tf.ones((tf.shape(x)[1], tf.shape(x)[1])), -1, 0)
)(decoder_inputs)
decoder_attention = tf.keras.layers.MultiHeadAttention(num_heads = num_heads, key_dim=embed_size)(decoder_embed, decoder_embed, attention_mask = causal_mask)
decoder_attention = tf.keras.layers.LayerNormalization(epsilon= 1e-6)(decoder_embed + decoder_attention)

#Encoder-Decoder Cross Attention
cross_attention = tf.keras.layers.MultiHeadAttention(num_heads = num_heads, key_dim = embed_size)(decoder_attention, encoder_outputs, encoder_outputs)
decoder_cross = tf.keras.layers.LayerNormalization(epsilon = 1e-6)(decoder_attention + cross_attention)



#Decoder feed-forward
decoder_ff = tf.keras.layers.Dense(ff_dim, activation = 'relu')(decoder_cross)
decoder_ff = tf.keras.layers.Dense(embed_size)(decoder_ff)
decoder_outputs = tf.keras.layers.LayerNormalization(epsilon = 1e-6)(decoder_cross + decoder_ff)


#Final output layer
output_logits = tf.keras.layers.Dense(vocab_size, activation = 'softmax')(decoder_outputs)

#Model
transformer = tf.keras.Model([encoder_inputs, decoder_inputs], output_logits)

In [None]:
transformer.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='nadam',
    metrics=['accuracy']
)

In [None]:
vocab_size = 10000
max_length = 50

text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length = max_length,
    pad_to_max_tokens=True
)

text_vec_layer_es = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length = max_length,
    pad_to_max_tokens=True
)

text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f'startofseq {s} endofseq' for s in sentences_es])

In [None]:
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(
    text_vec_layer_en(X_train).numpy(), padding='post', maxlen=max_length
)

X_train_dec_padded = tf.keras.preprocessing.sequence.pad_sequences(
    text_vec_layer_es(X_train_dec).numpy(), padding='post', maxlen=max_length
)

X_valid_padded = tf.keras.preprocessing.sequence.pad_sequences(
    text_vec_layer_en(X_valid).numpy(), padding='post', maxlen=max_length
)

X_valid_dec_padded = tf.keras.preprocessing.sequence.pad_sequences(
    text_vec_layer_en(X_valid_dec).numpy(), padding='post', maxlen=max_length
)


X_train_padded = tf.constant(X_train_padded)
X_train_dec_padded = tf.constant(X_train_dec_padded)
X_valid_padded = tf.constant(X_valid_padded)
X_valid_dec_padded = tf.constant(X_valid_dec_padded)

transformer.fit(
    (X_train_padded, X_train_dec_padded),
    Y_train, epochs=3,
    validation_data = ((X_valid_padded, X_valid_dec_padded), Y_valid,))

In [None]:
import numpy as np

def translate(sentence_en):
    # Tokenize and pad encoder input
    X = text_vec_layer_en(tf.constant([sentence_en]))
    X = tf.keras.preprocessing.sequence.pad_sequences(X.numpy(), padding="post", maxlen=max_length)

    # Start token
    start_token = text_vec_layer_es([ 'startofseq'])[0][0]
    end_token = text_vec_layer_es(['endofseq'])[0][0]

    # Decoder input initialized with just the start token
    decoder_input = [start_token]

    for _ in range(max_length):
        decoder_input_padded = tf.keras.preprocessing.sequence.pad_sequences(
            [decoder_input], maxlen=max_length, padding="post"
        )

        y_proba = transformer.predict((X, decoder_input_padded), verbose=0)[0, len(decoder_input)-1]
        predicted_word_id = np.argmax(y_proba)

        if predicted_word_id == end_token:
            break

        decoder_input.append(predicted_word_id)

    # Map tokens back to words
    vocab = text_vec_layer_es.get_vocabulary()
    translated_words = [vocab[token] for token in decoder_input[1:]]  # skip start token

    return ' '.join(translated_words)

In [None]:
sentence_en = 'I like soccer and going to the beach'
translate(sentence_en)