In [None]:

# Task 1: Train LSTM NMT Model + Apply Beam Search


import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.translate.bleu_score import sentence_bleu




In [None]:


# 1️ Create a tiny dataset (English → Spanish)

train_data = [
  ("good morning", "buenos dias"),
  ("good night", "buenas noches"),
  ("how are you", "como estas"),
  ("thank you", "gracias"),
  ("i love you", "te amo"),
  ("see you soon", "hasta pronto"),
  ("where are you", "donde estas"),
  ("have a nice day", "que tengas un buen dia")
]


eng_texts = list(data.keys())
spa_texts = ['<start> ' + t + ' <end>' for t in data.values()]

In [None]:
# 2️ Tokenize text

tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(eng_texts)

tokenizer_spa = Tokenizer()
tokenizer_spa.fit_on_texts(spa_texts)

eng_sequences = tokenizer_eng.texts_to_sequences(eng_texts)
spa_sequences = tokenizer_spa.texts_to_sequences(spa_texts)

max_eng_len = max(len(s) for s in eng_sequences)
max_spa_len = max(len(s) for s in spa_sequences)

encoder_input_data = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
decoder_input_data = pad_sequences(spa_sequences, maxlen=max_spa_len, padding='post')

# Prepare decoder output (shifted by 1)
decoder_output_data = np.zeros((len(spa_texts), max_spa_len, len(tokenizer_spa.word_index) + 1))
for i, seq in enumerate(spa_sequences):
    for t in range(1, len(seq)):
        decoder_output_data[i, t - 1, seq[t]] = 1.0


In [None]:
# 3️ Build Encoder-Decoder Model

latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(input_dim=len(tokenizer_eng.word_index) + 1, output_dim=latent_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(input_dim=len(tokenizer_spa.word_index) + 1, output_dim=latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(len(tokenizer_spa.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()


In [None]:

# 4️ Train

model.fit([encoder_input_data, decoder_input_data], decoder_output_data,
          batch_size=4, epochs=300, verbose=0)

model.save("nmt_lstm_model.h5")
print(" Model trained and saved as nmt_lstm_model.h5")


# 5️ Create inference model

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
dec_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
dec_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states_inputs)
dec_outputs2 = decoder_dense(dec_outputs2)
decoder_model = Model([decoder_inputs] + dec_states_inputs, [dec_outputs2, state_h2, state_c2])




 Model trained and saved as nmt_lstm_model.h5


In [None]:


g# 6️ Beam Search Implementation

def beam_search_decode(input_seq, beam_width=3):
    states_value = encoder_model.predict(input_seq)
    sequences = [[list(), 0.0, states_value]]

    for _ in range(max_spa_len):
        all_candidates = []
        for seq, score, states in sequences:
            target_seq = np.zeros((1, 1))
            if seq:
                target_seq[0, 0] = seq[-1]
            else:
                target_seq[0, 0] = tokenizer_spa.word_index['start']

            output_tokens, h, c = decoder_model.predict([target_seq] + states)
            for j in range(len(output_tokens[0, -1, :])):
                candidate = [seq + [j], score - np.log(output_tokens[0, -1, j] + 1e-9), [h, c]]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:beam_width]
    return sequences



In [None]:
# 7️ Test translation

def translate(sentence, beam_width=3):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')
    beams = beam_search_decode(seq, beam_width)
    print(f"\nInput: {sentence}")
    for seq, score, _ in beams:
        words = [tokenizer_spa.index_word.get(i, '') for i in seq]
        print("Candidate:", ' '.join(words), "| Score:", round(score, 3))

translate("good morning")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step

Input: good morning
Candidate: buenos dias end end | Score: 1.574
Candidate: buenos dias end noches | Score: 2.737
Candidate: buenos dias end dias | Score: 2.75


In [None]:
# 8️ BLEU Score Example

ref = ['buenos', 'dias']
cand = ['buenos', 'dias']
print("BLEU Score:", sentence_bleu([ref], cand))

BLEU Score: 1.491668146240062e-154
