In [1]:
# Encoder-Decoder with Attention: Educational Notebook

# 📘 Introduction
# In this notebook, we implement a simple encoder-decoder model with attention
# to understand how sequence-to-sequence models work in NLP.

# 🛠️ Requirements
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 🧾 Sample Data (English to French)
english_sentences = ["i love you", "he is smart", "she is nice"]
french_sentences = ["je t'aime", "il est intelligent", "elle est gentille"]

# 🔠 Tokenization
input_tokenizer = Tokenizer()
target_tokenizer = Tokenizer()

input_tokenizer.fit_on_texts(english_sentences)
target_tokenizer.fit_on_texts(["<start> "+s+" <end>" for s in french_sentences])

input_sequences = input_tokenizer.texts_to_sequences(english_sentences)
target_sequences = target_tokenizer.texts_to_sequences(["<start> "+s+" <end>" for s in french_sentences])

max_input_len = max([len(seq) for seq in input_sequences])
max_target_len = max([len(seq) for seq in target_sequences])

encoder_input_data = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
decoder_input_data = pad_sequences([seq[:-1] for seq in target_sequences], maxlen=max_target_len-1, padding='post')
decoder_target_data = pad_sequences([seq[1:] for seq in target_sequences], maxlen=max_target_len-1, padding='post')

# 🧱 Parameters
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
embedding_dim = 64
lstm_units = 128

# 🔧 Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = tf.keras.layers.Embedding(input_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True, return_sequences=True)(enc_emb)
encoder_states = [state_h, state_c]

# 🔧 Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = tf.keras.layers.Embedding(target_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm_output, _, _ = LSTM(lstm_units, return_sequences=True, return_state=True)(dec_emb, initial_state=encoder_states)

# 🔍 Attention
attention_layer = Attention()
context_vector = attention_layer([decoder_lstm_output, encoder_lstm])
concat = tf.keras.layers.Concatenate(axis=-1)([decoder_lstm_output, context_vector])
decoder_dense = Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(concat)

# ✅ Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

# 🧪 Dummy Training (for illustration only)
y = np.expand_dims(decoder_target_data, -1)
model.fit([encoder_input_data, decoder_input_data], y, epochs=100, verbose=0)

# 📌 Notes:
# - This model learns to translate short English sentences to French using attention.
# - Attention allows the decoder to focus on relevant parts of the input sequence.
# - This notebook is meant for educational purposes and is not production-grade.

# 🧠 Next: You can expand this to use real datasets like many-to-many translation tasks using datasets like Multi30k or WMT.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 64)             576       ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 64)             640       ['input_2[0][0]']             
                                                                                              

2025-06-29 21:50:39.835021: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 12 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


<keras.src.callbacks.History at 0x357d1a940>