<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Simple%20Grammatical%20Correction%20(Seq2Seq)/simple_grammar_correction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [21]:
import pandas as pd
import numpy as np
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, GRU, Dense, RepeatVector, TimeDistributed, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Synthetic Data
*Ground Truth*

In [4]:
CORRECT_SENTENCES = [
    "the cat sat on the mat",
    "i love to watch machine learning lectures",
    "this is a very simple test sentence",
    "the quick brown fox jumps over",
    "neural networks are very powerful tools"
]

# Add noise (Grammatical Errors)
def inject_noise(sentences):
    noisy_corpus = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) < 2:
            noisy_corpus.append(sentence)
            continue

        # Random indices
        swap_index = random.randint(0, len(words) - 2)
        # Swap
        words[swap_index], words[swap_index + 1] = words[swap_index + 1], words[swap_index]
        noisy_corpus.append(' '.join(words))
    return noisy_corpus

# Generate dataset
X_train_noisy = inject_noise(CORRECT_SENTENCES)
Y_train_correct = CORRECT_SENTENCES

print(f"Input (Noisy) {X_train_noisy[0]}")
print(f"Target (Correct) {Y_train_correct[0]}")

Input (Noisy) the cat sat on mat the
Target (Correct) the cat sat on the mat


# Tokenization
*For Seq2Seq models we must vectorize both Input & Output (Convert to integers) and pad them*

In [6]:
# Tokenization
tokenizer = Tokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(X_train_noisy + Y_train_correct)

VOCAB_SIZE = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {VOCAB_SIZE}")

Vocabulary Size: 31


In [11]:
# Convert to int
X_train_seq = tokenizer.texts_to_sequences(X_train_noisy)
Y_train_seq = tokenizer.texts_to_sequences(Y_train_correct)
max_len = max([len(x) for x in X_train_seq + Y_train_seq])

print(f"Input Sequence: {X_train_seq[0]}")
print(f"Target Sequence: {Y_train_seq[0]}\n\n")

# Padding
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
Y_train_padded = pad_sequences(Y_train_seq, maxlen=max_len, padding='post')

print(f"X_train_padded: {X_train_padded[0]}")
print(f"Y_train_padded: {Y_train_padded[0]}\n\n")

# Add special tokens
SOS_TOKEN_ID = 0
Y_decoder_input = np.zeros((len(Y_train_padded), max_len))
Y_decoder_input[:, 1:] = Y_train_padded[:, :-1]

print(f"Y_decoder_input: {Y_decoder_input[0]}")

Input Sequence: [2, 4, 5, 6, 7, 2]
Target Sequence: [2, 4, 5, 6, 2, 7]


X_train_padded: [2 4 5 6 7 2 0]
Y_train_padded: [2 4 5 6 2 7 0]


Y_decoder_input: [0. 2. 4. 5. 6. 2. 7.]


# Seq2Seq (Encoder-Decoder) Model Definition

In [34]:
EMBEDDING_DIM = 32
LSTM_UNITS = 32
TARGET_LENGTH = max_len

# Encoder
encoder_inputs = Input(shape=(max_len, ), name="encoder_input")
encoder_emb = Embedding(
    VOCAB_SIZE, EMBEDDING_DIM, name="encoder_embedding")(encoder_inputs)
_, encoder_state = GRU(LSTM_UNITS, return_state=True, name="encoder_gru")(encoder_emb)

# Decoder
decoder_inputs = Input(shape=(TARGET_LENGTH, 1)) # Teacher forcing input
decoder_gru = GRU(
    LSTM_UNITS, return_sequences=True)(decoder_inputs, initial_state=encoder_state)
output_layer = TimeDistributed(
    Dense(VOCAB_SIZE, activation='softmax'))(decoder_gru)

# Final model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=output_layer)

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

# Training

In [35]:
EPOCHS = 100
BATCH_SIZE = 32

callbacks = [
    ModelCheckpoint('best_reversal_model.keras', save_best_only=True, monitor='val_accuracy', mode='max'),
    # EarlyStopping(monitor='val_loss', patience=35, restore_best_weights=True)
]

print("\n--- Starting Model Training with Teacher Forcing ---")
history = model.fit(
    [X_train_padded, Y_decoder_input],
    Y_train_padded,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    callbacks=callbacks
)


--- Starting Model Training with Teacher Forcing ---
Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - accuracy: 0.0357 - loss: 3.5987 - val_accuracy: 0.0000e+00 - val_loss: 3.7777
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - accuracy: 0.0357 - loss: 3.5793 - val_accuracy: 0.0000e+00 - val_loss: 3.7760
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.1071 - loss: 3.5604 - val_accuracy: 0.0000e+00 - val_loss: 3.7745
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - accuracy: 0.1429 - loss: 3.5419 - val_accuracy: 0.0000e+00 - val_loss: 3.7733
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.1429 - loss: 3.5239 - val_accuracy: 0.0000e+00 - val_loss: 3.7722
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.1429 - loss: 3.5063 - val_a

# Inference

In [36]:
from tensorflow.keras.layers import Reshape

encoder_model = Model(encoder_inputs, encoder_state)

decoder_state_input = Input(shape=(LSTM_UNITS,))
decoder_input_single = Input(shape=(1,)) # Input shape for single token

# Reshape the input to be 3D (batch_size, timesteps, features)
decoder_input_reshaped = Reshape((1, 1))(decoder_input_single)

dec_out, dec_state = GRU(LSTM_UNITS, return_sequences=True, return_state=True)(decoder_input_reshaped, initial_state=decoder_state_input)
decoder_output_single = Dense(VOCAB_SIZE, activation='softmax')(dec_out)

decoder_model = Model([decoder_input_single, decoder_state_input], [decoder_output_single, dec_state])

# Decoding Function

In [37]:
def decode_sequence(input_seq_padded):
    # 1. Get the initial state from the Encoder
    state_value = encoder_model.predict(np.expand_dims(input_seq_padded, 0))

    # 2. Start with the start-of-sequence token (SOS - assuming ID 1, if 0 is padding)
    # Based on the tokenization and padding, the SOS token is not explicitly added
    # and padding is 0. We should start with the first token of the target sequence during inference
    # or use a dedicated SOS token if added during data preparation.
    # For this example, let's assume we start with an arbitrary token and rely on the model's learning.
    # A better approach would be to have a dedicated SOS token with a known ID.
    # Given the current setup, let's start with the token ID for '<unk>' which is 1 based on the tokenizer.
    next_token_id = tokenizer.word_index.get('<unk>', 1) # Use 1 as a fallback if <unk> not found

    decoded_sequence_ids = []

    for _ in range(max_len):
        # Current input for the Decoder: (1, 1, 1) - a single token
        decoder_input_step = np.array([[next_token_id]])

        # Predict the current step
        output, state_value = decoder_model.predict([decoder_input_step, state_value])

        # Get the ID of the token with the highest probability
        sampled_token_id = np.argmax(output[0, -1, :])

        # Stop if the end-of-sequence token is generated (assuming EOS is ID)
        # Based on padding=post and max_len, 0 is used for padding.
        # We can assume 0 as an implicit EOS token if the model learns to output it after the sequence.
        if sampled_token_id == 0:
            break

        decoded_sequence_ids.append(sampled_token_id)

        # Update the input for the next step
        next_token_id = sampled_token_id

    # Convert IDs to text
    final_text = tokenizer.sequences_to_texts([decoded_sequence_ids])[0]
    return final_text

# --- Test Execution ---
test_input = X_train_padded[0] # A training sample for testing
test_input_text = X_train_noisy[0] # Get the original noisy text

print("Original Noisy Text:", test_input_text)
print("Predicted Correct Text:", decode_sequence(test_input))

Original Noisy Text: the cat sat on mat the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 224ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Predicted Correct Text: over over the test test test test
