<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Vowel_Detection/vowel_detection_many_to_many_aligned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [35]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import random
import string

# Config

In [30]:
INPUT_SEQ_LENGTH = 15      # Sequence length for the model (Must match output length for this task)
MAX_WORD_LENGTH = 20       # Maximum length for synthetic word generation
NUM_SAMPLES = 5000
VOCAB_SIZE = 30
EMBEDDING_DIM = 32
LSTM_UNITS = 64
EPOCHS = 20
BATCH_SIZE = 32
PATIENCE = 5

# Special Token IDs
PAD_ID = 0
SOS_ID = 1 # Start of Sequence Token (NOT used in this specific Many-to-Many Aligned task, but kept for concept)
EOS_ID = 2 # End of Sequence Token (Used to signify end of the actual word)
# Actual letters will map from ID 3 onwards

# Data Generation

In [44]:
# Define Alphabet and Mappings
all_letters = list(string.ascii_lowercase)
char_to_int = {char: i + 3 for i, char in enumerate(all_letters)} # Letters start from ID 3
char_to_int['<UNK>'] = 1
char_to_int['<PAD>'] = 0

def get_char_id(char):
    return char_to_int.get(char, char_to_int["<UNK>"])

def generate_synthetic_word_data(num_samples, max_len):
    """Generates words and their corresponding Vowel/Consonant labels."""
    X_sequences, Y_sequences = [], []
    VOWELS = set('aeiou')

    for _ in range(num_samples):
        word_len = random.randint(5, max_len)
        word = ''.join(random.choices(all_letters, k=word_len))

        # Target Y: The label sequence (0 or 1) for EACH character
        labels = [1 if char in VOWELS else 0 for char in word]

        # Input X: The sequence of character IDs
        word_ids = [get_char_id(char) for char in word]

        X_sequences.append(word_ids)
        Y_sequences.append(labels)

    # Pad sequences to a fixed length (MAX_WORD_LENGTH)
    X_padded = pad_sequences(X_sequences, maxlen=max_len, padding='post', dtype='int32', value=PAD_ID)
    # Y must also be padded, but with PAD_ID (0), as it corresponds to an input character
    Y_padded = pad_sequences(Y_sequences, maxlen=max_len, padding='post', value=PAD_ID)

    return X_padded, Y_padded

# Generate Data
X, Y = generate_synthetic_word_data(NUM_SAMPLES, MAX_WORD_LENGTH)
Y = np.expand_dims(Y, axis=-1)  # ✅ تبدیل (N, 20) → (N, 20, 1)
print(f"Generated data shape: X={X.shape}, Y={Y.shape}")

# Split into Train/Test sets (STRATIFY is difficult here, so we skip it for simplicity/speed)
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)
print(f"Train set size: {len(X_train)}, Test set size: {len(X_test)}")

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("y_train dtype:", y_train.dtype)
print("Model output shape:", model.output_shape)

Generated data shape: X=(5000, 20), Y=(5000, 20, 1)
Train set size: 4000, Test set size: 1000
X_train shape: (4000, 20)
y_train shape: (4000, 20, 1)
y_train dtype: int32
Model output shape: (None, 20, 1)


# Model Definition

In [45]:
# تعریف DROPOUT_RATE در Config (اگر ندارید)
DROPOUT_RATE = 0.5

input_layer = Input(shape=(MAX_WORD_LENGTH,), name='input_layer')
embedding_layer = Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(input_layer)

bilstm_layer = Bidirectional(
    GRU(LSTM_UNITS, recurrent_dropout=0.2, return_sequences=True),
    name='bilstm_layer'
)(embedding_layer)

dropout_layer = Dropout(DROPOUT_RATE)(bilstm_layer)
output_layer = Dense(1, activation='sigmoid', name='output_layer')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

# Model Training

In [46]:
callbacks = [
    ModelCheckpoint(
        'best_vowel_model.keras', save_best_only=True, monitor='val_accuracy', mode='max'),
    EarlyStopping(
        monitor='val_loss', patience=5, restore_best_weights=True)
]

# Note: X_train and y_train are both (N, MAX_WORD_LENGTH)
history = model.fit(
    X_train, y_train, # Input and Target have the same sequence length
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.15,
    callbacks=callbacks
)


Epoch 1/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 62ms/step - accuracy: 0.9090 - loss: 0.3811 - val_accuracy: 1.0000 - val_loss: 0.0031
Epoch 2/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 41ms/step - accuracy: 1.0000 - loss: 0.0022 - val_accuracy: 1.0000 - val_loss: 4.6174e-04
Epoch 3/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 1.0000 - loss: 5.5309e-04 - val_accuracy: 1.0000 - val_loss: 2.0161e-04
Epoch 4/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 52ms/step - accuracy: 1.0000 - loss: 2.7603e-04 - val_accuracy: 1.0000 - val_loss: 1.1692e-04
Epoch 5/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 1.0000 - loss: 1.7198e-04 - val_accuracy: 1.0000 - val_loss: 7.6478e-05
Epoch 6/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 40ms/step - accuracy: 1.0000 - loss: 1.1555e-04 - val_accuracy: 1.0000 - val_l

# Evaluation

In [48]:
from sklearn.metrics import classification_report

best_model = tf.keras.models.load_model('best_vowel_model.keras')
test_loss, test_accuracy = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Generate detailed classification report
y_pred_probs = best_model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

# Flatten y_test and y_pred before passing to classification_report
print("\nClassification Report:")
print(classification_report(y_test.flatten(), y_pred.flatten(), target_names=['Consonant (0)', 'Vowel (1)']))

# Example Inference
test_word = "programming" # A word NOT seen during training
# Must manually tokenize/pad the new word
test_ids = [get_char_id(c) for c in test_word]
test_padded = pad_sequences([test_ids], maxlen=MAX_WORD_LENGTH, padding='post')

prediction_probs = best_model.predict(test_padded)
# For sequence output, we check the probability for each position
predicted_labels = (prediction_probs[0] > 0.5).astype(int)

print("\n--- Inference Example ---")
print(f"Word: {test_word}")
print(f"Prediction Shape (One per letter): {prediction_probs.shape}")
print(f"Predicted Tags (0=C, 1=V): {predicted_labels.flatten()[:len(test_word)]}")

Test Accuracy: 100.00%
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step

Classification Report:
               precision    recall  f1-score   support

Consonant (0)       1.00      1.00      1.00     17588
    Vowel (1)       1.00      1.00      1.00      2412

     accuracy                           1.00     20000
    macro avg       1.00      1.00      1.00     20000
 weighted avg       1.00      1.00      1.00     20000

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step

--- Inference Example ---
Word: programming
Prediction Shape (One per letter): (1, 20, 1)
Predicted Tags (0=C, 1=V): [0 0 1 0 0 1 0 0 1 0 0]
