# 1. Data prep & CNN pipeline for the Arabic **words** model  

This block mounts Google Drive, extracts the words dataset, preprocesses raw WAV files into fixed-length normalized waveforms, encodes labels, and defines a 1D CNN architecture for training a multi-class Arabic word classifier.  

- **Mount & extract:** Mounts Drive, locates `arabic_words_dataset.zip`, unzips into `/content/arabic_words_dataset`, and lists top-level directories to confirm dataset structure.  
- **Dataset layout & classes:** Scans each subfolder (one per word class). Prints number of `.wav` files per class to verify balance.  
- **Audio preprocessing:**  
  - **Loading:** Uses `librosa.load()` at **16 kHz** input.  
  - **Resampling:** Downsamples to **8 kHz** (`SR_MODEL=8000`) for lighter models.  
  - **Fixed length:** Pads/truncates each clip to exactly **8000 samples** (`FIX_LEN=8000` ≈ 1 sec).  
  - **Normalization:** Applies per-sample standardization (`(x – mean) / std`).  
  - **Final shape:** Stores as `(num_samples, 8000, 1)` for Conv1D input.  
- **Labels & encoding:**  
  - Collects `all_label` list, encodes with `LabelEncoder`, and converts to **one-hot vectors** via `to_categorical`.  
  - Prints full class list (truncated if long).  
- **Train/validation split:** Uses `train_test_split` with **stratification** to maintain class balance. Splits **80/20** with reproducible seed (`SEED=777`). Reports shapes of `X_train`, `X_val`.  
- **Model architecture (Conv1D CNN):**  
  - **Input:** (8000, 1).  
  - **Conv blocks:** Four Conv1D layers with filters `[8, 16, 32, 64]` and kernel sizes `[13, 11, 9, 7]`. Each block has: Conv → BatchNorm → MaxPool1D → Dropout(0.3).  
  - **Pooling:** GlobalAveragePooling1D for temporal aggregation.  
  - **Dense layers:** Dense(256) → Dropout(0.3) → Dense(128) → Dropout(0.3).  
  - **Output:** Dense(#classes, softmax).  
- **Optimizer & compile:**  
  - Attempts AdamW (`lr=1e-3, weight_decay=1e-4`), falls back to Adam if unavailable.  
  - Loss: categorical crossentropy.  
  - Metric: accuracy.  
  - Model summary confirms layer stack and parameter counts.  

**Key hyperparameters:** `SR_LOAD=16000`, `SR_MODEL=8000`, `FIX_LEN=8000`, `SEED=777`, conv kernel sizes `[13,11,9,7]`, dropout rate `0.3`, Dense layers `[256,128]`, optimizer `AdamW/Adam (lr=1e-3)`.  

**Purpose:** Prepares Arabic word recordings into standardized 8kHz waveforms and trains a compact but deep 1D CNN for multi-class word recognition, forming the basis for the app’s higher-level recitation feedback.  

In [None]:
from google.colab import drive, files
from IPython.display import Audio, display

import os
import re
import time
import random
import glob
import pathlib
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile

import librosa
import librosa.display

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, BatchNormalization, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical

In [None]:
SEED = 777
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

tf.random.set_seed(SEED)

warnings.filterwarnings("ignore")

print("✅ Libraries imported successfully")


In [None]:
drive.mount('/content/drive')

zip_path = "/content/drive/MyDrive/QariAI/arabic_words_dataset.zip"

!mkdir -p /content/arabic_words_dataset
!unzip -q "$zip_path" -d /content/arabic_words_dataset

DATASET_PATH = "/content/arabic_words_dataset"
print("📂 Unzipped dataset contents:")
print(os.listdir(DATASET_PATH)[:10])


In [None]:
SR_LOAD = 16000
SR_MODEL = 8000
FIX_LEN = 8000

def norm_wave(w):
    w = np.asarray(w, dtype=np.float32)
    m, s = w.mean(), w.std()
    return (w - m) / (s + 1e-8)

class_audio_train = sorted([d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))])
print(f"🔎 Found {len(class_audio_train)} classes:")
print(class_audio_train)

all_wave = []
all_label = []

for label in class_audio_train:
    folder = os.path.join(DATASET_PATH, label)
    wavs = [f for f in os.listdir(folder) if f.lower().endswith(".wav")]
    print(f"⏳ Loading '{label}' ({len(wavs)} files) ...")

    for fname in wavs:
        fpath = os.path.join(folder, fname)
        try:
            y, sr = librosa.load(fpath, sr=SR_LOAD)
            y8 = librosa.resample(y, orig_sr=SR_LOAD, target_sr=SR_MODEL)
            if len(y8) < FIX_LEN:
                pad = FIX_LEN - len(y8)
                y8 = np.pad(y8, (0, pad), mode="constant")
            elif len(y8) > FIX_LEN:
                y8 = y8[:FIX_LEN]

            y8 = norm_wave(y8)
            all_wave.append(y8)
            all_label.append(label)
        except Exception as e:
            print(f"   ⚠️ Skipped {fname}: {e}")

all_wave = np.array(all_wave, dtype=np.float32).reshape(-1, FIX_LEN, 1)

le = LabelEncoder()
y_idx = le.fit_transform(all_label)
classes = list(le.classes_)
y = to_categorical(y_idx, num_classes=len(classes))

X_train, X_val, y_train, y_val = train_test_split(
    all_wave, y, stratify=y_idx, test_size=0.20, random_state=777, shuffle=True
)

print("\n✅ Preprocess complete:")
print(f" - Total samples: {len(all_wave)}")
print(f" - Train: {X_train.shape}, Val: {X_val.shape}")
print(f" - Classes: {len(classes)} -> {classes[:10]}{' ...' if len(classes) > 10 else ''}")


In [None]:
print("🔧 TF version:", tf.__version__)
print("🧠 GPU:", tf.config.list_physical_devices('GPU'))

tf.keras.backend.clear_session()

inputs = Input(shape=(8000, 1))

x = Conv1D(8, 13, padding='valid', activation='relu', strides=1)(inputs)
x = BatchNormalization()(x)
x = MaxPooling1D(3)(x)
x = Dropout(0.3)(x)

x = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(x)
x = BatchNormalization()(x)
x = MaxPooling1D(3)(x)
x = Dropout(0.3)(x)

x = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(x)
x = BatchNormalization()(x)
x = MaxPooling1D(3)(x)
x = Dropout(0.3)(x)

x = Conv1D(64, 7, padding='valid', activation='relu', strides=1)(x)
x = BatchNormalization()(x)
x = MaxPooling1D(3)(x)
x = Dropout(0.3)(x)

x = GlobalAveragePooling1D()(x)

x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)

outputs = Dense(len(classes), activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

try:
    optimizer = tf.keras.optimizers.experimental.AdamW(learning_rate=1e-3, weight_decay=1e-4)
except Exception as e:
    print("ℹ️ AdamW not available, falling back to Adam:", e)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()


# 2. Training & evaluation for the Arabic **words** model  

This block trains the Conv1D CNN using the prepared waveform dataset, applying callbacks for early stopping, dynamic learning rate adjustment, and checkpointing.  

- **Early stopping:**  
  - Monitors `val_loss`, mode = `min`.  
  - Stops training if no improvement after **10 epochs** with at least `1e-4` delta.  
  - Restores the best weights automatically.  

- **Learning rate scheduling (ReduceLROnPlateau):**  
  - Monitors `val_loss`.  
  - Reduces learning rate by factor **0.5** if no improvement for 4 epochs.  
  - Minimum LR enforced at `1e-5`.  

- **Checkpointing:**  
  - Saves the model to `/content/best_model.h5`.  
  - Uses `val_accuracy` as criterion (highest value kept).  

- **Model training:**  
  - Runs up to **50 epochs** with `batch_size=64`.  
  - Trains on `(X_train, y_train)` with validation on `(X_val, y_val)`.  
  - All three callbacks (`es`, `plateau`, `ckpt`) applied.  
  - Prints confirmation of completion and path of best saved model.  

- **Best epoch tracking:**  
  - Reports the epoch index with highest `val_accuracy`.  
  - Displays `val_accuracy` and `val_loss` values at that epoch.  

**Purpose:** Provides robust model training with automatic prevention of overfitting, dynamic learning rate scheduling, and safe saving of the best model for deployment.  


In [None]:
es = EarlyStopping(
    monitor='val_loss',
    mode='min',
    patience=10,
    min_delta=1e-4,
    restore_best_weights=True,
    verbose=1
)

plateau = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=4,
    min_lr=1e-5,
    verbose=1
)

ckpt_path = "/content/best_model.h5"
ckpt = ModelCheckpoint(
    ckpt_path,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1
)

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[es, plateau, ckpt],
    verbose=1
)

print("\n✅ Training complete. Best model saved to:", ckpt_path)
best_epoch = int(np.argmax(history.history['val_accuracy'])) + 1
print(f"🔝 Best epoch: {best_epoch} | "
      f"val_accuracy={history.history['val_accuracy'][best_epoch-1]:.4f} | "
      f"val_loss={history.history['val_loss'][best_epoch-1]:.4f}")


# 3. Evaluation & Latency for the Arabic **words** model  

This block reloads the best saved checkpoint, evaluates the CNN on the validation set, and measures runtime performance.  

- **Validation metrics:**  
  - Computes predictions on the held-out validation set.  
  - Reports global accuracy, confusion matrix, and classification report (precision, recall, F1).  
  - Plots per-class accuracy to highlight strengths and weaknesses.  
  - Prints the 5 lowest-performing classes.  
  - Calculates Top-1 and Top-3 accuracy to assess near-miss performance.  

- **Latency benchmarking:**  
  - Measures **single-sample latency** (ms per utterance) to simulate real-time app use.  
  - Measures **batched latency** (per-sample at batch size 64) for throughput analysis.  
  - Confirms model inference is suitable for interactive pronunciation feedback (~1 second clips).  

**Purpose:** Provides a comprehensive assessment of both model accuracy and speed, ensuring the Arabic words classifier is effective and responsive enough for integration into the mobile learning application.  


In [None]:
best_model = tf.keras.models.load_model("/content/best_model.h5")

Y_pred = best_model.predict(X_val, batch_size=128, verbose=0)
y_pred = np.argmax(Y_pred, axis=1)
y_true = np.argmax(y_val, axis=1)

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=classes)

fig, ax = plt.subplots(figsize=(12,12))
disp.plot(ax=ax, cmap="Blues", xticks_rotation=90, colorbar=False)
plt.title("Confusion Matrix (Validation)")
plt.show()

print("\nClassification Report")
print(classification_report(y_true, y_pred, target_names=classes, digits=3))

acc = (y_pred == y_true).mean()
print(f"\n✅ Validation Accuracy (recomputed): {acc:.4f}")


In [None]:
per_class_total = cm.sum(axis=1)
per_class_correct = np.diag(cm)
per_class_acc = per_class_correct / np.maximum(per_class_total, 1)

order = np.argsort(per_class_acc)
sorted_acc = per_class_acc[order]
sorted_names = [classes[i] for i in order]

plt.figure(figsize=(10, 14))
plt.barh(sorted_names, sorted_acc)
plt.xlabel("Per-class Accuracy")
plt.title("Per-class Accuracy (Validation Set)")
plt.xlim(0, 1.0)
plt.tight_layout()
plt.show()

print("🔎 5 Worst-performing classes:")
for i in range(5):
    idx = order[i]
    print(f" - {classes[idx]}: {per_class_acc[idx]:.3f} (correct {cm[idx, idx]}/{per_class_total[idx]})")

def top_k_accuracy(probs, true_idx, k=3):
    topk = np.argsort(probs, axis=1)[:, -k:]
    hits = np.any(topk == true_idx[:, None], axis=1)
    return hits.mean()

top1 = (y_pred == y_true).mean()
top3 = top_k_accuracy(Y_pred, y_true, k=3)

print(f"\nTop-1 Accuracy: {top1:.4f}")
print(f"Top-3 Accuracy: {top3:.4f}")


In [None]:
best = tf.keras.models.load_model("/content/best_model.h5")

def bench_single_sample(model, X, warmup=20, iters=200):
    model.predict(X[:warmup], batch_size=64, verbose=0)
    t0 = time.time()
    for i in range(iters):
        model.predict(X[i:i+1], batch_size=1, verbose=0)
    t1 = time.time()
    per = (t1 - t0) / iters
    return per

def bench_batch(model, X, batch_size=64, iters=10):
    model.predict(X[:batch_size], batch_size=batch_size, verbose=0)
    t0 = time.time()
    for i in range(iters):
        model.predict(X[:batch_size], batch_size=batch_size, verbose=0)
    t1 = time.time()
    per_batch = (t1 - t0) / iters
    return per_batch / batch_size

Xslice = X_val[:256]

ss = bench_single_sample(best, Xslice, warmup=32, iters=128)
bs = bench_batch(best, Xslice, batch_size=64, iters=20)

print(f"🕒 Single-sample avg latency: {ss*1000:.2f} ms  (~{1.0/ss:.1f} samples/sec)")
print(f"🕒 Batched avg latency (per sample @64): {bs*1000:.2f} ms")
print("Note: GPU on Colab can vary; CPU typically higher latency but similar ordering.")


In [None]:
BEST_PATH = "/content/best_model.h5"
best = tf.keras.models.load_model(BEST_PATH)

try:
    classes
except NameError:
    DATASET_PATH = "/content/arabic_words_dataset"
    classes = sorted([d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))])

SR_LOAD = 16000
SR_MODEL = 8000
FIX_LEN = 8000

def norm_wave(w):
    w = np.asarray(w, dtype=np.float32)
    m, s = w.mean(), w.std()
    return (w - m) / (s + 1e-8)

def preprocess_wave_path(path, sr_load=SR_LOAD, sr_model=SR_MODEL, fix_len=FIX_LEN):
    """Load any .wav, convert to mono 16k, resample to 8k, pad/trim to 1s, normalize, add dims."""
    y, sr = librosa.load(path, sr=sr_load, mono=True)
    y8 = librosa.resample(y, orig_sr=sr, target_sr=sr_model)
    if len(y8) < fix_len:
        y8 = np.pad(y8, (0, fix_len - len(y8)), mode="constant")
    elif len(y8) > fix_len:
        y8 = y8[:fix_len]
    y8 = norm_wave(y8).reshape(1, fix_len, 1)
    return y8

def predict_file(path):
    """Return (pred_label, pred_conf, top5_dict) for a given .wav path."""
    x = preprocess_wave_path(path)
    probs = best.predict(x, verbose=0)[0]
    top = int(np.argmax(probs))
    pred_label = classes[top]
    pred_conf = float(probs[top])
    top5_idx = np.argsort(probs)[-5:][::-1]
    top5 = {classes[i]: float(probs[i]) for i in top5_idx}
    return pred_label, pred_conf, top5

print("✅ Model & helpers ready. Classes:", len(classes))


In [None]:
print("📤 Select one or more .wav files to test (ideally ~1 second).")
uploaded = files.upload()

for fname, _ in uploaded.items():
    path = f"/content/{fname}"
    try:
        label, conf, top5 = predict_file(path)
        print("\n────────────")
        print(f"🎧 {fname}")
        display(Audio(path))
        print(f"🔮 Prediction: {label}  ({conf:.2%})")
        print("🏅 Top‑5:")
        for k, v in top5.items():
            print(f"   - {k:10s}: {v:.2%}")
    except Exception as e:
        print(f"⚠️ Error on {fname}: {e}")
