<a href="https://colab.research.google.com/github/nourm77/reader/blob/main/arabic_chars_mnist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!ls -lh


In [None]:
!unzip -q "archive (4).zip" -d /content/arabic-chars-mnist


In [None]:
!ls /content/arabic-chars-mnist


In [None]:
import os

BASE_DIR  = '/content/arabic-chars-mnist'
TRAIN_DIR = os.path.join(BASE_DIR, 'train')
TEST_DIR  = os.path.join(BASE_DIR, 'test')

print("Train files:", len(os.listdir(TRAIN_DIR)))
print("Test  files:", len(os.listdir(TEST_DIR)))


In [None]:
import os
import glob
import re
import pandas as pd

# 1. Gather all image files (adjust extensions if needed)
train_paths = glob.glob(os.path.join(TRAIN_DIR, '*.*'))
test_paths  = glob.glob(os.path.join(TEST_DIR,  '*.*'))

# 2. Filter only common image formats
train_paths = [p for p in train_paths if p.lower().endswith(('.png','.jpg','.jpeg'))]
test_paths  = [p for p in test_paths  if p.lower().endswith(('.png','.jpg','.jpeg'))]

print(f"Found {len(train_paths)} train images and {len(test_paths)} test images")

# 3. Extract labels from filenames (e.g. 'alef12.png' → 'alef')
def get_label(path):
    fname = os.path.basename(path)
    return re.split(r'(\d+)', fname)[0]

# 4. Build your DataFrames
df_train = pd.DataFrame({
    'path':  train_paths,
    'label': [get_label(p) for p in train_paths]
})
df_test = pd.DataFrame({'path': test_paths})
df_test['label'] = None  # unknown for test

# 5. Inspect
print(df_train.shape, df_test.shape)
df_train.head()


In [None]:
!ls -R /content/arabic-chars-mnist


In [None]:
import cv2
import matplotlib.pyplot as plt

# 1. Get the sorted list of class names
labels = sorted(df_train['label'].unique())

# 2. Set up a row of subplots (one per class)
fig, axes = plt.subplots(1, len(labels), figsize=(30, 12), squeeze=False)

# 3. Sample and display one image for each label
for i, lbl in enumerate(labels):
    sample_path = df_train[df_train['label'] == lbl].sample(1)['path'].iloc[0]
    img = cv2.imread(sample_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    axes[0, i].imshow(img)
    axes[0, i].set_title(lbl)
    axes[0, i].axis('off')

plt.tight_layout()
plt.show()


In [None]:
import cv2
import numpy as np

def load_images(df, size=(32,32)):
    N = len(df)
    X = np.empty((N, size[0], size[1], 3), dtype=np.uint8)
    for i, path in enumerate(df['path']):
        img = cv2.imread(path)
        img = cv2.resize(img, size)
        X[i] = img
    return X

x_train = load_images(df_train)
x_test  = load_images(df_test)

print('x_train shape:', x_train.shape)
print('x_test  shape:', x_test.shape)


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Convert string labels → integer indices → one-hot vectors
le = LabelEncoder()
y_train_int = le.fit_transform(df_train['label'])
y_train     = to_categorical(y_train_int, num_classes=len(le.classes_))

print('Found classes:', le.classes_)
print('y_train shape:', y_train.shape)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, BatchNormalization,
    Dropout, Flatten, Dense
)

def create_model(input_shape=(32,32,3), n_classes=28):
    model = Sequential([
        Conv2D(32, (3,3), padding='same', activation='relu', input_shape=input_shape),
        MaxPooling2D(2,2),
        BatchNormalization(),

        Conv2D(64, (3,3), padding='same', activation='relu'),
        MaxPooling2D(2,2),
        BatchNormalization(),
        Dropout(0.2),

        Flatten(),
        Dense(128, activation='relu', kernel_regularizer='l2'),
        BatchNormalization(),
        Dropout(0.2),

        Dense(n_classes, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Instantiate and inspect
model = create_model(n_classes=len(le.classes_))
model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=8,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    x_train, y_train,
    validation_split=0.3,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)


In [None]:
  import matplotlib.pyplot as plt

# Accuracy
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.title('Accuracy')
plt.xlabel('Epoch'); plt.ylabel('Accuracy')
plt.legend(); plt.show()

# Loss
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.title('Loss')
plt.xlabel('Epoch'); plt.ylabel('Loss')
plt.legend(); plt.show()


In [None]:
import matplotlib.pyplot as plt

# Accuracy
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.title('Accuracy')
plt.xlabel('Epoch'); plt.ylabel('Accuracy')
plt.legend(); plt.show()

# Loss
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.title('Loss')
plt.xlabel('Epoch'); plt.ylabel('Loss')
plt.legend(); plt.show()


In [None]:
import random
# Predict class indices
preds = model.predict(x_test)
idxs = np.argmax(preds, axis=1)
labels_pred = le.inverse_transform(idxs)

# Show a few random examples
fig, axes = plt.subplots(2, 3, figsize=(12, 8))
for ax in axes.flatten():
    i = random.randint(0, len(x_test)-1)
    img = cv2.cvtColor(x_test[i], cv2.COLOR_BGR2RGB)
    ax.imshow(img)
    ax.set_title(labels_pred[i])
    ax.axis('off')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Split off 20% for evaluation
paths = df_train['path']
labels= df_train['label']
p_train, p_eval, y_train_lab, y_eval_lab = train_test_split(
    paths, labels, test_size=0.2, stratify=labels, random_state=42
)

# Load & preprocess
df_eval = pd.DataFrame({'path': p_eval})
x_eval = load_images(df_eval)
y_eval_int = le.transform(y_eval_lab)
y_eval = to_categorical(y_eval_int, num_classes=len(le.classes_))

# Evaluate
loss, acc = model.evaluate(x_eval, y_eval, verbose=1)
print(f"Held-out accuracy: {acc:.4f}")


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

gen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    validation_split=0.3
)

train_gen = gen.flow(x_train, y_train, batch_size=64, subset='training')
val_gen   = gen.flow(x_train, y_train, batch_size=64, subset='validation')

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=50,
    callbacks=[early_stop]
)


In [None]:
import os

img_path = '/content/arabic-chars-mnist/test/zain86.jpg'
print("Exists?", os.path.exists(img_path))
print("Directory listing:", os.listdir(os.path.dirname(img_path))[:10])


In [None]:
# pick one that actually shows up in the list
test_file = '/content/arabic-chars-mnist/train/feh3518.jpg'
print(predict_char(test_file, model, le))


In [None]:
print(predict_char('/content/arabic-chars-mnist/train/feh3518.jpg', model, le))
