IMPORT LIBABRIES

In [10]:
import os
import shutil
import random

**FER2013 PRE-PROCESSING**
1. Split the original dataset into 3 splits; val, test, train
2. Handle Class imbalance using -- stratified split
3. (maybe train the model using only 4 classes if time is an issue -- decision based on the class imabalance) 

In [11]:
# Paths for FER2013 dataset
notebook_dir = os.path.dirname(os.path.abspath("__file__"))

FER_original_train = os.path.join(notebook_dir, "FER2013_original", "train")
FER_original_test = os.path.join(notebook_dir, "FER2013_original", "test")
FER_dataset_root = os.path.join(notebook_dir, "FER2013_split")

train_folder = os.path.join(FER_dataset_root, "train")
val_folder = os.path.join(FER_dataset_root, "val")
test_folder = os.path.join(FER_dataset_root, "test")

# Create output folders
for folder in [train_folder, val_folder, test_folder]:
    os.makedirs(folder, exist_ok=True)


# Split percentages
train_pct = 0.8
val_pct = 0.1
test_pct = 0.1

random.seed(42)  # reproducible shuffling

In [12]:
# -----------------------------
# Combine original train + test for each class and split
# -----------------------------
classes = os.listdir(FER_original_train)

for cls in classes:
    # Collect all images from train + test
    cls_train_src = os.path.join(FER_original_train, cls)
    cls_test_src = os.path.join(FER_original_test, cls)
    
    if not os.path.isdir(cls_train_src):
        continue  # skip hidden files

    all_imgs = []
    for folder in [cls_train_src, cls_test_src]:
        if os.path.isdir(folder):
            imgs = [f for f in os.listdir(folder)
                    if os.path.isfile(os.path.join(folder, f)) and not f.startswith(".")]
            all_imgs.extend([os.path.join(folder, f) for f in imgs])

    random.shuffle(all_imgs)
    total = len(all_imgs)
    train_end = int(total * train_pct)
    val_end = train_end + int(total * val_pct)

    # Define destination folders
    cls_train_dst = os.path.join(train_folder, cls)
    cls_val_dst = os.path.join(val_folder, cls)
    cls_test_dst = os.path.join(test_folder, cls)
    for folder in [cls_train_dst, cls_val_dst, cls_test_dst]:
        os.makedirs(folder, exist_ok=True)

    # Copy images
    for img in all_imgs[:train_end]:
        shutil.copy(img, cls_train_dst)
    for img in all_imgs[train_end:val_end]:
        shutil.copy(img, cls_val_dst)
    for img in all_imgs[val_end:]:
        shutil.copy(img, cls_test_dst)

print("✅ Dataset successfully split into exact 80/10/10 stratified train/val/test")

✅ Dataset successfully split into exact 80/10/10 stratified train/val/test


In [13]:
# CHECK SPLITS
for split in ["train", "val", "test"]:
    print(f"\n{split.upper()} counts:")
    split_path = os.path.join(FER_dataset_root, split)
    for cls in os.listdir(split_path):
        cls_path = os.path.join(split_path, cls)
        if os.path.isdir(cls_path):
            print(f"{cls}: {len(os.listdir(cls_path))} images")



TRAIN counts:
happy: 7191 images
sad: 4861 images
fear: 4096 images
surprise: 3201 images
neutral: 4958 images
angry: 3962 images
disgust: 437 images

VAL counts:
happy: 898 images
sad: 607 images
fear: 512 images
surprise: 400 images
neutral: 619 images
angry: 495 images
disgust: 54 images

TEST counts:
happy: 900 images
sad: 609 images
fear: 513 images
surprise: 401 images
neutral: 621 images
angry: 496 images
disgust: 56 images
