In [4]:
import os

# Base dataset path
BASE_DIR = "/cluster/home/miolate21/FER_biasmitigation1/data/"
datasets = ['fer2013', 'ckplus', 'rafdb']

for dataset in datasets:
    for split in ['train', 'test']:
        split_path = os.path.join(BASE_DIR, dataset, split)
        if not os.path.exists(split_path):
            continue

        for emotion in os.listdir(split_path):
            emotion_path = os.path.join(split_path, emotion)
            if not os.path.isdir(emotion_path):
                continue

            images = sorted(os.listdir(emotion_path))  # Sort for reproducibility

            for idx, img_name in enumerate(images):
                src = os.path.join(emotion_path, img_name)

                # Skip non-files (e.g., directories like .ipynb_checkpoints)
                if not os.path.isfile(src) or img_name.startswith('.'):
                    continue

                dst_name = f"{emotion}_{dataset}_{split}_{idx:05d}.jpg"
                dst = os.path.join(emotion_path, dst_name)

                os.rename(src, dst)

print("✅ All files renamed successfully.")

✅ All files renamed successfully.


In [None]:
# ====== Cell 1: Load Libraries ======
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from tqdm import tqdm

# ====== Cell 2: Define Paths and Setup ======
BASE_DIR = "/cluster/home/miolate21/FER_biasmitigation1/data"
datasets = ['fer2013', 'ckplus', 'rafdb']
splits = ['train', 'test']

IMG_SIZE = (224, 224)  # ResNet50 input size
emotion_to_idx = {
    'angry': 0,
    'disgust': 1,
    'fear': 2,
    'happy': 3,
    'neutral': 4,
    'sad': 5,
    'surprise': 6
}

# ====== Cell 3: Load Images and Labels ======
def load_dataset(dataset, split):
    images = []
    labels = []

    split_dir = os.path.join(BASE_DIR, dataset, split)
    if not os.path.exists(split_dir):
        return np.array(images), np.array(labels)  # Empty arrays

    for emotion in os.listdir(split_dir):
        if emotion.startswith('.'):
            continue  

        emotion_path = os.path.join(split_dir, emotion)
        if not os.path.isdir(emotion_path):
            continue

        img_files = os.listdir(emotion_path)

        for img_file in tqdm(img_files, desc=f"{dataset} {split} {emotion}"):
            img_path = os.path.join(emotion_path, img_file)

            if not os.path.isfile(img_path):
                print(f"Skipping non-file {img_path}")
                continue

            try:
                img = image.load_img(img_path, color_mode='rgb', target_size=IMG_SIZE)
                img_array = image.img_to_array(img)

                images.append(img_array)
                labels.append(emotion_to_idx.get(emotion.split('_')[0], 0))  
            except Exception as e:
                print(f"Skipping file {img_path}: {e}")

    return np.array(images), np.array(labels)

# ====== Cell 4: Load all datasets ======
X_train, y_train = [], []
X_test, y_test = [], []

for dataset in datasets:
    X_tr, y_tr = load_dataset(dataset, "train")
    X_te, y_te = load_dataset(dataset, "test")

    X_train.append(X_tr)
    y_train.append(y_tr)
    X_test.append(X_te)
    y_test.append(y_te)

# Concatenate everything
X_train = np.concatenate(X_train, axis=0)
y_train = np.concatenate(y_train, axis=0)
X_test = np.concatenate(X_test, axis=0)
y_test = np.concatenate(y_test, axis=0)

print(f"Training samples: {X_train.shape}")
print(f"Testing samples: {X_test.shape}")

# ====== Cell 5: Preprocess for ResNet50 ======
X_train = preprocess_input(X_train)
X_test = preprocess_input(X_test)

print("✅ Preprocessing complete.")

# ====== Cell 6: Save final arrays ======
output_dir = '/cluster/home/miolate21/FER_biasmitigation1/data_preprocessed/'
os.makedirs(output_dir, exist_ok=True)  # Make sure the folder exists

np.save(os.path.join(output_dir, 'X_train.npy'), X_train)
np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
np.save(os.path.join(output_dir, 'X_test.npy'), X_test)
np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

print("✅ All datasets saved successfully.")

fer2013 train neutral: 100%|██████████| 4965/4965 [00:03<00:00, 1337.44it/s]
fer2013 train happy:  88%|████████▊ | 6376/7216 [00:04<00:00, 1344.56it/s]

Skipping non-file /cluster/home/miolate21/FER_biasmitigation1/data/fer2013/train/happy/happy_fer2013_train_00000.jpg


fer2013 train happy: 100%|██████████| 7216/7216 [00:05<00:00, 1346.76it/s]
fer2013 train surprise: 100%|██████████| 3171/3171 [00:02<00:00, 1340.77it/s]
fer2013 train fear: 100%|██████████| 4097/4097 [00:03<00:00, 1336.31it/s]
fer2013 train disgust: 100%|██████████| 436/436 [00:00<00:00, 1347.68it/s]
fer2013 train sad: 100%|██████████| 4830/4830 [00:03<00:00, 1343.52it/s]
fer2013 train angry:  37%|███▋      | 1474/3996 [00:01<00:01, 1335.39it/s]

Skipping non-file /cluster/home/miolate21/FER_biasmitigation1/data/fer2013/train/angry/angry_fer2013_train_00000.jpg


fer2013 train angry: 100%|██████████| 3996/3996 [00:02<00:00, 1333.26it/s]


In [None]:
# ===== Auto-create processed_data.npz safely =====

import numpy as np
import os

# Paths
preprocessed_dir = '/cluster/home/miolate21/FER_biasmitigation1/data_preprocessed/'
npz_save_path = '/cluster/home/miolate21/FER_biasmitigation1/data_preprocessed/processed_data.npz'

# Step 1: Check if variables exist
need_reload = False
try:
    _ = X_train.shape
    _ = y_train.shape
    _ = X_test.shape
    _ = y_test.shape
    print("✅ X_train, y_train, X_test, y_test already loaded.")
except NameError:
    need_reload = True
    print("⚠️ Variables not found in memory. Will reload from .npy files.")

# Step 2: Reload if needed
if need_reload:
    print("🔄 Reloading arrays from preprocessed .npy files...")
    X_train = np.load(os.path.join(preprocessed_dir, 'X_train.npy'))
    y_train = np.load(os.path.join(preprocessed_dir, 'y_train.npy'))
    X_test = np.load(os.path.join(preprocessed_dir, 'X_test.npy'))
    y_test = np.load(os.path.join(preprocessed_dir, 'y_test.npy'))
    print(f"✅ Reloaded! Shapes: X_train {X_train.shape}, X_test {X_test.shape}")

# Step 3: Save into .npz
os.makedirs(os.path.dirname(npz_save_path), exist_ok=True)  

np.savez(
    npz_save_path,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

print(f"✅ Saved processed_data.npz at: {npz_save_path}")

✅ X_train, y_train, X_test, y_test already loaded.
