<a href="https://colab.research.google.com/github/megmarv/PsychoAI-/blob/Emotion-Identification2/preprocessing_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import cv2
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# Define paths and parameters
data_dir = "/content/drive/MyDrive/CNN model/new_dataset_for_custom_model"  # Replace with your dataset path
emotions = ["anger", "happy", "sad", "neutral", "surprise", "fear", "disgust"]  # Add your emotion classes
target_size = (48, 48)
grayscale = True

# Function to preprocess images
def preprocess_image(image_path, target_size, grayscale):
    image = cv2.imread(image_path)
    if grayscale:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image, target_size)
    return image

# Load and preprocess dataset
images = []
labels = []

for emotion in emotions:
    emotion_dir = os.path.join(data_dir, emotion)
    emotion_images = os.listdir(emotion_dir)

    for image_name in emotion_images:
        image_path = os.path.join(emotion_dir, image_name)
        image = preprocess_image(image_path, target_size, grayscale)
        images.append(image)
        labels.append(emotion)

# Convert to numpy arrays
images = np.array(images)
labels = np.array(labels)

# Normalize pixel values to [0, 1]
images = images / 255.0

# Handle class imbalance
def balance_classes(images, labels):
    unique_classes, counts = np.unique(labels, return_counts=True)
    min_samples = min(counts)

    balanced_images = []
    balanced_labels = []

    for emotion in unique_classes:
        emotion_indices = np.where(labels == emotion)[0]
        sampled_indices = resample(emotion_indices, n_samples=min_samples, replace=False, random_state=42)
        balanced_images.extend(images[sampled_indices])
        balanced_labels.extend(labels[sampled_indices])

    return np.array(balanced_images), np.array(balanced_labels)

balanced_images, balanced_labels = balance_classes(images, labels)

# Encode labels to integers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(balanced_labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(balanced_images, encoded_labels, test_size=0.2, random_state=42)

# Reshape images for model input (add channel dimension for grayscale)
if grayscale:
    X_train = X_train.reshape(X_train.shape[0], target_size[0], target_size[1], 1)
    X_test = X_test.reshape(X_test.shape[0], target_size[0], target_size[1], 1)

# Save preprocessed data
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

print("Preprocessing complete!")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Preprocessing complete!
Training data shape: (13871, 48, 48, 1)
Testing data shape: (3468, 48, 48, 1)
