In [1]:
# data augmentation
import os
import numpy as np
import scipy
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical, img_to_array, load_img


In [2]:
train_dir = 'data/train'
test_dir = 'data/test'

In [3]:
# LOAD IMAGES AND lABELS

# Function to load images and labels from directory
def load_images_from_directory(directory):
    images = []
    labels = []
    label_map = {folder: idx for idx, folder in enumerate(os.listdir(directory))}
    for label, idx in label_map.items():
        class_dir = os.path.join(directory, label)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            img = load_img(img_path, target_size=(48, 48))  # Adjust target size as needed
            img = img_to_array(img)
            images.append(img)
            labels.append(idx)
    images = np.array(images)
    labels = to_categorical(labels, num_classes=len(label_map))
    return images, labels, label_map

# Load training data
X_train, y_train, label_map = load_images_from_directory(train_dir)

# Load test data
X_test, y_test, _ = load_images_from_directory(test_dir)

In [4]:
X_train = X_train.astype('float32')
X_train.shape

(28709, 48, 48, 3)

In [5]:
y_train

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [6]:
label_map

{'angry': 0,
 'disgust': 1,
 'fear': 2,
 'happy': 3,
 'neutral': 4,
 'sad': 5,
 'surprise': 6}

In [7]:
import pandas as pd

labels = [label for label, idx in label_map.items() for _ in range(len(os.listdir(os.path.join(train_dir, label))))]
df = pd.DataFrame({'emotion': labels})
df

Unnamed: 0,emotion
0,angry
1,angry
2,angry
3,angry
4,angry
...,...
28704,surprise
28705,surprise
28706,surprise
28707,surprise


In [8]:
# Define the output directory for augmented images
output_dir = 'data/preview_train'

In [9]:

def data_augmentation(X, y, df, output_dir, target_count=None):
    # Create an ImageDataGenerator object
    datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        zoom_range=0.2,
        brightness_range=(0.95, 1.05),
        horizontal_flip=True,
        vertical_flip=False,
        fill_mode='nearest'
    )

    # Create a balanced dataset
    balanced_X = []
    balanced_y = []
    # Set the target count to the maximum number of images in a class if not provided
    target_count = target_count if target_count else max(df['emotion'].value_counts())

    distribution = []

    # Loop through each class
    for class_label in df['emotion'].unique():
        class_index = label_map[class_label]  # Get the index for the class label
        class_indices = np.where(y[:, class_index] == 1)[0]
        class_images = X[class_indices]
        class_labels = y[class_indices]
        num_images = class_images.shape[0]
        distribution.append(num_images)

        augmentations_needed = target_count - num_images

        # Create a directory for each class to save augmented images
        class_output_dir = os.path.join(output_dir, str(class_label))
        os.makedirs(class_output_dir, exist_ok=True)

        while augmentations_needed > 0:
            for img, label in zip(class_images, class_labels):
                if augmentations_needed <= 0:
                    break
                img = img.reshape((1,) + img.shape)
                augmented_img = next(datagen.flow(img, batch_size=1, save_to_dir=class_output_dir, save_prefix='aug', save_format='jpeg'))
                balanced_X.append(augmented_img.squeeze())
                balanced_y.append(label)
                augmentations_needed -= 1

        balanced_X.extend(class_images)
        balanced_y.extend(class_labels)

    balanced_X = np.array(balanced_X)
    balanced_y = np.array(balanced_y)

    return balanced_X, balanced_y, distribution


In [10]:
# Balance and save the augmented training data
balanced_X_train, balanced_y_train, distribution = data_augmentation(X_train, y_train, df, output_dir)

print(f'Balanced X_train shape: {balanced_X_train.shape}')
print(f'Balanced y_train shape: {balanced_y_train.shape}')
print(f'Class distribution: {distribution}')

Balanced X_train shape: (50505, 48, 48, 3)
Balanced y_train shape: (50505, 7)
Class distribution: [3995, 436, 4097, 7215, 4965, 4830, 3171]


In [11]:
labels = [label for label, idx in label_map.items() for _ in range(len(os.listdir(os.path.join(test_dir, label))))]
df = pd.DataFrame({'emotion': labels})
df

Unnamed: 0,emotion
0,angry
1,angry
2,angry
3,angry
4,angry
...,...
7173,surprise
7174,surprise
7175,surprise
7176,surprise


In [12]:
output_dir = 'data/preview_test'
balanced_X_test, balanced_y_test, distribution = data_augmentation(X_test, y_test, df, output_dir)

In [13]:
print(f'Balanced X_test shape: {balanced_X_test.shape}')
print(f'Balanced y_test shape: {balanced_y_test.shape}')
print(f'Class distribution: {distribution}')

Balanced X_test shape: (12418, 48, 48, 3)
Balanced y_test shape: (12418, 7)
Class distribution: [958, 111, 1024, 1774, 1233, 1247, 831]


(12418, 7)