# 💾 EDA

In [None]:
import os
import math
import warnings

import numpy as np
import pandas as pd
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json
import pickle
import seaborn as sns
import scikitplot
from keras.layers import Conv2D, MaxPool2D, Flatten,Dense,Dropout,BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2
from tensorflow.keras.applications import VGG16, InceptionResNetV2
from keras import regularizers
from tensorflow.keras.optimizers import Adam,RMSprop,SGD,Adamax
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator

# Ensure warnings are ignored
warnings.filterwarnings('ignore')

# Print TensorFlow version
print(tf.__version__)


In [None]:
train_dir = "/kaggle/input/fer2013/train" # Directory containing the training data
test_dir = "/kaggle/input/fer2013/test"  # Directory containing the validation data
gen_dir = "/kaggle/input/generated-fer/fer2013-300/fer2013-300"

In [None]:
# Define parameters
num_classes = 7  # Number of classes in your dataset, adjust as needed
num_images_per_class = 5  # Number of images to visualize per class

# 1. Basic Information about Dataset
def dataset_info(directory):
    class_names = os.listdir(directory)
    num_samples = {class_name: len(os.listdir(os.path.join(directory, class_name))) for class_name in class_names}
    
    print("Classes and Sample Counts:")
    for class_name, count in num_samples.items():
        print(f"{class_name}: {count} images")
    return class_names, num_samples

In [None]:
# 2. Class Distribution Plot
def plot_class_distribution(num_samples):
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(num_samples.keys()), y=list(num_samples.values()), palette="viridis")
    plt.title("Class Distribution")
    plt.xlabel("Class")
    plt.ylabel("Number of Samples")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# 3. Visualize Sample Images from Each Class
def visualize_sample_images(directory, class_names, num_images_per_class=5):
    plt.figure(figsize=(num_images_per_class * 2, len(class_names) * 2))
    
    for i, class_name in enumerate(class_names):
        class_dir = os.path.join(directory, class_name)
        images = os.listdir(class_dir)
        selected_images = np.random.choice(images, num_images_per_class, replace=False)
        
        for j, img_name in enumerate(selected_images):
            img_path = os.path.join(class_dir, img_name)
            img = load_img(img_path, target_size=(48, 48))
            
            ax = plt.subplot(len(class_names), num_images_per_class, i * num_images_per_class + j + 1)
            ax.imshow(img)
            ax.axis("off")
            if j == 0:
                ax.set_title(class_name, fontsize=10)
    plt.tight_layout()
    plt.show()

In [None]:
# Training data info and EDA
print("Training Data EDA:")
train_class_names, train_num_samples = dataset_info(train_dir)
plot_class_distribution(train_num_samples)
visualize_sample_images(train_dir, train_class_names, num_images_per_class)

In [None]:
# Testing data info and EDA (if you want to repeat for the test data)
print("\nTesting Data EDA:")
test_class_names, test_num_samples = dataset_info(test_dir)
plot_class_distribution(test_num_samples)
visualize_sample_images(test_dir, test_class_names, num_images_per_class)

In [None]:
print("\Generated Data EDA:")
gen_class_names, gen_num_samples = dataset_info(gen_dir)
plot_class_distribution(gen_num_samples)
visualize_sample_images(gen_dir, gen_class_names, num_images_per_class)

# Without any data augmentation


# Modeling 

In [None]:
img_size = 48 #original size of the image
batch_size = 16
epochs = 60

In [None]:
# Define ImageDataGenerators for non-augmented and augmented datasets
train_datagen_no_aug = ImageDataGenerator(
    rescale=1./255,
)

validation_datagen = ImageDataGenerator(
    rescale=1./255,
)

train_datagen_aug = ImageDataGenerator(
    rescale=1./255,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
)

train_datagen_gan = ImageDataGenerator(
    rescale=1./255,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
)


# Create data generators for training and validation without augmentation
train_generator_no_aug = train_datagen_no_aug.flow_from_directory(
    directory=train_dir,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    color_mode="grayscale",
    class_mode="categorical",
)

# Create data generators for training and validation with augmentation
train_generator_aug = train_datagen_aug.flow_from_directory(
    directory=train_dir,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    color_mode="grayscale",
    class_mode="categorical"
)

train_generator_gan = train_datagen_gan.flow_from_directory(
    directory=gen_dir,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    color_mode="grayscale",
    class_mode="categorical"
)

validation_generator = validation_datagen.flow_from_directory(
    directory=test_dir,
    target_size=(img_size, img_size),
    batch_size=batch_size,
    color_mode="grayscale",
    class_mode="categorical",
    shuffle=False
)


In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import clone_model

# Define your model as a function to re-instantiate it fresh for each dataset
def create_model():
    model = tf.keras.models.Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu', input_shape=(48, 48,1)))
    model.add(Conv2D(64, (3,3), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Conv2D(128, (5,5), padding='same', activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(512, (3,3), padding='same', activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(512, (3,3), padding='same', activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Flatten()) 
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))

    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))

    model.add(Dense(7, activation='softmax'))
    
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [None]:


tf.keras.backend.clear_session()

# Define datasets with augmentation options
datasets = [
    ("Non-Augmented", train_generator_no_aug, validation_generator),
    ("Augmented", train_generator_aug, validation_generator),
    ("GAN", train_generator_gan, validation_generator)
]

# Create directories to store models and histories
os.makedirs("models", exist_ok=True)
os.makedirs("histories", exist_ok=True)

# Store training histories
training_histories = []

# Function to compute F1 score and plot confusion matrix
def compute_f1_and_plot_heatmap(model, validation_generator, dataset_name):
    # Get true labels and predictions
    true_labels = validation_generator.classes
    class_indices = list(validation_generator.class_indices.keys())
    
    # Predict on the validation data
    predictions = model.predict(validation_generator, verbose=1)
    predicted_classes = np.argmax(predictions, axis=1)

    # Calculate F1 score
    f1 = f1_score(true_labels, predicted_classes, average='weighted')
    print("\nClassification Report:\n")
    print(classification_report(true_labels, predicted_classes, target_names=class_indices))
    print(f"\nWeighted F1 Score: {f1}\n")
    
    # Plot confusion matrix as heatmap
    cm = confusion_matrix(true_labels, predicted_classes)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_indices, yticklabels=class_indices)
    plt.title(f"Confusion Matrix for {dataset_name}")
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    return f1

# Loop through datasets to train on each one
for dataset_name, train_generator, validation_generator in datasets:
    print(f"Training on {dataset_name} dataset...")

    # Create a fresh model instance
    model = create_model()

    # Define early stopping to prevent overfitting
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model on the current dataset
    history = model.fit(
        train_generator,
        validation_data=validation_generator,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping],
        verbose=1
    )

    # Save the model
    model.save(f"models/{dataset_name}_model.h5")
    
    # Save the training history
    with open(f"histories/{dataset_name}_history.json", 'w') as f:
        json.dump(history.history, f)

    # Compute F1 score and plot confusion matrix
    f1 = compute_f1_and_plot_heatmap(model, validation_generator, dataset_name)

    # Append history and F1 score for comparison
    training_histories.append((dataset_name, history.history, f1))


In [None]:
# Display results for all datasets
for dataset_name, _, f1 in training_histories:
    print(f"{dataset_name} Dataset - Weighted F1 Score: {f1}")

# Visualize training histories
for dataset_name, history, _ in training_histories:
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    
    # Training and validation accuracy
    ax[0].plot(history['accuracy'], label='Train Accuracy')
    ax[0].plot(history['val_accuracy'], label='Validation Accuracy')
    ax[0].set_title(f'{dataset_name} - Training Accuracy vs Validation Accuracy')
    ax[0].set_ylabel('Accuracy')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(loc='upper left')

    # Training and validation loss
    ax[1].plot(history['loss'], label='Train Loss')
    ax[1].plot(history['val_loss'], label='Validation Loss')
    ax[1].set_title(f'{dataset_name} - Training Loss vs Validation Loss')
    ax[1].set_ylabel('Loss')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(loc='upper left')

    plt.show()

# GAN