instalation and imports

## Przygotowanie danych

In [None]:
!pip install librosa numpy scikit-learn joblib



In [None]:
!pip install datasets librosa numpy pandas tqdm tensorflow



In [None]:
D

Data analisys

In [None]:
# Sistema e warnings
import os
import sys
import warnings

# Dados e manipulação
import numpy as np
import pandas as pd
from tqdm import tqdm

# Áudio e visualização
import librosa
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
Audios = "/kaggle/input/ravdess-emotional-speech-audio"

dir_rav = os.listdir(Audios)

file_emotion = []
file_path = []

for dir_name in dir_rav:
    actor_path = os.path.join(Audios, dir_name)
    for file in os.listdir(actor_path):
        if file.endswith('.wav'):
            parts = file.split('.')[0].split('-')
            emotion_code = int(parts[2])
            file_emotion.append(emotion_code)
            file_path.append(os.path.join(actor_path, file))

emotion = pd.DataFrame(file_emotion, columns=['Emotions'])
path = pd.DataFrame(file_path, columns=['Path'])

data = pd.concat([emotion, path], axis=1)

emotion_map = {
    1: 'neutral',
    2: 'calm',
    3: 'happy',
    4: 'sad',
    5: 'angry',
    6: 'fear',
    7: 'disgust',
    8: 'surprise'
}

data['Emotions'] = data['Emotions'].map(emotion_map)

EmotionData = "/kaggle/working/EmotionData"
os.makedirs(EmotionData, exist_ok=True)

data.to_csv(os.path.join(EmotionData, "data.csv"), index=False)

data.head()

In [None]:
sns.set_theme(context='notebook', style='darkgrid', palette='mako', font='sans-serif', font_scale=1, color_codes=True, rc=None)
plt.figure(figsize=(14, 8))
sns.countplot(x='Emotions', data=data, order=data['Emotions'].value_counts().index)
plt.title('Count of Emotions', size=16)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
sns.despine(top=True, right=True)
plt.savefig(os.path.join(EmotionData, "emotions_plot.png"))
plt.show()

In [None]:
def create_spectrogram(data, sr, e):
    X = librosa.stft(data)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(12, 3))
    plt.title('Spectrogram for audio with {} emotion'.format(e), size=15)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.colorbar()

In [None]:
def create_spectrogram_grid(data, emotions_list):
    fig, axes = plt.subplots(4, 2, figsize=(12, 12))
    fig.suptitle("spectrograms by Emotion", fontsize=18)
    axes = axes.flatten()

    for idx, emotion in enumerate(emotions_list[:8]):
        try:
            path = np.array(data.Path[data.Emotions == emotion])[0]
            audio_data, sr = librosa.load(path, duration=2.5, offset=0.6)
            stft = librosa.stft(audio_data)
            db = librosa.amplitude_to_db(np.abs(stft))

            librosa.display.specshow(db, sr=sr, x_axis='time', y_axis='hz', ax=axes[idx])
            axes[idx].set_title(emotion.capitalize())
            axes[idx].label_outer()
        except Exception as e:
            axes[idx].set_visible(False)
            print(f"Erro com emoção '{emotion}': {e}")

    for i in range(len(emotions_list), len(axes)):
        axes[i].set_visible(False)


    fig.tight_layout(rect=[0, 0.03, 1, 0.95])

    plt.show()

emotions = data.Emotions.unique()[:8]
create_spectrogram_grid(data, emotions)

## Approach 1 - Custom Model

To solve the task of classifying emotions based on the speech signal, three different architectural approaches to modeling were designed and compared: a convolutional neural network (CNN), a hybrid model combining a CNN with a bidirectional LSTM network (CNN+BiLSTM), and an ensemble model combining the predictions of two earlier models.

### 1. Feature Extraction and Augmentation for Audio Classification
This section prepares audio files for use in classification models by extracting meaningful, hand-crafted features that capture the temporal, spectral, and perceptual characteristics of sound. These features are essential for enabling machine learning models to distinguish between audio classes such as speech commands, music genres, or environmental sounds.


The extratced key features are:
- **Zero-Crossing Rate (ZCR):** detects signal changes — useful for distinguishing voiced/unvoiced sounds.
- **Root Mean Square Energy (RMSE):** measures energy — helps detect sound intensity and silence.
- **MFCCs (Mel-Frequency Cepstral Coefficients):** represent audio timbre — essential for capturing human-perceived sound.
- **Chroma Features:** encode pitch class information — useful for music and tonal analysis.

Each audio file is processed in 4 ways:
1. Original
2. With Gaussian noise
3. Pitch-shifted
4. Noise + pitch

These augmentations increase dataset diversity and improve model robustness.


In [None]:
import os
import glob
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from joblib import dump, load

# Configuration
SR = 22050
FRAME_LENGTH = 2048
HOP_LENGTH = 512
CLIP_DURATION = 2.5  # in seconds
OFFSET = 0.6
NOISE_LEVEL = 0.005
PITCH_FACTOR = 0.7
N_MFCC = 13
N_CHROMA = 12

# Augmentation
def add_gaussian_noise(audio, noise_level=NOISE_LEVEL):
    noise = np.random.normal(0, 1, len(audio))
    audio_noise = audio + noise_level * noise
    return np.clip(audio_noise, -1.0, 1.0)

def apply_pitch_shift(audio, sr=SR, pitch_factor=PITCH_FACTOR):
    return librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_factor)

def extract_features(y, sr):
    # Trim and pad the audio to 2.5 seconds
    desired_length = int(CLIP_DURATION * sr)
    if len(y) > desired_length:
        y = y[:desired_length]
    else:
        y = np.pad(y, (0, max(0, desired_length - len(y))))

    zcr = librosa.feature.zero_crossing_rate(y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).squeeze()
    rmse = librosa.feature.rms(y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).squeeze()
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, hop_length=HOP_LENGTH)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=HOP_LENGTH, n_chroma=N_CHROMA)

    # Flatten and concatenate
    feature_vector = np.concatenate([
        zcr.flatten(),
        rmse.flatten(),
        mfcc.flatten(),
        chroma.flatten()
    ])
    return feature_vector

def process_file(file_path):
    y, _ = librosa.load(file_path, sr=SR, offset=OFFSET)
    features = []

    # Original audio (O.A.)
    features.append(extract_features(y, SR))

    # Noise-augmented (N.A.)
    features.append(extract_features(add_gaussian_noise(y), SR))

    # Pitch-shifted (P.A.)
    features.append(extract_features(apply_pitch_shift(y), SR))

    # Noise + Pitch (N.A. + P.A.)
    features.append(extract_features(apply_pitch_shift(add_gaussian_noise(y)), SR))

    return np.array(features)

def load_dataset(audio_dir, label_fn):
    all_features = []
    all_labels = []

    for audio_file in glob.glob(os.path.join(audio_dir, "*.wav")):
        label = label_fn(audio_file)  # user-defined function
        file_feats = process_file(audio_file)
        for feat in file_feats:
            all_features.append(feat)
            all_labels.append(label)

    X = np.array(all_features)
    y = np.array(all_labels)

    # Replace NaNs with 0
    X = np.nan_to_num(X)

    # Normalize
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y, scaler

def split_and_save(X, y, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

    dump((X_train, y_train), os.path.join(output_dir, "train.joblib"))
    dump((X_val, y_val), os.path.join(output_dir, "val.joblib"))
    dump((X_test, y_test), os.path.join(output_dir, "test.joblib"))

    print(f"Saved to {output_dir}")

def prepare_dataset(audio_dir, label_fn, output_dir, force_rebuild=False):
    if not force_rebuild and os.path.exists(os.path.join(output_dir, "train.joblib")):
        print(f"Using cached data from {output_dir}")
        return
    # print("Number of audio files:", len(audio_dir))
    # print("Number of extracted features:", len(audio_dir))
    os.makedirs(output_dir, exist_ok=True)
    X, y, scaler = load_dataset(audio_dir, label_fn)
    dump(scaler, os.path.join(output_dir, "scaler.joblib"))
    split_and_save(X, y, output_dir)


### 2. Datasets preparation

##### EmoDB Dataset

In [None]:
import zipfile
import os
import shutil
from pathlib import Path

# Emotions mapping
emodb_emotion_map = {
    "W": "anger",
    "L": "boredom",
    "E": "disgust",
    "A": "fear",
    "F": "happiness",
    "T": "sadness",
    "N": "neutral"
}

# Label base on filename
def emodb_label_fn(filename):
    emotion_code = Path(filename).stem[5]  # 6. znak
    return emodb_emotion_map.get(emotion_code, "unknown")

# Process data from zip file
def process_emodb_zip(zip_path):
    extract_dir = "emodb_extracted"

    # Delete if exists
    if os.path.exists(extract_dir):
        shutil.rmtree(extract_dir)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

    # Data exsists inside wav folder in extracted directory
    wav_dir = os.path.join(extract_dir, "wav")

    if not os.path.exists(wav_dir):
        raise FileNotFoundError(f"Expected folder 'wav' not found in {extract_dir}")

    # Cal preparation function
    prepare_dataset(wav_dir, emodb_label_fn, "processed/emodb")


In [None]:
process_emodb_zip("/content/archive (2).zip")


Saved to processed/emodb


In [None]:
import shutil
shutil.make_archive("emodb_data", 'zip', "processed/emodb")


'/content/emodb_data.zip'

In [None]:
## We download prepared data in case the session will disconnect - we can import it instead of redoing preparation
from google.colab import files
files.download("emodb_data.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### Ravdess dataset

In [None]:
import os
import zipfile
from pathlib import Path
import shutil

def process_ravdess_zip(zip_path, extract_to="ravdess_extracted", output_dir="processed/ravdess", force_rebuild=False):
    """
    Extract RAVDESS zip, filter for vocal_channel=01, and run prepare_dataset.

    Args:
        zip_path (str): Path to the RAVDESS zip file.
        extract_to (str): Temporary folder to extract contents.
        output_dir (str): Where to store processed data.
        force_rebuild (bool): Force reprocessing even if cache exists.
    """
    # 1. Extract zip
    print(f"Extracting {zip_path} to {extract_to}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    # 2. Filter for vocal_channel == 01
    print("Filtering for vocal_channel = 01...")
    extracted_path = Path(extract_to)
    valid_audio_files = []
    for wav_path in extracted_path.rglob("*.wav"):
        filename = wav_path.stem  # e.g. 03-01-01-01-01-01-01
        parts = filename.split("-")
        if len(parts) >= 3:
            vocal_channel = parts[1]
            if vocal_channel == "01":
                valid_audio_files.append(str(wav_path))

    if not valid_audio_files:
        raise ValueError("No valid audio files found with vocal_channel=01")

    # 3. Create a temporary directory with only the filtered audio
    filtered_audio_dir = "ravdess_filtered"
    if os.path.exists(filtered_audio_dir):
        shutil.rmtree(filtered_audio_dir)
    os.makedirs(filtered_audio_dir)

    for path in valid_audio_files:
        target = os.path.join(filtered_audio_dir, os.path.basename(path))
        shutil.copy(path, target)

    # 4. Define the label extraction function
    def ravdess_label_fn(filename):
        # example: 03-01-01-01-01-01-01.wav
        parts = Path(filename).stem.split("-")
        if len(parts) >= 3:
            emotion_id = int(parts[2])
            return emotion_id
        else:
            raise ValueError(f"Filename format not recognized: {filename}")

    # 5. Call prepare_dataset
    prepare_dataset(filtered_audio_dir, ravdess_label_fn, output_dir, force_rebuild=force_rebuild)

    print("RAVDESS data prepared successfully.")


In [None]:
process_ravdess_zip("/content/archive (1).zip")

In [None]:
import shutil
shutil.make_archive("ravdess_data", 'zip', "processed/ravdess")


In [None]:
from google.colab import files
files.download("ravdess_data.zip")


#### Model Architectures for Emotion Classification

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv1D, MaxPooling1D, BatchNormalization,
    Dropout, Flatten, Dense, Bidirectional, LSTM, Average
)
from tensorflow.keras.models import Model


##### **1. CNN Model**
A stack of 1D convolutional layers extracts local patterns in the audio features. It includes:
- Conv1D + BatchNorm + MaxPooling layers
- Dropout layers to prevent overfitting
- Final Dense layer with softmax for given class prediction

This baseline architecture relies exclusively on one-dimensional convolutional layers (`Conv1D`) to extract local acoustic patterns from the input signal.  This configuration is particularly effective at capturing frequency-localized features, which are essential for emotion-related prosodic and timbral cues. The resulting feature maps are flattened and passed through a dense output layer with a `softmax` activation function to produce a probability distribution over the target emotion classes.

In [None]:
def build_cnn_model(input_shape=(7344, 1), num_classes=10):
    inputs = Input(shape=input_shape)

    x = Conv1D(128, kernel_size=3, padding='same', activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Conv1D(128, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.3)(x)

    x = Conv1D(64, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Conv1D(64, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.3)(x)

    x = Conv1D(32, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.3)(x)

    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    outputs = Dense(num_classes, activation='softmax')(x)

    return Model(inputs, outputs, name="CNN_Model")


##### **2. CNN + BiLSTM Model**
To incorporate long-range temporal dependencies often present in emotional speech, the CNN architecture is extended with a `Bidirectional LSTM` layer. This recurrent component is inserted after the initial convolutional blocks and is capable of learning sequential patterns in both forward and backward temporal directions. This hybrid setup allows the model to preserve the advantages of convolutional feature extraction while also modeling the dynamic evolution of emotional expression over time. The remaining structure — including additional convolutional layers, dropout, and the final dense classifier — mirrors the CNN baseline.

In [None]:
def build_cnn_bilstm_model(input_shape=(7344, 1), num_classes=7):
    inputs = Input(shape=input_shape)

    x = Conv1D(128, kernel_size=3, padding='same', activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Conv1D(128, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.3)(x)

    x = Conv1D(64, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Conv1D(64, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.3)(x)

    x = Bidirectional(LSTM(64, return_sequences=True))(x)

    x = Conv1D(32, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.3)(x)

    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    outputs = Dense(num_classes, activation='softmax')(x)

    return Model(inputs, outputs, name="CNN_BiLSTM_Model")


##### **3. Ensemble Model (CNN + CNN-BiLSTM):**  
The third architecture employs an ensemble approach by combining the outputs of the CNN and CNN+BiLSTM models. Both subnetworks process the same input in parallel, and their prediction vectors are fused using an `Average` layer. This strategy aims to harness the complementary strengths of the individual models — CNN's ability to capture fine-grained spectral features and BiLSTM's capacity for modeling temporal context — to yield a more stable and potentially more accurate classification output.

In [None]:
def build_ensemble_model(input_shape=(7344, 1), num_classes=7):
    input_layer = Input(shape=input_shape)

    cnn_model = build_cnn_model(input_shape, num_classes)
    bilstm_model = build_cnn_bilstm_model(input_shape, num_classes)

    cnn_output = cnn_model(input_layer)
    bilstm_output = bilstm_model(input_layer)

    merged_output = Average()([cnn_output, bilstm_output])

    return Model(inputs=input_layer, outputs=merged_output, name="Ensemble_Model")


In [None]:
model = build_ensemble_model()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


Training loop that will be reused by each of the datasets.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

import numpy as np

from tensorflow.keras.optimizers import Adam

def train_and_evaluate(model, model_name, X_train, y_train, X_val, y_val, X_test, y_test,
                       learning_rate=0.0001, batch_size=64, epochs=50):
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

    y_pred = model.predict(X_test)
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    acc = accuracy_score(y_true, y_pred_labels)
    prec = precision_score(y_true, y_pred_labels, average='weighted')
    rec = recall_score(y_true, y_pred_labels, average='weighted')
    f1 = f1_score(y_true, y_pred_labels, average='weighted')

    try:
        auc_roc = roc_auc_score(y_test, y_pred, average='weighted', multi_class='ovr')
    except ValueError:
        auc_roc = None

    try:
        auc_prc = average_precision_score(y_test, y_pred, average='weighted')
    except ValueError:
        auc_prc = None

    return {
        'model_name': model_name,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'auc_roc': auc_roc,
        'auc_prc': auc_prc
    }



#### **Approach 1 for EmoDB dataset**

1. Read and transform the data

In [None]:
from joblib import load

X_train, y_train = load("processed/emodb/train.joblib")
X_val, y_val = load("processed/emodb/val.joblib")
X_test, y_test = load("processed/emodb/test.joblib")

scaler = load("processed/emodb/scaler.joblib")


In [None]:
print("Shape before reshape:", X_train.shape)
X_train = np.expand_dims(X_train, axis=-1)
print("Shape after reshape:", X_train.shape)


Shape before reshape: (1712, 2916)
Shape after reshape: (1712, 2916, 1)


In [None]:
# X_train = np.expand_dims(X_train, axis=-1) # comment because above we checked the size after the reshaping
X_val = np.expand_dims(X_val, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)


In [None]:
input_shape = X_train.shape[1:]

if len(y_train.shape) == 1:
    num_classes = len(np.unique(y_train))
else:
    num_classes = y_train.shape[1]
print(num_classes)

7


In [None]:
print(input_shape)

(2916, 1)


2. Encode labels

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Encode labels (there were strings before)

le = LabelEncoder()
le.fit(y_train)

y_train_enc = le.transform(y_train)
y_val_enc = le.transform(y_val)
y_test_enc = le.transform(y_test)


num_classes = len(le.classes_)

y_train_cat = to_categorical(y_train_enc, num_classes)
y_val_cat = to_categorical(y_val_enc, num_classes)
y_test_cat = to_categorical(y_test_enc, num_classes)


3. Training loop

In [None]:
import pandas as pd
results = []

# CNN model
cnn_model = build_cnn_model(input_shape=input_shape, num_classes=num_classes)
results.append(train_and_evaluate(cnn_model, "CNN_Model", X_train, y_train_cat, X_val, y_val_cat, X_test, y_test_cat))

# CNN+BiLSTM model
cnn_bilstm_model = build_cnn_bilstm_model(input_shape=input_shape, num_classes=num_classes)
results.append(train_and_evaluate(cnn_bilstm_model, "CNN_BiLSTM_Model",  X_train, y_train_cat, X_val, y_val_cat, X_test, y_test_cat))

# Ensemble model
ensemble_model = build_ensemble_model(input_shape=input_shape, num_classes=num_classes)
results.append(train_and_evaluate(ensemble_model, "Ensemble_Model",  X_train, y_train_cat, X_val, y_val_cat, X_test, y_test_cat))


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 2s/step - accuracy: 0.1422 - loss: 2.4933 - val_accuracy: 0.1589 - val_loss: 2.0228
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - accuracy: 0.3096 - loss: 1.9688 - val_accuracy: 0.1215 - val_loss: 2.3106
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - accuracy: 0.4271 - loss: 1.5500 - val_accuracy: 0.1262 - val_loss: 2.6813
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - accuracy: 0.4587 - loss: 1.4362 - val_accuracy: 0.1776 - val_loss: 3.0106
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2s/step - accuracy: 0.5491 - loss: 1.2598 - val_accuracy: 0.1589 - val_loss: 3.1951
Epoch 6/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - accuracy: 0.5988 - loss: 1.0802 - val_accuracy: 0.1542 - val_loss: 3.3955
Epoch 7/50
[1m27/27[0m [32m━━━━━━━━━━



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 208ms/step
Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 3s/step - accuracy: 0.1753 - loss: 2.4609 - val_accuracy: 0.1168 - val_loss: 1.9503
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3s/step - accuracy: 0.3270 - loss: 1.9067 - val_accuracy: 0.1542 - val_loss: 1.9590
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 3s/step - accuracy: 0.4293 - loss: 1.5604 - val_accuracy: 0.1542 - val_loss: 2.0174
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 3s/step - accuracy: 0.5193 - loss: 1.3080 - val_accuracy: 0.1542 - val_loss: 2.1109
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 3s/step - accuracy: 0.5658 - loss: 1.1585 - val_accuracy: 0.1542 - val_loss: 2.1934
Epoch 6/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 3s/step - accuracy: 0.6103 - loss: 1.0151 - val_acc

4. Save results

In [None]:
df_results = pd.DataFrame(results)
df_results.to_csv("model_metrics_emodb_v2.csv", index=False)

In [None]:
from google.colab import files
files.download("model_metrics_emodb_v2.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##### **Analisys of the results**
**Learning Behavior**

For each of the models the training dynamics over the 50 epochs shows that they began with low accuracy and high loss, but steadily improved as training progressed. Key stages:

1. Initial Phase (Epochs 1–10):

  - Training accuracy was climbing from ~15% to ~70%, while validation accuracy remained low (around 15–18%).

  - The training loss dropped from ~2.49 to ~0.75, indicating the model was learning the training distribution.

  - However, the high validation loss and plateaued validation accuracy pointed to potential overfitting or early instability.

2. Middle Phase (Epochs 11–25):
  - Validation loss consistently decreased, showing that the model was beginning to generalize better.

  - This phase marks a transition from underfitting to generalization.

3. Final Phase (Epochs 26–50):

  - Training accuracy reached >92%, while validation accuracy approached 90%.

  - Validation loss plateaued around ~0.29–0.31, suggesting stable convergence.

  - Training remained stable, and the learning rate of 0.0001 seems appropriate for this phase of training.

  **Comparision of models**
- The CNN model started with very low accuracy but quickly learned to capture important features from the audio data. We observed that once it surpassed 90% accuracy, the results began to stabilize. An important observation is that not only did the accuracy increase, but the loss decreased as well, which may suggest that the model is not overfitting. Final metrics: Accuracy: 0.9693, Loss: 0.1487, Validation Accuracy: 0.9486, Validation Loss: 0.1889.
- The CNN + BiLSTM Model achieved better performance from the very beginning. It effectively captured important acoustic features and continuously improved its results. One notable observation is the large gap between training and validation accuracy. Although the training accuracy was higher than that of the CNN model, the validation accuracy was lower. Final best metrics: Accuracy: 0.9686, loss: 0.1350, Validation Accuracy: 0.9206, Validation Loss: 0.2105
- The ensemble model achieved the best results of all three. By combining the strengths of both approaches—CNNs for effectively capturing frequency-localized features and BiLSTMs for learning sequential patterns in both temporal directions—it produced superior performance. Final best results were 98% accuracy on training and 95.80 accuracy on test set with loss of 0.18.

#### **Approach 1 for Ravdness Dataset**


1. Read and transform data

In [None]:
import os
import zipfile
from pathlib import Path
import shutil

zip_path = "/content/ravdess_data.zip"
extract_to = "processed/ravdess"
output_dir = "processed/ravdess"
print(f"Extracting {zip_path} to {extract_to}...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
     zip_ref.extractall(extract_to)

Extracting /content/ravdess_data.zip to processed/ravdess...


In [None]:
from joblib import load

X_train, y_train = load("processed/ravdess/train.joblib")
X_val, y_val = load("processed/ravdess/val.joblib")
X_test, y_test = load("processed/ravdess/test.joblib")

scaler = load("processed/ravdess/scaler.joblib")


In [None]:
print(X_train.shape)

(4608, 2916)


In [None]:
import numpy as np
print("Shape before reshape:", X_train.shape)
X_train = np.expand_dims(X_train, axis=-1)
print("Shape after reshape:", X_train.shape)

Shape before reshape: (4608, 2916)
Shape after reshape: (4608, 2916, 1)


In [None]:
# X_train = np.expand_dims(X_train, axis=-1)
X_val = np.expand_dims(X_val, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

In [None]:
input_shape = X_train.shape[1:]  # np. (7344, 1)

if len(y_train.shape) == 1:
    num_classes = len(np.unique(y_train))
else:
    num_classes = y_train.shape[1]
print(num_classes)

8


2. Encode labels

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


le = LabelEncoder()
le.fit(y_train)

y_train_enc = le.transform(y_train)
y_val_enc = le.transform(y_val)
y_test_enc = le.transform(y_test)

num_classes = len(le.classes_)

y_train_cat = to_categorical(y_train_enc, num_classes)
y_val_cat = to_categorical(y_val_enc, num_classes)
y_test_cat = to_categorical(y_test_enc, num_classes)

3. Training loop

In [None]:
results = []

# CNN model
cnn_model = build_cnn_model(input_shape=input_shape, num_classes=num_classes)
results.append(train_and_evaluate(cnn_model, "CNN_Model", X_train, y_train_cat, X_val, y_val_cat, X_test, y_test_cat))

# CNN+BiLSTM model
cnn_bilstm_model = build_cnn_bilstm_model(input_shape=input_shape, num_classes=num_classes)
results.append(train_and_evaluate(cnn_bilstm_model, "CNN_BiLSTM_Model",  X_train, y_train_cat, X_val, y_val_cat, X_test, y_test_cat))

# Ensemble model
ensemble_model = build_ensemble_model(input_shape=input_shape, num_classes=num_classes)
results.append(train_and_evaluate(ensemble_model, "Ensemble_Model",  X_train, y_train_cat, X_val, y_val_cat, X_test, y_test_cat))


Epoch 1/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 71ms/step - accuracy: 0.1497 - loss: 2.6381 - val_accuracy: 0.1337 - val_loss: 2.4597
Epoch 2/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 50ms/step - accuracy: 0.2803 - loss: 2.0638 - val_accuracy: 0.1354 - val_loss: 3.0067
Epoch 3/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 48ms/step - accuracy: 0.3374 - loss: 1.8677 - val_accuracy: 0.1840 - val_loss: 3.3912
Epoch 4/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 49ms/step - accuracy: 0.4056 - loss: 1.6489 - val_accuracy: 0.2066 - val_loss: 3.3524
Epoch 5/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 48ms/step - accuracy: 0.4420 - loss: 1.5277 - val_accuracy: 0.2170 - val_loss: 3.1157
Epoch 6/50
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 48ms/step - accuracy: 0.4748 - loss: 1.4145 - val_accuracy: 0.2465 - val_loss: 2.7521
Epoch 7/50
[1m72/72[0m [32m━━

4. Save results

In [None]:
df_results = pd.DataFrame(results)
df_results.to_csv("model_metrics_ravdess.csv", index=False)

In [None]:
from google.colab import files
files.download("model_metrics_ravdess.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##### **Analisys of the results**
**Learning behavior**

1. Initial Learning Phase (Epochs 1–6):
- Training accuracy grew from 15% to ~47%, while validation accuracy remained low (~13–25%).

- Validation loss initially increased (overfitting warning) before beginning to decrease from epoch 6.

Suggests the model needed several epochs to begin generalizing beyond the training data.

2. Transition Phase (Epochs 7–15):
- Marked improvement in validation performance:

- Training accuracy kept increasing, suggesting effective learning with minimal overfitting.

- The model started to converge around epoch 15.

3. Stable Convergence Phase (Epochs 16–30):

- Validation loss stabilized around 0.65–0.75, indicating a well-generalizing model.

- Training accuracy also grew consistently, surpassing 80%.

Clear improvement in generalization and reduced gap between training and validation metrics.

4. Final Optimization Phase (Epochs 31–43+):
- Strong performance: Validation accuracy plateaued around 90%, but model may be approaching its optimal performance ceiling, What is more, slight signs of overfitting have started to re-emerge post-epoch 42 (e.g., val_loss stabilizing/increasing slightly while training accuracy continues rising).

**Comparision of models**

- CNN model - The training process for the CNN model took longer, and the improvements in accuracy were less pronounced compared to the previous dataset. Despite a gradual increase in accuracy, the loss remained relatively high throughout training. The final performance metrics were:
Training Accuracy: 0.9073 | Training Loss: 0.3180
Validation Accuracy: 0.8438 | Validation Loss: 0.4609

- The CNN + BiLSTM Model - This model exhibited a slower start than the baseline CNN, with low validation accuracy and high validation loss during the early epochs, indicating initial difficulties in generalization. However, from around epoch 8 onward, the model began to show steady improvements. Nevertheless, it was unable to reduce the training loss below 0.3 or the validation loss below 0.5. While a validation accuracy of approximately 82% is a solid result, there remains room for improvement. The best recorded metrics were:
Training Accuracy: 0.9091 | Training Loss: 0.3057
Validation Accuracy: 0.8229 | Validation Loss: 0.5349
- Ensemble model - In this case, combining multiple approaches did not lead to a significant improvement in performance on the given dataset. The ensemble model's results remained comparable to those of the individual models, with validation accuracy plateauing around 80%. The final metrics were:
Training Accuracy: 0.9195 | Training Loss: 0.3401
Validation Accuracy: 0.8264 | Validation Loss: 0.6194

## Approach 2 - Fine tuning wav2vec
To improve the performance of emotion classification from speech signals, a transfer learning approach was tested using the Wav2Vec2 model, which is one of the most advanced architectures in the field of audio signal processing. Wav2Vec2 has been trained on huge collections of raw speech data, learning unsupervised acoustic representations. Such representations are then fine-tuned (fine-tuning) for a specific supervised task - in this case, emotion classification.

In [None]:
from sklearn.model_selection import train_test_split
import torch

train_test = dataset.train_test_split(test_size=0.2)
train_ds = train_test['train']
test_ds = train_test['test']

from transformers import DataCollatorWithPadding
from transformers import Trainer

class DataCollatorCTCWithPadding:
    def __init__(self, processor, padding=True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):
        # features is a list of dicts with keys: input_values, attention_mask, label
        input_features = [{"input_values": f["input_values"], "attention_mask": f["attention_mask"]} for f in features]
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt"
        )
        # add labels as tensor (not padded, because labels are scalars)
        batch["labels"] = torch.tensor([f["label"] for f in features], dtype=torch.long)
        return batch

# Then, instantiate and pass this as data_collator:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./wav2vec2-checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    num_train_epochs=10,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, f1_score
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(pred.label_ids, preds),
        "f1": f1_score(pred.label_ids, preds, average="weighted")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


#### Approach 2 for Emodb

**1. Dataset prepartion**

In [None]:
import zipfile
import os

# Unzip file
zip_path = "/content/archive (2).zip"  # change this path
extract_path = "/content/emodb"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [None]:
import pandas as pd
import glob
import os

# Map German emotion codes to English labels (example mapping, adjust if needed)
emotion_map = {
    "W": "anger",     # "Wut"
    "L": "boredom",   # "Langeweile"
    "E": "disgust",   # "Ekel"
    "A": "fear",      # "Angst"
    "F": "happiness", # "Freude"
    "T": "sadness",   # "Traurigkeit"
    "N": "neutral"    # "neutral"
}

# Find all wav files
wav_files = glob.glob(os.path.join(extract_path, "wav", "*.wav"))

data = []
for file_path in wav_files:
    filename = os.path.basename(file_path)
    speaker_id = filename[0:2]
    text_code = filename[2:5]
    emotion_code = filename[5].upper()
    version = filename[6] if len(filename) > 6 else "a"
    emotion_label = emotion_map.get(emotion_code, "unknown")

    if emotion_label == "unknown":
        continue  # skip bad files

    data.append({
        "path": file_path,
        "label": emotion_label,
        "speaker": speaker_id,
        "text_code": text_code,
        "version": version
    })

df = pd.DataFrame(data)
df.head()


Unnamed: 0,path,label,speaker,text_code,version
0,/content/emodb/wav/10b03Wb.wav,anger,10,b03,b
1,/content/emodb/wav/16b02Aa.wav,fear,16,b02,a
2,/content/emodb/wav/09b09Wa.wav,anger,9,b09,a
3,/content/emodb/wav/09a05Wc.wav,anger,9,a05,c
4,/content/emodb/wav/08a05Fe.wav,happiness,8,a05,e


**2. Enocde labels and transform files to audio**

In [None]:
from datasets import Dataset, Audio
from transformers import Wav2Vec2Processor
from datasets import ClassLabel

# Encode label into integer
unique_labels = sorted(df["label"].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

df["label_id"] = df["label"].map(label2id)

# Hugging Face Dataset
ds = Dataset.from_pandas(df[["path", "label_id"]])
ds = ds.rename_column("path", "audio")
ds = ds.rename_column("label_id", "label")
ds = ds.cast_column("audio", Audio(sampling_rate=16000))



processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
processor.feature_extractor.return_attention_mask = True

def preprocess_batch(batch):
    # Extract the list of raw audio arrays from each dict
    audio_arrays = [audio["array"] for audio in batch["audio"]]
    sampling_rate = batch["audio"][0]["sampling_rate"]  # All should be the same

    # Process audio
    inputs = processor(audio_arrays, sampling_rate=sampling_rate, return_tensors="np", padding=True, return_attention_mask=True)

    batch["input_values"] = inputs["input_values"]
    batch["attention_mask"] = inputs["attention_mask"]
    return batch

dataset = ds.map(preprocess_batch, batched=True, batch_size=8, remove_columns=["audio"])
# Then, instantiate and pass this as data_collator:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

**3. Prepare model and train**

In [None]:
from transformers import Wav2Vec2ForSequenceClassification
label2id = {str(k): int(v) for k, v in label2id.items()}
id2label = {int(k): str(v) for k, v in id2label.items()}

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    mask_time_prob=0.05
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.2188,1.214868,0.616822,0.542132
2,0.9642,0.858787,0.813084,0.809547
3,0.5686,0.578729,0.869159,0.866984
4,0.5474,0.56892,0.869159,0.86375
5,0.2349,0.427408,0.906542,0.909813
6,0.1476,0.310134,0.915888,0.912232
7,0.09,0.255241,0.943925,0.943673
8,0.0949,0.238742,0.943925,0.943673
9,0.0672,0.237666,0.943925,0.943673


TrainOutput(global_step=530, training_loss=0.44018169551525477, metrics={'train_runtime': 993.6527, 'train_samples_per_second': 4.307, 'train_steps_per_second': 0.533, 'total_flos': 2.1942160984368294e+17, 'train_loss': 0.44018169551525477, 'epoch': 9.822429906542055})

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.2552410662174225, 'eval_accuracy': 0.9439252336448598, 'eval_f1': 0.9436733982528375, 'eval_runtime': 14.3502, 'eval_samples_per_second': 7.456, 'eval_steps_per_second': 1.882, 'epoch': 9.822429906542055}


##### Results Analisys

The Wav2Vec 2.0 model was fine-tuned on the EMO-DB dataset across 9 training epochs, and the progression of performance metrics (training loss, validation loss, accuracy, and F1-score) clearly demonstrates strong learning behavior and effective generalization.
The Wav2Vec 2.0 model was fine-tuned on the EMO-DB dataset across 9 training epochs, and the progression of performance metrics (training loss, validation loss, accuracy, and F1-score) clearly demonstrates strong learning behavior and effective generalization. In the final three epochs, the model achieves a stable accuracy of ~94.4% and an F1-score of 0.9437, but the validation loss decreases only marginally in, indicating the model has likely reached its optimal performance on this dataset without signs of overfitting.

#### Approach 2 for Ravdess

**1. Dataset preparation**

In [None]:
import zipfile
import os

# Unzip file
zip_path = "/content/archive (1).zip"  # change this path
extract_path = "/content/ravdess"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [None]:
import os
import glob
import shutil
import pandas as pd
from datasets import Dataset, Audio
from transformers import Wav2Vec2Processor

# ==== STEP 1: SETUP ====
source_dir = "ravdess"  # your main RAVDESS directory
target_dir = "ravdess_audio_only"  # directory where we copy audio-only files
os.makedirs(target_dir, exist_ok=True)

# ==== STEP 2: DEFINE LABEL MAPPING ====
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# ==== STEP 3: EXTRACT AND FILTER AUDIO FILES ====
data = []

# Find all audio files in nested Actor folders
for actor_folder in glob.glob(os.path.join(source_dir, "Actor*")):
    for file_path in glob.glob(os.path.join(actor_folder, "03-*.wav")):  # Filter by "03-" prefix
        filename = os.path.basename(file_path)
        parts = filename.replace(".wav", "").split("-")

        if len(parts) != 7:
            continue  # Skip badly named files

        modality, channel, emotion_id, intensity, statement, repetition, actor_id = parts

        # Build metadata
        emotion_label = emotion_map.get(emotion_id, "unknown")
        if emotion_label == "unknown":
            continue  # skip unrecognized emotion

        new_path = os.path.join(target_dir, filename)
        shutil.copy(file_path, new_path)

        data.append({
            "path": new_path,
            "label": emotion_label,
            "actor_id": int(actor_id),
            "gender": "male" if int(actor_id) % 2 == 1 else "female"
        })

# ==== STEP 4: CREATE DATAFRAME ====
df = pd.DataFrame(data)

# Encode labels
unique_labels = sorted(df["label"].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
df["label_id"] = df["label"].map(label2id)

# ==== STEP 5: CREATE HF DATASET ====
ds = Dataset.from_pandas(df[["path", "label_id"]])
ds = ds.rename_column("path", "audio")
ds = ds.rename_column("label_id", "label")
ds = ds.cast_column("audio", Audio(sampling_rate=16000))

# ==== STEP 6: LOAD WAV2VEC2 PROCESSOR ====
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
processor.feature_extractor.return_attention_mask = True

def preprocess_batch(batch):
    audio_arrays = [a["array"] for a in batch["audio"]]
    sampling_rate = batch["audio"][0]["sampling_rate"]

    inputs = processor(audio_arrays, sampling_rate=sampling_rate, return_tensors="np",
                       padding=True, return_attention_mask=True)

    batch["input_values"] = inputs["input_values"]
    batch["attention_mask"] = inputs["attention_mask"]
    return batch

# ==== STEP 7: FINAL DATASET ====
dataset = ds.map(preprocess_batch, batched=True, batch_size=8, remove_columns=["audio"])
# Then, instantiate and pass this as data_collator:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

**2. Model initialization and training**

In [None]:
from transformers import Wav2Vec2ForSequenceClassification
label2id = {str(k): int(v) for k, v in label2id.items()}
id2label = {int(k): str(v) for k, v in id2label.items()}

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    mask_time_prob=0.05
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.7271,1.648248,0.444444,0.369251
2,1.295,1.374118,0.552083,0.506979
3,0.7547,0.815704,0.763889,0.747543
4,0.5432,0.786094,0.767361,0.749375
5,0.4764,0.591048,0.829861,0.829071
6,0.2338,0.574833,0.847222,0.847406
7,0.1082,0.504256,0.881944,0.88111
8,0.0705,0.42817,0.895833,0.895486
9,0.0335,0.496169,0.892361,0.891867
10,0.0485,0.460169,0.892361,0.891861


TrainOutput(global_step=1440, training_loss=0.6219968101216687, metrics={'train_runtime': 2329.9929, 'train_samples_per_second': 4.944, 'train_steps_per_second': 0.618, 'total_flos': 4.73754634500566e+17, 'train_loss': 0.6219968101216687, 'epoch': 10.0})

##### Results analisys
The Wav2Vec 2.0 model was fine-tuned on a Ravdess dataset over 10 epochs, and the metrics reveal a clear learning trajectory and strong performance gains, particularly in the later stages of training.
The model begins with a low accuracy of 44.4% and F1-score of 0.37, indicating limited initial ability to distinguish between classes.
However, by Epoch 3, there is a sharp rise in performance: accuracy reaches 76.4%, and F1-score increases to 0.75.
In the final epochs, the model achieves peak accuracy of 89.6% and F1-score of 0.895 (Epoch 8), followed by consistent results in Epochs 9 and 10.
Interestingly, while training loss continues to drop (down to 0.03), validation loss stabilizes around 0.46–0.50, suggesting slight overfitting may begin but without hurting validation performance significantly.

## Approach 3 - Audio images classification
An increasingly popular method for audio classification involves transforming audio signals into visual representations—specifically, spectrograms—and treating them as images for input into convolutional neural networks (CNNs). This approach leverages the strengths of well-established image classification architectures and transfer learning techniques.

The process begins with converting raw audio waveforms into mel spectrograms, which encode the time-frequency distribution of sound energy using a perceptually meaningful scale. These spectrograms are then visualized and saved as two-dimensional images, typically in grayscale or color, depending on the application.

Once converted to image format, the spectrograms can be processed using standard computer vision pipelines. Pre-trained CNN models such as ResNet, VGG, or EfficientNet can be fine-tuned for the classification task by replacing their final layers with task-specific output units. The network is then trained to distinguish between audio classes based on visual patterns in the spectrograms.

**In my approach**

Raw audio recordings were first converted into 2D mel spectrogram images using the Librosa library. Each audio file was transformed into a 224×224 pixel image, capturing frequency and temporal information. These images were used as inputs to a pre-trained ResNet-18 convolutional neural network, adapted for multi-class classification by modifying the final fully connected layer. The model was trained using cross-entropy loss and optimized with the Adam optimizer over 10 epochs. This image-based method enabled the application of powerful computer vision models to audio classification tasks.

### Approach 3 for Ravdess Dataset

In [4]:
import zipfile
import os

# Unzip file
zip_path = "/content/archive (1).zip"  # change this path
extract_path = "/content/ravdess"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

**1. Dataset preparation**

In [5]:
import os
import shutil
import glob

source_dir = "ravdess"
target_dir = "ravdess_filtered"

os.makedirs(target_dir, exist_ok=True)

for actor_folder in os.listdir(source_dir):
    actor_path = os.path.join(source_dir, actor_folder)
    if not os.path.isdir(actor_path):
        continue

    for file in os.listdir(actor_path):
        if file.startswith("03-"):
            shutil.copy(os.path.join(actor_path, file), os.path.join(target_dir, file))



In [6]:
emotion_map = {
    "01": "neutral", "02": "calm", "03": "happy", "04": "sad",
    "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
}

def get_label_from_filename(filename):
    parts = filename.split("-")
    emotion_code = parts[2]
    return emotion_map[emotion_code]


**2. Convert Audio to Mel-Spectrogram Images**

In [7]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import torch
from torchvision import transforms
from PIL import Image

spectrogram_dir = "spectrograms"
os.makedirs(spectrogram_dir, exist_ok=True)
target_dir = "ravdess_filtered"
def audio_to_melspec_image(file_path, out_path):
    y, sr = librosa.load(file_path, sr=16000)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_DB = librosa.power_to_db(S, ref=np.max)

    plt.figure(figsize=(2.24, 2.24), dpi=100)  # 224x224 image
    librosa.display.specshow(S_DB, sr=sr, cmap='viridis')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
    plt.close()

# Convert all audio files
for file in os.listdir(target_dir):
    if file.endswith(".wav"):
        label = get_label_from_filename(file)
        label_folder = os.path.join(spectrogram_dir, label)
        os.makedirs(label_folder, exist_ok=True)
        img_path = os.path.join(label_folder, file.replace(".wav", ".png"))
        audio_to_melspec_image(os.path.join(target_dir, file), img_path)


**3. Create PyTorch Dataset**

In [8]:
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset = ImageFolder(root=spectrogram_dir, transform=transform)
# dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

class_names = dataset.classes
print("Classes:", class_names)

from torch.utils.data import random_split

# 80% train, 20% validation split
val_ratio = 0.2
val_size = int(val_ratio * len(dataset))
train_size = len(dataset) - val_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)



Classes: ['angry', 'calm', 'disgust', 'fearful', 'happy', 'neutral', 'sad', 'surprised']


**4. Train a CNN Model (e.g., ResNet18)**

In [14]:
import torch.nn as nn
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(class_names))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




In [10]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate(model, dataloader):
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return val_loss / len(dataloader), acc, f1


**5. Training Loop**

In [15]:
def train(model, train_loader, val_loader, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = correct / total
        val_loss, val_acc, val_f1 = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {running_loss:.4f} | Train Acc: {train_acc:.2f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f} | F1: {val_f1:.2f}")



In [16]:
train(model, train_loader, val_loader, epochs=25)

Epoch 1/25 | Train Loss: 108.2558 | Train Acc: 0.46 | Val Loss: 1.1199 | Val Acc: 0.61 | F1: 0.60
Epoch 2/25 | Train Loss: 41.0849 | Train Acc: 0.85 | Val Loss: 0.9458 | Val Acc: 0.63 | F1: 0.63
Epoch 3/25 | Train Loss: 11.9749 | Train Acc: 0.98 | Val Loss: 0.8107 | Val Acc: 0.71 | F1: 0.71
Epoch 4/25 | Train Loss: 3.7511 | Train Acc: 0.99 | Val Loss: 0.8426 | Val Acc: 0.69 | F1: 0.69
Epoch 5/25 | Train Loss: 2.2506 | Train Acc: 1.00 | Val Loss: 0.8378 | Val Acc: 0.70 | F1: 0.70
Epoch 6/25 | Train Loss: 1.3439 | Train Acc: 1.00 | Val Loss: 0.7711 | Val Acc: 0.73 | F1: 0.73
Epoch 7/25 | Train Loss: 0.8614 | Train Acc: 1.00 | Val Loss: 0.7848 | Val Acc: 0.73 | F1: 0.73
Epoch 8/25 | Train Loss: 0.6087 | Train Acc: 1.00 | Val Loss: 0.7594 | Val Acc: 0.73 | F1: 0.73
Epoch 9/25 | Train Loss: 0.4065 | Train Acc: 1.00 | Val Loss: 0.7484 | Val Acc: 0.74 | F1: 0.74
Epoch 10/25 | Train Loss: 0.4441 | Train Acc: 1.00 | Val Loss: 0.7837 | Val Acc: 0.75 | F1: 0.75
Epoch 11/25 | Train Loss: 0.6212 | 

In [None]:
torch.save(model.state_dict(), "audio_image_model.pth")

#### Results analisys
Early epochs (e.g., Epoch 1–2) started with ~61–63% accuracy and similar F1 scores, but finainal epochs (23–25) reached ~78–79% validation accuracy and F1, showing a strong classification performance. The F1 score, which balances precision and recall, reaching 0.79 is a reliable indicator of effective generalization.

We can also notice a stable Training - after a brief overfitting dip (Epochs 14–16), the model recovered well, showing resilience to noise or overfitting.
The final epochs show consistently low validation loss and high accuracy, suggesting the model has converged. While Train Acc = 1.00 may indicate overfitting, the alignment with high validation metrics (not collapsing or diverging) suggests good model capacity and enough regularization.
To sum up, these results indicate that the classification model based on image representations of Mel spectrograms is capable of learning and generalizing audio emotion features effectively. The final model achieves nearly 80% F1 score, reflecting a well-trained and validated performance level.

### Emodb

In [1]:
import zipfile
import os


# Unzip file
zip_path = "/content/archive (2).zip"  # change this path
extract_path = "/content/emodb"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)


In [2]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

# Ścieżki
input_dir = "emodb/wav"
output_dir = "emodb_spectrograms"
os.makedirs(output_dir, exist_ok=True)

# Emocje niemieckie
emotion_map_de = {
    'W': 'Wut',
    'L': 'Langeweile',
    'E': 'Ekel',
    'A': 'Angst',
    'F': 'Freude',
    'T': 'Trauer',
    'N': 'Neutral'
}

def get_emotion_label(filename):
    emotion_code = filename[5].upper()
    return emotion_map_de.get(emotion_code, 'Unknown')

def audio_to_melspec_image(file_path, out_path):
    y, sr = librosa.load(file_path, sr=16000)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_DB = librosa.power_to_db(S, ref=np.max)

    plt.figure(figsize=(2.24, 2.24), dpi=100)
    librosa.display.specshow(S_DB, sr=sr, cmap='magma')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
    plt.close()

# Konwersja wszystkich plików
for fname in os.listdir(input_dir):
    if fname.endswith(".wav"):
        emotion = get_emotion_label(fname)
        if emotion == 'Unknown': continue
        label_dir = os.path.join(output_dir, emotion)
        os.makedirs(label_dir, exist_ok=True)

        out_file = os.path.join(label_dir, fname.replace(".wav", ".png"))
        audio_to_melspec_image(os.path.join(input_dir, fname), out_file)


In [4]:
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset = ImageFolder(root=output_dir, transform=transform)


class_names = dataset.classes
print("Classes:", class_names)

from torch.utils.data import random_split

# 80% train, 20% validation split
val_ratio = 0.2
val_size = int(val_ratio * len(dataset))
train_size = len(dataset) - val_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)




Classes: ['Angst', 'Ekel', 'Freude', 'Langeweile', 'Neutral', 'Trauer', 'Wut']


In [9]:
import torch
import torch.nn as nn
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(class_names))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




In [6]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate(model, dataloader):
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return val_loss / len(dataloader), acc, f1


def train(model, train_loader, val_loader, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = correct / total
        val_loss, val_acc, val_f1 = evaluate(model, val_loader)
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {running_loss:.4f} | Train Acc: {train_acc:.2f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f} | F1: {val_f1:.2f}")



In [10]:
train(model, train_loader, val_loader, epochs=25)

Epoch 1/25 | Train Loss: 35.8517 | Train Acc: 0.50 | Val Loss: 1.2413 | Val Acc: 0.55 | F1: 0.50
Epoch 2/25 | Train Loss: 9.0222 | Train Acc: 0.95 | Val Loss: 0.8077 | Val Acc: 0.65 | F1: 0.64
Epoch 3/25 | Train Loss: 2.5488 | Train Acc: 1.00 | Val Loss: 0.6920 | Val Acc: 0.70 | F1: 0.69
Epoch 4/25 | Train Loss: 1.1331 | Train Acc: 1.00 | Val Loss: 0.7610 | Val Acc: 0.72 | F1: 0.70
Epoch 5/25 | Train Loss: 0.7208 | Train Acc: 1.00 | Val Loss: 0.6967 | Val Acc: 0.72 | F1: 0.71
Epoch 6/25 | Train Loss: 0.5983 | Train Acc: 1.00 | Val Loss: 0.7230 | Val Acc: 0.74 | F1: 0.73
Epoch 7/25 | Train Loss: 0.4904 | Train Acc: 1.00 | Val Loss: 0.7013 | Val Acc: 0.74 | F1: 0.72
Epoch 8/25 | Train Loss: 0.3594 | Train Acc: 1.00 | Val Loss: 0.7996 | Val Acc: 0.70 | F1: 0.68
Epoch 9/25 | Train Loss: 0.2931 | Train Acc: 1.00 | Val Loss: 0.8518 | Val Acc: 0.71 | F1: 0.69
Epoch 10/25 | Train Loss: 0.4000 | Train Acc: 1.00 | Val Loss: 0.7107 | Val Acc: 0.73 | F1: 0.72
Epoch 11/25 | Train Loss: 0.2121 | Tra

##### results analisys
1. Training and Validation Performance summary
- Training Accuracy: The model reached perfect training accuracy (1.00) by epoch 3 and maintained it throughout the rest of the training.

- Training Loss: The training loss decreased consistently, dropping from 35.85 in epoch 1 to 0.05 in epoch 25, indicating the model fit the training data very well.

- Validation Accuracy: Validation accuracy improved steadily in the early epochs, peaking at 0.76 in epoch 20. However, it plateaued between 0.70 and 0.75 in later epochs, with occasional drops.

- Validation Loss: The validation loss initially decreased, reaching its lowest point around epochs 5–6, but then fluctuated and even increased towards the end of training, with a final value of 0.8642.

- F1 Score: The F1 score followed a similar pattern, increasing quickly in the first few epochs, then stabilizing between 0.69 and 0.75 from epoch 4 onward.

2. Observations
The model learned quickly in the first few epochs, with validation accuracy improving from 0.55 to 0.72 by epoch 4. The sustained perfect training accuracy, alongside fluctuating validation metrics and increasing validation loss after epoch 6, suggests the **model is overfitting** to the training data.
Validation accuracy and F1 score plateaued after epoch 6. Despite continued reductions in training loss, no significant improvement was observed in validation metrics. Several later epochs show increased validation loss and decreased F1 score, indicating inconsistent generalization and sensitivity to validation data characteristics.



#### Introducing techniques to fight with overfitting

As we saw in the previous cells, the model showed sighns of orefitting. To fight with that we will try to introduce some techniques and see if we can avoid that. we will focus on:
- Introduce Regularization: Apply dropout layers and/or L2 regularization to reduce overfitting.

- Early Stopping: Monitor validation loss or F1 score and stop training when no improvement is seen over several epochs.

What more can be done id for exaple Data augmentation, and if we don't perform better this time we will try to use it.

In [11]:
import torch
import torch.nn as nn
import torchvision.models as models
from sklearn.metrics import accuracy_score, f1_score
import copy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modify ResNet18 with dropout
class ResNetWithDropout(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.3):
        super(ResNetWithDropout, self).__init__()
        self.base_model = models.resnet18(pretrained=True)
        in_features = self.base_model.fc.in_features
        self.base_model.fc = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(in_features, num_classes)
        )

    def forward(self, x):
        return self.base_model(x)

model = ResNetWithDropout(num_classes=len(class_names)).to(device)

# Loss and optimizer with L2 regularization
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)  # L2 regularization

def evaluate(model, dataloader):
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return val_loss / len(dataloader), acc, f1

# Train function with early stopping
def train(model, train_loader, val_loader, epochs=25, patience=5):
    best_f1 = 0.0
    best_model = None
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = correct / total
        val_loss, val_acc, val_f1 = evaluate(model, val_loader)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {running_loss:.4f} | Train Acc: {train_acc:.2f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f} | F1: {val_f1:.2f}")

        # Early stopping
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model = copy.deepcopy(model.state_dict())
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}. Best F1: {best_f1:.2f}")
                break

    if best_model:
        model.load_state_dict(best_model)

    return model




In [12]:
train(model, train_loader, val_loader, epochs=25, patience=5)

Epoch 1/25 | Train Loss: 39.7505 | Train Acc: 0.48 | Val Loss: 1.3665 | Val Acc: 0.50 | F1: 0.43
Epoch 2/25 | Train Loss: 12.2917 | Train Acc: 0.90 | Val Loss: 0.8333 | Val Acc: 0.71 | F1: 0.68
Epoch 3/25 | Train Loss: 4.4527 | Train Acc: 0.99 | Val Loss: 0.7526 | Val Acc: 0.73 | F1: 0.71
Epoch 4/25 | Train Loss: 2.1896 | Train Acc: 1.00 | Val Loss: 0.7026 | Val Acc: 0.79 | F1: 0.79
Epoch 5/25 | Train Loss: 1.0214 | Train Acc: 1.00 | Val Loss: 0.7126 | Val Acc: 0.73 | F1: 0.71
Epoch 6/25 | Train Loss: 0.8396 | Train Acc: 1.00 | Val Loss: 0.7082 | Val Acc: 0.76 | F1: 0.74
Epoch 7/25 | Train Loss: 0.5130 | Train Acc: 1.00 | Val Loss: 0.7111 | Val Acc: 0.74 | F1: 0.73
Epoch 8/25 | Train Loss: 0.4066 | Train Acc: 1.00 | Val Loss: 0.7194 | Val Acc: 0.73 | F1: 0.71
Epoch 9/25 | Train Loss: 0.6503 | Train Acc: 1.00 | Val Loss: 0.7835 | Val Acc: 0.75 | F1: 0.73
Early stopping at epoch 9. Best F1: 0.79


ResNetWithDropout(
  (base_model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=Tr

#### results analisys
Dropout regularization and early stopping effectively prevented overfitting, as seen by the stabilization of the validation F1 score after epoch 4 despite continued improvement in training metrics.

Generalization: The peak validation F1-score of 0.79 indicates good generalization to unseen data, especially considering the small gap between training and validation performance in later epochs.

Architecture: The ResNet backbone (up to 4 layers with increasing depth and residual connections) combined with batch normalization, ReLU activations, and a final dropout-regularized fully connected layer supports both stability during training and improved test-time robustness.