# MFCC Feature Table for RAVDESS

This notebook builds a compact MFCC-based feature table for the RAVDESS dataset so it can feed PyCaret experiments and future MLP baselines.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import librosa
import librosa.display  # Optional helper for quick visual checks


def resolve_project_root(marker: str = "data/ravdess_data") -> Path:
    """Walk up from CWD until the marker path exists and return that directory."""
    current = Path.cwd().resolve()
    for candidate in [current, *current.parents]:
        if (candidate / marker).exists():
            return candidate
    raise FileNotFoundError(f"Could not find '{marker}' relative to {current}")


PROJECT_ROOT = resolve_project_root()
AUDIO_ROOT = PROJECT_ROOT / "data/ravdess_data"
OUTPUT_CSV = PROJECT_ROOT / "data/processed/ravdess_mfcc_features.csv"

SAMPLE_RATE = 22050
N_MFCC = 20


## Utility functions

The helpers below parse labels from filenames, compute MFCC-based statistics, and assemble a single feature row per file.

In [2]:
EMOTION_MAP = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised',
}


def parse_emotion_from_filename(path: Path) -> str:
    '''Map the RAVDESS filename code (3rd segment) to the human-readable emotion label.'''
    parts = path.stem.split('-')
    if len(parts) < 3:
        raise ValueError(f"Unexpected filename pattern: {path.name}")
    code = parts[2]
    if code not in EMOTION_MAP:
        raise ValueError(f"Unknown emotion code '{code}' in {path.name}")
    return EMOTION_MAP[code]


def extract_mfcc_features(path: Path) -> dict:
    '''Return mean/std statistics for MFCC and delta MFCC coefficients.'''
    y, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)

    feature_dict: dict[str, float] = {}
    for idx in range(mfcc.shape[0]):
        coeff = mfcc[idx]
        feature_dict[f"mfcc{idx + 1}_mean"] = float(np.mean(coeff))
        feature_dict[f"mfcc{idx + 1}_std"] = float(np.std(coeff))

    delta = librosa.feature.delta(mfcc)
    for idx in range(delta.shape[0]):
        coeff = delta[idx]
        feature_dict[f"delta_mfcc{idx + 1}_mean"] = float(np.mean(coeff))
        feature_dict[f"delta_mfcc{idx + 1}_std"] = float(np.std(coeff))

    return feature_dict


def build_feature_row(path: Path) -> dict:
    '''Create a single feature row with metadata + MFCC statistics for one audio file.'''
    row = {
        'file_path': str(path),
        'emotion': parse_emotion_from_filename(path),
    }
    row.update(extract_mfcc_features(path))
    return row


## Discover audio files

Find every `.wav` file under `AUDIO_ROOT` so the extraction loop knows which clips to process.

In [3]:
file_paths = sorted(AUDIO_ROOT.rglob('*.wav'))
print(f"Found {len(file_paths)} audio files under {AUDIO_ROOT}")


Found 1440 audio files under /home/nico/ds_workspace/projects/RAVDESS/data/ravdess_data


## Extract MFCC features

This cell loops over all files, builds the MFCC feature rows, and saves the resulting table as a CSV.

In [4]:
OVERWRITE = False

if OUTPUT_CSV.exists() and not OVERWRITE:
    print(f"Existing feature table found at {OUTPUT_CSV}. Set OVERWRITE=True to rebuild.")
    df = pd.read_csv(OUTPUT_CSV)
else:
    rows: list[dict] = []
    failed_files: list[Path] = []

    for path in tqdm(file_paths, desc='Extracting MFCC features'):
        try:
            rows.append(build_feature_row(path))
        except Exception as exc:
            failed_files.append(path)
            print(f"Failed on {path}: {exc}")

    df = pd.DataFrame(rows)
    OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved {len(df)} rows to {OUTPUT_CSV}")

    if failed_files:
        failed_path = OUTPUT_CSV.with_name('ravdess_mfcc_failed_files.txt')
        failed_path.write_text('\n'.join(str(p) for p in failed_files))
        print(f"Logged {len(failed_files)} failures to {failed_path}")


Existing feature table found at /home/nico/ds_workspace/projects/RAVDESS/data/processed/ravdess_mfcc_features.csv. Set OVERWRITE=True to rebuild.


## Inspect the feature table

Quick sanity checks to ensure the dataset dimensions, label distribution, and column counts look correct.

In [9]:
if 'df' not in globals():
    df = pd.read_csv(OUTPUT_CSV)

print('Shape:', df.shape)
df.head(10)


Shape: (1440, 82)


Unnamed: 0,file_path,emotion,mfcc1_mean,mfcc1_std,mfcc2_mean,mfcc2_std,mfcc3_mean,mfcc3_std,mfcc4_mean,mfcc4_std,...,delta_mfcc16_mean,delta_mfcc16_std,delta_mfcc17_mean,delta_mfcc17_std,delta_mfcc18_mean,delta_mfcc18_std,delta_mfcc19_mean,delta_mfcc19_std,delta_mfcc20_mean,delta_mfcc20_std
0,/home/nico/ds_workspace/projects/RAVDESS/data/...,neutral,-697.792603,183.030441,54.890038,72.16848,0.663467,19.195799,12.435785,20.930756,...,-0.002792203,0.586559,-0.002689828,1.398391,-0.002582618,0.924114,-0.00247099,1.030372,-0.002355282,0.532815
1,/home/nico/ds_workspace/projects/RAVDESS/data/...,neutral,-692.855774,185.050293,55.363899,66.308495,-1.548319,19.290407,16.038307,19.345299,...,0.002107918,0.810777,0.001906806,1.409166,0.001724786,0.780301,0.001558079,1.106086,0.001401772,0.73461
2,/home/nico/ds_workspace/projects/RAVDESS/data/...,neutral,-691.587891,190.336121,58.024662,72.18483,0.159465,21.651524,13.624649,19.525526,...,0.002610261,0.961857,0.003389649,1.230899,0.003744929,0.934672,0.003710179,1.016712,0.003396559,0.742823
3,/home/nico/ds_workspace/projects/RAVDESS/data/...,neutral,-685.105469,184.565063,55.879421,66.488159,2.783262,20.100769,13.252023,20.778818,...,-0.00895638,0.902226,0.01126685,1.463178,-0.005738164,0.781945,-0.01721028,1.092176,0.006391661,1.017392
4,/home/nico/ds_workspace/projects/RAVDESS/data/...,calm,-727.10437,182.821884,62.355034,68.404228,3.121181,22.141096,15.064669,20.880312,...,-0.01178795,0.991112,-0.009131163,1.817581,-0.00751572,0.953957,-0.006219722,0.920864,-0.004666029,1.083396
5,/home/nico/ds_workspace/projects/RAVDESS/data/...,calm,-707.358215,169.380035,66.736458,73.044151,2.25349,22.743551,11.169915,19.891697,...,0.001006612,1.213249,0.0005815579,2.037919,0.0001586599,0.932036,-0.0002575347,0.921631,-0.0006621923,1.046869
6,/home/nico/ds_workspace/projects/RAVDESS/data/...,calm,-697.166138,195.661469,65.1082,77.897346,0.930369,25.643011,13.633629,20.614908,...,-0.00438172,0.901376,-0.004200001,1.541156,-0.004012907,0.90986,-0.003821335,0.895472,-0.003626274,1.177229
7,/home/nico/ds_workspace/projects/RAVDESS/data/...,calm,-698.637695,196.158691,68.698586,75.416084,1.100368,22.913017,13.685941,19.570847,...,-0.007004171,0.947991,-0.007487094,1.751932,-0.008335752,0.964059,-0.009026422,0.984156,-0.009231192,1.126482
8,/home/nico/ds_workspace/projects/RAVDESS/data/...,calm,-734.12561,190.54306,70.532913,71.599319,4.22518,20.327414,13.866501,21.240456,...,0.01669757,0.987914,0.01544976,1.578539,0.01330673,0.937897,0.01081991,0.968158,0.008215373,1.055801
9,/home/nico/ds_workspace/projects/RAVDESS/data/...,calm,-697.822327,173.384033,67.339592,72.69619,-0.449553,21.242195,11.884349,18.541895,...,1.102514e-08,1.020174,-1.102514e-08,1.500251,-6.89071e-09,0.99762,5.512568e-09,1.156806,8.268852e-09,1.008103


In [6]:
print(df['emotion'].value_counts())

expected_cols = 2 + 4 * N_MFCC
actual_cols = df.shape[1]
print(f"Expected columns (file_path/emotion + MFCC/delta stats): {expected_cols}")
print(f"Actual columns: {actual_cols}")

assert actual_cols == expected_cols, 'Column count mismatch—verify MFCC settings.'
assert df['emotion'].nunique() == len(EMOTION_MAP), 'Not all emotions present.'
assert 1000 <= len(df) <= 1500, 'Unexpected number of rows—double-check audio discovery.'

emotion
calm         192
happy        192
sad          192
angry        192
disgust      192
fearful      192
surprised    192
neutral       96
Name: count, dtype: int64
Expected columns (file_path/emotion + MFCC/delta stats): 82
Actual columns: 82


## Lightweight EDA

Minimal exploratory checks for missing values and basic MFCC statistics per emotion.

In [7]:
missing = df.isna().sum().sort_values(ascending=False)
print('Top missing columns:')
print(missing.head())

mfcc_cols = [c for c in df.columns if c.startswith('mfcc')][:5]
if mfcc_cols:
    grouped_means = df.groupby('emotion')[mfcc_cols].mean()
    print('\nMean of first few MFCC features by emotion:')
    display(grouped_means)


Top missing columns:
file_path     0
emotion       0
mfcc1_mean    0
mfcc1_std     0
mfcc2_mean    0
dtype: int64

Mean of first few MFCC features by emotion:


Unnamed: 0_level_0,mfcc1_mean,mfcc1_std,mfcc2_mean,mfcc2_std,mfcc3_mean
emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
angry,-478.45291,179.299329,38.497447,53.342108,-12.378559
calm,-685.864579,168.966638,59.167877,57.588739,4.718826
disgust,-594.867439,174.150066,51.204054,61.02853,-2.668143
fearful,-533.206672,172.558512,38.693487,47.009614,-11.208904
happy,-554.136678,176.416609,46.012288,53.416784,-12.198635
neutral,-679.331834,177.868021,58.064829,61.390083,2.769017
sad,-639.44208,167.171585,54.128719,57.612042,-1.898446
surprised,-585.307789,187.440449,46.076378,57.007892,-4.487377


## Notes for future mel spectrogram work

We will reuse the same audio loading configuration (`SAMPLE_RATE`, `mono=True`) when building mel spectrogram datasets for CNN models. The helper below is a placeholder for that future workflow.

In [8]:
def compute_mel_spectrogram(y: np.ndarray, sr: int, n_mels: int = 64) -> np.ndarray:
    '''Compute a mel spectrogram with log scaling to feed into CNN pipelines later on.'''
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db

# Example usage (do not run over all files yet):
# y, sr = librosa.load(file_paths[0], sr=SAMPLE_RATE, mono=True)
# mel_spec = compute_mel_spectrogram(y, sr)
