In [1]:
import os
import librosa
import numpy as np
from tqdm import tqdm

DATA_DIR = "../data"
SAMPLE_RATE = 22050
DURATION = 3
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION
MAX_PAD_LEN = 130

def extract_mfcc(y, sr, max_pad_len=130):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    if mfcc.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad_len]
    return mfcc

def get_label_from_filename(filename):
    parts = filename.split("-")
    emotion = int(parts[2])
    emotion_map = {
        1: "neutral", 2: "calm", 3: "happy", 4: "sad",
        5: "angry", 6: "fearful", 7: "disgust", 8: "surprised"
    }
    return emotion_map.get(emotion)

data = []
labels = []

for folder in ["Audio_Speech_Actors_01_24", "Audio_Songs_Actors_01_24"]:
    full_path = os.path.join(DATA_DIR, folder)
    for actor_folder in os.listdir(full_path):
        actor_path = os.path.join(full_path, actor_folder)
        if not os.path.isdir(actor_path): continue
        for file in tqdm(os.listdir(actor_path)):
            if file.endswith(".wav"):
                file_path = os.path.join(actor_path, file)
                label = get_label_from_filename(file)
                if not label:
                    continue

                try:
                    y, sr = librosa.load(file_path, sr=SAMPLE_RATE, duration=DURATION)

                    # Original
                    mfcc_orig = extract_mfcc(y, sr)
                    data.append(mfcc_orig)
                    labels.append(label)

                    # Add Noise
                    y_noise = y + 0.005 * np.random.randn(len(y))
                    mfcc_noise = extract_mfcc(y_noise, sr)
                    data.append(mfcc_noise)
                    labels.append(label)

                    # Time Stretch 
                    try:
                        y_stretch = librosa.effects.time_stretch(y, rate=0.9)
                        mfcc_stretch = extract_mfcc(y_stretch, sr)
                        data.append(mfcc_stretch)
                        labels.append(label)
                    except:
                        pass

                    # Pitch Shift
                    y_pitch = librosa.effects.pitch_shift(y, sr=sr, n_steps=2)
                    mfcc_pitch = extract_mfcc(y_pitch, sr)
                    data.append(mfcc_pitch)
                    labels.append(label)

                except Exception as e:
                    print(f"Failed processing {file_path}: {e}")


np.savez("../models/cnn_features_augmented.npz", data=np.array(data), labels=np.array(labels))
print("Saved augmented features to cnn_features_augmented.npz")


0it [00:00, ?it/s]
  "class": algorithms.Blowfish,
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:13<00:00,  4.59it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00,  8.98it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00,  8.86it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00,  8.91it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00,  8.81it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00,  8.58it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00,  9.05it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:06<00:00, 

Saved augmented features to cnn_features_augmented.npz
