In [2]:
import numpy as np
import pandas as pd
import librosa
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from IPython.display import FileLink

ds_path = "/kaggle/input/cremad/AudioWAV/"

def feature_extraction(filename, sampling_rate=16000):
    path = os.path.join(ds_path, filename)
    features = list()
    audio, _ = librosa.load(path, sr=sampling_rate)
   
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sampling_rate))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate))
    spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate))
    features.extend([spectral_centroid, spectral_bandwidth, spectral_rolloff])
   
    mfcc = librosa.feature.mfcc(y=audio, sr=sampling_rate)
    features.extend([np.mean(el) for el in mfcc])
   
    filename = filename.lower()
    if 'ang' in filename:
        emotion = 'angry'
    elif 'dis' in filename:
        emotion = 'disgust'
    elif 'fea' in filename:
        emotion = 'fear'
    elif 'hap' in filename:
        emotion = 'happy'
    elif 'neu' in filename:
        emotion = 'neutral'
    elif 'sad' in filename:
        emotion = 'sad'
    else:
        emotion = 'unknown'
   
    features.append(emotion)
   
    return features

all_features = []

for filename in os.listdir(ds_path):
    if filename.endswith(".wav"):
        features = feature_extraction(filename)
        all_features.append(features)

feature_names = [f"feature_{i+1}" for i in range(len(all_features[0])-1)] + ["emotion"]

df_features = pd.DataFrame(all_features, columns=feature_names)

desired_feature_names = ["spectral_centroid", "spectral_bandwidth", "spectral_rolloff",
                         "mfcc1", "mfcc2", "mfcc3", "mfcc4", "mfcc5", "mfcc6", "mfcc7", "mfcc8",
                         "mfcc9", "mfcc10", "mfcc11", "mfcc12", "mfcc13","mfcc14","mfcc15","mfcc16","mfcc17","mfcc18","mfcc19","mfcc20", "emotion"]

df_features.columns = desired_feature_names
def scale_features(data):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype=float))
    return scaled_data, scaler

x, scaler = scale_features(df_features)
def get_labels(data):
    labels = data.iloc[:, -1]
    encoder = LabelEncoder()
    labels = encoder.fit_transform(labels)
    return labels, encoder

y, encoder = get_labels(df_features)

n_features = 22  
f_selector = SelectKBest(f_classif, k=n_features).fit(x, y)
X_new = f_selector.transform(x)

selected_feature_indices = f_selector.get_support(indices=True)

selected_feature_names = df_features.columns[:-1][selected_feature_indices]

df_preprocessed = pd.DataFrame(X_new, columns=selected_feature_names)
df_preprocessed['emotion'] = df_features['emotion']
csv_file_name = 'preprocessed_features.csv'
df_preprocessed.to_csv(csv_file_name, index=False)
FileLink(csv_file_name)