In [1]:
import os
import glob
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import sounddevice as sd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import load_model

##### Converts Genre code (string) to integer label (0-based)

In [2]:
genre_map = {
    "01": 0, # Blues
    "02": 1, # Classical
    "03": 2, # Country
    "04": 3, # Disco
    "05": 4, # Hip-Hop
    "06": 5, # Jazz
    "07": 6, # Metal
    "08": 7, # Pop
    "09": 8, # Reggae
    "10": 9  # Rock
}

genre_labels = ["Blues", "Classical", "Country", "Disco", "Hip-Hop", "Jazz", "Metal", "Pop", "Reggae", "Rock"]

In [3]:
def extract_statistical_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfcc_std = np.std(mfcc, axis=1)
        delta_mean = np.mean(mfcc_delta, axis=1)
        delta_std = np.std(mfcc_delta, axis=1)
        delta2_mean = np.mean(mfcc_delta2, axis=1)
        delta2_std = np.std(mfcc_delta2, axis=1)
        features = np.concatenate([mfcc_mean, mfcc_std, delta_mean, delta_std, delta2_mean, delta2_std])
        return features, y, sr, mfcc
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None, None, None

In [12]:
path = "Genres"

song_files = glob.glob(os.path.join(path, '**', '*.wav'), recursive=True)
print("Number of audio files found:", len(song_files))

Number of audio files found: 1000


In [13]:
features_list = []
labels_list = []
waveforms = []
sample_rates = []
mfccs = []
file_names = []

In [14]:
for file_path in song_files:
    # Extracting features from the audio file
    features, y, sr, mfcc = extract_statistical_features(file_path, n_mfcc=13)
    if features is not None:
        features_list.append(features)
        waveforms.append(y)
        sample_rates.append(sr)
        mfccs.append(mfcc)
        file_names.append(file_path)
        
        # Parse the filename to extract the emotion code (3rd part of the filename)
        base_name = os.path.basename(file_path)
        parts = base_name.split('-')
        if len(parts) >= 3:
            genre_code = parts[2]
            label = genre_map.get(genre_code, -1)  # default to -1 if not found
            labels_list.append(label)
        else:
            labels_list.append(-1)

  y, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Error processing Genres\jazz\jazz.00054.wav: 


In [15]:
features_array = np.array(features_list)
labels_array = np.array(labels_list)

print("Features shape:", features_array.shape)
print("Labels shape:", labels_array.shape)

Features shape: (999, 78)
Labels shape: (999,)
