In [None]:
import os
import sys
from glob import glob
import asyncio
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
import librosa
import librosa.display

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from keras import layers


### Step 1. Set a path to audiofiles and check if they are being accessed correctly

In [None]:
path = "genres"
genres = glob(path + "/*")
audio_files = glob(path + "/**/*.wav", recursive=True)

#Checking that audiofiles and genres paths have been identified correctly
genres_list = [os.path.basename(genre_path) for genre_path in genres if ".mf" not in genre_path]
audio_files_list = [os.path.basename(file) for file in audio_files if ".mf" not in audio_files]
print(genres_list)
print(audio_files_list)


### Step 2. Create a function to extract features from audiofiles

In [None]:
async def extract_features(y, sr):
    #y, sr = librosa.load(file, sr=None, duration=seconds) # Load audio for specified duration

    # Extracting features from audio
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    rms = librosa.feature.rms(y=y)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    harmony, perceptr = librosa.effects.hpss(y)
    tempo = librosa.beat.tempo(y=y, sr=sr)
    length = librosa.get_duration(y=y, sr=sr)

    # Aggregate features
    features = {
        "name": os.path.basename(file).split(".wav")[0].replace(".", "_"),
        "genre": os.path.basename(file).split(".")[0],
        "length": length,
        "tempo": tempo,
        **{f"mfcc{i+1}_mean": np.mean(mfcc[i]) for i in range(13)},
        **{f"mfcc{i+1}_var": np.var(mfcc[i]) for i in range(13)},
        "chroma_stft_mean": np.mean(chroma_stft),
        "chroma_stft_var": np.var(chroma_stft),
        "spectral_centroid_mean": np.mean(spectral_centroid),
        "spectral_centroid_var": np.var(spectral_centroid),
        "spectral_bandwidth_mean": np.mean(spectral_bandwidth),
        "spectral_bandwidth_var": np.var(spectral_bandwidth),
        "rolloff_mean": np.mean(rolloff),
        "rolloff_var": np.var(rolloff),
        "zero_crossing_rate_mean": np.mean(zero_crossing_rate),
        "zero_crossing_rate_var": np.var(zero_crossing_rate),
        "rms_mean": np.mean(rms),
        "rms_var": np.var(rms),
        "mel_mean": np.mean(mel),
        "mel_var": np.var(mel),
        "contrast_mean": np.mean(contrast),
        "contrast_var": np.var(contrast),
        "tonnetz_mean": np.mean(tonnetz),
        "tonnetz_var": np.var(tonnetz),
        "harmony_mean": np.mean(harmony),
        "harmony_var": np.var(harmony),
        "perceptr_mean": np.mean(perceptr),
        "perceptr_var": np.var(perceptr)
    }
    return features

### Step 3. Iterate over files and folders. and extract the features

In [None]:
#TO REVIEWER: Features are already extracted in csv files. You can skip this step, as it's going to take a while to regenerate the dataset.

num_segments = 10 #Segments - how many parts are we splitting the audio into

feature_list_3 = [] #features spilt in 10 segments 
feature_list_30 = [] #features of full audiofile
 
for file in audio_files:
    print("processing", file)
    y, sr = librosa.load(file, sr=None) 
    samples_per_segment = len(y) // num_segments  #Calculate samples per segment
    feature_list_30.append(await extract_features(y, sr))
    for segment in range(num_segments):
        start_sample = segment * samples_per_segment  #Start sample index of each segment
        end_sample = (segment + 1) * samples_per_segment  #End sample index of each segment
        segment_features = await extract_features(y[start_sample:end_sample], sr)
        feature_list_3.append(segment_features)


In [None]:
feature_list_3
feature_list_30

### Step 4. Save the extracted features to a respective CSV file

In [None]:
df_features_3 = pd.DataFrame(feature_list_3)
df_features_3.to_csv("extracted_features_3sec.csv", index=False)

df_features_30 = pd.DataFrame(feature_list_30)
df_features_30.to_csv("extracted_features_30sec.csv", index=False)

### Step 5. Create, run and evaluate the model

In [None]:
df_3sec = pd.read_csv("extracted_features_3sec.csv")
df_30sec = pd.read_csv("extracted_features_30sec.csv")

In [None]:
#Encode "genre" column
label_encoder = LabelEncoder()

df_3sec["genre"] = label_encoder.fit_transform(df_3sec["genre"])
print(df_3sec["genre"])

df_30sec["genre"] = label_encoder.transform(df_30sec["genre"])
print(df_30sec["genre"])

In [None]:

df_combined = pd.concat([df_3sec, df_30sec], ignore_index=True)

#Split features and labels
X = df_combined.drop(columns=['name', 'genre', 'length'])
y = df_combined['genre'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation="relu", input_shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Dense(256, activation="relu"), 
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Dense(128, activation="relu"), 
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Dense(64, activation="relu"), 
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Dense(len(label_encoder.classes_), activation="softmax")
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_scaled, y_train, epochs=120, batch_size=128, validation_split=0.3)

test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test Accuracy: {test_accuracy}')

#Generate report
y_pred_prob = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_prob, axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
