In [3]:
import os
import librosa
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
import xgboost as xgb

In [4]:
# Read csv file 
# Data was downloaded from https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification
#There are audio files and csv file. For this part csv with 30 sec music features was used.
path = "Resources\\GTZAN\\features_3_sec.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [5]:
df.columns

Index(['filename', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean',
       'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
       'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
       'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
       'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
       'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean',
       'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 'mfcc5_var',
       'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
       'mfcc8_var', 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var',
       'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 'mfcc13_mean',
       'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var',
       'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean',
       'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean', 'mfcc20_var',
  

In [6]:
df.shape[1]

60

## Preprocessing for machine learning

In [7]:
# Define features set. Dropping filename and label
X = df.copy()
X.drop(["filename", "label"], axis=1, inplace=True)
#Converting X to numpy array as a part of preprocessing
X.to_numpy()

array([[ 6.61490000e+04,  3.35406363e-01,  9.10482928e-02, ...,
         3.36180725e+01, -2.43026793e-01,  4.37717667e+01],
       [ 6.61490000e+04,  3.43065351e-01,  8.61465260e-02, ...,
         9.70308304e+01,  5.78406334e+00,  5.99430809e+01],
       [ 6.61490000e+04,  3.46814752e-01,  9.22428891e-02, ...,
         5.30508347e+01,  2.51737475e+00,  3.31051216e+01],
       ...,
       [ 6.61490000e+04,  3.47480893e-01,  8.90194401e-02, ...,
         4.85823784e+01, -2.99545288e-01,  4.15869904e+01],
       [ 6.61490000e+04,  3.87527317e-01,  8.48154277e-02, ...,
         2.48436127e+01,  6.75824106e-01,  1.27877502e+01],
       [ 6.61490000e+04,  3.69292945e-01,  8.67586955e-02, ...,
         3.94859009e+01, -3.41253424e+00,  3.17274895e+01]])

In [8]:
# Define target vector "label"
y = df["label"].to_numpy()
y[:5]

array(['blues', 'blues', 'blues', 'blues', 'blues'], dtype=object)

In [9]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [11]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [12]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Implementing Random Forest

In [13]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)

In [14]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [15]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [16]:
# Calculate confusion matrix
cm = confusion_matrix(y_test, predictions)

# Use unique class labels for the confusion matrix
class_labels = sorted(set(y_test)) 
cm_df = pd.DataFrame(cm, index=[f"{label}" for label in class_labels], 
                         columns=[f"Predicted {label}" for label in class_labels])

# Confusion matrix
print("Confusion Matrix for Random Forest:")
cm_df.head()

Confusion Matrix for Random Forest:


Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted disco,Predicted hiphop,Predicted jazz,Predicted metal,Predicted pop,Predicted reggae,Predicted rock
blues,226,1,10,5,1,3,6,0,4,1
classical,0,248,0,0,0,7,0,0,0,1
country,16,1,189,3,0,13,2,3,2,3
disco,1,2,3,214,7,2,2,7,7,10
hiphop,3,1,5,2,240,2,7,7,1,2


In [17]:
#Evaluate the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       blues       0.89      0.88      0.89       257
   classical       0.91      0.97      0.94       256
     country       0.80      0.81      0.81       232
       disco       0.87      0.84      0.85       255
      hiphop       0.94      0.89      0.91       270
        jazz       0.86      0.91      0.88       244
       metal       0.89      0.96      0.92       261
         pop       0.88      0.95      0.91       224
      reggae       0.88      0.90      0.89       254
        rock       0.89      0.73      0.80       245

    accuracy                           0.88      2498
   macro avg       0.88      0.88      0.88      2498
weighted avg       0.88      0.88      0.88      2498



## Trying XGBoost

In [18]:
#X_train, X_test, y_train, y_test are already preprocessed and are numpy
#Labels of music genres should be converted to integers for this algorithm

#Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [19]:
# Train the XGBoost model
XGB_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
XGB_model.fit(X_train, y_train_encoded)

# Make predictions
predictions = XGB_model.predict(X_test)

# Decode the predictions back to original labels
predict_decoded = label_encoder.inverse_transform(predictions)

# Evaluate the model
print(classification_report(y_test, predict_decoded))

              precision    recall  f1-score   support

       blues       0.91      0.89      0.90       257
   classical       0.95      0.98      0.97       256
     country       0.82      0.89      0.85       232
       disco       0.88      0.89      0.88       255
      hiphop       0.96      0.91      0.93       270
        jazz       0.93      0.92      0.93       244
       metal       0.95      0.97      0.96       261
         pop       0.91      0.96      0.93       224
      reggae       0.95      0.91      0.93       254
        rock       0.90      0.84      0.87       245

    accuracy                           0.92      2498
   macro avg       0.92      0.92      0.92      2498
weighted avg       0.92      0.92      0.92      2498



In [20]:
# Calculate confusion matrix
cm = confusion_matrix(y_test, predict_decoded)

# Use unique class labels for the confusion matrix
class_labels = sorted(set(y_test)) 
cm_df = pd.DataFrame(cm, index=[f"{label}" for label in class_labels], 
                         columns=[f"Predicted {label}" for label in class_labels])

# Confusion matrix
print("Confusion Matrix for XGBoost:")
cm_df.head()

Confusion Matrix for XGBoost:


Unnamed: 0,Predicted blues,Predicted classical,Predicted country,Predicted disco,Predicted hiphop,Predicted jazz,Predicted metal,Predicted pop,Predicted reggae,Predicted rock
blues,229,1,10,7,0,1,3,0,2,4
classical,0,251,0,0,0,4,0,0,0,1
country,9,0,207,3,0,4,0,4,3,2
disco,2,2,2,227,1,1,1,10,2,7
hiphop,3,1,6,4,245,2,2,4,1,2


## Testing of audio files with librosa library was made with Chat GPT assistance

In [30]:
#Test audio file
def extract_features(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, duration=30)

    # Extract features in the same order as in training
    chroma_stft_mean = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))
    chroma_stft_var = np.var(librosa.feature.chroma_stft(y=y, sr=sr))

    rms_mean = np.mean(librosa.feature.rms(y=y))
    rms_var = np.var(librosa.feature.rms(y=y))

    spectral_centroid_mean = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_centroid_var = np.var(librosa.feature.spectral_centroid(y=y, sr=sr))

    spectral_bandwidth_mean = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    spectral_bandwidth_var = np.var(librosa.feature.spectral_bandwidth(y=y, sr=sr))

    rolloff_mean = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    rolloff_var = np.var(librosa.feature.spectral_rolloff(y=y, sr=sr))

    zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(y=y))
    zero_crossing_rate_var = np.var(librosa.feature.zero_crossing_rate(y=y))

    harmony = librosa.effects.harmonic(y)
    harmony_mean = np.mean(harmony)
    harmony_var = np.var(harmony)

    perceptr = librosa.effects.percussive(y)
    perceptr_mean = np.mean(perceptr)
    perceptr_var = np.var(perceptr)

    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfccs_mean = np.mean(mfccs, axis=1)
    mfccs_var = np.var(mfccs, axis=1)

    # Combine all features into a single feature vector
    features = np.hstack((
        chroma_stft_mean, chroma_stft_var,
        rms_mean, rms_var,
        spectral_centroid_mean, spectral_centroid_var,
        spectral_bandwidth_mean, spectral_bandwidth_var,
        rolloff_mean, rolloff_var,
        zero_crossing_rate_mean, zero_crossing_rate_var,
        harmony_mean, harmony_var,
        perceptr_mean, perceptr_var,
        tempo,
        mfccs_mean, mfccs_var
    ))

    # Print features for debugging
    print("Extracted Features:", features)

    return features

# Test the function
#file_path = "Resources/GTZAN/Music_files/disco.00005.wav"
#file_path = "Resources/GTZAN/Music_files/classical.00005.wav"
file_path = "Resources/GTZAN/Music_files/pop.00005.wav"
test_features = extract_features(file_path)

Extracted Features: [ 3.23256552e-01  8.67510810e-02  1.39538363e-01  5.96979121e-03
  2.29008592e+03  5.50276860e+05  2.43591839e+03  1.98045222e+05
  4.74800462e+03  2.63287287e+06  1.06780533e-01  2.45404097e-03
  4.72876218e-06  1.42366132e-02  3.80023630e-05  4.37254738e-03
  1.72265625e+02 -1.13618103e+02  9.00145645e+01 -5.54134369e+00
  2.55954742e+01  2.60114980e+00  3.19225240e+00  1.13118112e+00
  6.14371872e+00  2.78937984e+00 -1.87616062e+00 -7.11999083e+00
 -3.44179249e+00 -5.91100454e+00 -4.35074949e+00 -4.94646168e+00
 -4.38010263e+00 -3.31226683e+00 -5.43290949e+00 -5.70474982e-01
 -2.18023849e+00  1.99126328e+04  4.55192291e+02  8.32140198e+02
  1.96540314e+02  1.01381714e+02  1.48253586e+02  9.54550552e+01
  1.28462570e+02  1.04811592e+02  1.25091080e+02  8.94460983e+01
  8.24000397e+01  6.14456253e+01  7.01034851e+01  8.71588211e+01
  4.19773331e+01  4.74416466e+01  4.72014313e+01  4.50276184e+01
  4.54078598e+01]


In [31]:
#Testing audio file on Random Forest

# Step 2: Check the number of features
if len(test_features) < rf_model.n_features_in_:
    # Add a placeholder for the missing feature
    missing_features = rf_model.n_features_in_ - len(test_features)
    test_features = np.append(test_features, [0] * missing_features)
elif len(test_features) > rf_model.n_features_in_:
    # Trim extra features
    test_features = test_features[:rf_model.n_features_in_]

# Step 3: Reshape for prediction
test_features = test_features.reshape(1, -1)

# Step 4: Make prediction
prediction = rf_model.predict(test_features)

print(f"Predicted genre: {prediction[0]}")

Predicted genre: pop
