In [34]:
import os
import pickle
import librosa
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# PATH = "../../datasets/gtzan/"
GTZAN_PATH = "D:/Personal Projects/wave-craft/backend/datasets/gtzan"


Load audio files of speech and music using librosa package. Extract STFT (Short-Time Fourier Transform) values for each audio file, and save it in a dataframe along with its label (speech/music).

In [2]:
def gtzan_loader():
    data = []
    for dir in ["music_wav", "speech_wav"]:
        dir_path = GTZAN_PATH + "/" + dir
        for filename in os.listdir(dir_path):
            file_path = os.path.join(dir_path, filename)
            if os.path.isfile(file_path):  # Check if it's a file
                try:
                    # Load the audio file using librosa
                    y, sr = librosa.load(file_path, mono=False, sr=None)
                    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
                    mfcc_mean = mfcc.mean(axis=1)

                    # Append a row with all MFCC values and label
                    label = 0 if dir == "music_wav" else 1
                    data.append([*mfcc_mean, label])
                except Exception as e:
                    print(f"Error reading {filename}: {e}")

    return data

In [3]:
data = gtzan_loader()
    
# Create DataFrame with MFCC columns and a label column
num_mfcc_features = len(data[0]) - 1  # Subtract 1 for the label column
column_names = [f'mfcc_{i+1}' for i in range(num_mfcc_features)] + ['label']
df_gtzan = pd.DataFrame(data, columns=column_names)

In [4]:
df_gtzan.head()

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,...,mfcc_32,mfcc_33,mfcc_34,mfcc_35,mfcc_36,mfcc_37,mfcc_38,mfcc_39,mfcc_40,label
0,-157.48822,47.408287,-35.37471,16.73896,-14.782825,-2.014582,-10.715254,-2.605302,-1.815764,7.286231,...,7.345362,-5.29774,-0.856299,-1.914519,-3.541516,-3.636032,-0.494503,2.444749,-6.595859,0
1,-372.427887,183.306808,-13.082294,3.932508,16.875496,-5.816745,-4.877747,-6.733414,-11.405272,-1.3534,...,2.679184,-0.239537,-0.117979,-1.733592,-2.223251,-0.322705,2.184809,7.429195,3.633396,0
2,-174.057175,136.046494,-60.375648,4.883183,-27.085163,-8.613901,-19.284859,-12.682382,-16.254562,-7.103938,...,4.688622,6.033112,6.344836,2.081139,4.519475,2.929529,2.606121,0.962403,-1.431882,0
3,-168.446762,92.986084,-2.337321,27.885298,-3.521363,12.739482,3.071577,5.617084,-0.950944,3.745061,...,0.14112,-1.729236,-2.075783,-2.79675,-0.043275,-0.266358,0.986992,0.085185,0.591165,0
4,-269.695496,110.00724,17.367323,59.710495,12.691465,25.967535,9.185562,18.754194,2.988607,10.531519,...,-1.885853,-3.400853,-3.541979,-2.89719,-2.295101,-2.276045,-2.179547,0.348086,3.12271,0


In [13]:
df_gtzan.shape

(128, 41)

### Training

Split into train and test sets.

In [14]:
df_gtzan = df_gtzan.sample(frac=1)      # Shuffle rows

X, y = df_gtzan.loc[:, df_gtzan.columns != 'label'], df_gtzan['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [15]:
y_test.value_counts()

label
1    8
0    5
Name: count, dtype: int64

##### Model 1: SVM

Train

In [16]:
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)

Test

In [17]:
y_pred = clf.predict(X_test)

In [18]:
accuracy_score(y_test, y_pred)

1.0

In [19]:
precision_recall_fscore_support(y_test, y_pred)

(array([1., 1.]), array([1., 1.]), array([1., 1.]), array([5, 8], dtype=int64))

Save the model for inference.

In [35]:
with open('sm_model.pkl','wb') as f:
    pickle.dump(clf,f)