In [19]:
import os
import librosa
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# PATH = "../../datasets/gtzan/"
GTZAN_PATH = "D:/Personal Projects/wave-craft/backend/datasets/gtzan"


Load audio files of speech and music using librosa package. Extract STFT (Short-Time Fourier Transform) values for each audio file, and save it in a dataframe along with its label (speech/music).

In [11]:
def gtzan_loader():
    data = []
    for dir in ["music_wav", "speech_wav"]:
        dir_path = GTZAN_PATH + "/" + dir
        for filename in os.listdir(dir_path):
            file_path = os.path.join(dir_path, filename)
            if os.path.isfile(file_path):  # Check if it's a file
                try:
                    # Load the audio file using librosa
                    y, sr = librosa.load(file_path, mono=False, sr=None)
                    mfcc = librosa.feature.chroma_stft(y=y, sr=sr).flatten()

                    # Append a row with all MFCC values and label
                    label = 0 if dir == "music_wav" else 1
                    data.append([*mfcc, label])
                except Exception as e:
                    print(f"Error reading {filename}: {e}")

    return data

In [12]:
data = gtzan_loader()
    
# Create DataFrame with MFCC columns and a label column
num_mfcc_features = len(data[0]) - 1  # Subtract 1 for the label column
column_names = [f'mfcc_{i+1}' for i in range(num_mfcc_features)] + ['label']
df_gtzan = pd.DataFrame(data, columns=column_names)

In [13]:
df_gtzan.shape

(128, 15505)

### Training

Split into train and test sets.

In [14]:
df_gtzan = df_gtzan.sample(frac=1)      # Shuffle rows

X, y = df_gtzan.loc[:, df_gtzan.columns != 'label'], df_gtzan['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [21]:
y_test.value_counts()

label
1    8
0    5
Name: count, dtype: int64

##### Model 1: SVM

Train

In [15]:
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)

Test

In [16]:
y_pred = clf.predict(X_test)

In [17]:
accuracy_score(y_test, y_pred)

1.0

In [20]:
precision_recall_fscore_support(y_test, y_pred)

(array([1., 1.]), array([1., 1.]), array([1., 1.]), array([5, 8], dtype=int64))

### Inference

In [25]:
TEST_PATH = "D:\Songs\Killer.mp3"

y, sr = librosa.load(TEST_PATH, mono=False, sr=None)
mfcc = librosa.feature.mfcc(y=y, sr=sr).flatten()
print(mfcc.shape)

(577000,)


In [None]:
clf.predict(*mfcc)