In [1]:
from python_speech_features import mfcc
import scipy.io.wavfile as wav
import numpy as np
import os
from tqdm.notebook import tqdm
import pickle
from sklearn.model_selection import train_test_split

In [2]:
DATASET_PATH = './genres'
FEATURES_PATH = f'{DATASET_PATH}/features_svm.pkl'

In [3]:
genres = [path for path in os.listdir(DATASET_PATH) if '.' not in path]
genres

['blues',
 'classical',
 'country',
 'disco',
 'hiphop',
 'jazz',
 'metal',
 'pop',
 'reggae',
 'rock']

In [4]:
genre_to_num = {genre: i for i, genre in enumerate(genres)}
genre_to_num

{'blues': 0,
 'classical': 1,
 'country': 2,
 'disco': 3,
 'hiphop': 4,
 'jazz': 5,
 'metal': 6,
 'pop': 7,
 'reggae': 8,
 'rock': 9}

In [5]:
num_to_genre = dict(enumerate(genres))
num_to_genre

{0: 'blues',
 1: 'classical',
 2: 'country',
 3: 'disco',
 4: 'hiphop',
 5: 'jazz',
 6: 'metal',
 7: 'pop',
 8: 'reggae',
 9: 'rock'}

In [6]:
import librosa
from scipy.stats import skew

In [7]:
def get_features(wav_path, rate=22050):
    data, _ = librosa.core.load(wav_path, sr = rate)
    ft1 = librosa.feature.mfcc(data, sr = rate, n_mfcc=13)
    ft2 = librosa.feature.zero_crossing_rate(data)[0]
    ft3 = librosa.feature.spectral_rolloff(data)[0]
    ft4 = librosa.feature.spectral_centroid(data)[0]
    ft5 = librosa.feature.spectral_contrast(data)[0]
    ft6 = librosa.feature.spectral_bandwidth(data)[0]
    ft1_trunc = np.hstack((np.mean(ft1, axis=1), np.std(ft1, axis=1), skew(ft1, axis = 1), np.max(ft1, axis = 1), np.median(ft1, axis = 1), np.min(ft1, axis = 1)))
    ft2_trunc = np.hstack((np.mean(ft2), np.std(ft2), skew(ft2), np.max(ft2), np.median(ft2), np.min(ft2)))
    ft3_trunc = np.hstack((np.mean(ft3), np.std(ft3), skew(ft3), np.max(ft3), np.median(ft3), np.min(ft3)))
    ft4_trunc = np.hstack((np.mean(ft4), np.std(ft4), skew(ft4), np.max(ft4), np.median(ft4), np.min(ft4)))
    ft5_trunc = np.hstack((np.mean(ft5), np.std(ft5), skew(ft5), np.max(ft5), np.median(ft5), np.min(ft5)))
    ft6_trunc = np.hstack((np.mean(ft6), np.std(ft6), skew(ft6), np.max(ft6), np.median(ft6), np.max(ft6)))
    return np.hstack((ft1_trunc, ft2_trunc, ft3_trunc, ft4_trunc, ft5_trunc, ft6_trunc))

In [51]:
genre_folders = [path for path in os.listdir(DATASET_PATH) if '.' not in path]
for genre_name in tqdm(genre_folders[4:5]):
    wavs_list = os.listdir(f'{DATASET_PATH}/{genre_name}')
    for w in wavs_list[:1]:
        wav_path = f'{DATASET_PATH}/{genre_name}/{w}'
        feat = get_features(wav_path)
feat

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




array([-8.08921967e+01,  8.47820816e+01, -3.15480289e+01,  3.39053116e+01,
       -1.98489189e+01,  2.35588837e+01, -2.21487579e+01,  2.00262260e+01,
       -1.67810364e+01,  1.92165146e+01, -1.31140451e+01,  1.14986544e+01,
       -1.07295685e+01,  6.98513412e+01,  2.73957920e+01,  2.34281330e+01,
        2.01807690e+01,  1.26004086e+01,  1.16385260e+01,  9.68514729e+00,
        9.83020878e+00,  9.67762852e+00,  7.47023535e+00,  8.66638279e+00,
        7.24333811e+00,  8.60036278e+00, -9.35900688e-01,  4.07580078e-01,
        4.72338736e-01, -3.52820665e-01, -4.60751653e-02, -9.01150778e-02,
        2.21833717e-02, -4.44833338e-02,  3.16404812e-02, -1.02824897e-01,
        3.98396999e-01, -8.12162831e-03,  3.29952389e-02,  5.48079453e+01,
        2.09831055e+02,  5.70010986e+01,  9.12448730e+01,  2.18620644e+01,
        6.12939796e+01,  8.14974213e+00,  4.72032051e+01,  1.99516659e+01,
        4.08543167e+01,  1.67455673e+01,  3.27091675e+01,  1.48384886e+01,
       -7.35319366e+01,  

In [52]:
def create_save_features(dataset_path: str, out_dir: str):
    data = {}
    data['mfcc'] = []
    data['label'] = []
    """ gets list of folders cotaining wavs and returns created mfcc features and label's csv"""
    genre_folders = [path for path in os.listdir(dataset_path) if '.' not in path]
    for genre_name in tqdm(genre_folders):
        wavs_list = os.listdir(f'{dataset_path}/{genre_name}')
        for wav in tqdm(wavs_list):
            wav_path = f'{dataset_path}/{genre_name}/{wav}'
            data['mfcc'].append(get_features(wav_path).T)
            data['label'].append(genre_to_num[genre_name])
            
    with open(out_dir, 'wb') as f:
        pickle.dump(data, f)

In [54]:
create_save_features(DATASET_PATH, FEATURES_PATH)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))





In [29]:
#  with open(FEATURES_PATH, "rb") as f:
#     data = pickle.load(f)
# set([it.shape for it in data['mfcc']])

### loading data

In [8]:
def load_data(data_path):
    with open(data_path, "rb") as f:
        data = pickle.load(f)

    X = np.stack(data["mfcc"], axis=0)
    y = np.array(data["label"])
    return X, y

In [9]:
def ttf_split(data, test_size, val_size, seed=1234):
    X_train, X_test, y_train, y_test = train_test_split(*data, test_size=test_size, random_state=seed)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=val_size, random_state=seed)

    return X_train, X_validation, X_test, y_train, y_validation, y_test

In [10]:
X, y = load_data(FEATURES_PATH)
X.shape, y.shape

((1000, 108), (1000,))

In [166]:
test_size = 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=4321)

In [176]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [168]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [169]:
from sklearn.decomposition import PCA

In [170]:
pca = PCA(n_components=30).fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(sum(pca.explained_variance_ratio_)) 

0.9208609755863255


In [171]:
from thundersvm import SVC

In [172]:
clf = SVC(kernel = 'rbf', C=1, probability=True)
clf.fit(X_train_pca, y_train)

SVC(C=1, class_weight={}, probability=True)

In [173]:
print(accuracy_score(clf.predict(X_train_pca), y_train))

0.984


In [177]:
print(classification_report(clf.predict(X_test_pca), y_test))

              precision    recall  f1-score   support

         0.0       0.64      0.78      0.70        18
         1.0       0.90      0.90      0.90        30
         2.0       0.58      0.61      0.60        23
         3.0       0.76      0.63      0.69        35
         4.0       0.65      0.74      0.69        23
         5.0       0.81      0.57      0.67        30
         6.0       0.83      0.77      0.80        26
         7.0       0.73      0.76      0.75        25
         8.0       0.62      0.62      0.62        21
         9.0       0.48      0.68      0.57        19

    accuracy                           0.70       250
   macro avg       0.70      0.71      0.70       250
weighted avg       0.72      0.70      0.71       250

