In [16]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import librosa
import numpy as np
import librosa.display
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from server.database_wrapper import PostgresqlWrapper
from server.utils import Util
from feature_extractor import FeatureExtractor
import xgboost as xgb
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Грузим датасет

In [201]:
data = list()
genre_list = list()

In [12]:
genres=['classical', 'metal', 'blues', 'hiphop', 'disco', 'pop', 'rock', 'country', 'reggae', 'jazz']

In [202]:
def getData(where_to, genre_list, genre, range_tuple):
    for i in tqdm(range(range_tuple)):
        if i < 10:
            path = "../../" + genre + "/" + genre + ".0000" + str(i) + ".au"        
        else:
            path = "../../" + genre + "/" + genre + ".000" + str(i) + ".au"
        
        song = librosa.load(path)
        where_to.append(song[0])
        genre_list.append(genre)

In [203]:
for genre in genres:
    getData(data, genre_list, genre, 100)

100%|██████████| 100/100 [00:53<00:00,  1.79it/s]
100%|██████████| 100/100 [00:59<00:00,  1.85it/s]
100%|██████████| 100/100 [00:54<00:00,  2.03it/s]
100%|██████████| 100/100 [00:50<00:00,  1.79it/s]
100%|██████████| 100/100 [00:55<00:00,  1.61it/s]
100%|██████████| 100/100 [01:04<00:00,  1.58it/s]
100%|██████████| 100/100 [00:56<00:00,  1.91it/s]
100%|██████████| 100/100 [00:58<00:00,  2.07it/s]
100%|██████████| 100/100 [00:59<00:00,  1.28it/s]
100%|██████████| 100/100 [00:58<00:00,  1.96it/s]


In [4]:
song = librosa.load("../../hiphop/hiphop.00001.au")[0]

In [6]:
qtr = librosa.feature.chroma_cqt(y=song)

In [10]:
qtr.mean(axis = 1)

array([0.27858519, 0.59648524, 0.97147959, 0.62372022, 0.2951616 ,
       0.25554028, 0.2407301 , 0.2467557 , 0.240299  , 0.24933925,
       0.2641769 , 0.26098882])

# Извлечём фичи из Бургера

In [685]:
# extractor = FeatureExtractor(data)

In [686]:
extractor = FeatureExtractor([librosa.load("../../burger.mp3")[0]])

In [687]:
qtransform = extractor.generateQtransform()

Got qtr data for 0 songs


In [688]:
low_energy = extractor.generate_energy()

Got rmse data for 0 songs


In [689]:
flux = extractor.generate_flux()

Got flux data for 0 songs


In [690]:
rolloff = extractor.generate_rolloff()

Got rolloff data for 0 songs


In [691]:
rhythm = extractor.generate_rhythm()

Got rhythm data for 0 songs


In [692]:
mfcc_means = extractor.generate_mfcc(n_mfcc=20, sr=22050)

Got mfcc for 0 songs


In [693]:
zcrs = extractor.generate_zero_crossing_rate(None, None)

Got zero_cross_rate for 0 songs


In [694]:
cent = extractor.generate_centoid_meanstd()

Got centroid data for 0 songs


In [17]:
# Вводим названия фичей

list_names = ["mfcc_mean_" + str(i) for i in range(1,21)]
list_names.append("std_mfcc")
list_names.append("zero_cros_mean")
list_names.append("zero_cros_std")
list_names.append("cent_mean")
list_names.append("cent_std")
list_names.append("tempo_static")
list_names.append("tempo_mean")
list_names.append("tempo_std")
list_names.append("num_tempo_changes")
list_names.append("rolloff_mean")
list_names.append("rolloff_std")
list_names.append("flux_mean")
list_names.append("flux_std")
list_names.append("energy")
list_names += ["qtransf_mean_" + str(i) for i in range(1,13)]
list_names.append("genre")

# Обучаем модель

In [16]:
# X = np.hstack((mfcc_means, zcrs, cent, rhythm, rolloff, flux, low_energy, qtransform))

In [224]:
# Сохраняем в csv файл для удобства дальнейшего пользования
# pd.DataFrame(np.hstack((X,y.reshape(-1,1))), columns=list_names).to_csv("extracted_data.csv", encoding="utf-8")

In [697]:
data = pd.DataFrame.from_csv("extracted_data.csv", encoding="utf-8").values

In [698]:
X = np.array(data[:, :-1], dtype = float)
y = np.arange(10).repeat(100)

In [699]:
# Исключаем плохие жанры
mask =  (np.array(genre_list) != 'country') & (np.array(genre_list) != 'blues') & (np.array(genre_list) != 'reggae') 

In [700]:
X = X[mask]
y = y[mask]

In [701]:
genres = np.array(['classical',
 'metal',
 'blues',
 'hiphop',
 'disco',
 'pop',
 'rock',
 'country',
 'reggae',
 'jazz'])[np.unique(y)]

In [702]:
y = np.arange(np.unique(y).shape[0]).repeat(100)

In [703]:
# Prepare data for xgboost. Encode genres. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [704]:
# Normalization
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [705]:
dtrain = xgb.DMatrix(X_scaled_train, label=y_train)
dtest = xgb.DMatrix(X_scaled_test, label=y_test)

In [706]:
# Xgboost training
param = {'objective': "multi:softmax", "num_class": np.unique(y_train).size}
evallist = [(dtrain, 'train'), (dtest, 'eval')]
bst = xgb.train(param, dtrain, evals=evallist)

[0]	train-merror:0.071429	eval-merror:0.385714
[1]	train-merror:0.02449	eval-merror:0.409524
[2]	train-merror:0.010204	eval-merror:0.404762
[3]	train-merror:0.010204	eval-merror:0.385714
[4]	train-merror:0.006122	eval-merror:0.390476
[5]	train-merror:0	eval-merror:0.385714
[6]	train-merror:0	eval-merror:0.390476
[7]	train-merror:0	eval-merror:0.37619
[8]	train-merror:0	eval-merror:0.385714
[9]	train-merror:0	eval-merror:0.37619


In [707]:
print(classification_report(y_test, bst.predict(dtest), target_names=genres))

             precision    recall  f1-score   support

  classical       0.90      0.93      0.92        30
      metal       0.72      0.70      0.71        30
     hiphop       0.54      0.47      0.50        30
      disco       0.49      0.57      0.52        30
        pop       0.65      0.73      0.69        30
       rock       0.31      0.27      0.29        30
       jazz       0.72      0.70      0.71        30

avg / total       0.62      0.62      0.62       210



In [711]:
# Создаем ансамбль из двух GridSearchCV: RandomForest и SVM

randomForest_grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs = -1, verbose=True)
grid = GridSearchCV(svm, {"C": np.logspace(0,2,num=20), "kernel": ["poly", "rbf", "sigmoid"]}, n_jobs = -1, verbose=True)
eclf = VotingClassifier(estimators=[('svc', grid), ('rndf', randomForest_grid)], voting='hard')

In [652]:
# Код для проверки работоспособности модели:
# Берем разные рандонстэйты, делим выборку, выводим средний classification_report

def check_model(model, X, y, n):
    average = np.zeros((len(genres) + 1, 4 ), dtype = float)
    for i in range(n):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = i, stratify=y)
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_scaled_train = scaler.transform(X_train)
        X_scaled_test = scaler.transform(X_test)
        model.fit(X_scaled_train, y_train)
        df = parse_class_report(classification_report(y_test, model.predict(X_scaled_test), target_names=genres))
        average += df.values
    df.iloc[:,:] = average / n
    return df

def parse_class_report(class_rep_str):
    lines = class_rep_str.split('\n')
    list_splitted = list()
    for line in lines:
        splitted_line = np.array(line.split(' '))
        mask_non_empty = np.array([word != '' for word in line.split(' ')])
        res = splitted_line[mask_non_empty]
        if len(res) != 0:
            list_splitted.append(np.array(res, dtype = object))

    first = list(list_splitted[0])
    first.insert(0, 'score')
    list_splitted[0] = np.array(first, dtype = object)
    list_splitted[-1] = list_splitted[-1][2:]
    list_splitted[-1][0] = 'avg / total'
    
    df = pd.DataFrame(list_splitted)
    df.columns = list(df.iloc[0])
    return df.iloc[1:].set_index('score').convert_objects(convert_numeric=True)

In [709]:
average_score = check_model(eclf, X, y, 10)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    4.6s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.7s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.2s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.9s finished


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.0s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.2s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.0s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.4s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    1.3s


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.1s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    6.5s finished


Fitting 3 folds for each of 60 candidates, totalling 180 fits


  if diff:


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    7.4s finished
  if diff:


In [710]:
average_score

Unnamed: 0_level_0,precision,recall,f1-score,support
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
classical,0.864,0.921,0.891,30.0
metal,0.804,0.895,0.846,30.0
hiphop,0.615,0.774,0.681,30.0
disco,0.657,0.595,0.621,30.0
pop,0.792,0.798,0.791,30.0
rock,0.636,0.465,0.533,30.0
jazz,0.877,0.761,0.81,30.0
avg / total,0.748,0.743,0.74,210.0


In [None]:
# Попробуем подставить в модель burger

In [715]:
eclf.fit(X_scaled_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    5.7s finished


VotingClassifier(estimators=[('svc', GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=Fal...pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [716]:
genres[eclf.predict(burger)[0]]

  if diff:


'hiphop'

## Остальные модели

In [481]:
svm = SVC(kernel='rbf', C=50)
svm.fit(X_scaled_train, y_train)

SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [331]:
print(classification_report(y_test, svm.predict(X_scaled_test), target_names=genres))

             precision    recall  f1-score   support

  classical       0.91      0.97      0.94        30
      metal       0.89      0.83      0.86        30
      blues       0.73      0.80      0.76        30
     hiphop       0.76      0.63      0.69        30
      disco       0.79      0.77      0.78        30
        pop       0.79      0.87      0.83        30
     reggae       0.77      0.77      0.77        30

avg / total       0.80      0.80      0.80       210



In [348]:
clf = RandomForestClassifier(n_estimators=500, max_depth=10, max_features='log2')
clf.fit(X_scaled_train, y_train)
print(classification_report(y_test, clf.predict(X_scaled_test), target_names=genres))

TypeError: __init__() got an unexpected keyword argument 'probability'

In [333]:
grid = GridSearchCV(svm, {"C": np.logspace(0,2,num=20), "kernel": ["poly", "rbf", "sigmoid"]}, n_jobs = -1, verbose=True)
grid.fit(X_scaled_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.2s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([  1.     ,   1.27427,   1.62378,   2.06914,   2.63665,   3.35982,
         4.28133,   5.45559,   6.95193,   8.85867,  11.28838,  14.3845 ,
        18.32981,  23.35721,  29.76351,  37.9269 ,  48.3293 ,  61.58482,
        78.476  , 100.     ]), 'kernel': ['poly', 'rbf', 'sigmoid']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [334]:
print(classification_report(y_test, grid.predict(X_scaled_test), target_names=genres))

             precision    recall  f1-score   support

  classical       0.91      0.97      0.94        30
      metal       0.89      0.80      0.84        30
      blues       0.73      0.80      0.76        30
     hiphop       0.73      0.63      0.68        30
      disco       0.79      0.77      0.78        30
        pop       0.79      0.87      0.83        30
     reggae       0.77      0.77      0.77        30

avg / total       0.80      0.80      0.80       210



In [668]:
param_grid = { 
    'n_estimators': [ 70, 150, 370],
    'max_features': ['log2'],
    'max_depth' : [10,15, 20],
    'criterion' :['gini']
}

CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs = -1, verbose=True)
CV_rfc.fit(X_scaled_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


KeyboardInterrupt: 

In [337]:
print(classification_report(y_test, CV_rfc.predict(X_scaled_test), target_names=genres))

             precision    recall  f1-score   support

  classical       1.00      0.93      0.97        30
      metal       0.80      0.80      0.80        30
      blues       0.74      0.87      0.80        30
     hiphop       0.79      0.50      0.61        30
      disco       0.73      0.73      0.73        30
        pop       0.79      0.90      0.84        30
     reggae       0.71      0.80      0.75        30

avg / total       0.80      0.79      0.79       210



In [338]:

CV_rfc.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'log2',
 'n_estimators': 500}

In [136]:
from sklearn.ensemble import VotingClassifier

Fitting 3 folds for each of 60 candidates, totalling 180 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   14.6s finished


VotingClassifier(estimators=[('svc', GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=Fal...pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

             precision    recall  f1-score   support

  classical       0.91      1.00      0.95        30
      metal       0.83      0.80      0.81        30
      blues       0.72      0.87      0.79        30
     hiphop       0.71      0.67      0.69        30
      disco       0.75      0.70      0.72        30
        pop       0.87      0.87      0.87        30
     reggae       0.85      0.73      0.79        30

avg / total       0.81      0.80      0.80       210



  if diff:
