In [1]:
%matplotlib inline

import pickle
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import structure

In [2]:
##################################
# if you want to load the object #
##################################

try:
    PATH = '/Users/maxwellclarke/Documents/data/fma_metadata/segmented'
    os.chdir(PATH)
except:
    PATH = r'C:\Users\james\Documents\data\fma_metadata\segmented'
    os.chdir(PATH)
    
with open('data.pickle', 'rb') as f:
    data = pickle.load(f)
    
dfs = data.segmented_dfs # already sorted data

In [3]:
dfs.keys()

dict_keys(['tracks', 'genres', 'features', 'echonest'])

In [4]:
len(dfs['features'].keys())

77

In [5]:
data.tracks.track.groupby('genre_top').count().sort_values('title', ascending=False)['title']

genre_top
Rock                   14182
Experimental           10608
Electronic              9371
Hip-Hop                 3552
Folk                    2803
Pop                     2332
Instrumental            2079
International           1389
Classical               1230
Jazz                     571
Old-Time / Historic      554
Spoken                   423
Country                  194
Soul-RnB                 175
Blues                    110
Easy Listening            24
Name: title, dtype: int64

### Genres to include

- Rock
- Electronic
- Hip-Hop
- Folk
- Classical

In [6]:
dfs['features'].keys()

dict_keys(['spectral_contrast__std', 'spectral_contrast__skew', 'spectral_contrast__mean', 'spectral_contrast__kurtosis', 'spectral_contrast__median', 'spectral_contrast__min', 'spectral_contrast__max', 'tonnetz__std', 'tonnetz__skew', 'tonnetz__mean', 'tonnetz__kurtosis', 'tonnetz__median', 'tonnetz__min', 'tonnetz__max', 'chroma_stft__std', 'chroma_stft__skew', 'chroma_stft__mean', 'chroma_stft__kurtosis', 'chroma_stft__median', 'chroma_stft__min', 'chroma_stft__max', 'mfcc__std', 'mfcc__skew', 'mfcc__mean', 'mfcc__kurtosis', 'mfcc__median', 'mfcc__min', 'mfcc__max', 'spectral_rolloff__std', 'spectral_rolloff__skew', 'spectral_rolloff__mean', 'spectral_rolloff__kurtosis', 'spectral_rolloff__median', 'spectral_rolloff__min', 'spectral_rolloff__max', 'spectral_bandwidth__std', 'spectral_bandwidth__skew', 'spectral_bandwidth__mean', 'spectral_bandwidth__kurtosis', 'spectral_bandwidth__median', 'spectral_bandwidth__min', 'spectral_bandwidth__max', 'spectral_centroid__std', 'spectral_cent

Each of the above contain summary statistical information derived from the audio file. For now, I'll only work with mean and standard deviation for category.

In [7]:
import re

r = re.compile('.*mean')
q = re.compile('.*std')

to_concat = [df for key, df in dfs['features'].items() if (bool(r.match(key)) | bool(q.match(key)))] # means and stds dfs

df = pd.concat(to_concat + [data.tracks.track[['genre_top']]], axis=1, join='inner')

mask = df['genre_top'].isin(['Rock', 'Electronic', 'Hip-Hop', 'Folk', 'Classical'])

print(df.shape)
df = df[mask]
print(df.shape)


(91214, 149)
(25019, 149)


## Let's see how logistic regression performs

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split


X = df.drop('genre_top', axis=1)
y = df['genre_top']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.2, stratify=y)

#### Define functions to evaluate our models

In [44]:
from copy import deepcopy
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def test_roc(X, y, est):
    """Cross validates and evaluates multi-class model."""
    genres = set(y)
    X = X.values
    y = y
    
    skf = StratifiedKFold(n_splits=5, random_state=42)
    
    roc = {
        'test_auc': [],
        'train_auc': [],
        'test_curve': [],
        'train_curve': []
    }
    
    genre_dict = {genre: deepcopy(roc) for genre in genres}
    
    genre_code = {
        'Classical': [1,0,0,0,0],
        'Electronic': [0,1,0,0,0],
        'Folk': [0,0,1,0,0],
        'Hip-Hop': [0,0,0,1,0],
        'Rock': [0,0,0,0,1]
    }

    for genre in genres:
        targets = ((pd.get_dummies(y).values == genre_code[genre]).sum(axis=1) > 3) # hacky but it works 
        
        for tr_ix, te_ix in skf.split(X, targets):

            X_train, X_test = X[tr_ix], X[te_ix]
            y_train, y_test = targets[tr_ix], targets[te_ix]

            est.fit(X_train, y_train)
            y_preds = est.predict(X_test)
            y_preds_proba_test = est.predict_proba(X_test)
            y_preds_proba_train = est.predict_proba(X_train)

            train_score = roc_auc_score(y_train, y_preds_proba_train[:, 1])
            test_score = roc_auc_score(y_test, y_preds_proba_test[:, 1])
            genre_dict[genre]['train_auc'].append(train_score)
            genre_dict[genre]['test_auc'].append(test_score)
    
    return genre_dict

In [46]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pip = Pipeline([
    ('std_scl', StandardScaler()),
    ('lr', LogisticRegression(solver='lbfgs', max_iter=1000))
])

scores = test_roc(X_train, y_train, pip)

In [47]:
def get_results(scores):
    for genre in set(y):
        if len(genre) > 5:
            print(genre, ':\t', np.round(np.mean(scores[genre]['train_auc']),4))
        else:
            print(genre, ':\t\t', np.round(np.mean(scores[genre]['train_auc']),4))
        
get_results(scores)

Folk :		 0.9204
Classical :	 0.9894
Electronic :	 0.9095
Rock :		 0.9363
Hip-Hop :	 0.9218


### AdaBoost

In [17]:
from sklearn.ensemble import AdaBoostClassifier

pip = Pipeline([
    ('std_scl', StandardScaler()),
    ('abc', AdaBoostClassifier())
])

socres = test_roc(X_train, y_train, pip)

In [19]:
get_results(socres)

Classical :	 0.9958
Folk :		 0.9418
Rock :		 0.9309
Electronic :	 0.9028
Hip-Hop :	 0.9256


In [131]:
mask = (simple_feats['genre_top'].isin(['Rock', 'Electronic']))

In [132]:
simple_feats = simple_feats[mask]

In [133]:
lr = LogisticRegression()
lr.fit(simple_feats.drop('genre_top', axis=1), simple_feats['genre_top'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [134]:
with open('simple_model.pickle', 'wb') as f:
    pickle.dump(lr, f)