In [1]:
%matplotlib inline

import pickle
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import structure


In [2]:
##################################
# if you want to load the object #
##################################

try:
    PATH = '/Users/maxwellclarke/Documents/data/fma_metadata/segmented'
    os.chdir(PATH)
except:
    PATH = r'C:\Users\james\Documents\data\fma_metadata\segmented'
    os.chdir(PATH)
    
with open('data.pickle', 'rb') as f:
    data = pickle.load(f)
    
dfs = data.segmented_dfs # already sorted data

In [3]:
dfs.keys()

dict_keys(['tracks', 'genres', 'features', 'echonest'])

In [4]:
len(dfs['features'].keys())

77

In [5]:
data.tracks.track.groupby('genre_top').count().sort_values('title', ascending=False)['title']

genre_top
Rock                   14182
Experimental           10608
Electronic              9371
Hip-Hop                 3552
Folk                    2803
Pop                     2332
Instrumental            2079
International           1389
Classical               1230
Jazz                     571
Old-Time / Historic      554
Spoken                   423
Country                  194
Soul-RnB                 175
Blues                    110
Easy Listening            24
Name: title, dtype: int64

### Genres to include

- Rock
- Electronic
- Hip-Hop
- Folk
- Classical

In [6]:
dfs['features'].keys()

dict_keys(['spectral_contrast__std', 'spectral_contrast__skew', 'spectral_contrast__mean', 'spectral_contrast__kurtosis', 'spectral_contrast__median', 'spectral_contrast__min', 'spectral_contrast__max', 'tonnetz__std', 'tonnetz__skew', 'tonnetz__mean', 'tonnetz__kurtosis', 'tonnetz__median', 'tonnetz__min', 'tonnetz__max', 'chroma_stft__std', 'chroma_stft__skew', 'chroma_stft__mean', 'chroma_stft__kurtosis', 'chroma_stft__median', 'chroma_stft__min', 'chroma_stft__max', 'mfcc__std', 'mfcc__skew', 'mfcc__mean', 'mfcc__kurtosis', 'mfcc__median', 'mfcc__min', 'mfcc__max', 'spectral_rolloff__std', 'spectral_rolloff__skew', 'spectral_rolloff__mean', 'spectral_rolloff__kurtosis', 'spectral_rolloff__median', 'spectral_rolloff__min', 'spectral_rolloff__max', 'spectral_bandwidth__std', 'spectral_bandwidth__skew', 'spectral_bandwidth__mean', 'spectral_bandwidth__kurtosis', 'spectral_bandwidth__median', 'spectral_bandwidth__min', 'spectral_bandwidth__max', 'spectral_centroid__std', 'spectral_cent

Each of the above contain summary statistical information derived from the audio file. For now, I'll only work with mean and standard deviation for category.

In [7]:
import re

r = re.compile('.*mean')
q = re.compile('.*std')

to_concat = [df for key, df in dfs['features'].items() if (bool(r.match(key)) | bool(q.match(key)))] # means and stds dfs

df = pd.concat(to_concat +[data.tracks.track[['genre_top']]], axis=1, join='inner')

mask = df['genre_top'].isin(['Rock', 'Electronic', 'Hip-Hop', 'Folk', 'Classical'])

print(df.shape)
df = df[mask]
print(df.shape)


(91214, 149)
(25019, 149)


## Let's see how logistic regression performs

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split


X = df.drop('genre_top', axis=1)
y = df['genre_top']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=.2, stratify=y)

#### Define functions to evaluate our models

In [12]:
from copy import deepcopy
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


def test_roc(X, y, est):
    """Cross validates and evaluates multi-class model."""
    genres = set(y)
    X = X.values
    y = y.values
    
    classes = set(y)
    
    skf = StratifiedKFold(n_splits=5, random_state=42)
    
    roc = {
        'test_auc': [],
        'train_auc': [],
        'test_curve': [],
        'train_curve': []
    }
    
    genre_dict = {genre: deepcopy(roc) for genre in genres}

    for genre in genres:
        targets = (y == genre).astype(int)
        
        for tr_ix, te_ix in skf.split(X, targets):
            

            X_train, X_test = X[tr_ix], X[te_ix]
            y_train, y_test = targets[tr_ix], targets[te_ix]

            est.fit(X_train, y_train)
            y_preds = est.predict(X_test)
            y_preds_proba_test = est.predict_proba(X_test)
            y_preds_proba_train = est.predict_proba(X_train)
            
            ####
            # THE IDEA OF AVERAGING OUT ROC CURVES OVER DIFFERENT FOLDS IS ***MOST QUESTIONABLE***
            ####

            train_score = roc_auc_score(y_train, y_preds_proba_train[:, 1])
            test_score = roc_auc_score(y_test, y_preds_proba_test[:, 1])
            genre_dict[genre]['train_auc'].append(train_score)
            genre_dict[genre]['test_auc'].append(test_score)
            
            genre_dict[genre]['test_curve'].append(roc_curve(y_test, y_preds_proba_test[:, 1])[:2])
            genre_dict[genre]['train_curve'].append(roc_curve(y_train, y_preds_proba_train[:, 1])[:2])
            
        # TODO: MAKE ROC PLOT FOR EACH GENRE WITH LEGEND
    
    return genre_dict

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pip = Pipeline([
    ('std_scl', StandardScaler()),
    ('lr', LogisticRegression(solver='lbfgs', max_iter=5000))
])

scores = test_roc(X_train, y_train, pip)

In [20]:
def get_results(scores):
    for genre in set(y):
        if len(genre) > 5:
            print(genre, ':\t', np.round(np.mean(scores[genre]['train_auc']),4))
        else:
            print(genre, ':\t\t', np.round(np.mean(scores[genre]['train_auc']),4))
        
get_results(scores)

Classical :	 0.9894
Folk :		 0.9204
Rock :		 0.9363
Electronic :	 0.9095
Hip-Hop :	 0.9218


### AdaBoost

In [17]:
from sklearn.ensemble import AdaBoostClassifier

pip = Pipeline([
    ('std_scl', StandardScaler()),
    ('abc', AdaBoostClassifier())
])

socres = test_roc(X_train, y_train, pip)

In [19]:
get_results(socres)


Classical :	 0.9958
Folk :		 0.9418
Rock :		 0.9309
Electronic :	 0.9028
Hip-Hop :	 0.9256


### Gradient Boost with Logistic Cost Function

In [23]:
from sklearn.ensemble import GradientBoostingClassifier

pip = Pipeline([
    ('std_scl', StandardScaler()),
    ('gbc', GradientBoostingClassifier())
])

gbc_scores = test_roc(X_train, y_train, pip)

KeyboardInterrupt: 

#### multi_class_logisitc

In [56]:
from sklearn.ensemble import BaseEnsemble

In [110]:
from copy import deepcopy
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score



def multi_class_logistic_test(X, y, est):
    """Cross validates and evaluates multi-class model."""
    genres = set(y)
    X = X.values
    y = y.values
    
    classes = set(y)
    
    skf = StratifiedKFold(n_splits=5, random_state=42)
    
    scores = {
        'train': [],
        'test': []
    }
    
    estimators = []
    for tr_ix, te_ix in skf.split(X, y):
        esti = deepcopy(est)

        X_train, X_test = X[tr_ix], X[te_ix]
        y_train, y_test = y[tr_ix], y[te_ix]

        esti.fit(X_train, y_train)
        
        y_train_preds = esti.predict(X_train)
        y_test_preds = esti.predict(X_test)
        y_preds_proba_test = esti.predict_proba(X_test)
        y_preds_proba_train = esti.predict_proba(X_train)

        scores['train'].append(accuracy_score(y_train, y_train_preds))
        scores['test'].append(accuracy_score(y_test, y_test_preds))
            
        estimators.append(esti)
    
    return scores, estimators
    

In [145]:
lr = LogisticRegression()

scores, estimators = multi_class_logistic_test(X_train, y_train, lr)



In [None]:
estimators[0].coef_

In [125]:
targets = dfs['tracks']['track']['genre_top']
simple_feats = pd.concat([dfs['features']['zcr__mean'], dfs['features']['zcr__std'], targets], axis=1, join='inner')
simple_feats.head()

Unnamed: 0_level_0,01,01,genre_top
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.085629,0.061448,Hip-Hop
3,0.084578,0.06933,Hip-Hop
5,0.053114,0.044861,Hip-Hop
10,0.077515,0.0408,Pop
20,0.047225,0.030993,


In [131]:
mask = (simple_feats['genre_top'].isin(['Rock', 'Electronic']))

In [132]:
simple_feats = simple_feats[mask]

In [133]:
lr = LogisticRegression()
lr.fit(simple_feats.drop('genre_top', axis=1), simple_feats['genre_top'])






LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [134]:
with open('simple_model.pickle', 'wb') as f:
    pickle.dump(lr, f)

In [149]:
simple_feats.groupby('genre_top').describe()

Unnamed: 0_level_0,01,01,01,01,01,01,01,01,01,01,01,01,01,01,01,01
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
genre_top,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Electronic,7718.0,0.053568,0.02764,0.002491,0.034787,0.049146,0.067003,0.462462,7718.0,0.049832,0.027971,0.001702,0.030712,0.044816,0.06335,0.264326
Rock,11129.0,0.058187,0.022993,0.002735,0.043274,0.056026,0.06957,0.550677,11129.0,0.032351,0.017574,0.001785,0.021154,0.027751,0.03817,0.178451


In [143]:
lr.predict(simple_feats.drop('genre_top', axis=1).values[0].reshape(1, -1))

array(['Rock'], dtype=object)

In [144]:
lr.coef_

array([[ 17.08494855, -29.92371724]])

In [52]:
#############
# MODIFYING ABOVE TO COMBINE PREDICTION OF ALL
#############

from sklearn.base import clone

def multi_test(X, y, est):
    """Cross validates and evaluates multi-class model."""
    genres = set(y)
    X = X.values
    y = y.values
    
    classes = list(set(y))
    
    skf = StratifiedKFold(n_splits=5, random_state=42)
    
    scores = {
        'train': [],
        'test': []
    }
        
    estimators = {}
        
    for tr_ix, te_ix in skf.split(X, y):
        

        X_train, X_test = X[tr_ix], X[te_ix]
        y_train, y_test = y[tr_ix], y[te_ix]
        
        for genre in genres:
            
            targets = (y_train == genre).astype(int)

            
            estimators[genre] = clone(est)

            estimators[genre].fit(X_train, targets)
            
#             y_preds = estimators[genre].predict(X_test)
#             y_preds_proba_test = estimators[genre].predict_proba(X_test)
#             y_preds_proba_train = estimators[genre].predict_proba(X_train)

            ####
            # THE IDEA OF AVERAGING OUT ROC CURVES OVER DIFFERENT FOLDS IS ***MOST QUESTIONABLE***
            ####
        
        probas = np.array([estimators[genre].predict_proba(X_train) for genre in genres])
        print(X_train.shape)
        print(probas.shape)
        probas = probas[:,:,0].reshape(-1, 5)
        print(probas.shape)
        
        preds = np.array([classes[np.argmax(prob)] for prob in probas])
        print(preds.shape)
        print(y_train.shape)
        
        scores['train'].append(accuracy_score(y_train, preds))
        
        probas = np.array([estimators[genre].predict_proba(X_test) for genre in genres])
        probas = probas[:,:,0].reshape(-1, 5)
        preds = np.array([classes[np.argmax(prob)] for prob in probas])
        
        scores['test'].append(accuracy_score(y_test, preds))
        
    return scores
            

#         train_score = roc_auc_score(y_train, y_preds_proba_train[:, 1])
#         test_score = roc_auc_score(y_test, y_preds_proba_test[:, 1])
#         genre_dict[genre]['train_auc'].append(train_score)
#         genre_dict[genre]['test_auc'].append(test_score)

            
        # TODO: MAKE ROC PLOT FOR EACH GENRE WITH LEGEND
    
    

In [54]:
lr = LogisticRegression()

scores = multi_test(X_train, y_train, lr)



(16010, 148)
(5, 16010, 2)
(16010, 5)
(16010,)
(16010,)




(16011, 148)
(5, 16011, 2)
(16011, 5)
(16011,)
(16011,)




(16011, 148)
(5, 16011, 2)
(16011, 5)
(16011,)
(16011,)




(16013, 148)
(5, 16013, 2)
(16013, 5)
(16013,)
(16013,)




(16015, 148)
(5, 16015, 2)
(16015, 5)
(16015,)
(16015,)




In [55]:
scores # something is rotten in the state of denmark

{'train': [0.1971267957526546,
  0.1985509961901193,
  0.20029979389169947,
  0.19640292262536688,
  0.20068685607243208],
 'test': [0.1990012484394507,
  0.2012987012987013,
  0.18856143856143856,
  0.20514742628685656,
  0.2045]}