In [1]:
# Core imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from sklearn.multiclass import OneVsOneClassifier
# Preprocessing and visualization
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Metric functions
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors    import KNeighborsClassifier
from sklearn.dummy        import DummyClassifier
from sklearn.tree         import DecisionTreeClassifier

# Model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Ignore warnings if they happen, we don't care (that much)
import warnings; warnings.simplefilter('ignore')

# Cross-validation takes a minute, so we will save these models
from sklearn.externals import joblib

In [2]:
df     = pd.read_csv("data/lyrical_genius.csv")

# Remove pop songs, they are all over the place and hurt classification
df = df[(df["Genre"] != "pop")]

# Remove some irrelevant columns
df = df.drop(columns="Unnamed: 0")
df = df.drop(columns="Unnamed: 0.1")

# We go ahead and remove ALL duplicates
df = df.drop_duplicates(subset=["Name","Artist"],keep=False)

# Give each genre a new cool color
genres = df["Genre"].unique()
unique_colors = [
    '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080'
]
colors = {}
i = 0
for genre in genres:
    colors[genre] = unique_colors[i]
    i+=1
    
# Upsample the amount of occurances of values that don't appear very often
# df = df.append(df[((df["Genre"] != "country") & (df["Genre"] != "edm_dance"))])
extras    = df.copy()
counts    = df["Genre"].value_counts()
max_count = max(df["Genre"].value_counts())
for genre in genres:
    needed = max_count - counts[genre]
    extras = extras.append(df[df["Genre"]==genre].sample(n=needed,replace=True))
df = extras
counts    = df["Genre"].value_counts()
colors_list = [colors[genre] for genre in genres]

df.head()

Unnamed: 0,Genre,Id,Popularity,Is_Exp,Name,Artist,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time_Signature,Lyrics
0,rnb,3ibKnFDaa3GhpPGlOUj7ff,80,False,Let Me Love You,Mario,0.656,0.578,7,-8.97,0,0.0922,0.235,0.0,0.118,0.556,94.514,4,"Mmmm Mmmmm Yeah Mmmmm Yeah, yeah, yeah Mmmm Y..."
1,rnb,2aIdVb8v9KTpEZnftkz2mD,78,False,Buy U a Drank (Shawty Snappin'),T-Pain,0.451,0.55,1,-8.137,1,0.262,0.0108,0.0,0.0737,0.594,80.001,4,
2,rnb,7DFnq8FYhHMCylykf6ZCxA,69,False,Yo (Excuse Me Miss),Chris Brown,0.536,0.612,4,-5.847,1,0.272,0.119,0.0,0.209,0.57,86.768,4,Let’s get ONE. THING. STRAIGHT! Certain shit ...
3,rnb,39YovPslPCXbFYhlYjsZ2Y,67,False,Don't Mess With My Man,Nivea,0.879,0.73,11,-4.369,0,0.164,0.114,0.0,0.241,0.885,99.925,4,"] Uh, I like it baby, uh Uh, one time for the..."
4,rnb,3LmvfNUQtglbTrydsdIqFU,72,False,We Belong Together,Mariah Carey,0.838,0.469,0,-7.992,1,0.0835,0.0358,0.0,0.0928,0.778,139.975,4,"Ooh, oh oh Sweet love, yeah I didn't mean i..."


In [3]:
# Split data into data frames of the right type
x_cols    = ["Is_Exp","Danceability","Energy","Key","Loudness","Mode","Speechiness","Acousticness","Instrumentalness","Liveness","Valence","Tempo","Time_Signature"]
y_cols    = ["Genre"]
meta_cols = ["Id","Popularity","Name","Artist"]

X,y,meta = df[x_cols],df[y_cols].iloc[:,0],df[meta_cols]

In [4]:
# Scale the data and fit run 2D PCA on it
scaler   = StandardScaler()
scaled_X = scaler.fit_transform(X)
pca = PCA(n_components=2)
prin_comp = pca.fit_transform(scaled_X)
prin_df   = pd.DataFrame(data=prin_comp, columns=["PC1","PC2"])

In [5]:
# Split 
X_train, X_test, y_train, y_test = train_test_split(scaled_X,y, test_size=.2, random_state=1234, stratify=y)

In [6]:
clf = LogisticRegressionCV(cv=5, random_state=1234, multi_class="multinomial")
clf.fit(X_train,y_train)

print(clf)

joblib.dump(clf, "logistic.pkl")
y_pred = clf.predict(X_test)
training = clf.score(X_train, y_train)
testing  = clf.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(genres)
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred)) 

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='multinomial', n_jobs=None, penalty='l2',
           random_state=1234, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)
Training Accuracy: 0.7104693679576151
Testing  Accuracy: 0.706057893166219
['rnb' 'hiphop' 'country' 'classical' 'edm_dance' 'rock']
[[263 115  90   1  51  39]
 [ 85 436   6   0  24   8]
 [ 40   5 395  14  20  84]
 [  2   1  17 529   7   3]
 [ 41  19  42  11 373  72]
 [ 27  10 100   1  50 370]]
              precision    recall  f1-score   support

   classical       0.95      0.95      0.95       559
     country       0.61      0.71      0.65       558
   edm_dance       0.71      0.67      0.69       558
      hiphop       0.74      0.78      0.76       559
         rnb       0.57      0.47      0.52       559
        rock       0.64      0.66      0.65       558

   micro avg       0

In [7]:
for i in range(len(clf.classes_)):
    print(clf.classes_[i])
    todos      = [(clf.coef_[i][j],list(X)[j]) for j in range(len(list(X)))]
    todos.sort(key=lambda x: abs(x[0]))
    for i in range(len(todos)):
        print(todos[i])
    

classical
(0.003946303957833425, 'Liveness')
(-0.012320837939287883, 'Tempo')
(-0.059209190800346395, 'Mode')
(0.1386253682006998, 'Valence')
(-0.15647763936258957, 'Key')
(0.18093054703667974, 'Speechiness')
(-0.20481272917398125, 'Time_Signature')
(0.36922499165517536, 'Energy')
(-0.4348551978606229, 'Is_Exp')
(1.0825861517214295, 'Acousticness')
(1.1158497141655224, 'Instrumentalness')
(-1.452747866697919, 'Danceability')
(-2.399882193829322, 'Loudness')
country
(0.07627944681588716, 'Liveness')
(0.1003827640241546, 'Tempo')
(0.11020855853054035, 'Key')
(-0.16959172164308817, 'Time_Signature')
(0.2121117239536969, 'Acousticness')
(0.36076982294641957, 'Loudness')
(-0.6509743152183395, 'Energy')
(-0.6658538708981445, 'Danceability')
(-0.6714826407924182, 'Instrumentalness')
(0.6964723020336722, 'Mode')
(0.9575799765631113, 'Valence')
(-0.9677900936521278, 'Is_Exp')
(-0.9810181266605161, 'Speechiness')
edm_dance
(0.03402132915186239, 'Time_Signature')
(-0.08386011756773848, 'Speechine

In [8]:
def best_classifier(X, y, t_clf, params) :
    """
    Sweeps different settings for the hyperparameters of a Decision Tree classifier,
    calculating the k-fold CV performance for each setting and metric,
    then selects the hyperparameters that maximize the average performance for each metric.
    """
    clf  = GridSearchCV(t_clf, params, cv=5,scoring= "accuracy")
    
    clf.fit(X,y)
    return clf.best_estimator_

In [9]:
weights = ["uniform","distance"]
pvals   = [1,2]
n_neigh = range(1,40,2)
params  = {
    "p": range(5,22,2),
    "weights": weights,
    "p": pvals,
}
knn = KNeighborsClassifier()
clf = best_classifier(X_train,y_train,knn,params)

print(clf)

joblib.dump(clf, "knn.pkl")

y_pred = clf.predict(X_test)
training = clf.score(X_train, y_train)
testing  = clf.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred)) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=17, p=1,
           weights='distance')
Training Accuracy: 1.0
Testing  Accuracy: 0.9829901521933752
[[559   0   0   0   0   0]
 [  0 559   0   0   0   0]
 [  4   0 546   0   0   8]
 [  1   1  17 530   4   6]
 [  4   1   3   0 542   8]
 [  0   0   0   0   0 558]]
              precision    recall  f1-score   support

   classical       1.00      0.95      0.97       559
     country       0.96      0.98      0.97       558
   edm_dance       0.99      0.97      0.98       558
      hiphop       1.00      1.00      1.00       559
         rnb       0.98      1.00      0.99       559
        rock       0.96      1.00      0.98       558

   micro avg       0.98      0.98      0.98      3351
   macro avg       0.98      0.98      0.98      3351
weighted avg       0.98      0.98      0.98      3351



In [10]:
params = {
    "max_depth": range(5,12),
    "min_samples_leaf": range(1,10),
    "criterion": ["gini","entropy"]
}
t = DecisionTreeClassifier()

DTree = best_classifier(X_train, y_train, t, params)

print(DTree)

joblib.dump(DTree, "dtree.pkl")

# predict genres of test data
accuracy = DTree.score(X_test,y_test)
y_pred = DTree.predict(X_test)
training = DTree.score(X_train, y_train)
testing  = DTree.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Training Accuracy: 0.9055294381016342
Testing  Accuracy: 0.8731721874067443
[[491  36  17   0   2  13]
 [ 48 502   0   0   6   3]
 [ 39   7 462   1  12  37]
 [  5   1   7 533  11   2]
 [ 30  13  27   0 467  21]
 [ 34   2  39   0  12 471]]
              precision    recall  f1-score   support

   classical       1.00      0.95      0.98       559
     country       0.84      0.83      0.83       558
   edm_dance       0.92      0.84      0.87       558
      hiphop       0.89      0.90      0.90       559
         rnb       0.76      0.88      0.81       559
        rock       0.86      0.84      0.85       558

   micro avg       0.87      0.87

In [11]:
importance = DTree.feature_importances_
d_feats      = list(X)
todos      = [(importance[i],d_feats[i]) for i in range(len(d_feats))]
todos.sort(key=lambda x: x[0],reverse=True)

In [12]:
todos

[(0.23236646555203988, 'Loudness'),
 (0.11972763774214497, 'Speechiness'),
 (0.11116756671594244, 'Danceability'),
 (0.10370262721781753, 'Instrumentalness'),
 (0.09066175301567177, 'Is_Exp'),
 (0.08960550874967296, 'Acousticness'),
 (0.0794520600186662, 'Energy'),
 (0.06269239212140733, 'Tempo'),
 (0.04421285160139217, 'Valence'),
 (0.03159769648938847, 'Liveness'),
 (0.0168126251574168, 'Key'),
 (0.014429366236905213, 'Mode'),
 (0.0035714493815342196, 'Time_Signature')]

In [14]:
# compare to stratified dummy classifier
dummy = DummyClassifier(strategy='stratified')



dummy.fit(X_train,y_train)

joblib.dump(dummy, "dummy.pkl")
dummy_accuracy = dummy.score(X_test,y_test)
print( "Dummy classifier accuracy is" )
print(dummy_accuracy)

Dummy classifier accuracy is
0.1683079677708147
