In [18]:
# Core imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from sklearn.multiclass import OneVsOneClassifier
# Preprocessing and visualization
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Metric functions
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors    import KNeighborsClassifier
from sklearn.dummy        import DummyClassifier
from sklearn.tree         import DecisionTreeClassifier

# Model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Ignore warnings if they happen, we don't care (that much)
import warnings; warnings.simplefilter('ignore')

# Cross-validation takes a minute, so we will save these models
from sklearn.externals import joblib
from sklearn.base import clone

In [2]:
df     = pd.read_csv("data/lyrical_genius.csv")

# Remove pop songs, they are all over the place and hurt classification
df = df[(df["Genre"] != "pop")]

# Remove some irrelevant columns
df = df.drop(columns="Unnamed: 0")
df = df.drop(columns="Unnamed: 0.1")

# We go ahead and remove ALL duplicates
df = df.drop_duplicates(subset=["Name","Artist"],keep=False)

# Give each genre a new cool color
genres = df["Genre"].unique()
unique_colors = [
    '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080'
]
colors = {}
i = 0
for genre in genres:
    colors[genre] = unique_colors[i]
    i+=1


# Upsample the amount of occurances of values that don't appear very often
# df = df.append(df[((df["Genre"] != "country") & (df["Genre"] != "edm_dance"))])
# extras    = df.copy()
# counts    = df["Genre"].value_counts()
# max_count = max(df["Genre"].value_counts())
# for genre in genres:
#     needed = max_count - counts[genre]
#     extras = extras.append(df[df["Genre"]==genre].sample(n=needed,replace=True))
# df = extras
counts    = df["Genre"].value_counts()
colors_list = [colors[genre] for genre in genres]

In [3]:
# Split data into data frames of the right type
x_cols    = ["Is_Exp","Danceability","Energy","Key","Loudness","Mode","Speechiness","Acousticness","Instrumentalness","Liveness","Valence","Tempo","Time_Signature"]
y_cols    = ["Genre"]
meta_cols = ["Id","Popularity","Name","Artist"]

X,y,meta = df[x_cols],df[y_cols].iloc[:,0],df[meta_cols]

# Split 
X_trainP, X_testP, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=1234, stratify=y)

extras    = X_trainP.copy()
counts    = y_train.value_counts()
max_count = max(counts)
extra_y   = pd.Series()
for genre in genres:
    needed = max_count - counts[genre]
    for i in range(needed):
        extra_y = extra_y.append(pd.Series([genre]))
    extras = extras.append(X_trainP[y_train==genre].sample(n=needed,replace=True))
X_trainP = extras
y_train = y_train.append(extra_y)

In [4]:
X_trainP.shape

(13398, 13)

In [5]:
# Scale the data and fit run 2D PCA on it
scaler   = StandardScaler()
scaler.fit(X_trainP)
X_train = scaler.transform(X_trainP)
X_test  = scaler.transform(X_testP)

In [6]:
X_train.shape

(13398, 13)

In [7]:
clf = LogisticRegressionCV(cv=5, random_state=1234, multi_class="multinomial")
clf.fit(X_train,y_train)

print(clf)

joblib.dump(clf, "logistic.pkl")
y_pred = clf.predict(X_test)
training = clf.score(X_train, y_train)
testing  = clf.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(genres)
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred)) 

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='multinomial', n_jobs=None, penalty='l2',
           random_state=1234, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)
Training Accuracy: 0.7106284520077624
Testing  Accuracy: 0.8060230292294066
['rnb' 'hiphop' 'country' 'classical' 'edm_dance' 'rock']
[[ 31  16   9   0   5   4]
 [ 17  57   3   0   4   0]
 [ 19   1 126   2   2  30]
 [  2   1  13 531   9   3]
 [  9   6  16   1 115  22]
 [  4   1  11   0   9  50]]
              precision    recall  f1-score   support

   classical       0.99      0.95      0.97       559
     country       0.71      0.70      0.70       180
   edm_dance       0.80      0.68      0.73       169
      hiphop       0.70      0.70      0.70        81
         rnb       0.38      0.48      0.42        65
        rock       0.46      0.67      0.54        75

   micro avg       

In [8]:
for i in range(len(clf.classes_)):
    print(clf.classes_[i])
    todos      = [(clf.coef_[i][j],list(X)[j]) for j in range(len(list(X)))]
    todos.sort(key=lambda x: abs(x[0]))
    for i in range(len(todos)):
        print(todos[i])

classical
(-0.008426993946581331, 'Liveness')
(-0.071311778302171, 'Mode')
(-0.09378144174400618, 'Tempo')
(0.16287595586454218, 'Speechiness')
(-0.20374502303073222, 'Time_Signature')
(-0.2206160698912292, 'Key')
(0.22806482020200153, 'Valence')
(0.4919374818803662, 'Energy')
(-0.6237077529107955, 'Is_Exp')
(1.116571110938684, 'Acousticness')
(1.3751385456595588, 'Instrumentalness')
(-1.48711793120067, 'Danceability')
(-2.5556691433679104, 'Loudness')
country
(0.02136484181347592, 'Liveness')
(-0.10115744919432847, 'Time_Signature')
(0.11877701156431579, 'Tempo')
(0.18022176830652686, 'Key')
(0.24615050783484105, 'Acousticness')
(0.44466554363828537, 'Loudness')
(-0.5428124415337515, 'Instrumentalness')
(-0.6654690386358262, 'Danceability')
(-0.6777020641823934, 'Energy')
(0.7312489277590659, 'Mode')
(-0.8709611160786486, 'Speechiness')
(0.9458740622801458, 'Valence')
(-1.0123276879380672, 'Is_Exp')
edm_dance
(-0.0017789150842387677, 'Speechiness')
(0.005142525859032556, 'Time_Signatu

In [9]:
def best_classifier(X, y, t_clf, params,scores=["accuracy"]) :
    """
    Sweeps different settings for the hyperparameters of a Decision Tree classifier,
    calculating the k-fold CV performance for each setting and metric,
    then selects the hyperparameters that maximize the average performance for each metric.
    """
    best_parms = []
    
    for score in scores:
        base_clf = clone(t_clf)
        print("Scoring for {}".format(score))
        clf   = GridSearchCV(base_clf, params, cv=5,scoring=score)
        
        clf.fit(X,y)
        print("Done fitting")
        
        mts   = clf.cv_results_["mean_test_score"]
        parms = clf.cv_results_["params"]
        
        for mt, parm in zip(mts,parms):
            print("Score: {:.4f}; Parameters {}".format(mt, parm))
        
        best_parms.append(clf.best_estimator_)
    return best_parms

In [30]:
weights = ["uniform"]
params  = {
    "n_neighbors": range(5,22,2)
}
knn = KNeighborsClassifier()
# clf = best_classifier(X_train,y_train,knn,params)[0]

print(clf)
clf = KNeighborsClassifier(n_neighbors=11)
clf.fit(X_train, y_train)

joblib.dump(clf, "knn.pkl")

y_pred = clf.predict(X_test)
training = clf.score(X_train, y_train)
testing  = clf.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred)) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=11, p=2,
           weights='uniform')
Training Accuracy: 0.8984176742797433
Testing  Accuracy: 0.7537643932683791
[[ 27  19   8   0   6   5]
 [ 23  48   4   0   3   3]
 [ 17   4 114   3  12  30]
 [  3   1  13 522  12   8]
 [ 20   6  20   1 106  16]
 [ 11   4  18   0   8  34]]
              precision    recall  f1-score   support

   classical       0.99      0.93      0.96       559
     country       0.64      0.63      0.64       180
   edm_dance       0.72      0.63      0.67       169
      hiphop       0.59      0.59      0.59        81
         rnb       0.27      0.42      0.33        65
        rock       0.35      0.45      0.40        75

   micro avg       0.75      0.75      0.75      1129
   macro avg       0.59      0.61      0.60      1129
weighted avg       0.78      0.75      0.77      1129



In [31]:
params = {
    "max_depth": range(5,12),
    "min_samples_leaf": range(1,10),
    "criterion": ["entropy"]
}
t = DecisionTreeClassifier()

# DTree = best_classifier(X_train, y_train, t, params, scores=["accuracy"])[0]

print(DTree)
DTree = DecisionTreeClassifier(max_depth=9, min_samples_leaf=4, criterion="entropy")
DTree.fit(X_train, y_train)

joblib.dump(DTree, "dtree.pkl")

# predict genres of test data
accuracy = DTree.score(X_test,y_test)
y_pred = DTree.predict(X_test)
training = DTree.score(X_train, y_train)
testing  = DTree.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred))

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Training Accuracy: 0.809896999552172
Testing  Accuracy: 0.7936226749335695
[[ 35  20   4   0   2   4]
 [ 17  58   1   0   4   1]
 [ 29   6 123   4   5  13]
 [  7   2  11 528  10   1]
 [ 18   5  20   1 108  17]
 [ 11   4  10   0   6  44]]
              precision    recall  f1-score   support

   classical       0.99      0.94      0.97       559
     country       0.73      0.68      0.70       180
   edm_dance       0.80      0.64      0.71       169
      hiphop       0.61      0.72      0.66        81
         rnb       0.30      0.54      0.38        65
        rock       0.55      0.59      0.57        75

   micro avg       0.79      0.7

In [32]:
importance = DTree.feature_importances_
d_feats      = list(X)
todos      = [(importance[i],d_feats[i]) for i in range(len(d_feats))]
todos.sort(key=lambda x: x[0],reverse=True)

In [33]:
todos

[(0.27499970018210484, 'Loudness'),
 (0.1277064376002267, 'Instrumentalness'),
 (0.12529005304112933, 'Is_Exp'),
 (0.12165234809841831, 'Speechiness'),
 (0.1104761043183542, 'Danceability'),
 (0.09010941094087152, 'Acousticness'),
 (0.05133060154311118, 'Energy'),
 (0.03833493054448468, 'Valence'),
 (0.03141099933177038, 'Tempo'),
 (0.013126277581783188, 'Liveness'),
 (0.012657578977465072, 'Mode'),
 (0.0024512882243385805, 'Key'),
 (0.00045426961594193676, 'Time_Signature')]

In [34]:
# compare to stratified dummy classifier
dummy = DummyClassifier(strategy='stratified')

joblib.dump(dummy, "dummy.pkl")

dummy.fit(X_train,y_train)
dummy_accuracy = dummy.score(X_test,y_test)
print( "Dummy classifier accuracy is" )
print(dummy_accuracy)

Dummy classifier accuracy is
0.17803365810451727
