In [27]:
# Core imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from sklearn.multiclass import OneVsOneClassifier
# Preprocessing and visualization
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Metric functions
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import accuracy_score

# Models
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors    import KNeighborsClassifier
from sklearn.dummy        import DummyClassifier
from sklearn.tree         import DecisionTreeClassifier

# Model selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Ignore warnings if they happen, we don't care (that much)
import warnings; warnings.simplefilter('ignore')

# Cross-validation takes a minute, so we will save these models
from sklearn.externals import joblib
from sklearn.base import clone

In [None]:
df     = pd.read_csv("data/lyrical_genius.csv")

# Remove pop songs, they are all over the place and hurt classification
df = df[(df["Genre"] != "pop")]

# Remove some irrelevant columns
df = df.drop(columns="Unnamed: 0")
df = df.drop(columns="Unnamed: 0.1")

# We go ahead and remove ALL duplicates
df = df.drop_duplicates(subset=["Name","Artist"],keep=False)

# Give each genre a new cool color
genres = df["Genre"].unique()
unique_colors = [
    '#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080'
]
colors = {}
i = 0
for genre in genres:
    colors[genre] = unique_colors[i]
    i+=1


# Upsample the amount of occurances of values that don't appear very often
# df = df.append(df[((df["Genre"] != "country") & (df["Genre"] != "edm_dance"))])
# extras    = df.copy()
# counts    = df["Genre"].value_counts()
# max_count = max(df["Genre"].value_counts())
# for genre in genres:
#     needed = max_count - counts[genre]
#     extras = extras.append(df[df["Genre"]==genre].sample(n=needed,replace=True))
# df = extras
counts    = df["Genre"].value_counts()
colors_list = [colors[genre] for genre in genres]

In [3]:
# Split data into data frames of the right type
x_cols    = ["Is_Exp","Danceability","Energy","Key","Loudness","Mode","Speechiness","Acousticness","Instrumentalness","Liveness","Valence","Tempo","Time_Signature"]
y_cols    = ["Genre"]
meta_cols = ["Id","Popularity","Name","Artist"]

X,y,meta = df[x_cols],df[y_cols].iloc[:,0],df[meta_cols]

# Split 
X_trainP, X_testP, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=1234, stratify=y)

extras    = X_trainP.copy()
counts    = y_train.value_counts()
min_count = min(counts)
extra_y   = pd.Series()
for genre in genres:
    for i in range(min_count):
        extra_y = extra_y.append(pd.Series([genre]))
    extras = extras.append(X_trainP[y_train==genre].sample(n=needed,replace=False))
X_trainP = extras
y_train = y_train.append(extra_y)

In [4]:
X_trainP.shape

(13398, 13)

In [5]:
# Scale the data and fit run 2D PCA on it
scaler   = StandardScaler()
scaler.fit(X_trainP)
X_train = scaler.transform(X_trainP)
X_test  = scaler.transform(X_testP)

In [21]:
joblib.dump(scaler,"scaler.pkl")

['scaler.pkl']

In [7]:
clf = LogisticRegressionCV(cv=5, random_state=1234, multi_class="multinomial")
clf.fit(X_train,y_train)

print(clf)

joblib.dump(clf, "logistic.pkl")
y_pred = clf.predict(X_test)
training = clf.score(X_train, y_train)
testing  = clf.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(genres)
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred)) 

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='multinomial', n_jobs=None, penalty='l2',
           random_state=1234, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)
Training Accuracy: 0.7151067323481116
Testing  Accuracy: 0.8060230292294066
['rnb' 'hiphop' 'country' 'classical' 'edm_dance' 'rock']
[[ 33  16   8   0   4   4]
 [ 18  57   3   0   3   0]
 [ 18   1 128   2   2  29]
 [  2   1  16 528   9   3]
 [ 11   6  16   1 114  21]
 [  3   1  11   0  10  50]]
              precision    recall  f1-score   support

   classical       0.99      0.94      0.97       559
     country       0.70      0.71      0.71       180
   edm_dance       0.80      0.67      0.73       169
      hiphop       0.70      0.70      0.70        81
         rnb       0.39      0.51      0.44        65
        rock       0.47      0.67      0.55        75

   micro avg       

In [8]:
missed = X_test[(y_test != y_pred)]
ymiss  = y_test[(y_test != y_pred)]
missP  = clf.predict_proba(missed)

In [9]:
maxclass = missP.argmax(axis=1)
maxprob  = missP.max(axis=1)
biggest  = maxprob.argmax()

In [10]:
print(ymiss.iloc[biggest]) # Actual
print(max(max(clf.predict_proba(missed[biggest].reshape(1,-1)))))
print(clf.predict(missed[biggest].reshape(1,-1)))

edm_dance
0.9811733143375763
['classical']


In [11]:
scaler.inverse_transform(missed[biggest])

array([ 0.00000e+00,  4.02000e-01,  2.23000e-01,  0.00000e+00,
       -1.75950e+01,  0.00000e+00,  3.56000e-02,  9.89000e-01,
        9.63000e-01,  1.20000e-01,  3.85000e-02,  1.27059e+02,
        3.00000e+00])

In [12]:
for i in range(len(clf.classes_)):
    print(clf.classes_[i])
    todos      = [(clf.coef_[i][j],list(X)[j]) for j in range(len(list(X)))]
    todos.sort(key=lambda x: abs(x[0]))
    for i in range(len(todos)):
        print(todos[i])

classical
(-0.04055103473933157, 'Liveness')
(-0.05562804199168221, 'Tempo')
(-0.09802944762858233, 'Mode')
(0.19993213862397077, 'Speechiness')
(-0.20023035545441323, 'Key')
(-0.26327756993277496, 'Time_Signature')
(0.28587503489884003, 'Valence')
(0.3953124608225859, 'Energy')
(-0.5538082699210936, 'Is_Exp')
(1.0933808491101127, 'Acousticness')
(1.4590262442458395, 'Instrumentalness')
(-1.5247400605918369, 'Danceability')
(-2.5265677186344293, 'Loudness')
country
(-0.002337469473319377, 'Liveness')
(-0.09048753748236983, 'Time_Signature')
(0.12304922400643863, 'Tempo')
(0.1324299540450898, 'Key')
(0.23162379173869044, 'Acousticness')
(0.38532165177046096, 'Loudness')
(-0.41925794665656974, 'Instrumentalness')
(-0.6417256793059505, 'Danceability')
(-0.6439301224052372, 'Energy')
(0.6793522575646934, 'Mode')
(0.893840685133813, 'Valence')
(-1.0374038284316522, 'Speechiness')
(-1.1932318050404125, 'Is_Exp')
edm_dance
(-0.0256052301072368, 'Speechiness')
(-0.04590077993697691, 'Key')
(0.

In [13]:
def best_classifier(X, y, t_clf, params,scores=["accuracy"]) :
    """
    Sweeps different settings for the hyperparameters of a Decision Tree classifier,
    calculating the k-fold CV performance for each setting and metric,
    then selects the hyperparameters that maximize the average performance for each metric.
    """
    best_parms = []
    
    for score in scores:
        base_clf = clone(t_clf)
        print("Scoring for {}".format(score))
        clf   = GridSearchCV(base_clf, params, cv=5,scoring=score)
        
        clf.fit(X,y)
        print("Done fitting")
        
        mts   = clf.cv_results_["mean_test_score"]
        parms = clf.cv_results_["params"]
        
        for mt, parm in zip(mts,parms):
            print("Score: {:.4f}; Parameters {}".format(mt, parm))
        
        best_parms.append(clf.best_estimator_)
    return best_parms

In [14]:
weights = ["uniform"]
params  = {
    "n_neighbors": range(5,22,2)
}
knn = KNeighborsClassifier()
# clf = best_classifier(X_train,y_train,knn,params)[0]

print(clf)
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)

joblib.dump(clf, "knn.pkl")

y_pred = clf.predict(X_test)
training = clf.score(X_train, y_train)
testing  = clf.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred)) 

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='multinomial', n_jobs=None, penalty='l2',
           random_state=1234, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)
Training Accuracy: 1.0
Testing  Accuracy: 0.7723649247121346
[[ 18  18  15   1   9   4]
 [ 17  49   1   0  12   2]
 [ 12   1 130  11  12  14]
 [  3   1  10 533   9   3]
 [  8   5  22   4 119  11]
 [  7   3  25   3  14  23]]
              precision    recall  f1-score   support

   classical       0.97      0.95      0.96       559
     country       0.64      0.72      0.68       180
   edm_dance       0.68      0.70      0.69       169
      hiphop       0.64      0.60      0.62        81
         rnb       0.28      0.28      0.28        65
        rock       0.40      0.31      0.35        75

   micro avg       0.77      0.77      0.77      1129
   macro avg       0.60      0.59     

In [15]:
params = {
    "max_depth": range(5,12),
    "min_samples_leaf": range(1,10),
    "criterion": ["entropy","gini"]
}
t = DecisionTreeClassifier()

# DTree = best_classifier(X_train, y_train, t, params, scores=["accuracy"])[0]


DTree = DecisionTreeClassifier(max_depth=9, min_samples_leaf=4, criterion="gini")
DTree.fit(X_train, y_train)
print(DTree)
joblib.dump(DTree, "dtree.pkl")

# predict genres of test data
accuracy = DTree.score(X_test,y_test)
y_pred = DTree.predict(X_test)
training = DTree.score(X_train, y_train)
testing  = DTree.score(X_test,  y_test)
print("Training Accuracy: {}".format(training))
print("Testing  Accuracy: {}".format(testing))
print(confusion_matrix(y_test,y_pred,labels=genres))
print(classification_report(y_test, y_pred))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Training Accuracy: 0.822585460516495
Testing  Accuracy: 0.7821080602302923
[[ 35  14   9   0   4   3]
 [ 25  49   4   0   3   0]
 [ 28   2 118   5   3  24]
 [  7   3   6 532   4   7]
 [ 16   5  13   1 106  28]
 [  8   2  11   1  10  43]]
              precision    recall  f1-score   support

   classical       0.99      0.95      0.97       559
     country       0.73      0.66      0.69       180
   edm_dance       0.82      0.63      0.71       169
      hiphop       0.65      0.60      0.63        81
         rnb       0.29      0.54      0.38        65
        rock       0.41      0.57      0.48        75

   micro avg       0.78      0.78  

In [16]:
importance = DTree.feature_importances_
d_feats      = list(X)
todos      = [(importance[i],d_feats[i]) for i in range(len(d_feats))]
todos.sort(key=lambda x: x[0],reverse=True)

In [17]:
todos

[(0.2555225603233197, 'Loudness'),
 (0.12869858397543027, 'Speechiness'),
 (0.11931341838824691, 'Instrumentalness'),
 (0.11616843759700468, 'Is_Exp'),
 (0.10990970360365304, 'Danceability'),
 (0.09665024570718239, 'Acousticness'),
 (0.047585882456287656, 'Energy'),
 (0.04054549736718052, 'Tempo'),
 (0.033004374252821024, 'Valence'),
 (0.02234237504738629, 'Liveness'),
 (0.020973164048562525, 'Mode'),
 (0.009285757232924822, 'Key'),
 (0.0, 'Time_Signature')]

In [18]:
total = 0
for x in todos:
    total += x[0]
total

0.9999999999999999

In [19]:
len(todos)

13

In [22]:
# compare to stratified dummy classifier
dummy = DummyClassifier(strategy='stratified')



dummy.fit(X_train,y_train)

joblib.dump(dummy, "dummy.pkl")
dummy_accuracy = dummy.score(X_test,y_test)
print( "Dummy classifier accuracy is" )
print(dummy_accuracy)

Dummy classifier accuracy is
0.1629760850310009


In [23]:
def cv_performance(clf, X, y, kf, metrics=["accuracy"]) :
    """
    Splits the data, X and y, into k-folds and runs k-fold cross-validation.
    Trains classifier on k-1 folds and tests on the remaining fold.
    Calculates the k-fold cross-validation accuracy for classifier
    by averaging the performance across folds.
    
    Adapted for HW6
    """
    y=np.array(y.tolist())
    k = kf.get_n_splits(X, y)
    m = len(metrics)
    scores = np.empty((m, k))
    m=0
    for k, (train, test) in enumerate(kf.split(X, y)) :
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        clf.fit(X_train, y_train)
        # use Decision_tree_classifier.predict to make predictions
        y_pred = clf.predict(X_test)
        score = clf.score(X_test, y_test)
        scores[m,k] = score
            
    return scores.mean(axis=1) # average across columns

In [24]:
def select_params(X, y, kf, metrics=["accuracy"]) :
    """
    Sweeps different settings for the hyperparameters of a Decision Tree classifier,
    calculating the k-fold CV performance for each setting and metric,
    then selects the hyperparameters that maximize the average performance for each metric.
    
    Adapted from HW6
    """

    # part 4b: for each metric, select optimal hyperparameters using cross-validation
    
    # create grid of hyperparameters
    # hint: use a small 2x2 grid of hyperparameters for debugging
    depth_range = range(5,21)
    min_samples_range = range(1,15)
    scores = np.empty((len(metrics), len(depth_range), len(min_samples_range)))

    # compute CV scores using cv_performance(...)
    for depth_ind, max_depth in enumerate(depth_range):
        for samples_ind, min_samples in enumerate(min_samples_range):
            clf = DecisionTreeClassifier(criterion="entropy", max_depth=max_depth, min_samples_leaf=min_samples) 
            # compute CV scores using cv_performance(...)
            score = cv_performance(clf, X, y, kf, metrics)
            scores[:,depth_ind,samples_ind] = score
    
    # get best hyperparameters
    best_params = []
    for met_ind, metric in enumerate(scores):
        print (metrics[met_ind])
        print ("maximum score is", metric.max())
        depth_ind, samples_ind = np.unravel_index(metric.argmax(), metric.shape)
        params = (depth_range[depth_ind], min_samples_range[samples_ind])
        print( "max_depth, min_samples=", params)
        best_params.append(params)
    
    return best_params

In [26]:
from sklearn.model_selection import StratifiedKFold

# optimize parameters with cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)
opt_max_depth, opt_min_samples = select_params(X_train, y_train, skf)[0]

# train classifier
DTree = DecisionTreeClassifier(criterion="entropy", max_depth=opt_max_depth, min_samples_leaf=opt_min_samples)
DTree.fit(X_train,y_train)

# predict genres of test data
accuracy = DTree.score(X_test,y_test)

print("Test accuracy of the DTree is")
print(accuracy)
print("=============================")

NameError: name 'tree' is not defined