In [17]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import pylab
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression

# Quickly plot ROC Curve and calculate AUC score for several algorithms to determine the best model

df = pd.read_csv('Spotify Data/decaded-combined.csv')

data2 = df.copy()
data2.loudness = (data2.loudness +50)/max(data2.loudness +50)
data2.duration_ms = (data2.duration_ms / 1000)/max(data2.duration_ms / 1000)
data2.key = (data2.key)/max(data2.key)
data2.tempo = (data2.tempo)/max(data2.tempo)
data2.time_signature = (data2.time_signature)/max(data2.time_signature)
data2.chorus_hit = (data2.chorus_hit)/max(data2.chorus_hit)
data2.sections = (data2.sections)/max(data2.sections)

# Establishing X and y
y = data2['target']
X = data2.drop(columns = ['uri','track', 'artist', 'target'])


# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

X2_train, X2_test,  y2_train,y2_test = train_test_split(X_train,y_train, test_size = 0.25, random_state = 42)


y.shape
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41106 entries, 0 to 41105
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      41106 non-null  float64
 1   energy            41106 non-null  float64
 2   key               41106 non-null  float64
 3   loudness          41106 non-null  float64
 4   mode              41106 non-null  int64  
 5   speechiness       41106 non-null  float64
 6   acousticness      41106 non-null  float64
 7   instrumentalness  41106 non-null  float64
 8   liveness          41106 non-null  float64
 9   valence           41106 non-null  float64
 10  tempo             41106 non-null  float64
 11  duration_ms       41106 non-null  float64
 12  time_signature    41106 non-null  float64
 13  chorus_hit        41106 non-null  float64
 14  sections          41106 non-null  float64
dtypes: float64(14), int64(1)
memory usage: 4.7 MB


In [18]:

# KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X2_train, y2_train)
knn_ypred = knn.predict(X2_test)
knn_proba = knn.predict_proba(X2_test)[:,1]
fpr_knn, tpr_knn, _ = roc_curve(y2_test, knn_proba)
knn_auc = auc(fpr_knn, tpr_knn)
knn_f1 = f1_score(y2_test, knn_ypred)
knn_prec = precision_score(y2_test, knn_ypred)
knn_recall = recall_score(y2_test, knn_ypred)
knn_accuracy = accuracy_score(y2_test, knn_ypred)

#LR
lr = LogisticRegression(C = 0.5 )
lr.fit(X2_train,y2_train)
lr_ypred = lr.predict(X2_test)
lr_proba = lr.predict_proba(X2_test)[:,1]
fpr_lr, tpr_lr, _ = roc_curve(y2_test, lr_proba)
lr_auc = auc(fpr_lr, tpr_lr)
lr_f1 = f1_score(y2_test, lr_ypred)
lr_prec = precision_score(y2_test, lr_ypred)
lr_recall = recall_score(y2_test, lr_ypred)
lr_accuracy = accuracy_score(y2_test, lr_ypred)

# Decision Tree
tree = DecisionTreeClassifier()
tree.fit(X2_train, y2_train)
tree_ypred = tree.predict(X2_test)
tree_proba = tree.predict_proba(X2_test)[:,1]
fpr_tree, tpr_tree, _ = roc_curve(y2_test, tree_proba)
tree_auc = auc(fpr_tree, tpr_tree)
tree_f1 = f1_score(y2_test, tree_ypred)
tree_prec = precision_score(y2_test, tree_ypred)
tree_recall = recall_score(y2_test, tree_ypred)
tree_accuracy = accuracy_score(y2_test, tree_ypred)


# Random Forest
forest = RandomForestClassifier(n_estimators=100, max_features=10)
forest.fit(X2_train, y2_train)
forest_ypred = forest.predict(X2_test)
forest_proba = forest.predict_proba(X2_test)[:,1]
fpr_rf, tpr_rf, _ = roc_curve(y2_test, forest_proba)
forest_auc = auc(fpr_rf, tpr_rf)
forest_f1 = f1_score(y2_test, forest_ypred)
forest_prec = precision_score(y2_test, forest_ypred)
forest_recall = recall_score(y2_test, forest_ypred)
forest_accuracy = accuracy_score(y2_test, forest_ypred)

# Gradient Boosting
grad = GradientBoostingClassifier()
grad.fit(X2_train, y2_train)
grad_ypred = grad.predict(X2_test)
grad_proba = grad.predict_proba(X2_test)[:,1]
fpr_gb, tpr_gb, _ = roc_curve(y2_test, grad_proba)
grad_auc = auc(fpr_gb, tpr_gb)
grad_f1 = f1_score(y2_test, grad_ypred)
grad_prec = precision_score(y2_test, grad_ypred)
grad_recall = recall_score(y2_test, grad_ypred)
grad_accuracy = accuracy_score(y2_test, grad_ypred)


In [19]:


print('Precision SCORES',
'\n',
'KNN:',knn_prec,
'\n',
'LR:', lr_prec,
'\n',
'TREE:',tree_prec,
'\n',
'FOREST:',forest_prec,
'\n',
'GRAD:',grad_prec)

print('Recall SCORES',
'\n',
'KNN:',knn_recall,
'\n',
'LR:', lr_recall,
'\n',
'TREE:',tree_recall,
'\n',
'FOREST:',forest_recall,
'\n',
'GRAD:',grad_recall)


print('F1 SCORES',
'\n',
'KNN:',knn_f1,
'\n',
'LR:', lr_f1,
'\n',
'TREE:',tree_f1,
'\n',
'FOREST:',forest_f1,
'\n',
'GRAD:',grad_f1)

print('ACCURACY SCORES',
'\n',
'KNN:',knn_accuracy,
'\n',
'LR:', lr_accuracy,
'\n',
'TREE:',tree_accuracy,
'\n',
'FOREST:',forest_accuracy,
'\n',
'GRAD:',grad_accuracy)

print('AUC SCORES',
'\n',
'KNN:',knn_auc,
'\n',
'LR:',lr_auc,
'\n',
'TREE:',tree_auc,
'\n',
'FOREST:',forest_auc,
'\n',
'GRAD:',grad_auc)


# Gradient Boosting seems to work significantly better, so we'll tune the model using Gradient Boosting going forward

pylab.figure(figsize=(10,10))
pylab.plot(fpr_knn, tpr_knn, label='knn')
pylab.plot(fpr_lr, tpr_lr, label='lr')
pylab.plot(fpr_tree, tpr_tree, label='decision tree')
pylab.plot(fpr_rf, tpr_rf, label='random forest')
pylab.plot(fpr_gb, tpr_gb, label='gradient boosting')
pylab.plot([0,1],[0,1], linestyle='dashed')
pylab.xlabel('FPR', labelpad=10)
pylab.ylabel('TPR',rotation=0, labelpad=15)
pylab.legend(loc='upper left')
pylab.title('ROC Curves')
pylab.savefig('all_aucroc.png')




Precision SCORES 
 KNN: 0.7000635458589282 
 LR: 0.7007268951194184 
 TREE: 0.7053483807654564 
 FOREST: 0.7658623771224308 
 GRAD: 0.7484049340706083
Recall SCORES 
 KNN: 0.798502053636144 
 LR: 0.8151727470403479 
 TREE: 0.6946122251751631 
 FOREST: 0.8282193766610292 
 GRAD: 0.8502053636143996
F1 SCORES 
 KNN: 0.7460496613995485 
 LR: 0.7536296627205717 
 TREE: 0.699939135727328 
 FOREST: 0.795821242019733 
 GRAD: 0.7960637936884968
ACCURACY SCORES 
 KNN: 0.7263106678019705 
 LR: 0.731662814742732 
 TREE: 0.7001581316141589 
 FOREST: 0.7860357620727405 
 GRAD: 0.780683615131979
AUC SCORES 
 KNN: 0.780401325852164 
 LR: 0.8056421044357759 
 TREE: 0.7004152254951319 
 FOREST: 0.8614102195165808 
 GRAD: 0.8576360556880637
