# Spotify Hit Songs Classification

## Business Problem

## Data Understanding

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('data/dataset-of-10s.csv')
data.head()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target
0,Wild Things,Alessia Cara,spotify:track:2ZyuwVvV6Z3XJaXIFbspeE,0.741,0.626,1,-4.826,0,0.0886,0.02,0.0,0.0828,0.706,108.029,188493,4,41.18681,10,1
1,Surfboard,Esquivel!,spotify:track:61APOtq25SCMuK0V5w2Kgp,0.447,0.247,5,-14.661,0,0.0346,0.871,0.814,0.0946,0.25,155.489,176880,3,33.18083,9,0
2,Love Someone,Lukas Graham,spotify:track:2JqnpexlO9dmvjUMCaLCLJ,0.55,0.415,9,-6.557,0,0.052,0.161,0.0,0.108,0.274,172.065,205463,4,44.89147,9,1
3,Music To My Ears (feat. Tory Lanez),Keys N Krates,spotify:track:0cjfLhk8WJ3etPTCseKXtk,0.502,0.648,0,-5.698,0,0.0527,0.00513,0.0,0.204,0.291,91.837,193043,4,29.52521,7,0
4,Juju On That Beat (TZ Anthem),Zay Hilfigerrr & Zayion McCall,spotify:track:1lItf5ZXJc1by9SbPeljFd,0.807,0.887,1,-3.892,1,0.275,0.00381,0.0,0.391,0.78,160.517,144244,4,24.99199,8,1


In [3]:
data.isna().sum()

track               0
artist              0
uri                 0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
chorus_hit          0
sections            0
target              0
dtype: int64

In [4]:
data['target'].value_counts()

1    3199
0    3199
Name: target, dtype: int64

## Data Preparation

In [5]:
X = data.drop(columns=['target', 'uri', 'artist', 'track'], axis=1)
y = data['target']
X.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections
0,0.741,0.626,1,-4.826,0,0.0886,0.02,0.0,0.0828,0.706,108.029,188493,4,41.18681,10
1,0.447,0.247,5,-14.661,0,0.0346,0.871,0.814,0.0946,0.25,155.489,176880,3,33.18083,9
2,0.55,0.415,9,-6.557,0,0.052,0.161,0.0,0.108,0.274,172.065,205463,4,44.89147,9
3,0.502,0.648,0,-5.698,0,0.0527,0.00513,0.0,0.204,0.291,91.837,193043,4,29.52521,7
4,0.807,0.887,1,-3.892,1,0.275,0.00381,0.0,0.391,0.78,160.517,144244,4,24.99199,8


- Creating a baseline model (logistic Regression)

In [6]:
#normalize the data
X = X.apply(lambda x : (x - x.min()) /(x.max() - x.min()), axis=0)
X.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections
0,0.73879,0.626533,0.090909,0.899432,0.0,0.070809,0.02008,0.0,0.068476,0.723361,0.400098,0.09308,0.8,0.193225,0.093023
1,0.418807,0.247058,0.454545,0.687954,0.0,0.012962,0.874498,0.81809,0.0807,0.256148,0.676658,0.086266,0.6,0.155665,0.081395
2,0.53091,0.415269,0.818182,0.862211,0.0,0.031601,0.161647,0.0,0.094582,0.280738,0.773251,0.103036,0.8,0.210605,0.081395
3,0.478668,0.64856,0.0,0.880682,0.0,0.032351,0.005151,0.0,0.194033,0.298156,0.305743,0.095749,0.8,0.138515,0.05814
4,0.810623,0.88786,0.090909,0.919516,1.0,0.270487,0.003825,0.0,0.387755,0.79918,0.705958,0.067117,0.8,0.117248,0.069767


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 42)

In [12]:
#instantiate and fit the model

logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')

logreg.fit(X_train, y_train)

LogisticRegression(C=1000000000000.0, fit_intercept=False, solver='liblinear')

In [13]:
#generate predictions
y_hat_train = logreg.predict(X_train)
y_hat_test = logreg.predict(X_test)

In [15]:
# How many times was the classifier correct on the training set?
train_residuals = np.abs(y_train - y_hat_train)
print(pd.Series(train_residuals, name="Residuals (counts)").value_counts())
print()
print(pd.Series(train_residuals, name="Residuals (proportions)").value_counts(normalize=True))

0    3787
1    1011
Name: Residuals (counts), dtype: int64

0    0.789287
1    0.210713
Name: Residuals (proportions), dtype: float64


In [16]:
# How many times was the classifier correct on the test set?
test_residuals = np.abs(y_test - y_hat_test)
print(pd.Series(test_residuals, name="Residuals (counts)").value_counts())
print()
print(pd.Series(test_residuals, name="Residuals (proportions)").value_counts(normalize=True))

0    1250
1     350
Name: Residuals (counts), dtype: int64

0    0.78125
1    0.21875
Name: Residuals (proportions), dtype: float64


In [17]:
""" On the training set, the model had an accuracy of about 79% and on the test set,
the model had an accuracy of about 78%. These numbers are similar enough that there likely isn't overfitting."""

" On the training set, the model had an accuracy of about 79% and on the test set,\nthe model had an accuracy of about 78%. These numbers are similar enough that there likely isn't overfitting."

## Modeling

In [18]:
#trying to improve the baseline model - random forest

## Evaluation

## Deployment