In [45]:
%matplotlib inline
 
import numpy as np
import pandas as pd
import sklearn as sk
 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report, make_scorer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn import metrics 
from sklearn import neighbors
from sklearn import tree

We begin with reading in our data and doing some initial cleaning.

In [26]:
# Read in data
data = pd.read_csv('data.csv')

# Filter out new data
data_current = data[data.year >= 2010]

# Remove duplicate values based on id
data_current = data_current.drop_duplicates(subset=['id'],keep='first',inplace=False)
df.shape[0] # shape matches what we have in R; we're good here.

# Creating a new popularity dummy variable
def groups(series):
    if 75 <= series <= 100:
        return 3
    elif 50 <= series < 75:
        return 2
    elif 25 <= series < 50:
        return 1
    elif series < 25:
        return 0
data_current['popDummy'] = data_current['popularity'].apply(groups)

# Converting duration_ms to seconds
data_current['duration_s'] = data_current['duration_ms']*.001

# Recoding year variable
def groups(series):
    if series == 2010:
        return 0
    elif series == 2011:
        return 1
    elif series == 2012:
        return 2
    elif series == 2013:
        return 3
    elif series == 2014:
        return 4
    elif series == 2015:
        return 5
    elif series == 2016:
        return 6
    elif series == 2017:
        return 7
    elif series == 2018:
        return 8
    elif series == 2019:
        return 9
    elif series == 2020:
        return 10
    elif series == 2021:
        return 11
data_current['year_factor'] = data_current['year'].apply(groups)

data_current.head()

# Subsetting our data to use for model--let's call this 'spotify'
spotify = data_current[['popDummy','instrumentalness','explicit','loudness','duration_s','liveness','year_factor']]

# We're going to only use data from 2010-2021, and predict 2021.
spotify_new = spotify[spotify.year_factor == 11]
spotify = spotify[spotify.year_factor < 11]

# Just a quick look-through to make sure the data is good
print(spotify.head())
print(list(spotify.columns))
print(spotify.info())

      popDummy  instrumentalness  explicit  loudness  duration_s  liveness  \
9087         1          0.000000         0   -17.796     187.333    0.9040   
9091         1          0.000005         0   -18.168     236.800    0.6830   
9111         0          0.000022         0   -14.118     313.093    0.7980   
9117         0          0.000162         0   -15.533     295.093    0.0986   
9119         0          0.000015         0   -14.087     183.440    0.1470   

      year_factor  
9087            8  
9091            8  
9111           10  
9117           10  
9119           10  
['popDummy', 'instrumentalness', 'explicit', 'loudness', 'duration_s', 'liveness', 'year_factor']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 25318 entries, 9087 to 174188
Data columns (total 7 columns):
popDummy            25318 non-null int64
instrumentalness    25318 non-null float64
explicit            25318 non-null int64
loudness            25318 non-null float64
duration_s          25318 non-nul

Things to consider:
    - change instrumentalness variable to factor?
    - ensure explicit, key, mode, and year are factors

MODEL SELECTION

In [27]:
train, test = train_test_split(spotify, test_size = 0.2, random_state = 123)

# Creating dataframes for the training/test inputs/target
train_X = train.drop(['popDummy'], axis=1)
train_Y = train["popDummy"]
test_X = test.drop(['popDummy'], axis=1)
test_Y = test["popDummy"]

test_Y.size
train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20254 entries, 75235 to 142971
Data columns (total 6 columns):
instrumentalness    20254 non-null float64
explicit            20254 non-null int64
loudness            20254 non-null float64
duration_s          20254 non-null float64
liveness            20254 non-null float64
year_factor         20254 non-null int64
dtypes: float64(4), int64(2)
memory usage: 1.1 MB


1. Multinomial logisitic regression

In [48]:
# Declare model
lr_model = linear_model.LogisticRegression()

# Fitting model on training data
lr_model.fit(train_X,train_Y)

# Printing our our predicted values
Y_hat = lr_model.predict(test_X)
with np.printoptions(threshold=np.inf):
    print(Y_hat)

[0 2 0 2 0 0 0 0 2 0 0 0 2 2 0 0 2 0 0 2 0 2 2 0 0 0 0 0 2 0 2 0 2 0 2 2 0
 0 2 2 0 0 3 0 0 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0
 0 2 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 2 0 0 2 2 0 2 0 0 0 0 0 2 2 0 0
 0 2 0 0 0 2 0 0 0 3 0 0 0 0 2 2 0 2 0 2 0 0 0 0 0 0 2 2 0 2 0 0 0 0 0 0 0
 2 2 2 0 2 0 0 0 0 0 2 0 2 0 0 0 0 2 0 2 0 0 2 2 0 0 2 0 0 2 2 2 0 3 0 2 3
 0 0 2 0 0 0 2 2 0 0 0 2 2 0 0 0 2 2 2 0 0 0 0 0 2 0 0 2 0 0 2 0 2 0 2 0 2
 2 2 0 0 0 0 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 2 2 0 0
 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 2 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 2 0 2 2 0 0 0 0 2 0 2 0 2
 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 0 0 0 0 0 2 2 2 0 0 0 0 2
 2 2 0 0 2 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0 0 0 0 0 0 0 2 0 0 2 0 0 2 0 0 2
 0 0 0 3 2 2 0 2 2 0 3 0 0 2 0 3 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 2 0 2 2
 0 2 0 0 0 0 0 2 0 2 2 0 0 0 0 0 0 0 2 0 2 0 2 0 0 2 2 2 0 0 0 2 2 0 0 2 2
 2 0 2 0 3 3 0 0 0 0 0 2 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [49]:
# confusion matrix
conf_matrix = sk.metrics.confusion_matrix(test_Y,Y_hat)
print(conf_matrix)

# classification report
print(classification_report(Y_hat,test_Y))
# warning; most likely due to all predictions being 0

# accuracy measure
print(metrics.accuracy_score(Y_hat,test_Y))

[[2455   11  330   29]
 [ 104    0   63    3]
 [ 834    1  933   22]
 [ 151    0  108   20]]
              precision    recall  f1-score   support

           0       0.87      0.69      0.77      3544
           1       0.00      0.00      0.00        12
           2       0.52      0.65      0.58      1434
           3       0.07      0.27      0.11        74

    accuracy                           0.67      5064
   macro avg       0.37      0.40      0.37      5064
weighted avg       0.76      0.67      0.71      5064

0.6729857819905213


2. kNN 

In [50]:
# First find the optimal # of neighbors
scores = []
for i in range(1, 30):
    knn_model = neighbors.KNeighborsClassifier(n_neighbors = i)
    knn_model.fit(train_X, train_Y)
    y_model = knn_model.predict(test_X)
    scores.append(metrics.accuracy_score(test_Y, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.6773301737756714 9


In [51]:
k = scores.index(max_val)

# Create model object
knn_model = neighbors.KNeighborsClassifier(k)

# Train model
knn_model.fit(train_X, train_Y)

# Test model
Y_hat = knn_model.predict(test_X)

# Printing our our predicted values
with np.printoptions(threshold=np.inf):
    print(Y_hat)

[0 0 0 2 2 2 0 2 2 0 0 0 0 0 0 2 2 0 0 2 0 2 2 2 0 2 0 2 0 2 2 0 0 0 2 2 2
 0 2 2 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 2 2 0 0 0 0 0 2 0 2 0 2 0 0 2 0 2 0
 3 2 0 0 0 2 0 0 0 0 2 0 2 0 0 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 2 0 0
 0 2 0 0 0 0 0 0 2 0 0 2 0 0 0 2 0 0 2 2 2 0 0 0 0 0 2 2 0 2 0 0 0 0 0 0 0
 2 0 2 2 2 2 0 2 0 0 2 0 3 0 0 0 0 2 2 2 0 0 2 2 0 2 2 0 0 2 0 2 0 0 0 0 0
 2 2 2 0 0 0 2 2 0 2 2 0 2 0 0 0 0 2 2 0 0 0 0 2 2 2 0 0 0 0 2 0 2 2 0 0 2
 2 2 2 0 2 0 0 0 0 2 2 0 2 0 0 0 0 0 0 2 0 0 0 0 0 0 2 0 0 2 0 2 2 0 0 0 2
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0 2 0 0 0 0 2 2 2
 0 0 0 0 0 2 0 2 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0 0 0 2 2 2 0 0 2 2 0 2 0 0
 0 0 2 0 1 0 2 0 0 0 0 0 2 2 2 2 0 0 2 0 0 2 2 0 2 0 0 2 0 2 0 2 0 2 0 2 2
 0 0 2 2 2 0 2 2 0 0 0 2 0 2 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 2 0 2
 0 2 0 0 0 2 0 0 2 0 0 0 0 3 0 2 0 0 0 0 0 0 3 0 0 2 0 0 0 2 0 0 0 2 0 2 2
 2 0 0 0 0 0 0 0 0 2 2 0 2 0 0 2 0 0 2 0 2 0 2 2 0 2 0 2 0 0 0 2 2 0 0 2 0
 0 2 2 0 0 0 0 0 0 2 0 0 

In [52]:
# confusion matrix
conf_matrix = sk.metrics.confusion_matrix(test_Y, Y_hat)
print(conf_matrix)

# classification report
print(classification_report(test_Y, Y_hat))

# accuracy measure
print("Accuracy:",metrics.accuracy_score(test_Y,Y_hat))

[[2271    4  517   33]
 [  93   15   60    2]
 [ 682    3 1078   27]
 [ 142    0  113   24]]
              precision    recall  f1-score   support

           0       0.71      0.80      0.76      2825
           1       0.68      0.09      0.16       170
           2       0.61      0.60      0.61      1790
           3       0.28      0.09      0.13       279

    accuracy                           0.67      5064
   macro avg       0.57      0.40      0.41      5064
weighted avg       0.65      0.67      0.65      5064

Accuracy: 0.6690363349131122


3. Classification tree

In [53]:
# Create model object
tree_model = tree.DecisionTreeClassifier()

# Train model
tree_model.fit(train_X, train_Y)

# Test model
Y_hat = tree_model.predict(test_X)

# Printing our our predicted values
with np.printoptions(threshold=np.inf):
    print(Y_hat)

[2 2 0 2 0 2 0 0 3 0 2 0 2 2 0 0 0 0 0 2 3 1 2 2 0 0 0 1 2 2 2 0 0 0 2 0 2
 0 2 0 0 0 2 0 3 3 0 0 0 0 0 2 0 0 0 0 2 0 0 2 1 0 0 0 0 2 2 2 0 0 0 0 2 0
 2 2 0 0 0 2 0 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 2 0 2 2 2 3 3 0 0 0 0 2 2 0 0
 0 2 0 0 0 3 0 0 2 2 0 0 0 3 0 2 0 0 3 2 2 3 2 0 0 0 2 2 0 2 0 0 0 2 2 0 0
 2 2 2 2 2 2 0 2 0 0 2 0 0 0 0 0 0 0 2 2 0 0 2 2 0 2 2 0 0 2 0 2 1 3 0 2 0
 2 2 2 3 0 1 2 2 1 2 0 2 0 0 0 3 0 0 2 1 0 0 1 2 2 2 0 2 0 2 2 0 2 2 2 0 2
 2 2 2 0 3 0 0 2 0 2 2 0 2 0 0 2 0 2 0 3 0 0 0 0 0 2 3 0 0 0 0 2 2 2 2 0 0
 0 2 0 2 3 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2 2 0 0 2 0 0 0 0 3 0 0 0 0 0 3 0
 0 0 0 0 2 2 0 2 0 0 0 2 0 0 0 0 2 2 0 3 2 2 0 2 2 0 2 2 0 0 0 2 2 0 2 2 0
 2 0 0 0 0 0 2 0 0 2 0 0 2 0 2 0 2 0 2 0 3 3 2 2 2 2 0 2 0 2 0 3 2 1 0 2 2
 0 0 1 2 2 0 2 0 0 0 0 3 0 0 0 2 0 0 0 2 2 0 0 0 0 0 0 2 0 0 2 0 0 2 2 0 2
 2 0 0 3 0 3 0 2 0 0 0 0 0 3 0 0 0 0 0 2 0 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0
 0 2 0 0 0 0 3 0 0 2 2 2 3 2 0 2 2 0 2 0 2 3 2 0 2 2 2 2 0 0 0 2 2 0 0 2 2
 2 2 0 0 2 0 0 0 0 0 0 0 

In [54]:
# confusion matrix
conf_matrix = sk.metrics.confusion_matrix(test_Y, Y_hat)
print(conf_matrix)

# classification report
print(classification_report(test_Y, Y_hat))

# accuracy measure
print("Accuracy:",metrics.accuracy_score(test_Y,Y_hat))

[[2132   86  513   94]
 [  77   31   58    4]
 [ 486   49 1110  145]
 [  75   11  133   60]]
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      2825
           1       0.18      0.18      0.18       170
           2       0.61      0.62      0.62      1790
           3       0.20      0.22      0.21       279

    accuracy                           0.66      5064
   macro avg       0.44      0.44      0.44      5064
weighted avg       0.66      0.66      0.66      5064

Accuracy: 0.658175355450237


Testing our models on 2021 data:

In [56]:
X = spotify_new.drop(['popDummy'], axis=1)
Y = spotify_new['popDummy']

# Logistic regression
# Printing our our predicted values
Y_hat = lr_model.predict(X)
with np.printoptions(threshold=np.inf):
    print(Y_hat)
    
# confusion matrix
conf_matrix = sk.metrics.confusion_matrix(Y, Y_hat)
print(conf_matrix)

# classification report
print(classification_report(Y, Y_hat))

# accuracy measure
print("Accuracy:",metrics.accuracy_score(Y,Y_hat))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 3 3 0 0 3 0 3 0 0 0 2 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 0 2 3 0 2 0 3
 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 2 3 0 3 0 2 0 2 2 0 3 3 0 3 3 2 0 0 0 3 0
 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 0 0 3 3 0 3 0 0 0 3
 3 0 0 3 0 3 0 0 0 3 0 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 3 0 3 0 0 0 0 0 3 0 3 0 0 3 0 0 0 0 0 3 0 3 3 0 0 0 0 0 3 3 0 0
 3 0 0 0 0 0 0 0 0 0 3 3 3 3 0 3 0 3 0 3 0 0 3 0 0 0 0 0 0 3 3 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 3 3 3 0 0 3 3
 0 2 3 3 3 3 0 3 0 0 0 3 3 3 3 3 0 0 3 3 3 0 3 3 0 3 3 3 0 0 0 0 3 3 3 3 0
 3 0 0 0 2 0 0 0 0 0 2 0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
# kNN, k = 9
# Printing our our predicted values
Y_hat = knn_model.predict(X)
with np.printoptions(threshold=np.inf):
    print(Y_hat)
    
# confusion matrix
conf_matrix = sk.metrics.confusion_matrix(Y, Y_hat)
print(conf_matrix)

# classification report
print(classification_report(Y, Y_hat))

# accuracy measure
print("Accuracy:",metrics.accuracy_score(Y,Y_hat))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 1 0 3 0 2 0 3 0 3 2 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0 2 0 0 3 2 0 0 3 3 2 3 0 3 2
 3 0 0 0 0 0 0 3 3 3 0 0 0 0 2 3 0 3 0 3 0 0 0 0 0 0 0 3 0 2 2 3 2 0 0 2 3
 0 0 0 0 0 0 0 0 0 2 0 2 3 0 0 3 0 0 3 3 0 3 0 0 0 0 3 0 3 3 3 0 3 0 0 0 3
 0 0 0 0 0 0 2 0 0 0 0 3 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 2 3 0
 0 0 0 0 0 3 2 0 2 0 0 0 0 0 0 3 2 0 0 0 0 0 0 0 0 3 0 0 2 0 3 2 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 3 0
 0 3 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 2 0 0 0 0 0 0 0 

In [58]:
# Classification tree
# Logistic regression
# Printing our our predicted values
Y_hat = tree_model.predict(X)
with np.printoptions(threshold=np.inf):
    print(Y_hat)
    
# confusion matrix
conf_matrix = sk.metrics.confusion_matrix(Y, Y_hat)
print(conf_matrix)

# classification report
print(classification_report(Y, Y_hat))

# accuracy measure
print("Accuracy:",metrics.accuracy_score(Y,Y_hat))

[0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 3 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 2 3 0 0 2 0 3 0 0 0 2 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0
 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 0 0 0 0 0 3 0 0 0 3 3 2 2 0 2 2 2 3 3 0 2
 0 0 0 3 3 0 0 0 0 3 0 2 0 2 3 3 0 2 0 2 0 3 3 3 3 0 3 2 0 2 2 2 2 0 0 2 2
 3 2 0 0 2 2 3 0 2 0 0 2 0 3 3 3 3 0 0 0 0 3 0 2 2 2 3 0 0 2 2 0 3 0 2 0 3
 3 0 0 3 0 0 0 3 0 3 3 0 3 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3
 0 0 0 0 0 1 2 0 0 3 0 2 3 2 2 1 0 0 3 0 3 0 0 0 0 0 3 3 2 0 0 0 0 3 3 0 0
 3 0 0 0 0 0 0 0 3 0 0 3 0 0 0 2 0 3 0 0 3 0 3 0 0 0 3 0 0 1 3 0 2 0 0 1 2
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 1
 0 3 3 0 0 2 0 3 0 0 0 3 0 1 0 0 3 0 0 3 0 0 3 0 0 2 1 1 0 0 3 0 0 0 3 1 0
 0 0 0 0 2 0 0 0 0 0 0 0 

Looks like kNN performs best! We were able to predict the 2021 songs' groups with around 75% accuracy.