In [1]:
%matplotlib inline
 
import numpy as np
import pandas as pd
 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Read in data
data = pd.read_csv('data.csv')
data.drop(['artists', 'id', 'name', 'release_date'], axis = 1, inplace = True)
data_current = data[data.year >= 2010]
data_current = data_current[~data_current.duplicated() == 1]
data_current

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
9087,0.887000,0.319,187333,0.2010,0,0.000000,7,0.9040,-17.796,1,27,0.0623,117.153,0.2390,2018
9091,0.938000,0.269,236800,0.1290,0,0.000005,7,0.6830,-18.168,0,26,0.0576,82.332,0.1600,2018
9111,0.881000,0.644,313093,0.2120,0,0.000022,11,0.7980,-14.118,1,19,0.0347,117.072,0.4410,2020
9117,0.955000,0.627,295093,0.1840,0,0.000162,1,0.0986,-15.533,1,19,0.0450,115.864,0.2990,2020
9119,0.888000,0.581,183440,0.3310,0,0.000015,6,0.1470,-14.087,1,19,0.2430,88.303,0.6420,2020
9121,0.930000,0.442,147907,0.3990,0,0.000499,6,0.9120,-12.661,1,19,0.0780,121.662,0.5540,2020
9129,0.949000,0.570,64173,0.1760,0,0.000000,6,0.1470,-22.676,0,19,0.2990,135.687,0.3480,2020
9137,0.911000,0.565,232640,0.1530,0,0.000000,10,0.3580,-21.606,0,18,0.3780,103.309,0.4340,2020
9139,0.932000,0.598,233520,0.2120,0,0.000023,6,0.6920,-15.078,0,18,0.0406,107.183,0.1720,2020
9143,0.879000,0.367,213840,0.3070,0,0.000000,11,0.7300,-12.420,0,18,0.0568,172.867,0.2840,2020


In [4]:
X = data_current[["acousticness", "danceability", "duration_ms", "energy", "explicit", "instrumentalness", 
                  "key", "liveness", "loudness", "mode", "speechiness", "tempo", "valence", "year"]]
def groups(series):
    if 75 <= series <= 100:
        return 3
    elif 50 <= series < 75:
        return 2
    elif 25 <= series < 50:
        return 1
    elif series < 25:
        return 0

y = data_current['popularity'].apply(groups)

In [5]:
X = pd.get_dummies(X, columns = ['explicit', 'key', 'mode', 'year'])
X.head(5)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,...,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021
9087,0.887,0.319,187333,0.201,0.0,0.904,-17.796,0.0623,117.153,0.239,...,0,0,0,0,0,0,1,0,0,0
9091,0.938,0.269,236800,0.129,5e-06,0.683,-18.168,0.0576,82.332,0.16,...,0,0,0,0,0,0,1,0,0,0
9111,0.881,0.644,313093,0.212,2.2e-05,0.798,-14.118,0.0347,117.072,0.441,...,0,0,0,0,0,0,0,0,1,0
9117,0.955,0.627,295093,0.184,0.000162,0.0986,-15.533,0.045,115.864,0.299,...,0,0,0,0,0,0,0,0,1,0
9119,0.888,0.581,183440,0.331,1.5e-05,0.147,-14.087,0.243,88.303,0.642,...,0,0,0,0,0,0,0,0,1,0


In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state = 0, test_size = 0.2)

In [7]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.5254237288135594

In [8]:
# K Nearest Neighbors
scores = []
for i in range(1, 10):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.5566255778120185 7


In [9]:
# K-Means
scores = []
for i in range(1, 5):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.5454545454545454 0


In [10]:
# Logistic Regression
model = LogisticRegression(max_iter = 100)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.5454545454545454

# Subset

In [11]:
# Use data_current & get dummy variables
data_temp = pd.get_dummies(data_current, columns = ['explicit', 'key', 'mode', 'year'])
data_temp.corr()['popularity'].sort_values(ascending = False)

popularity          1.000000
explicit_1          0.349933
loudness            0.201367
danceability        0.114800
acousticness        0.064706
speechiness         0.056785
valence             0.052029
year_2019           0.044062
key_1               0.041398
key_8               0.041037
year_2011           0.032978
year_2017           0.032919
mode_1              0.028017
key_5               0.027169
year_2012           0.022367
key_3               0.017014
key_6               0.012666
year_2015           0.010858
year_2016           0.010641
year_2010           0.010215
year_2014           0.008440
key_0               0.008150
key_10             -0.003536
key_2              -0.006624
year_2018          -0.009776
key_11             -0.019617
key_4              -0.020160
mode_0             -0.028017
year_2013          -0.030530
year_2020          -0.033719
key_9              -0.037171
key_7              -0.043878
tempo              -0.055058
energy             -0.097248
year_2021     

In [14]:
X_s = data_temp[['explicit_1', 'loudness', 'danceability', 'liveness', 'duration_ms', 'year_2021', 'explicit_0', 'instrumentalness']]
y_s = data_temp['popularity'].apply(groups)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_s, y_s, random_state = 0, test_size = 0.2)

In [15]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.5171417565485362

In [16]:
# K Nearest Neighbors
scores = []
for i in range(1, 50):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.5679892141756548 30


In [17]:
# K-Means
scores = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.5454545454545454 0


In [18]:
# Logistic Regression
model = LogisticRegression(max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.5454545454545454