In [344]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import category_encoders as ce

In [345]:
#load dataset
dataset = pd.read_csv('dataset/spotify_tracks.csv')
#print(dataset.describe())

In [346]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [347]:
# drop records with missing values
#print(dataset.isnull().sum())
dataset.dropna(inplace=True)

In [348]:
dataset.nunique(axis=0)

Unnamed: 0          113999
track_id             89740
artists              31437
album_name           46589
track_name           73608
popularity             101
duration_ms          50696
explicit                 2
danceability          1174
energy                2083
key                     12
loudness             19480
mode                     2
speechiness           1489
acousticness          5061
instrumentalness      5346
liveness              1722
valence               1790
tempo                45652
time_signature           5
track_genre            114
dtype: int64

In [349]:
X = dataset.drop(columns=['Unnamed: 0', 'track_id', 'popularity'])
y = dataset['popularity']
#print(X)
#print(y)

In [350]:
y

0         73
1         55
2         57
3         71
4         82
          ..
113995    21
113996    22
113997    22
113998    41
113999    22
Name: popularity, Length: 113999, dtype: int64

### Feature encoding

In [351]:
X['explicit'] = X['explicit'].map({False : 0, True : 1})

In [352]:
ohe = ce.OneHotEncoder(cols=['key','time_signature']) 
X = ohe.fit_transform(X)

In [353]:
be = ce.BinaryEncoder(cols=['artists', 'album_name', 'track_name', 'track_genre'])
X = be.fit_transform(X)

In [354]:
X.columns

Index(['artists_0', 'artists_1', 'artists_2', 'artists_3', 'artists_4',
       'artists_5', 'artists_6', 'artists_7', 'artists_8', 'artists_9',
       'artists_10', 'artists_11', 'artists_12', 'artists_13', 'artists_14',
       'album_name_0', 'album_name_1', 'album_name_2', 'album_name_3',
       'album_name_4', 'album_name_5', 'album_name_6', 'album_name_7',
       'album_name_8', 'album_name_9', 'album_name_10', 'album_name_11',
       'album_name_12', 'album_name_13', 'album_name_14', 'album_name_15',
       'track_name_0', 'track_name_1', 'track_name_2', 'track_name_3',
       'track_name_4', 'track_name_5', 'track_name_6', 'track_name_7',
       'track_name_8', 'track_name_9', 'track_name_10', 'track_name_11',
       'track_name_12', 'track_name_13', 'track_name_14', 'track_name_15',
       'track_name_16', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8',
       'key_9', 'key_10', 'key_11', 'key_12

In [355]:
X.shape

(113999, 84)

In [356]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True) #stratify=y)

In [357]:
numerical_cols = ['duration_ms', 'explicit', 'danceability', 'energy', 'loudness', 'mode',
                  'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
X_train_numerical = X_train[numerical_cols]
X_test_numerical = X_test[numerical_cols]

standard_scaler = preprocessing.StandardScaler().set_output(transform="pandas")

X_train_numerical = standard_scaler.fit_transform(X_train_numerical)
X_test_numerical = standard_scaler.transform(X_test_numerical)

X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)

# should I standardize also the target variable? I don't think it's mandatory

In [358]:
print(X_train.shape)
X_train

(91199, 84)


Unnamed: 0,artists_0,artists_1,artists_2,artists_3,artists_4,artists_5,artists_6,artists_7,artists_8,artists_9,...,time_signature_3,time_signature_4,time_signature_5,track_genre_0,track_genre_1,track_genre_2,track_genre_3,track_genre_4,track_genre_5,track_genre_6
96253,1.138771,-0.804101,1.216131,-0.986895,1.102499,-0.984452,-0.983955,0.987545,1.014391,-1.019343,...,-0.093954,-0.126795,-0.037636,1.106143,1.111649,-0.899565,-0.981885,-0.982101,-0.998455,1.001679
70417,1.138771,-0.804101,1.216131,-0.986895,-0.907030,-0.984452,1.016306,0.987545,-0.985813,0.981024,...,-0.093954,-0.126795,-0.037636,1.106143,-0.899565,-0.899565,-0.981885,1.018225,1.001547,1.001679
66688,1.138771,-0.804101,-0.822280,1.013279,1.102499,-0.984452,1.016306,-1.012613,-0.985813,0.981024,...,-0.093954,-0.126795,-0.037636,1.106143,-0.899565,-0.899565,-0.981885,-0.982101,1.001547,1.001679
51391,1.138771,-0.804101,-0.822280,-0.986895,-0.907030,1.015794,-0.983955,-1.012613,-0.985813,0.981024,...,-0.093954,-0.126795,-0.037636,-0.904042,1.111649,1.111649,-0.981885,1.018225,-0.998455,-0.998324
95123,1.138771,1.243626,-0.822280,-0.986895,1.102499,1.015794,-0.983955,0.987545,1.014391,-1.019343,...,-0.093954,-0.126795,-0.037636,1.106143,1.111649,-0.899565,-0.981885,-0.982101,-0.998455,-0.998324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76821,1.138771,-0.804101,1.216131,-0.986895,1.102499,1.015794,-0.983955,-1.012613,-0.985813,0.981024,...,-0.093954,-0.126795,-0.037636,1.106143,-0.899565,-0.899565,1.018449,1.018225,-0.998455,1.001679
110269,1.138771,1.243626,1.216131,-0.986895,1.102499,-0.984452,1.016306,0.987545,-0.985813,0.981024,...,-0.093954,-0.126795,-0.037636,1.106143,1.111649,-0.899565,1.018449,1.018225,1.001547,1.001679
103695,1.138771,1.243626,-0.822280,-0.986895,-0.907030,1.015794,-0.983955,-1.012613,1.014391,-1.019343,...,-0.093954,-0.126795,-0.037636,1.106143,1.111649,-0.899565,1.018449,-0.982101,-0.998455,-0.998324
860,-0.878140,-0.804101,-0.822280,-0.986895,-0.907030,-0.984452,-0.983955,-1.012613,1.014391,-1.019343,...,-0.093954,-0.126795,-0.037636,-0.904042,-0.899565,-0.899565,-0.981885,-0.982101,-0.998455,1.001679


In [359]:
print(X_train_numerical.shape)
X_train_numerical

(91199, 12)


Unnamed: 0,duration_ms,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
96253,1.209605,-0.306913,0.427460,0.950305,0.370570,-1.326874,-0.230378,-0.494867,-0.503032,2.086197,-0.196163,-0.971364
70417,0.032251,-0.306913,0.583241,-1.112037,-0.285686,-1.326874,-0.467827,1.441588,-0.505483,0.171212,-0.161413,-1.607835
66688,-1.199902,-0.306913,1.264062,-1.652458,-1.635483,0.753651,4.618883,1.092785,-0.505483,-0.064882,0.711186,-0.402106
51391,0.038294,-0.306913,0.819797,0.747647,0.486391,0.753651,1.515962,-0.739028,-0.505483,1.823870,0.695742,0.528133
95123,1.224293,-0.306913,1.264062,0.242990,0.302941,0.753651,-0.370388,0.587623,-0.505483,-0.489851,0.857906,-0.603445
...,...,...,...,...,...,...,...,...,...,...,...,...
76821,0.097276,-0.306913,-0.374524,-1.791537,-0.736874,0.753651,-0.458367,1.817453,2.257574,-0.679775,-1.250232,0.321024
110269,0.759202,-0.306913,-0.045653,1.125147,0.645051,0.753651,-0.164157,-0.917339,2.341303,3.413570,-1.312009,0.763428
103695,-0.848953,-0.306913,0.358224,-0.321274,-0.631762,-1.326874,-0.307005,0.840204,-0.505483,-0.164566,1.568341,-0.107604
860,-0.103603,-0.306913,-0.841867,-2.121352,-1.372703,0.753651,-0.484855,1.901646,-0.505483,-0.657215,-1.373786,-0.597174


In [360]:
y_train, y_test

(96253     41
 70417     52
 66688     11
 51391     61
 95123     37
           ..
 76821     20
 110269    28
 103695     0
 860       44
 15795     55
 Name: popularity, Length: 91199, dtype: int64,
 113186    50
 42819     11
 59311      0
 90417     34
 61000     57
           ..
 83384     59
 102335     0
 78411     29
 86528     42
 96230     42
 Name: popularity, Length: 22800, dtype: int64)

In [361]:
class RidgeRegression:

    def __init__(self, alpha = 1.0):
        self._alpha = alpha
        self._w = None

    def fit(self, X, y):

        _X = X.copy()
        
        _X.insert(0, 'dummy_feature', 1)

        I = np.identity(_X.shape[1])
        I[0][0] = 0                         # justify this line

        self._w = np.linalg.inv(_X.T @ _X + self._alpha * I) @ _X.T @ y
        self._w.index = _X.columns

        #X.drop(columns='dummy_feature', inplace=True)


    def predict(self, X):

        if self._w is None:
            raise RuntimeError('Model is still to fit')
        
        _X = X.copy()

        _X.insert(0, 'dummy_feature', 1)
        
        y_prediction = _X @ self._w
        
        return y_prediction

In [362]:
rr = RidgeRegression(1.0)
rr.fit(X_train_numerical, y_train)
y_prediction = rr.predict(X_test_numerical)

absolute_errors = np.abs(y_prediction - y_test)
mean_absolute_errors = np.mean(absolute_errors)
std_absolute_errors = np.std(absolute_errors)
print(mean_absolute_errors, std_absolute_errors)

18.40826106162412 12.112820878311286


In [363]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)

ridge.fit(X_train_numerical, y_train)
sklearn_predictions = ridge.predict(X_test_numerical)

y_prediction - sklearn_predictions


113186   -1.421085e-14
42819     3.552714e-15
59311    -2.131628e-14
90417    -2.842171e-14
61000    -6.394885e-14
              ...     
83384    -7.105427e-15
102335   -7.105427e-15
78411    -4.973799e-14
86528    -2.486900e-14
96230    -2.842171e-14
Length: 22800, dtype: float64

In [364]:
rr = RidgeRegression(1.0)
rr.fit(X_train, y_train)
y_prediction = rr.predict(X_test)

absolute_errors = np.abs(y_prediction - y_test)
mean_absolute_errors = np.mean(absolute_errors)
std_absolute_errors = np.std(absolute_errors)
print(mean_absolute_errors, std_absolute_errors)

16.599695146671984 11.36857479204648


In [367]:
ridge = Ridge(alpha=1.0)

ridge.fit(X_train, y_train)
sklearn_predictions = ridge.predict(X_test)

y_prediction - sklearn_predictions

113186    3.230198e-10
42819     3.213074e-10
59311    -3.250022e-11
90417    -2.026468e-11
61000    -1.405454e-11
              ...     
83384    -1.350031e-11
102335   -3.278122e-11
78411    -1.052385e-10
86528     2.349054e-11
96230    -3.602452e-11
Length: 22800, dtype: float64