In [133]:
import os
import numpy as np
import pandas as pd

MUSIC_PATH = "datasets/spotify/data.csv"

def load_music_data(music_path = MUSIC_PATH):
    csv_path = os.path.join(music_path)
    return pd.read_csv(csv_path)


In [134]:
music = load_music_data()

In [135]:
# Create an attribute representing the mean popularity of an artists songs

#import seaborn as sns
#from tqdm.autonotebook import tqdm

class Artist: 
    def __init__(self, name, popularity): 
        self.name = name
        self.popularity = popularity
        
        
class Track: 
    def __init__(self, name, artists, popularity): 
        self.name = name
        self.artists = artists
        self.popularity = popularity   
        
        

tracks = []

names = music.name.values
artists_names = music.artists.values
popularity = music.popularity.values

for index in range(len(names)): 
    track = Track(names[index], artists_names[index], popularity[index])
    tracks.append(track)
    
    
artists = []
artists_names_done = []
artists_popularities = []

for artists_str in artists_names: 
    artists_sub_list = artists_str[1:-1].split(', ')
    
    track_pop = 0
    for artist in artists_sub_list: 
        
        if artist in artists_names_done: 
            a = [x for x in artists if x.name == artist][0]
            artist_pop = a.popularity
            
        else: 
            songs_pop = [x.popularity for x in tracks if artist in x.artists]
            artist_pop = sum(songs_pop) / len(songs_pop)
            artists_names_done.append(artist)
            a = Artist(artist, artist_pop)
            artists.append(a)
        
        track_pop += artist_pop
        
    track_pop /= len(artists_sub_list)
    artists_popularities.append(track_pop)
    
artists_popularities = np.asarray(artists_popularities)

print(artists_popularities.max())

93.0


In [136]:
# Add the new attribute to the data

music["artist_popularity"] = artists_popularities

In [137]:
# Creating the test set and train set

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(music, test_size=0.2, random_state=42)

In [97]:
# Create a copy of the training set so we can play around with it

music_set = train_set.copy()

In [138]:
# Revert to a clean training set and separate predictors and labels

music_labels = train_set["popularity"].copy()
music_set = train_set.drop("popularity", axis=1)

In [141]:
# Also need to drop "release_date", "id" and "name"

music_set = music_set.drop("release_date", axis=1)
music_set = music_set.drop("id", axis=1)
music_set = music_set.drop("name", axis=1)
music_set = music_set.drop("artists", axis=1)

In [143]:
# Transformation Pipelines and Feature Scaling

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

music_prepared = num_pipeline.fit_transform(music_set)

In [48]:
# 1.0
# Train a linear regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(music_prepared, music_labels)

# Testing the model
some_data = music_set.iloc[32750:32760]
some_labels = music_labels.iloc[32750:32760]
some_data_prepared = num_pipeline.fit_transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

Predictions: [40.46178696 11.05446562  9.72915905 50.06409835 26.73194329 59.11926896
  7.39164937 54.5759881   7.07014527 48.1894325 ]
Labels: [43, 21, 0, 37, 38, 61, 0, 56, 0, 43]


In [44]:
# Measuring regression models RMSE on train set
# This gives the typical prediction error

from sklearn.metrics import mean_squared_error

music_predictions = lin_reg.predict(music_prepared)
lin_mse = mean_squared_error(music_labels, music_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

8.691937425526403

In [45]:
# 2.0
# Train a Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(music_prepared, music_labels)

# Testing the model
music_predictions_tree = tree_reg.predict(music_prepared)
tree_mse = mean_squared_error(music_labels, music_predictions_tree)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

1.0740164477845788

In [49]:
# Further testing of the Decision Tree Regressor. Seems like the Linear Regression shows better results.

some_data_tree = music_set.iloc[32750:32760]
some_labels_tree = music_labels.iloc[32750:32760]
some_data_prepared_tree = num_pipeline.fit_transform(some_data_tree)
print("Predictions:", tree_reg.predict(some_data_prepared_tree))
print("Labels:", list(some_labels_tree))

Predictions: [46. 22. 15. 43. 26. 58.  2. 60. 27. 43.]
Labels: [43, 21, 0, 37, 38, 61, 0, 56, 0, 43]


In [50]:
# Use k-fold coss-validation 
# NOT VERY GOOD

from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, music_prepared, music_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

# Checking the results
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [11.62744143 11.52022812 11.30961413 11.47575804 11.57556273 11.55662582
 11.54788394 11.56971941 11.48116147 11.64718588]
Mean: 11.531118097522894
Standard deviation: 0.09049311538321149


In [51]:
# Computing the scores for the linear regression model to compare

lin_scores = cross_val_score(lin_reg, music_prepared, music_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [8.74119199 8.72399056 8.59029482 8.58872684 8.81579687 8.59524735
 8.63022889 8.76304279 8.82357694 8.65115668]
Mean: 8.692325372365275
Standard deviation: 0.0877391787747594


In [144]:
# 3.0
# Building a Random Forest Regressor Model

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(music_prepared, music_labels)

# Checking RMSE
music_predictions = forest_reg.predict(music_prepared)
forest_mse = mean_squared_error(music_labels, music_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

3.1648283827312147

In [103]:
# Random Forest Regressor seems to be the most accurate

from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_reg, music_prepared, music_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [8.1608569  8.09987402 8.02784611 8.04598308 8.22075479 8.10091806
 8.1587252  8.15181274 8.22549239 8.11276108]
Mean: 8.130502436862113
Standard deviation: 0.06257040900770887


In [54]:
# Further testing of the Random Forest Regressor. 

some_data_forest = music_set.iloc[32750:32760]
some_labels_forest = music_labels.iloc[32750:32760]
some_data_prepared_forest = num_pipeline.fit_transform(some_data_forest)
print("Predictions:", forest_reg.predict(some_data_prepared_forest))
print("Labels:", list(some_labels_forest))

Predictions: [46.37       21.78        9.32       49.875      32.66       55.31816667
  5.34       55.17        9.78       48.08      ]
Labels: [43, 21, 0, 37, 38, 61, 0, 56, 0, 43]


In [64]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg2 = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg2, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(music_prepared, music_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [66]:
# Analyzing the RFR model and inspecting the importance of each attribute

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.01113765, 0.3509954 , 0.05419227, 0.01173508, 0.01322787,
       0.0109864 , 0.00152569, 0.00928329, 0.00586281, 0.01216029,
       0.02233068, 0.00125848, 0.01223952, 0.01173282, 0.47133175])

In [73]:
# We can see from this that "instrumentalness", "key", "explicit", "mode" and not important to the ml predictor

num_attribs = list(music_set)
attributes = num_attribs + ["artist_popularity"]
sorted(zip(feature_importances, attributes), reverse=True)

[(0.4713317479905617, 'artist_popularity'),
 (0.35099539983397676, 'year'),
 (0.05419226882005122, 'acousticness'),
 (0.022330679271190684, 'loudness'),
 (0.013227873410290981, 'duration_ms'),
 (0.012239517669072664, 'speechiness'),
 (0.012160288327212757, 'liveness'),
 (0.01173507904859588, 'danceability'),
 (0.011732823602197745, 'tempo'),
 (0.01113765174740698, 'valence'),
 (0.010986401623572343, 'energy'),
 (0.009283290202754674, 'instrumentalness'),
 (0.0058628125129225545, 'key'),
 (0.0015256864086499557, 'explicit'),
 (0.0012584795315431795, 'mode')]

In [145]:
test_set.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,artist_popularity
116368,0.91,1977,0.5,['Idris Muhammad'],0.894,273067,0.521,0,0KdzyvsBqX1HV3uLxGm2JV,0.772,4,0.068,-7.048,1,Camby Bolongo,34,1977,0.0847,126.805,34.0
161935,0.583,1976,0.93,['Stan Getz'],0.585,182200,0.272,0,1f4eO5waC6glPLXUhYKLm2,0.115,11,0.128,-16.693,0,Just One of Those Things,26,1976,0.0802,90.104,23.723301
135703,0.647,1994,0.157,"['Warren G', 'The Twinz']",0.878,179933,0.52,1,1X07ZfF5KQt8dyJ5kAkVAQ,0.0,7,0.173,-12.609,1,Recognize,38,1994-06-07,0.403,90.744,43.642857
112288,0.196,1957,0.937,['Red Garland Quintet'],0.463,925947,0.287,0,5wmMDeM4qvLfwqLa2cvoks,0.837,0,0.112,-11.758,0,Soul Junction,13,1957,0.0363,186.764,11.166667
22,0.422,1921,0.995,['Ignacio Corsini'],0.648,154240,0.0995,0,0SK1upzAP6NvIgF0uGh6z2,0.846,11,0.112,-22.429,1,Flor Marchita - Remasterizado,0,1921-03-20,0.105,71.978,0.031847


In [146]:
# Preparing the test set

test_set = test_set.drop("release_date", axis=1)
test_set = test_set.drop("id", axis=1)
test_set = test_set.drop("name", axis=1)
test_set = test_set.drop("artists", axis=1)

In [147]:
y_test = test_set["popularity"].copy()
x_test = test_set.drop("popularity", axis=1)

In [149]:
x_test.describe()

Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,artist_popularity
count,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0,34131.0
mean,0.52812,1976.770824,0.503404,0.538106,230259.0,0.481169,0.086197,0.167104,5.215435,0.205441,-11.467542,0.706601,0.100444,117.008194,31.454913
std,0.263012,25.985302,0.375972,0.176159,127006.9,0.267402,0.280659,0.314114,3.508321,0.1749,5.717673,0.455326,0.167347,30.797508,19.807963
min,0.0,1921.0,0.0,0.0,5108.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0
25%,0.319,1956.0,0.101,0.415,169773.0,0.255,0.0,0.0,2.0,0.0988,-14.5995,0.0,0.0349,93.4925,14.792869
50%,0.538,1977.0,0.525,0.551,207474.0,0.47,0.0,0.000211,5.0,0.136,-10.579,1.0,0.0451,114.906,34.591837
75%,0.747,1999.0,0.895,0.669,261467.0,0.702,0.0,0.098,8.0,0.26,-7.19,1.0,0.07575,135.909,46.216899
max,0.994,2020.0,0.996,0.975,3650800.0,1.0,1.0,1.0,11.0,0.995,1.342,1.0,0.969,222.605,85.0


In [150]:
# Evaluating the model on the test set

final_model = forest_reg

x_test_prepared = num_pipeline.transform(x_test)

final_predictions = final_model.predict(x_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [151]:
final_rmse

7.992779786410645

In [107]:
# Computing a 95% confidence interval

from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))

array([7.89165958, 8.09263656])

In [125]:
# Save the model

import joblib

filename = "forest_reg_popularity_model.pkl"
joblib.dump(forest_reg, filename)

['forest_reg_popularity_model.pkl']

In [153]:
#preds = final_model.predict(x_test)

accuracy = final_model.score(x_test, y_test)
print("Test Accuracy: {:.4f}".format(accuracy*100))

average_error = (abs(y_test - final_predictions)).mean()
print("{:.4f} average error".format(average_error))

Test Accuracy: -451.7697
5.4850 average error


In [162]:
# Further testing of the Random Forest Regressor. 

test_data_forest = music_set.iloc[:10]
test_labels_forest = music_labels.iloc[:10]
test_data_prepared_forest = num_pipeline.fit_transform(test_data_forest)
print("Predictions:", forest_reg.predict(test_data_prepared_forest))
print("Labels:", list(test_labels_forest))

Predictions: [21.375      33.92       50.79       24.57       57.81333333  0.
 34.76       33.57       58.29333333  6.005     ]
Labels: [19, 26, 67, 26, 45, 0, 38, 39, 55, 0]


In [164]:
for index in range(len(final_predictions[:100])): 

    pred = final_predictions[index]
    actual = y_test.iloc[index]
    
    print("Actual / Predicted: {:.4f} / {:.4f}".format(actual, pred))

Actual / Predicted: 34.0000 / 34.6575
Actual / Predicted: 26.0000 / 32.4025
Actual / Predicted: 38.0000 / 41.7600
Actual / Predicted: 13.0000 / 18.2300
Actual / Predicted: 0.0000 / 0.0300
Actual / Predicted: 62.0000 / 64.5800
Actual / Predicted: 23.0000 / 24.4120
Actual / Predicted: 0.0000 / 0.6600
Actual / Predicted: 25.0000 / 25.8400
Actual / Predicted: 25.0000 / 33.0800
Actual / Predicted: 48.0000 / 52.5150
Actual / Predicted: 42.0000 / 45.7000
Actual / Predicted: 35.0000 / 38.0200
Actual / Predicted: 22.0000 / 28.2800
Actual / Predicted: 34.0000 / 36.4200
Actual / Predicted: 49.0000 / 38.2500
Actual / Predicted: 0.0000 / 0.0700
Actual / Predicted: 46.0000 / 45.4800
Actual / Predicted: 50.0000 / 50.8100
Actual / Predicted: 33.0000 / 22.3333
Actual / Predicted: 45.0000 / 43.1600
Actual / Predicted: 8.0000 / 16.3200
Actual / Predicted: 22.0000 / 18.6600
Actual / Predicted: 62.0000 / 66.7300
Actual / Predicted: 20.0000 / 29.9500
Actual / Predicted: 29.0000 / 32.9300
Actual / Predicted:

In [167]:
# Save the model

import joblib

filename = "forest_reg_popularity_model.pkl"
joblib.dump(final_model, filename)

['forest_reg_popularity_model.pkl']