In [8]:
# Necessary packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [2]:
spotifyDF = pd.read_csv("spotify_df.csv").set_index("track_id")

In [49]:
# Selecting only top 15 highly correlated features with popularity
corr = spotifyDF.corr().abs()
popularity_corr = corr[["popularity"]]
popularity_corr = popularity_corr.sort_values(by=["popularity"], ascending=False)
top15Features = popularity_corr.iloc[1:16].index.values.tolist()

In [50]:
X = spotifyDF.loc[:, top15Features]
Y = spotifyDF[["popularity"]]

In [51]:
# Linear Regression Model

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)
Y_pred_test = model.predict(X_test)
print("MAE: %.4f"
      % mean_absolute_error(Y_test, Y_pred_test))
print("MSE: %.4f"
      % mean_squared_error(Y_test, Y_pred_test))
print("R^2: %.4f"
      % r2_score(Y_test, Y_pred_test))

MAE: 7.2395
MSE: 84.5565
R^2: 0.6345


In [52]:
# Linear Regression Model w/ K-Fold

cv = KFold(n_splits=10)
scoring = ["neg_mean_absolute_error", "neg_mean_squared_error", "r2"]
scores = cross_validate(model, X, Y, scoring=scoring, cv=cv)
print("MAE: %.4f" %-np.mean(scores["test_neg_mean_absolute_error"]))
print("MSE: %.4f" %-np.mean(scores["test_neg_mean_squared_error"]))
print("R^2: %.4f" %np.mean(scores["test_r2"]))

MAE: 7.3019
MSE: 87.3670
R^2: 0.5933


In [None]:
# DecisionTree Regression Model