In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor

In [2]:
df = pd.read_csv("./TikTok_songs_2022.csv")

In [3]:
df.loc[df["track_pop"] == 0,'track_pop'] = df['track_pop'].mean() 

In [4]:
from sklearn.model_selection import train_test_split

X = df[['liveness', 'speechiness', 'key', 'loudness', 'artist_pop']].values
y = df['track_pop'].values

SEED = 15

### Split training and testing set

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED, test_size=0.2, 
                                                    shuffle = True)

standard for SVR

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as MSE

ss = StandardScaler()

ss.fit(X)

X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

### 1. Linear Regression

In [31]:
#Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

lr = LinearRegression()
%time lr.fit(X_train, y_train)

predictions = lr.predict(X_test)
mae = np.mean(abs(predictions - y_test))#compare with true res
rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
print('Using Linear Regression, MAE is %0.2f' %  mae)  
print('Using Linear Regression, RMSE is %0.2f' %  rmse)

CPU times: user 3.91 ms, sys: 1.28 ms, total: 5.19 ms
Wall time: 2.96 ms
Using Linear Regression, MAE is 9.14
Using Linear Regression, RMSE is 14.09


### 2. SVR model

In [14]:
#SVR

from sklearn.svm import SVR

#linear
svr_lin = SVR(kernel='linear', C=100, gamma='auto')
%time svr_lin.fit(X_train, y_train)

#rbf
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
%time svr_rbf.fit(X_train,y_train)

y_lin = svr_lin.predict(X_test)
mae_svr1 = np.mean(abs(y_lin - y_test))#compare with true res
rmse_svr1 = np.sqrt(np.mean((y_lin - y_test) ** 2))
print('Using SVR_linear, MAE is %0.2f' %  mae_svr1)  
print('Using SVR_linear, RMSE is %0.2f' %  rmse_svr1)

y_rbf = svr_rbf.predict(X_test)
mae_svr2 = np.mean(abs(y_rbf - y_test))
rmse_svr2 = np.sqrt(np.mean((y_lin - y_test) ** 2))
print('Using SVR_rbf, MAE is %0.2f' %  mae_svr2)  
print('Using SVR_rbf, RMSE is %0.2f' %  rmse_svr2)

CPU times: user 542 ms, sys: 5.43 ms, total: 548 ms
Wall time: 553 ms
CPU times: user 9.37 ms, sys: 176 µs, total: 9.55 ms
Wall time: 9.18 ms
Using SVR_linear, MAE is 9.46
Using SVR_linear, RMSE is 14.82
Using SVR_rbf, MAE is 12.00
Using SVR_rbf, RMSE is 14.82


### 3. SVR with GridSearch(using standard dataset)

In [15]:
#SVR with GridSearchCV

from sklearn.model_selection import GridSearchCV

param_grid = {
            'C': [100, 10], 
            'gamma': [0.1, 0.01],
            'kernel': ['rbf', 'poly', 'sigmoid']
            }

svr = SVR()

svr_gs = GridSearchCV(
    svr, param_grid,
    cv=4, n_jobs=-1, verbose=1, scoring="neg_mean_absolute_error"
)

svr_gs.fit(X_train_scaled, y_train)

best_svr_est = svr_gs.best_estimator_
%time best_svr_est.fit(X_train_scaled, y_train)

y_pred_train = best_svr_est.predict(X_train_scaled)
y_pred_test = best_svr_est.predict(X_test_scaled)

mae_svrg = np.mean(abs(y_pred_test - y_test))
rmse_svrg = np.sqrt(np.mean((y_pred_test - y_test) ** 2))
print('Using SVR_rGrid, MAE is %0.2f' % mae_svrg)  
print('Using SVR_Grid, RMSE is %0.2f'% rmse_svrg)


Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


CPU times: user 2.57 ms, sys: 46 µs, total: 2.62 ms
Wall time: 2.62 ms
Using SVR_rGrid, MAE is 9.41
Using SVR_Grid, RMSE is 14.78


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    1.5s finished


### 4. Ensemble learnign

In [16]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

#random forest

forest_reg = RandomForestRegressor()

random_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
               'min_samples_leaf': [2, 4, 8],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [60, 120, 240]}


rf_random = RandomizedSearchCV(estimator = forest_reg, 
                               param_distributions = random_grid, 
                               scoring="neg_mean_absolute_error",
                               n_iter = 50, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42)

In [19]:
%time rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] n_estimators=240, min_samples_split=5, min_samples_leaf=4, max_depth=None, bootstrap=False 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=240, min_samples_split=5, min_samples_leaf=4, max_depth=None, bootstrap=False, total=   0.2s
[CV] n_estimators=240, min_samples_split=5, min_samples_leaf=4, max_depth=None, bootstrap=False 
[CV]  n_estimators=240, min_samples_split=5, min_samples_leaf=4, max_depth=None, bootstrap=False, total=   0.2s
[CV] n_estimators=240, min_samples_split=5, min_samples_leaf=4, max_depth=None, bootstrap=False 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  n_estimators=240, min_samples_split=5, min_samples_leaf=4, max_depth=None, bootstrap=False, total=   0.2s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=90, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=90, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=90, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=90, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=90, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=90, bootstrap=True, total=   0.1s
[CV] n_estimators=240, min_samples_split=10, min_samples_leaf=8, max_depth=100, bootstrap=False 
[CV]  n_estimators=240, min_samples_split=10, min_samples_leaf=8, max_depth=100, bootstrap=False, total=   0.2s
[CV] n_estimators=240, min_samples_split=10, min_samples_leaf=8, max_d

[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=60, bootstrap=False, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=100, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=100, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=100, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=100, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=100, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=100, bootstrap=True, total=   0.1s
[CV] n_estimators=240, min_samples_split=2, min_samples_leaf=4, max_depth=20, bootstrap=False 
[CV]  n_estimators=240, min_samples_split=2, min_samples_leaf=4, max_depth=20, bootstrap=False, total=   0.2s
[CV] n_estimators=240, min_samples_split=2, min_samples_leaf=4, max_de

[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=20, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=20, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=20, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=20, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=2, max_depth=20, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=2, min_samples_leaf=2, max_depth=None, bootstrap=False 
[CV]  n_estimators=120, min_samples_split=2, min_samples_leaf=2, max_depth=None, bootstrap=False, total=   0.1s
[CV] n_estimators=120, min_samples_split=2, min_samples_leaf=2, max_depth=None, bootstrap=False 
[CV]  n_estimators=120, min_samples_split=2, min_samples_leaf=2, max_depth=None, bootstrap=False, total=   0.1s
[CV] n_estimators=120, min_samples_split=2, min_samples_leaf=2, max

[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=8, max_depth=30, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=8, max_depth=30, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=8, max_depth=30, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=5, min_samples_leaf=8, max_depth=30, bootstrap=True 
[CV]  n_estimators=120, min_samples_split=5, min_samples_leaf=8, max_depth=30, bootstrap=True, total=   0.1s
[CV] n_estimators=120, min_samples_split=10, min_samples_leaf=8, max_depth=100, bootstrap=False 
[CV]  n_estimators=120, min_samples_split=10, min_samples_leaf=8, max_depth=100, bootstrap=False, total=   0.1s
[CV] n_estimators=120, min_samples_split=10, min_samples_leaf=8, max_depth=100, bootstrap=False 
[CV]  n_estimators=120, min_samples_split=10, min_samples_leaf=8, max_depth=100, bootstrap=False, total=   0.1s
[CV] n_estimators=120, min_samples_split=10, min_samples_leaf=8, ma

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   19.4s finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=50,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'min_samples_leaf': [2, 4, 8],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [60, 120, 240]},
                   random_state=42, scoring='neg_mean_absolute_error',
                   verbose=2)

In [20]:
rf_random.best_params_

{'n_estimators': 120,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_depth': 70,
 'bootstrap': True}

In [21]:
best_estimator = rf_random.best_estimator_

%time best_estimator.fit(X_train, y_train)

y_pred_train = best_estimator.predict(X_train)
y_pred_test = best_estimator.predict(X_test)


mae_rf = np.mean(abs(y_pred_test - y_test))
rmse_rf = np.sqrt(np.mean((y_pred_test - y_test) ** 2))
print('Using RF_RS, MAE is %0.2f' % mae_svrg)  
print('Using RF_RS, RMSE is %0.2f'% rmse_svrg)



CPU times: user 156 ms, sys: 5.75 ms, total: 162 ms
Wall time: 163 ms
Using RF_RS, MAE is 9.41
Using RF_RS, RMSE is 14.78


In [22]:
#gradient boosting

gb = GradientBoostingRegressor(random_state=SEED, max_depth=1)

params_grid = {
    "learning_rate":np.arange(0.1, 0.5, 0.05),
    "n_estimators":np.arange(40, 201, 20),
    "subsample":np.arange(0.4, 1.01, 0.1),
    "min_samples_split":np.arange(2, 10, 1),
    "min_samples_leaf":np.arange(2, 10, 1)
}



In [23]:
gb_rs = RandomizedSearchCV(
    estimator=gb,
    param_distributions=params_grid,
    n_iter=50,
    cv=3,
    scoring="neg_mean_absolute_error",
    verbose=1, 
    random_state=SEED
)

In [24]:
gb_rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    5.3s finished


RandomizedSearchCV(cv=3,
                   estimator=GradientBoostingRegressor(max_depth=1,
                                                       random_state=15),
                   n_iter=50,
                   param_distributions={'learning_rate': array([0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45]),
                                        'min_samples_leaf': array([2, 3, 4, 5, 6, 7, 8, 9]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]),
                                        'n_estimators': array([ 40,  60,  80, 100, 120, 140, 160, 180, 200]),
                                        'subsample': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
                   random_state=15, scoring='neg_mean_absolute_error',
                   verbose=1)

In [25]:
gb_rs.best_params_

{'subsample': 0.7,
 'n_estimators': 40,
 'min_samples_split': 5,
 'min_samples_leaf': 6,
 'learning_rate': 0.15000000000000002}

In [26]:
best_estimator = gb_rs.best_estimator_

%time best_estimator.fit(X_train, y_train)

y_pred_train = best_estimator.predict(X_train)
y_pred_test = best_estimator.predict(X_test)


mae_gb = np.mean(abs(y_pred_test - y_test))
rmse_gb = np.sqrt(np.mean((y_pred_test - y_test) ** 2))
print('Using GB_RS, MAE is %0.2f' % mae_gb)  
print('Using GB_RS, RMSE is %0.2f'% rmse_gb)

CPU times: user 20.1 ms, sys: 2.77 ms, total: 22.9 ms
Wall time: 20.8 ms
Using GB_RS, MAE is 9.42
Using GB_RS, RMSE is 14.11
