In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.stats import zscore
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('Spotify_Youtube.csv')

# Handling missing values (example: fill with mean or remove)
df.fillna(df.mean(numeric_only=True), inplace=True)  # or df.dropna(inplace=True)

# Encoding categorical variables if necessary
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('Views', axis=1))

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(scaled_features, df['Views'], test_size=0.2, random_state=42)

# Initialize the regressors
lgbm = LGBMRegressor()
xgb = XGBRegressor()
catboost = CatBoostRegressor(verbose=0)  # 'verbose=0' to prevent a lot of output

# Fit the models
lgbm.fit(X_train, y_train)
xgb.fit(X_train, y_train)
catboost.fit(X_train, y_train)

# Make predictions
predictions_lgbm = lgbm.predict(X_test)
predictions_xgb = xgb.predict(X_test)
predictions_catboost = catboost.predict(X_test)

# Evaluate the models
mae_lgbm = mean_absolute_error(y_test, predictions_lgbm)
mse_lgbm = mean_squared_error(y_test, predictions_lgbm)
r2_lgbm = r2_score(y_test, predictions_lgbm)

mae_xgb = mean_absolute_error(y_test, predictions_xgb)
mse_xgb = mean_squared_error(y_test, predictions_xgb)
r2_xgb = r2_score(y_test, predictions_xgb)

mae_catboost = mean_absolute_error(y_test, predictions_catboost)
mse_catboost = mean_squared_error(y_test, predictions_catboost)
r2_catboost = r2_score(y_test, predictions_catboost)

print('\n\n')
print('LightGBM:')
print(f'MAE: {mae_lgbm}')
print(f'MSE: {mse_lgbm}')
print(f'R-squared: {r2_lgbm}')
print('------')
print('XGBoost:')
print(f'MAE: {mae_xgb}')
print(f'MSE: {mse_xgb}')
print(f'R-squared: {r2_xgb}')
print('------')
print('CatBoost:')
print(f'MAE: {mae_catboost}')
print(f'MSE: {mse_catboost}')
print(f'R-squared: {r2_catboost}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5635
[LightGBM] [Info] Number of data points in the train set: 16574, number of used features: 26
[LightGBM] [Info] Start training from score 94293782.244298



LightGBM:
MAE: 31058726.478229642
MSE: 1.1835968473116102e+16
R-squared: 0.8444874583419679
------
XGBoost:
MAE: 29374092.933929443
MSE: 9342679836589874.0
R-squared: 0.877246725471986
------
CatBoost:
MAE: 28376951.281372547
MSE: 8044664020216955.0
R-squared: 0.8943013280737903
