In [35]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import os
import pickle

In [36]:
df = pd.read_csv('cleaned_dataset.csv')
df.head()

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,8.8e-05,3.5e-05,3.5e-05,73,0.024289,0,0.628394,-0.721328,1,0.2988,0,0.551483,-0.84923,-0.503538,0.758094,0.928984,-1.142994,4,0.008807
1,0.000114,1.8e-05,2.6e-05,55,-0.737425,0,-0.847891,-1.896382,1,-1.794228,1,-0.079182,1.837684,-0.503524,-0.591264,-0.799395,-1.490909,4,0.008807
2,9e-06,9e-06,9e-06,57,-0.162155,0,-0.744089,-1.127618,0,-0.29744,1,-0.27396,-0.313535,-0.503542,-0.507257,-1.36652,-1.52951,4,0.008807
3,0.000132,9e-06,5.3e-05,71,-0.245726,0,-1.735968,-2.320198,0,-2.049645,1,-0.457392,1.780439,-0.503313,-0.428501,-1.277786,1.987275,3,0.008807
4,9.7e-05,5.3e-05,0.000176,82,-0.27467,0,0.293923,-0.793026,2,-0.286864,1,-0.303272,0.466809,-0.503542,-0.686296,-1.185194,-0.074292,4,0.008807


In [37]:
RANDOM_STATE = 2024

In [38]:
df = df.head(1000)

In [39]:
# Define features (X) and target (y)
X = df.drop('popularity', axis=1)
y = df['popularity']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Check the shape of the splits
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((800, 18), (800,), (200, 18), (200,))

In [40]:
# Define models and their parameter grids for GridSearchCV
models_param_grid = {
    'polynomial_regression': {
        'model': Pipeline([('poly', PolynomialFeatures()), ('linear', LinearRegression())]),
        'params': {
            'poly__degree': [2, 3, 4],
            'linear__fit_intercept': [True, False]
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [5, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'max_depth': [5, 10, 20, 30],
            'n_estimators': [100, 200, 300],
            'min_samples_split': [5, 10]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5]
        }
    },
    'xgb_regressor': {
        'model': XGBRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5]
        }
    },
    'xgbrf_regressor': {
        'model': XGBRFRegressor(),
        'params': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5]
        }
    }
}

# Function to run GridSearchCV for each model
def run_grid_search(models_param_grid, X_train, y_train):
    best_estimators = {}
    for model_name, model_info in models_param_grid.items():
        if os.path.exists(f'pickled_models/{model_name}.pkl'):
            with open(f'pickled_models/{model_name}.pkl', 'rb') as f:
                grid_search = pickle.load(f)
        else:   
            grid_search = GridSearchCV(model_info['model'], model_info['params'], cv=3, scoring='neg_mean_squared_error')
            grid_search.fit(X_train, y_train)
            
        best_estimators[model_name] = grid_search.best_estimator_
        
        print(f"{model_name}:")
        print(f"        Best parameters: {grid_search.best_params_}")
        print(f"        Best score: {grid_search.best_score_}")
        y_pred = grid_search.best_estimator_.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"        MSE: {mse}")
        print(f"        R2: {r2}")
        print()
        # pickle the best_estimator in pickled_models folder
        with open(f'pickled_models/{model_name}.pkl', 'wb') as f:
            pickle.dump(grid_search, f)
    return best_estimators

# Run GridSearchCV for each model
best_estimators = run_grid_search(models_param_grid, X_train, y_train)


polynomial_regression:
        Best parameters: {'linear__fit_intercept': False, 'poly__degree': 2}
        Best score: -629.7533560628953
        MSE: 718.6837939412774
        R2: -1.5105840040762621

decision_tree:
        Best parameters: {'max_depth': 5, 'min_samples_split': 10}
        Best score: -187.96431507833668
        MSE: 199.0357070016624
        R2: 0.3047069289011787

random_forest:
        Best parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
        Best score: -151.9455026988118
        MSE: 174.77922431983134
        R2: 0.3894422992122195

gradient_boosting:
        Best parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300}
        Best score: -146.74341745144193
        MSE: 167.7745392230559
        R2: 0.4139118232307236

xgb_regressor:
        Best parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300}
        Best score: -153.89118193680815
        MSE: 159.79076218108142
        R2: 0.44180161718

In [41]:
results = {}
for model_name, model in best_estimators.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'MSE': mse, 'R2': r2}
    
results = pd.DataFrame(results)
results

Unnamed: 0,polynomial_regression,decision_tree,random_forest,gradient_boosting,xgb_regressor,xgbrf_regressor
MSE,718.683794,199.035707,174.779224,167.774539,159.790762,243.493348
R2,-1.510584,0.304707,0.389442,0.413912,0.441802,0.149403


In [42]:
raise "Stop Here"

TypeError: exceptions must derive from BaseException

In [None]:
# Initialize models with default parameters
models = {
    "Polynomial Regression": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    "Decision Tree": DecisionTreeRegressor(random_state=RANDOM_STATE),
    "Random Forest": RandomForestRegressor(random_state=RANDOM_STATE),
    "Gradient Boosting": GradientBoostingRegressor(random_state=RANDOM_STATE),
    "XGBRegressor": XGBRegressor(random_state=RANDOM_STATE),
    "XGBRFRegressor": XGBRFRegressor(random_state=RANDOM_STATE)
}

# Function to train and evaluate a model
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return mse, r2

# Train each model and collect their performance metrics
results = {}

for name, model in models.items():
    mse, r2 = train_and_evaluate(model, X_train, X_test, y_train, y_test)
    results[name] = {"MSE": mse, "R2": r2}

# Convert results to a DataFrame
results = pd.DataFrame(results)
results