Another notebook to be able to work on other models while the first ml notebook is doing crazy calculations

Let's try RandomForest, GradientBoosting, XGBoost, Ridge, Lasso and SVR WITHOUT hyperparameter tuning

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import RobustScaler
import time

def hyperparameter_tuning(X_train, y_train, X_test, y_test, models=None, scoring='R2'):

    if models is None:
        models = {
            'RandomForest': RandomForestRegressor(n_jobs=-1),
            'GradientBoosting': GradientBoostingRegressor(),
            'XGBoost': XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_jobs=-1),
            'Ridge': Ridge(),
            'Lasso': Lasso(),
            'SVR': SVR()
        }

    all_scores = {}
    best_model = None
    best_score = float('-inf')

    for model_name, model in models.items():
        pipeline = Pipeline([
            ('scaler', RobustScaler()),  # Handles outliers
            ('model', model)
        ])

        print(f'Tuning **{model_name}** with default parameters...')

        start_time = time.time()

        # We'll just fit the model directly
        pipeline.fit(X_train, y_train)

        elapsed_time = time.time() - start_time
        print(f'**{model_name}** completed in {elapsed_time:.2f} seconds.')

        y_pred = pipeline.predict(X_test)
        model_scores = evaluate_model(y_test, y_pred)

        all_scores[model_name] = model_scores

        # Print the scores for this model
        print(f'**{model_name} Performance**:')
        for metric, value in model_scores.items():
            print(f'{metric}: {value:.4f}')

        # Update the best model
        model_score = model_scores[scoring]  # Using R2 or the chosen scoring
        if model_score > best_score:
            best_score = model_score
            best_model = pipeline

    # Print model performance for all models

    for model_name, scores in all_scores.items():
        print(f'\n{model_name}:')
        for metric, value in scores.items():
            print(f'{metric}: {value:.4f}')

    return best_model, all_scores

def evaluate_model(y_true, y_pred):
    scores = {
        'RMSE': mean_squared_error(y_true, y_pred) ** 0.5,
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2': r2_score(y_true, y_pred)
    }
    return scores

In [2]:
import pandas as pd

X_test, X_train, y_test, y_train = pd.read_parquet('../shared_data/X_test.parquet'), pd.read_parquet('../shared_data/X_train.parquet'), pd.read_parquet('../shared_data/y_test.parquet'), pd.read_parquet('../shared_data/y_train.parquet')

In [3]:
# Test without text

X_test, X_train, y_test, y_train = X_test.drop(columns='text'), X_train.drop(columns='text'), y_test.values.ravel(), y_train.values.ravel()

In [4]:
best_model, all_scores = hyperparameter_tuning(X_train, y_train, X_test, y_test)

Tuning **RandomForest** with default parameters...
**RandomForest** completed in 154.99 seconds.
**RandomForest Performance**:
RMSE: 4.6717
MSE: 21.8250
MAE: 2.4674
R2: 0.9233
Tuning **GradientBoosting** with default parameters...
**GradientBoosting** completed in 238.82 seconds.
**GradientBoosting Performance**:
RMSE: 6.5913
MSE: 43.4459
MAE: 4.5210
R2: 0.8473
Tuning **XGBoost** with default parameters...
**XGBoost** completed in 3.29 seconds.
**XGBoost Performance**:
RMSE: 5.4509
MSE: 29.7118
MAE: 3.4387
R2: 0.8956
Tuning **Ridge** with default parameters...
**Ridge** completed in 0.79 seconds.
**Ridge Performance**:
RMSE: 12.8368
MSE: 164.7835
MAE: 9.2768
R2: 0.4209
Tuning **Lasso** with default parameters...
**Lasso** completed in 15.67 seconds.
**Lasso Performance**:
RMSE: 13.1216
MSE: 172.1764
MAE: 9.4923
R2: 0.3949
Tuning **SVR** with default parameters...
**SVR** completed in 2691.89 seconds.
**SVR Performance**:
RMSE: 12.2794
MSE: 150.7827
MAE: 7.6143
R2: 0.4701

RandomForest: