In [10]:
# Load parquets

import pandas as pd

X_test = pd.read_parquet('../shared_data/X_test.parquet')
X_train = pd.read_parquet('../shared_data/X_train.parquet')
y_test = pd.read_parquet('../shared_data/y_test.parquet')
y_train = pd.read_parquet('../shared_data/y_train.parquet')

In [None]:
# Evaluations

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X_test, y_test):

    y_pred = model.predict(X_test)

    metrics = {
        'MSE': (mse := mean_squared_error(y_test, y_pred)),
        'RMSE': mse ** 0.5,
        'MAE': mean_absolute_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }

    for metric, value in metrics.items():
        print(f'{metric}: {value:.4f}')

    return metrics

In [12]:
# Robust scaler

from sklearn.preprocessing import RobustScaler

def robust_scale_dataframe(df, fitted_scaler=None):

    df = df.copy()  # Avoid modifying the original DataFrame
    numerical_cols = df.select_dtypes(include=['number']).columns

    if len(numerical_cols) == 0:
        raise ValueError('No numerical columns found in the DataFrame.')

    if fitted_scaler is None:
        scaler = RobustScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df[numerical_cols] = fitted_scaler.transform(df[numerical_cols])
        scaler = fitted_scaler

    return df, scaler

X_train_scaled, scaler = robust_scale_dataframe(X_train)
X_test_scaled, _ = robust_scale_dataframe(X_test, scaler)

In [13]:
# Remove text column, flatten ys

X_train_textless, X_test_textless = X_train.drop(columns='text'), X_test.drop(columns='text')
y_train, y_test = y_train.values.ravel(), y_test.values.ravel()

In [None]:
# Linear regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def train_linear_regression(X_train, y_train, param_grid=None, search_type='grid', cv=5, n_iter=10):

    model = LinearRegression()

    if param_grid:
        if search_type == 'grid':
            search = GridSearchCV(model, param_grid, cv=cv, scoring='neg_mean_squared_error')
        elif search_type == 'random':
            search = RandomizedSearchCV(model, param_grid, n_iter=n_iter, cv=cv, scoring='neg_mean_squared_error', random_state=42)
        else:
            raise ValueError('Invalid search_type. Choose "grid" or "random".')

        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        print(f'Best Parameters: {search.best_params_}')
    else:
        best_model = model.fit(X_train, y_train)

    return best_model

lr_model = train_linear_regression(X_train_scaled.drop(columns='text'), y_train)
evaluate_model(lr_model, X_test_scaled.drop(columns='text'), y_test)

MSE: 164.7835
RMSE: 12.8368
MAE: 9.2768
R2: 0.4209


{'MSE': 164.78348245159992,
 'RMSE': 12.836801877866618,
 'MAE': 9.276756727230671,
 'R2': 0.4208518809199885}

This will take too long. One big function for several different models:

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from xgboost import XGBRegressor
import time

def hyperparameter_tuning(X_train,
                          y_train,
                          X_test,
                          y_test,
                          models=None,
                          search_type='halving',
                          n_iter=20,
                          cv=5,
                          scoring='r2',
                          verbose=1):
    '''
    Args:
        X_train, y_train: Training data
        X_test, y_test: Test data
        models: Dictionary of models with hyperparameter grids (default = common regressors + XGBoost)
        search_type: 'grid', 'random', or 'halving' (default = 'halving' [experimental])
        n_iter: Number of iterations (only for RandomizedSearchCV)
        cv: Number of cross-validations
        scoring: Metric to optimize (default = 'r2')
        verbose: Verbosity

    Returns:
        best_model: The best model found
        best_params: Best hyperparameters
        all_scores: A dictionary of performance metrics for all models
    '''

    if models is None:
        models = {
            'RandomForest': {
                'model': RandomForestRegressor(n_jobs=-1),
                'params': {
                    'n_estimators': [100, 300, 500],
                    'max_depth': [None, 10, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'bootstrap': [True, False]
                }
            },
            'GradientBoosting': {
                'model': GradientBoostingRegressor(),
                'params': {
                    'n_estimators': [100, 300, 500],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 10]
                }
            },
            'XGBoost': {  # Optimized for speed
                'model': XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_jobs=-1),
                'params': {
                    'n_estimators': [100, 300, 500],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 10],
                    'subsample': [0.6, 0.8, 1.0],
                    'colsample_bytree': [0.6, 0.8, 1.0]
                }
            },
            'Ridge': {
                'model': Ridge(),
                'params': {'alpha': [0.1, 1, 10, 100]}
            },
            'Lasso': {
                'model': Lasso(),
                'params': {'alpha': [0.001, 0.01, 0.1, 1, 10]}
            },
            'SVR': {
                'model': SVR(),
                'params': {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly']}
            }
        }

    all_scores = {}
    best_model = None
    best_score = float('-inf')
    best_params = None

    for model_name, config in models.items():
        model = config['model']
        param_grid = config['params']

        pipeline = Pipeline([
            ('scaler', RobustScaler()),  # Handles outliers
            ('model', model)
        ])

        print(f'Tuning **{model_name}** using **{search_type.upper()} Search**...')

        search = None
        start_time = time.time()

        if search_type == 'grid':
            search = GridSearchCV(pipeline, {'model__' + k: v for k, v in param_grid.items()},
                                  cv=cv, scoring=scoring, n_jobs=-1, verbose=verbose, refit=False)
        elif search_type == 'random':
            search = RandomizedSearchCV(pipeline, {'model__' + k: v for k, v in param_grid.items()},
                                        n_iter=n_iter, cv=cv, scoring=scoring, n_jobs=-1,
                                        random_state=42, verbose=verbose, refit=False)
        elif search_type == 'halving':
            search = HalvingGridSearchCV(pipeline, {'model__' + k: v for k, v in param_grid.items()},
                                         factor=2, cv=cv, scoring=scoring, n_jobs=-1, verbose=verbose, refit=False)
        else:
            raise ValueError('Invalid search_type. Choose from "grid", "random", or "halving".')

        search.fit(X_train, y_train)

        elapsed_time = time.time() - start_time
        print(f'**{model_name}** tuning completed in {elapsed_time:.2f} seconds.')

        # Evaluate and store the performance metrics
        y_pred = search.predict(X_test)
        model_scores = evaluate_model(y_test, y_pred)

        # Store the scores for the current model
        all_scores[model_name] = model_scores

        # Print the scores for this model
        print(f'**{model_name} Performance**:')
        for metric, value in model_scores.items():
            print(f'{metric}: {value:.4f}')

        # Update the best model
        if search.best_score_ > best_score:
            best_score = search.best_score_
            best_model = search.best_estimator_
            best_params = search.best_params_

        print(f'**Best Params for {model_name}**: {search.best_params_}\n')

    print('\n**Best Model Selected:**', best_model)
    print(f'Best Params: {best_params}')

    # Print model performance for all models
    for model_name, scores in all_scores.items():
        print(f'\n{model_name}:')
        for metric, value in scores.items():
            print(f'{metric}: {value:.4f}')

    return best_model, best_params, all_scores

def evaluate_model(y_true, y_pred):
    scores = {
        'RMSE': mean_squared_error(y_true, y_pred) ** 0.5,
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2': r2_score(y_true, y_pred)
    }
    return scores

In [16]:
# Try halving for speed

best_model, best_params, scores = hyperparameter_tuning(X_train_textless, y_train, X_test_textless, y_test, search_type='halving', n_iter=10)

Tuning **RandomForest** using **HALVING Search**...
n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 1277
max_resources_: 163516
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 162
n_resources: 1277
Fitting 5 folds for each of 162 candidates, totalling 810 fits
----------
iter: 1
n_candidates: 81
n_resources: 2554
Fitting 5 folds for each of 81 candidates, totalling 405 fits
----------
iter: 2
n_candidates: 41
n_resources: 5108
Fitting 5 folds for each of 41 candidates, totalling 205 fits
----------
iter: 3
n_candidates: 21
n_resources: 10216
Fitting 5 folds for each of 21 candidates, totalling 105 fits
----------
iter: 4
n_candidates: 11
n_resources: 20432
Fitting 5 folds for each of 11 candidates, totalling 55 fits




----------
iter: 5
n_candidates: 6
n_resources: 40864
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 6
n_candidates: 3
n_resources: 81728
Fitting 5 folds for each of 3 candidates, totalling 15 fits
----------
iter: 7
n_candidates: 2
n_resources: 163456
Fitting 5 folds for each of 2 candidates, totalling 10 fits
**RandomForest** tuning completed in 3028.01 seconds.


AttributeError: This 'HalvingGridSearchCV' has no attribute 'predict'