In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
import json
from sklearn.model_selection import ParameterSampler
from sklearn.model_selection import KFold
import numpy as np

In [2]:
df = pd.read_csv('../conso_data.csv')

In [3]:
X = df.drop(columns="pitd")
y = df['pitd']
num_variables = X.columns[X.dtypes == 'float64'].tolist()
cat_variables = X.columns[X.dtypes == 'category'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_variables),
        ('cat', 'passthrough', cat_variables)
    ])

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
columns = num_variables + cat_variables

X_train_processed = pd.DataFrame(X_train_transformed, columns=columns)
X_test_processed = pd.DataFrame(X_test_transformed, columns=columns)

In [7]:
import optuna
import numpy as np
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from optuna.integration import CatBoostPruningCallback

# Define the Optuna objective function with cross-validation
def objective(trial):
    # Suggest hyperparameters
    iterations = trial.suggest_int("iterations", 500, 3000, step=500)
    learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3)
    depth = trial.suggest_int("depth", 3, 10)
    l2_leaf_reg = trial.suggest_loguniform("l2_leaf_reg", 1.0, 10.0)
    border_count = trial.suggest_int("border_count", 32, 255)

    # Cross-validation settings
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mse_scores = []

    for train_index, val_index in kf.split(X_train_processed):
        X_train_fold = X_train_processed.iloc[train_index]
        X_val_fold = X_train_processed.iloc[val_index]
        y_train_fold = y_train.iloc[train_index]
        y_val_fold = y_train.iloc[val_index]

        # Define the model
        model = CatBoostRegressor(
            iterations=iterations,
            learning_rate=learning_rate,
            depth=depth,
            l2_leaf_reg=l2_leaf_reg,
            border_count=border_count,
            cat_features=cat_variables,
            verbose=0
        )

        # Add pruning callback
        pruning_callback = CatBoostPruningCallback(trial, "RMSE")

        # Train the model with early stopping
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=(X_val_fold, y_val_fold),
            early_stopping_rounds=50,
            verbose=0,
            callbacks=[pruning_callback]
        )

        # Predict and evaluate
        y_pred = model.predict(X_val_fold)
        mse = mean_squared_error(y_val_fold, y_pred)
        mse_scores.append(mse)

    return np.mean(mse_scores)  # Return the average MSE over all folds

# Set up Optuna study with database storage to save progress
study = optuna.create_study(
    study_name="catboost_tuning",
    storage="sqlite:///optuna.db",  # Save results to a database file
    load_if_exists=True,             # Resume previous tuning if available
    direction="minimize"             # Minimize MSE
)

# Run optimization sequentially (n_jobs=1)
study.optimize(objective, n_trials=100, n_jobs=1)

# Print best hyperparameters
print("Best Hyperparameters:", study.best_params)

# Train final model using the best hyperparameters
best_params = study.best_params
best_model = CatBoostRegressor(
    **best_params, cat_features=cat_variables, verbose=100
)
best_model.fit(X_train_processed, y_train, eval_set=(X_test_processed, y_test), early_stopping_rounds=50)

# Save best model
best_model.save_model("best_catboost_model.cbm")

[I 2025-03-07 17:19:14,246] A new study created in RDB with name: catboost_tuning
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3)
  l2_leaf_reg = trial.suggest_loguniform("l2_leaf_reg", 1.0, 10.0)
  pruning_callback = CatBoostPruningCallback(trial, "RMSE")
  pruning_callback = CatBoostPruningCallback(trial, "RMSE")
  pruning_callback = CatBoostPruningCallback(trial, "RMSE")
  pruning_callback = CatBoostPruningCallback(trial, "RMSE")
  pruning_callback = CatBoostPruningCallback(trial, "RMSE")
[I 2025-03-07 17:24:43,455] Trial 0 finished with value: 188487595.2678563 and parameters: {'iterations': 1000, 'learning_rate': 0.2381502411439463, 'depth': 5, 'l2_leaf_reg': 1.9636248613225071, 'border_count': 129}. Best is trial 0 with value: 188487595.2678563.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3)
  l2_leaf_reg = trial.suggest_loguniform("l2_leaf_reg", 1.0, 10.0)
  pruning_callback = CatBoostPruningCallback(trial, "RMSE")
  pruning_callbac

Best Hyperparameters: {'iterations': 1500, 'learning_rate': 0.28783571318083007, 'depth': 3, 'l2_leaf_reg': 1.5944034968564198, 'border_count': 121}
0:	learn: 175956.5042510	test: 172562.1453777	best: 172562.1453777 (0)	total: 1.15ms	remaining: 1.72s
100:	learn: 12352.4864825	test: 17454.5430721	best: 17454.5430721 (100)	total: 137ms	remaining: 1.9s
200:	learn: 7527.9339801	test: 14665.7152746	best: 14665.7152746 (200)	total: 311ms	remaining: 2.01s
300:	learn: 5450.4932236	test: 13176.3645146	best: 13176.3645146 (300)	total: 479ms	remaining: 1.91s
400:	learn: 4249.6213483	test: 12614.4653281	best: 12612.8935189 (392)	total: 698ms	remaining: 1.91s
500:	learn: 3466.3217780	test: 12251.7067062	best: 12251.7067062 (500)	total: 812ms	remaining: 1.62s
600:	learn: 2860.3458057	test: 12033.8679099	best: 12032.0511906 (599)	total: 975ms	remaining: 1.46s
700:	learn: 2419.2405948	test: 11883.8672995	best: 11860.5306575 (674)	total: 1.18s	remaining: 1.34s
Stopped by overfitting detector  (50 itera

In [None]:
study = optuna.load_study(storage="/home/onyxia/work/statapp/notebooks/optuna.db", study_name="my_study")

# Get the best trial
best_trial = study.best_trial

print(f"Best Hyperparameters: {best_trial.params}")
print(f"Best Value (Objective Function): {best_trial.value}")


ArgumentError: Could not parse SQLAlchemy URL from string '/home/onyxia/work/statapp/notebooks/optuna.db'