In [31]:
import optuna
import optunahub
import pandas as pd
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error, mean_absolute_error

In [32]:
df = pd.read_parquet('../data/data_with_meteo.parquet')
sf = df.dropna(inplace=True)

In [33]:
df['duedate'] = pd.to_datetime(df['duedate'])
df['hour'] = df['duedate'].dt.hour
df['dayofweek'] = df['duedate'].dt.dayofweek  # 0=lundi
df['is_weekend'] = df['dayofweek'] >= 5


In [34]:
y_columns = ['numdocksavailable', 'numbikesavailable']
X_columns = df.columns.difference(y_columns + ['coordonnees_geo', 'duedate'])
X = df[X_columns]
y = df['numdocksavailable']

split_date = "2025-12-20"

# Features
X_train = df.loc[df["duedate"] < split_date, X_columns]
X_test  = df.loc[df["duedate"] >= split_date, X_columns]

# Target
y_train = df.loc[df["duedate"] < split_date, y_columns]
y_test  = df.loc[df["duedate"] >= split_date, y_columns]

In [35]:
s = (X_train.dtypes == 'object')
cat_cols = list(s[s].index)
cat_cols

['code_insee_commune',
 'is_installed',
 'is_renting',
 'is_returning',
 'name',
 'nom_arrondissement_communes',
 'stationcode']

In [43]:
def objective(trial):

    params = {
        "loss_function": "MultiRMSE",
        "iterations": trial.suggest_int("iterations", 50, 1000, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 50.0, log=True),

        # Catégorielles
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),
        "max_ctr_complexity": trial.suggest_int("max_ctr_complexity", 2, 4),

        # Stabilité / généralisation
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 2.0),

        # Contrôle
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 10, 100, log=True),
        "task_type": "CPU",
        "random_seed": 42,
        "verbose": False,
        "cat_features": cat_cols,
    }

    model = CatBoostRegressor(**params)

    model.fit(
        X_train,
        y_train,
        eval_set=(X_test, y_test),
        use_best_model=True
    )

    # Récupérer la meilleure métrique
    score = model.get_best_score()["validation"]["MultiRMSE"]

    return score

In [44]:
module = optunahub.load_module(package="samplers/auto_sampler")
study = optuna.create_study(sampler=module.AutoSampler())
study.optimize(objective, n_trials=5)

[I 2025-12-25 11:08:21,021] A new study created in memory with name: no-name-a1574c36-9e69-4fbe-83c7-485b0e369246
[I 2025-12-25 11:09:03,291] Trial 0 finished with value: 2.0795927917628796 and parameters: {'iterations': 318, 'learning_rate': 0.012254639085016013, 'depth': 8, 'l2_leaf_reg': 6.649586756408236, 'one_hot_max_size': 7, 'max_ctr_complexity': 2, 'bagging_temperature': 0.258459456482101, 'random_strength': 0.8985987221421012, 'early_stopping_rounds': 36}. Best is trial 0 with value: 2.0795927917628796.
  return GPSampler(seed=seed, constraints_func=self._constraints_func)
[I 2025-12-25 11:09:48,376] Trial 1 finished with value: 1.796334691983261 and parameters: {'iterations': 428, 'learning_rate': 0.013565150377299173, 'depth': 7, 'l2_leaf_reg': 2.190246025788642, 'one_hot_max_size': 7, 'max_ctr_complexity': 3, 'bagging_temperature': 0.1608975442994829, 'random_strength': 1.649980028583801, 'early_stopping_rounds': 17}. Best is trial 1 with value: 1.796334691983261.
[I 2025-1

In [45]:
print("Best score:", study.best_value)
print("Best params:")
for k, v in study.best_params.items():
    print(f"{k}: {v}")

Best score: 1.3485655737397138
Best params:
iterations: 512
learning_rate: 0.0462042722670204
depth: 10
l2_leaf_reg: 42.887567317122326
one_hot_max_size: 3
max_ctr_complexity: 3
bagging_temperature: 0.7340014754086452
random_strength: 0.08619479333885005
early_stopping_rounds: 27


In [46]:
best_params = study.best_params

model = CatBoostRegressor(
    **best_params,
    loss_function="MultiRMSE",
    task_type="CPU",
    random_seed=42,
    cat_features=cat_cols,
    verbose=100
)

model.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test),
    use_best_model=True
)


0:	learn: 16.1809899	test: 15.2749015	best: 15.2749015 (0)	total: 135ms	remaining: 1m 8s
100:	learn: 1.8478339	test: 1.8713008	best: 1.8713008 (100)	total: 15.7s	remaining: 1m 3s
200:	learn: 1.4784990	test: 1.5416670	best: 1.5416670 (200)	total: 31.2s	remaining: 48.2s
300:	learn: 1.3732896	test: 1.4462177	best: 1.4462177 (300)	total: 47.4s	remaining: 33.2s
400:	learn: 1.3010673	test: 1.3832333	best: 1.3832333 (400)	total: 1m 3s	remaining: 17.5s
500:	learn: 1.2601071	test: 1.3509160	best: 1.3509160 (500)	total: 1m 19s	remaining: 1.74s
511:	learn: 1.2559773	test: 1.3485656	best: 1.3485656 (511)	total: 1m 21s	remaining: 0us

bestTest = 1.348565574
bestIteration = 511



<catboost.core.CatBoostRegressor at 0x797830fc4aa0>

In [47]:
y_pred = model.predict(X_test)

print(f'R2 score: {r2_score(y_test,y_pred)}')
print(f'Mean squared error: {mean_squared_error(y_test,y_pred)}')
print(f'Mean absolute error: {mean_absolute_error(y_test,y_pred)}')

R2 score: 0.9932818080221513
Mean squared error: 0.9093145533379611
Mean absolute error: 0.5373014064316313


In [130]:
y_pred

array([[23.29901027, 10.96546772],
       [12.78285299, 14.30435245],
       [18.51355706,  1.98060573],
       ...,
       [49.46421904, 15.88727677],
       [ 1.17094155, 30.79977521],
       [30.05172159,  3.95798715]], shape=(63572, 2))