In [1]:
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from pathlib import Path
import numpy as np

from feature_engineering import codify_date_2, get_X_y, remove_outliers
from feature_engineering import add_lag_and_rolling_features
import utils


In [2]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [13]:
mdata = pd.read_parquet(Path("data") / "train.parquet")
mdata = mdata.sort_values("date")
mdata = codify_date_2(mdata)
mdata = remove_outliers(mdata)
mdata = mdata.drop(columns=["counter_id", "site_id", "site_name", 
                              "bike_count", "counter_installation_date", 
                              "coordinates", "counter_technical_id",
                              "latitude", "longitude", "date"])
y = mdata["log_bike_count"].values
X = mdata.drop(["log_bike_count"], axis=1).copy()
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,datetime,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot O-E,2020-09-01 01:00:00,2020,9,1,1,1,False,False
1,Face au 8 avenue de la porte de Charenton SE-NO,2020-09-01 01:00:00,2020,9,1,1,1,False,False
2,Face 104 rue d'Aubervilliers S-N,2020-09-01 01:00:00,2020,9,1,1,1,False,False
3,90 Rue De Sèvres NE-SO,2020-09-01 01:00:00,2020,9,1,1,1,False,False
4,Face au 4 avenue de la porte de Bagnolet E-O,2020-09-01 01:00:00,2020,9,1,1,1,False,False


In [57]:
from sklearn.metrics import root_mean_squared_error


def cross_validation(mdata, X, y, Model1, Model2):
    categorical_columns = ["counter_name", "year", "is_weekend", "IsHoliday", "month", "day", "day_of_week"]

    preprocessor1 = ColumnTransformer(
        transformers=[
            ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
            ("hour_sin", sin_transformer(24), ["hour"]),
            ("hour_cos", cos_transformer(24), ["hour"]),
        ],
    #    remainder=MinMaxScaler(),
    )
    pipeline1 = Pipeline([
        ("preprocessor", preprocessor1),
        ("regressor", Model1)
    ])

    preprocessor2= ColumnTransformer(
        transformers=[
            ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), ["counter_name"]),
            #("hour_sin", sin_transformer(24), ["hour"]),
            #("hour_cos", cos_transformer(24), ["hour"]),   
        ],
    #    remainder=MinMaxScaler(),
    )
    pipeline2 = Pipeline([
        ("preprocessor", preprocessor2),
        ("regressor", Model2)
    ])


    scores = []
    for train_index, test_index in utils.get_cv(X, y):
        test_index = sorted(test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        data_train = mdata.iloc[train_index]
        data_test = mdata.iloc[test_index]

        pipeline1 = pipeline1.fit(X_train.drop(columns="datetime"), y_train)
        static_predictions = pipeline1.predict(X_train.drop(columns="datetime"))

        data_train_static = data_train.copy()
        data_train_static = data_train_static.rename(columns={"log_bike_count": "log_bike_count1"})
        data_train_static.loc[:, "log_bike_count"] = static_predictions
        data_train_static = add_lag_and_rolling_features(data_train_static, ["rolling_mean_168h"])
        pipeline2 = pipeline2.fit(data_train_static.drop(columns=["datetime", "log_bike_count", "log_bike_count1"]), 
                                data_train_static["log_bike_count1"])

        static_predictions = pipeline1.predict(X_test.drop(columns="datetime"))
        data_test = data_test.rename(columns={"log_bike_count": "log_bike_count1"})
        data_test.loc[:, "log_bike_count"] = static_predictions.astype(np.float64)
        df = mdata.copy(deep=True)
        df = df.query(f"datetime >= '{data_test['datetime'].min() - pd.offsets.Week() - pd.offsets.Hour()}' and datetime < '{data_test['datetime'].min()}'")
        df = pd.concat([data_test, df], axis=0)

        data_test_static = add_lag_and_rolling_features(df, ["rolling_mean_168h"])
        final_precictions = pipeline2.predict(data_test_static.drop(columns=["datetime", "log_bike_count", "log_bike_count1"]))

        scores.append(root_mean_squared_error(data_test_static["log_bike_count1"], final_precictions))
    return scores

In [59]:
from sklearn.linear_model import Ridge


scores = np.array(cross_validation(mdata, X, y, XGBRegressor(), XGBRegressor()))


  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')


In [60]:
print(f"The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

The accuracy is: 1.51577 +- 0.149


In [54]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the objective function
def objective(trial):
    # Define the hyperparameter search space
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-5, 10, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-5, 10, log=True),
    }

    # Create the model with current hyperparameters
    model = XGBRegressor(
        **param,
        tree_method="auto",  # Choose "gpu_hist" if using a GPU
    )

    scores = np.array(cross_validation(mdata, X, y, XGBRegressor(), model))

    return scores.mean()

# Create a study object and specify the optimization direction
study = optuna.create_study(direction="minimize")

# Run the optimization
study.optimize(objective, n_trials=50)

# Print the best trial
print("Best trial:")
print(f"  Value: {study.best_trial.value}")
print("  Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")


[I 2024-12-10 12:00:37,444] A new study created in memory with name: no-name-ab5e4cad-3c81-4f36-9ac2-9db1c83b2cb9
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
  data.groupby('counter_name')
[I 2024-12-10 12:01:21,238] Trial 0 finished with value: 1.6288235639853847 and parameters: {'n_estimators': 365, 'max_depth': 6, 'learning_rate': 0.0005021769557241886, 'subsample': 0.5945508451272905, 'colsample_bytree': 0.7996966497274098, 'gamma': 4.395763595028016, 'reg_alpha': 0.5428738349080394, 'reg_lambda': 0.0001067354527609093}. Best is trial 0 with value: 1.6288235639853847.


Best trial:
  Value: 1.5153360435078878
  Params: 
    n_estimators: 737
    max_depth: 3
    learning_rate: 0.29307998455151235
    subsample: 0.842816518050073
    colsample_bytree: 0.6701504597110362
    gamma: 9.651416356371687
    reg_alpha: 1.3143890757913108e-05
    reg_lambda: 1.0325636967770128e-05
