In [None]:
# Preprocessing
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor, Pool
from timeit import default_timer as timer
import optuna

%matplotlib inline
plt.style.use("bmh")

path = Path.cwd().parent / "mdsb-2023"

In [None]:
def train_test_split_temporal(X, y, delta_threshold="60 days"):

    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = X["date"] <= cutoff_date
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

In [None]:
def add_lags(X, cols_to_lag=["t", "u", "vv", "nnuage4"], lag_list=[2, -24, -2]):
    X = X.copy()

    feature_columns = [col for col in X.columns if col in cols_to_lag]

    for l in lag_list:
        lag_columns = [f"{col}_lag{l}" for col in feature_columns]
        X[lag_columns] = X[feature_columns].shift(periods=l, axis=0)
        X[lag_columns] = (
            X[lag_columns]
            .interpolate(method="linear")
            .interpolate(method="bfill")
            .interpolate(method="ffill")
        )

    return X


def add_moving_average(
    X, cols_to_ma=["t", "u", "vv", "nnuage4"], window_list=[24 * 7, 24], centered=True
):
    X = X.copy()

    feature_columns = [col for col in X.columns if col in cols_to_ma]

    for w in window_list:
        ma_columns = [f"{col}_ma{w}" for col in feature_columns]
        X[ma_columns] = X[feature_columns].rolling(window=w, center=centered).mean()
        X[ma_columns] = (
            X[ma_columns]
            .interpolate(method="linear")
            .interpolate(method="bfill")
            .interpolate(method="ffill")
        )

    return X

### Define pipeline functions

In [None]:
def _encode_dates(X, col_name="date"):
    X = X.copy()

    X["month"] = X[col_name].dt.month
    X["weekday"] = X[col_name].dt.weekday
    X["hour"] = X[col_name].dt.hour

    X["month_sin"] = np.sin(2 * np.pi * X["date"].dt.month / 12)
    X["month_cos"] = np.cos(2 * np.pi * X["date"].dt.month / 12)

    X["day_sin"] = np.sin(2 * np.pi * X["date"].dt.day / X["date"].dt.days_in_month)
    X["day_cos"] = np.cos(2 * np.pi * X["date"].dt.day / X["date"].dt.days_in_month)

    X["hour_sin"] = np.sin(2 * np.pi * X["date"].dt.hour / 24)
    X["hour_cos"] = np.cos(2 * np.pi * X["date"].dt.hour / 24)

    X[["month", "weekday", "hour"]] = X[["month", "weekday", "hour"]].astype("category")

    return X.drop(columns=[col_name])


def _encode_covid(X, col_name="date"):
    X = X.copy()

    # Create masks for lockdown dates
    lockdown_1 = (X["date"] >= "2020-10-17") & (X["date"] <= "2020-12-14")

    lockdown_2 = (X["date"] >= "2020-12-15") & (X["date"] <= "2021-02-26")

    lockdown_3 = (X["date"] >= "2021-02-27") & (X["date"] <= "2021-05-02")

    X["Covid"] = 0
    X.loc[lockdown_1 | lockdown_2 | lockdown_3, "Covid"] = 1

    return X


def _merge_external_data(X, include_lags=True, include_ma=True):
    to_keep = [
        "date",
        "hnuage4",
        "t",
        "ctype4",
        "nnuage4",
        "u",
        "etat_sol",
        "perssfrai",
        "tx12",
        "cm",
        "tn12",
        "tend24",
        "vv",
        "rafper",
        "rr24",
        "hnuage2",
        "td",
        "rr3",
        "hnuage3",
        "hnuage1",
    ]

    ext_data = pd.read_csv(path / "external_data.csv", parse_dates=["date"])[to_keep]

    ext_data.drop(columns=ext_data.columns[ext_data.isna().sum() > 1000], inplace=True)

    full_date_range = pd.date_range(
        start=np.min([np.min(data.date), np.min(test.date)]),
        end=np.max([np.max(data.date), np.max(test.date)]),
        freq="H",
    )

    full_date_range = pd.DataFrame({"date": full_date_range})

    ext_data = full_date_range.merge(ext_data, on="date", how="left")

    columns_to_interpolate = ext_data.drop(columns="date").columns
    ext_data[columns_to_interpolate] = (
        ext_data[columns_to_interpolate]
        .interpolate(method="polynomial", order=3)
        .interpolate(method="bfill")
        .interpolate(method="ffill")
    )

    if include_lags:
        ext_data = add_lags(ext_data)

    if include_ma:
        ext_data = add_moving_average(ext_data)

    to_drop = [
        "vv_ma24",
        "rr24",
        "t_lag2",
        "rafper",
        "hnuage1",
        "td",
        "vv",
        "perssfrai",
        "vv_lag2",
        "u_lag-24",
        "vv_lag-2",
        "vv_lag-24",
        "u",
        "u_lag2",
    ]

    ext_data.drop(columns=to_drop, inplace=True)

    X = X.copy()

    X["date"] = X["date"].astype("datetime64[ns]")
    ext_data["date"] = ext_data["date"].astype("datetime64[ns]")

    X["orig_index"] = np.arange(X.shape[0])

    X = pd.merge_asof(X.sort_values("date"), ext_data.sort_values("date"), on="date")

    # Sort back to the original order
    X = X.sort_values("orig_index")
    del X["orig_index"]

    return X


def _gas_price_encoder(X):
    X = X.copy()
    X["gas_price"] = 1

    gas_prices = np.array(
        [
            1.22,
            1.21,
            1.22,
            1.27,
            1.31,
            1.36,
            1.4,
            1.39,
            1.4,
            1.43,
            1.45,
            1.45,
            1.46,
            1.56,
        ]
    )

    years = [
        2020,
        2020,
        2020,
        2020,
        2021,
        2021,
        2021,
        2021,
        2021,
        2021,
        2021,
        2021,
        2021,
        2021,
    ]

    months = [9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

    for i, price in enumerate(gas_prices):
        X.loc[
            (X.date.dt.month == months[i]) & (X.date.dt.year == years[i]), "gas_price"
        ] = price

    return X


def full_encode(X):
    return _encode_dates(_encode_covid(_gas_price_encoder(_merge_external_data(X))))

## Import main dataset

In [None]:
data = pd.read_parquet(path / "train.parquet")
test = pd.read_parquet(path / "final_test.parquet")

targets = ["bike_count", "log_bike_count"]

In [None]:
data.drop(
    columns=[
        "site_name",
        "counter_id",
        "site_id",
        "counter_installation_date",
        "coordinates",
        "counter_technical_id",
    ],
    inplace=True,
)

## Optuna def

In [None]:
def objective(trial):
    # Define hyperparameters to optimize
    learning_rate = trial.suggest_float("learning_rate", 0.1, 0.25)
    max_depth = trial.suggest_int("max_depth", 5, 9)
    n_estimators = trial.suggest_int("n_estimators", 250, 1000)
    subsample = trial.suggest_float("subsample", 0.7, 0.95)

    data_merger = FunctionTransformer(_merge_external_data, validate=False)
    covid_encoder = FunctionTransformer(_encode_covid, validate=False)
    gas_encoder = FunctionTransformer(_gas_price_encoder, validate=False)
    date_encoder = FunctionTransformer(_encode_dates, validate=False)

    regressor = CatBoostRegressor(
        learning_rate=learning_rate,
        max_depth=max_depth,
        n_estimators=n_estimators,
        subsample=subsample,
        od_pval=1e-5,
    )

    pipe = Pipeline(
        [
            ("merge external", data_merger),
            ("gas prices encoder", gas_encoder),
            ("covid encoder", covid_encoder),
            ("date encoder", date_encoder),
            ("regressor", regressor),
        ]
    )

    # Perform temporal train-test split
    X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

    val_pool = Pool(
        full_encode(X_valid),
        label=y_valid,
        cat_features=categorical_cols,
    )

    # Fit the pipeline on the training data
    pipe.fit(
        X_train,
        y_train,
        regressor__cat_features=categorical_cols,
        regressor__early_stopping_rounds=130,
        regressor__eval_set=val_pool,
    )

    # Make predictions on the validation set
    predictions = pipe.predict(X_valid)

    # Calculate the mean squared error as the objective
    neg_rmse = -mean_squared_error(y_valid, predictions, squared=False)

    return neg_rmse

## Model

In [None]:
X, y = data.drop(columns=targets), data["log_bike_count"]

In [None]:
date_cols = (
    _encode_dates(X[["date"]]).select_dtypes(include="category").columns.tolist()
)
categorical_cols = ["counter_name"] + date_cols

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Get the best parameters from the study
best_params = study.best_params
print("Best Hyperparameters:", best_params)

In [None]:
# param1 = list(param_grid.keys())[0]
# param2 = list(param_grid.keys())[1]
# p1_name = param1.split('__')[1]
# p2_name = param2.split('__')[1]

# # Extract the relevant information for plotting vs the max_depth parameter
# mean_test_scores_depth = np.sqrt(-grid_search.cv_results_['mean_test_score'].reshape(len(param_grid[param1]), -1))
# n_estimators_values = param_grid[param2]

# # Extract the relevant information for plotting vs the n_estimators parameter
# mean_test_scores_estimators = np.sqrt(-grid_search.cv_results_['mean_test_score'].reshape(len(param_grid[param1]), -1)).T
# max_depth_values = param_grid[param1]

# # Start plot
# fig, ax = plt.subplots(1, 2, figsize=(10, 4))

# # Plot a line for each max_depth value
# for i, max_depth in enumerate(max_depth_values):
#     if max_depth in []:
#         continue
#     ax[0].plot(n_estimators_values, mean_test_scores_depth[i, :], label=f"{p1_name}={max_depth}", marker='o', alpha=0.6)

# ax[0].set_title('Mean Test Score vs. n_estimators')
# ax[0].set_xlabel('n_estimators')
# ax[0].set_ylabel('Mean Test Score (Negative MSE)')
# ax[0].legend(title='Max Depth', prop={'size': 10})
# ax[0].grid(True)

# # Plot a line for each n_estimators value
# for i, n_est in enumerate(n_estimators_values):
#     ax[1].plot(max_depth_values, mean_test_scores_estimators[i, :], label=f'{p2_name}={n_est}', marker='o', alpha=0.6)

# ax[1].set_title('Mean Test Score vs. max_depth')
# ax[1].set_xlabel('max_depth')
# ax[1].set_ylabel('Mean Test Score (Negative MSE)')
# ax[1].legend(title='n_estimators', prop={'size': 10})
# ax[1].grid(True)

# plt.tight_layout()
# plt.show()

In [None]:
pipe.fit(X, y)
prediction = pipe.predict(test)
prediction[prediction < 0] = 0

In [None]:
submission = pd.DataFrame({"log_bike_count": prediction})

# submission = pd.DataFrame({'Id' : submission.index, 'log_bike_count' : prediction})
submission = pd.DataFrame({"Id": test.index, "log_bike_count": prediction})

submission.to_csv("submission.csv", index=False)