In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt
import optuna
from vacances_scolaires_france import SchoolHolidayDates
from sklearn.model_selection import train_test_split
import importlib

from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer

from preprocess_FE import (
    get_zone_c_holidays,
    encode_dates,
    engineer_weather_features, 
    remove_outliers, 
    get_public_holidays
)

from sklearn.model_selection import TimeSeriesSplit

In [2]:
school_holidays = get_zone_c_holidays()
public_holidays = get_public_holidays()

In [3]:
data = pd.read_parquet(Path("data") / "train.parquet")
data = remove_outliers(data)
data = data.sort_values(["date", "counter_name"])

important_columns = ["date", "pres", "ff", "t", "vv", "rr1"]
weather_data = pd.read_csv("./external_data/external_data.csv", usecols=important_columns)

  data.groupby(["counter_name", "date_truncated"])


In [4]:
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data = weather_data.dropna(axis=1, how="all")
weather_data.set_index("date", inplace=True)
weather_data = weather_data[~weather_data.index.duplicated(keep="first")]
weather_data_interpolated = weather_data.resample("h").interpolate(method="linear")

In [5]:
columns = ['date', 'nouveaux_patients_hospitalises']
covid_data = pd.read_csv('./synthese-fra (1).csv', parse_dates=False, usecols=columns)
covid_data['date_only'] = pd.to_datetime(covid_data['date']).dt.date

In [6]:
merged_data = data.merge(weather_data_interpolated, on="date", how="left")
merged_data['date_only'] = pd.to_datetime(merged_data['date']).dt.date

merged_data = merged_data.merge(covid_data, on="date_only", how="left")

In [7]:
X = merged_data[["counter_name", "date_x", "longitude", "latitude", "ff", "t", "vv", "pres", "rr1", "nouveaux_patients_hospitalises"]]
y = merged_data["log_bike_count"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply date encoding

X_train_encoded1 = encode_dates(X_train, school_holidays, public_holidays)
X_test_encoded1 = encode_dates(X_test, school_holidays, public_holidays)

X_train_encoded3 = engineer_weather_features(X_train_encoded1)
X_test_encoded3 = engineer_weather_features(X_test_encoded1)

# Column transformer for preprocessing
categorical_features = ["counter_name", "rain_category", "season"]

numerical_features = list(X_train_encoded3.drop(columns=categorical_features).columns)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)

In [8]:
time_series_split = TimeSeriesSplit(n_splits=5)

In [9]:
def objective_with_cv(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 10.0),
    }

    xgb_model = XGBRegressor(**params, random_state=42)
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", xgb_model)
    ])

    cv_scores = cross_val_score(
        pipeline, X_train_encoded3, y_train,
        cv=time_series_split, scoring=rmse_scorer
    )
    mean_rmse = -cv_scores.mean()
    return mean_rmse

# Define RMSE as a scorer
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)

study = optuna.create_study(direction="minimize")
study.optimize(objective_with_cv, n_trials=100)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)

# Train the final model with the best hyperparameters using cross-validation
best_params = study.best_params
best_model = XGBRegressor(**best_params, random_state=42)

xgboost_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", best_model)
])

[I 2024-12-09 18:03:18,307] A new study created in memory with name: no-name-abf5d1ac-3964-4121-b983-7f9c052a693e
[I 2024-12-09 18:03:32,898] Trial 0 finished with value: 0.3817855140797195 and parameters: {'n_estimators': 185, 'max_depth': 9, 'learning_rate': 0.16092397303687297, 'subsample': 0.7273785067518931, 'colsample_bytree': 0.8211387205684981, 'reg_alpha': 6.833127622483164, 'reg_lambda': 4.3171353123547735}. Best is trial 0 with value: 0.3817855140797195.
[I 2024-12-09 18:04:03,832] Trial 1 finished with value: 0.3659460444146424 and parameters: {'n_estimators': 418, 'max_depth': 10, 'learning_rate': 0.15248710367334545, 'subsample': 0.6517662427014275, 'colsample_bytree': 0.8323056035048266, 'reg_alpha': 5.74984712346305, 'reg_lambda': 9.122517999966174}. Best is trial 1 with value: 0.3659460444146424.
[I 2024-12-09 18:04:14,866] Trial 2 finished with value: 0.4148220698939312 and parameters: {'n_estimators': 370, 'max_depth': 5, 'learning_rate': 0.1819888940982254, 'subsamp

Best hyperparameters: {'n_estimators': 493, 'max_depth': 10, 'learning_rate': 0.10806083507132916, 'subsample': 0.8405405259904322, 'colsample_bytree': 0.9116653568849942, 'reg_alpha': 1.0720540923387891, 'reg_lambda': 7.819270698069333}


In [10]:
cv_scores_final = cross_val_score(xgboost_pipeline, X_train_encoded3, y_train, cv=time_series_split, scoring=rmse_scorer)
mean_final_rmse = -cv_scores_final.mean()
print(f"Final Cross-Validated RMSE: {mean_final_rmse:.4f}")

Final Cross-Validated RMSE: 0.3589


In [11]:
xgboost_pipeline.fit(X_train_encoded3, y_train)
y_pred = xgboost_pipeline.predict(X_test_encoded3)
final_rmse = root_mean_squared_error(y_test, y_pred)
print(f"Final XGBoost RMSE: {final_rmse:.4f}")

Final XGBoost RMSE: 0.3388


In [12]:
df_test = pd.read_parquet("./data/final_test.parquet")
df_test_merged = df_test.merge(weather_data_interpolated, on='date', how='left')
df_test_merged['date_only'] = pd.to_datetime(df_test_merged['date']).dt.date

df_test_merged = df_test_merged.merge(covid_data, on='date_only', how='left')

X = df_test_merged[["counter_name", "date_x", "longitude", "latitude", "ff", "t", "vv", "pres", "rr1", "nouveaux_patients_hospitalises"]]

X = encode_dates(X, school_holidays, public_holidays)
X = engineer_weather_features(X)

X_test_final = preprocessor.transform(X)
final_predictions = xgboost_pipeline.named_steps['model'].predict(X_test_final)

results = pd.DataFrame({"Id": np.arange(final_predictions.shape[0]), "log_bike_count": final_predictions})
results.to_csv("submission_xgboost_optuna_ts.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["year"] = X["date_x"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["month"] = X["date_x"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["day"] = X["date_x"].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 