In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt
import optuna
from vacances_scolaires_france import SchoolHolidayDates
from sklearn.model_selection import train_test_split
import importlib

from preprocess_FE import (
    get_zone_c_holidays,
    encode_dates,
    engineer_weather_features, 
    remove_outliers
)

In [3]:
holiday_dates = get_zone_c_holidays()

In [4]:
data = pd.read_parquet(Path("data") / "train.parquet")
data = remove_outliers(data)

important_columns = ["date", "pres", "ff", "t", "vv", "rr1"]
weather_data = pd.read_csv("./external_data/external_data.csv", usecols=important_columns)

  data.groupby(["counter_name", "date_truncated"])


In [5]:
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data = weather_data.dropna(axis=1, how="all")
weather_data.set_index("date", inplace=True)
weather_data = weather_data[~weather_data.index.duplicated(keep="first")]
weather_data_interpolated = weather_data.resample("h").interpolate(method="linear")

In [6]:
columns = ['date', 'nouveaux_patients_hospitalises']
covid_data = pd.read_csv('./synthese-fra (1).csv', parse_dates=False, usecols=columns)
covid_data['date_only'] = pd.to_datetime(covid_data['date']).dt.date

In [7]:
merged_data = data.merge(weather_data_interpolated, on="date", how="left")
merged_data['date_only'] = pd.to_datetime(merged_data['date']).dt.date

merged_data = merged_data.merge(covid_data, on="date_only", how="left")

In [8]:
X = merged_data[["counter_name", "site_name", "date_x", "longitude", "latitude", "ff", "t", "vv", "pres", "rr1", "nouveaux_patients_hospitalises"]]
y = merged_data["log_bike_count"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply date encoding

X_train_encoded1 = encode_dates(X_train, holiday_dates)
X_test_encoded1 = encode_dates(X_test, holiday_dates)

X_train_encoded3 = engineer_weather_features(X_train_encoded1)
X_test_encoded3 = engineer_weather_features(X_test_encoded1)

# Column transformer for preprocessing
categorical_features = ["counter_name", "site_name", "rain_category", "season"
                        # "snow_category"
                       ]
numerical_features = list(X_train_encoded3.drop(columns=categorical_features).columns)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)

In [9]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 10.0),
    }
    
    xgb_model = XGBRegressor(**params, random_state=42)
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", xgb_model)
    ])
    pipeline.fit(X_train_encoded3, y_train)
    y_pred = pipeline.predict(X_test_encoded3)
    rmse = root_mean_squared_error(y_test, y_pred)
    return rmse

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)

best_params = study.best_params
best_model = XGBRegressor(**best_params, random_state=42)

xgboost_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", best_model)
])

[I 2024-12-09 14:37:03,021] A new study created in memory with name: no-name-272e81d0-234c-4f50-890c-e452bcd259ca
[I 2024-12-09 14:37:09,308] Trial 0 finished with value: 0.5477735587671532 and parameters: {'n_estimators': 209, 'max_depth': 9, 'learning_rate': 0.014410593372494567, 'subsample': 0.5417999723837469, 'colsample_bytree': 0.8554838324979197, 'reg_alpha': 7.131948187514835, 'reg_lambda': 4.285160270500187}. Best is trial 0 with value: 0.5477735587671532.
[I 2024-12-09 14:37:12,733] Trial 1 finished with value: 0.36737277803696994 and parameters: {'n_estimators': 125, 'max_depth': 10, 'learning_rate': 0.2306851891166297, 'subsample': 0.9032586250161665, 'colsample_bytree': 0.607501244235571, 'reg_alpha': 9.566740267491953, 'reg_lambda': 9.109392569640093}. Best is trial 1 with value: 0.36737277803696994.
[I 2024-12-09 14:37:16,243] Trial 2 finished with value: 0.456308821329726 and parameters: {'n_estimators': 366, 'max_depth': 4, 'learning_rate': 0.15134750351607837, 'subsam

Best hyperparameters: {'n_estimators': 474, 'max_depth': 10, 'learning_rate': 0.16284449180519794, 'subsample': 0.9730600625893189, 'colsample_bytree': 0.9883383622283959, 'reg_alpha': 5.813956289783056, 'reg_lambda': 6.494738888148139}


In [10]:
xgboost_pipeline.fit(X_train_encoded3, y_train)
y_pred = xgboost_pipeline.predict(X_test_encoded3)
final_rmse = root_mean_squared_error(y_test, y_pred)
print(f"Final XGBoost RMSE: {final_rmse:.4f}")

Final XGBoost RMSE: 0.3394


In [11]:
df_test = pd.read_parquet("./data/final_test.parquet")
df_test_merged = df_test.merge(weather_data_interpolated, on='date', how='left')
df_test_merged['date_only'] = pd.to_datetime(df_test_merged['date']).dt.date

df_test_merged = df_test_merged.merge(covid_data, on='date_only', how='left')

X = df_test_merged[["counter_name", "site_name", "date_x", "longitude", "latitude", "ff", "t", "vv", "pres", "rr1", "nouveaux_patients_hospitalises"]]

X = encode_dates(X, holiday_dates)
X = engineer_weather_features(X)

X_test_final = preprocessor.transform(X)
final_predictions = xgboost_pipeline.named_steps['model'].predict(X_test_final)

results = pd.DataFrame({"Id": np.arange(final_predictions.shape[0]), "log_bike_count": final_predictions})
results.to_csv("submission_xgboost_optuna3.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["year"] = X["date_x"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["month"] = X["date_x"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["day"] = X["date_x"].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 