In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error

from preprocess_FE import (
    get_zone_c_holidays,
    get_public_holidays,
    encode_dates,
    engineer_weather_features, 
    delete_zeros
)

In [2]:
school_holiday = get_zone_c_holidays()
public_holiday = get_public_holidays()

In [3]:
def _merge_weather_data(X):
    file_path = Path("./external_data/external_data.csv")
    important_columns = ["date", "pres", "ff", "t", "vv", "rr1"]
    weather_data = pd.read_csv(file_path, usecols=important_columns, parse_dates=["date"])
    weather_data["date"] = pd.to_datetime(weather_data["date"]).astype("datetime64[ns]")
    
    X = X.copy()
    X["orig_index"] = np.arange(X.shape[0])  # Save the original order
    X = pd.merge_asof(
        X.sort_values("date"),
        weather_data.sort_values("date"),
        on="date"
    )
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X

In [4]:
data = pd.read_parquet(Path("data") / "train.parquet")
data = delete_zeros(data)
data["date"] = pd.to_datetime(data["date"]).astype("datetime64[ns]")

data = _merge_weather_data(data)

In [5]:
covid_data_path = Path("./synthese-fra (1).csv")
columns = ["date", "nouveaux_patients_hospitalises"]
covid_data = pd.read_csv(covid_data_path, usecols=columns, parse_dates=["date"])
covid_data["date"] = pd.to_datetime(covid_data["date"]).astype("datetime64[ns]")
covid_data["date_only"] = covid_data["date"].dt.date

data["date_only"] = data["date"].dt.date
merged_data = data.merge(covid_data, on="date_only", how="left")

data.drop(columns=["date_only"], inplace=True)

In [6]:
X = merged_data[["counter_name", "site_name", "date_x", "longitude", 
                 "latitude", "ff", "t", "vv", "pres", "rr1", "nouveaux_patients_hospitalises"]]
y = merged_data["log_bike_count"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_encoded1 = encode_dates(X_train, school_holiday, public_holiday)
X_test_encoded1 = encode_dates(X_test, school_holiday, public_holiday)

X_train_encoded3 = engineer_weather_features(X_train_encoded1)
X_test_encoded3 = engineer_weather_features(X_test_encoded1)


In [7]:
categorical_features = ["counter_name", "site_name", "rain_category", "season", "year"]
numerical_features = list(X_train_encoded3.drop(columns=categorical_features).columns)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)

In [8]:
best_params = {
    "n_estimators": 472,
    "max_depth": 10,
    "learning_rate": 0.12614361865303286,
    "subsample": 0.9523866028433459,
    "colsample_bytree": 0.7603900904362988,
    "reg_alpha": 2.369797865081342,
    "reg_lambda": 1.8538255301093718,
}

best_model = XGBRegressor(**best_params, random_state=42)

xgboost_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", best_model)
])

In [9]:
xgboost_pipeline.fit(X_train_encoded3, y_train)
y_pred = xgboost_pipeline.predict(X_test_encoded3)
final_rmse = root_mean_squared_error(y_test, y_pred)
print(f"Final XGBoost RMSE: {final_rmse:.4f}")

Final XGBoost RMSE: 0.3364


In [10]:
df_test = pd.read_parquet("./data/final_test.parquet")
df_test["date"] = pd.to_datetime(df_test["date"], format='%Y-%m-%d %H:%M:%S').astype("datetime64[ns]")
df_test_merged = _merge_weather_data(df_test)

df_test_merged['date_only'] = pd.to_datetime(df_test_merged['date']).dt.date

df_test_merged = df_test_merged.merge(covid_data, on='date_only', how='left')

X = df_test_merged[["counter_name", "site_name", "date_x", "longitude", "latitude", "ff", "t", "vv", 
                    "pres", "rr1", "nouveaux_patients_hospitalises"]]

X = encode_dates(X, school_holiday, public_holiday)
X = engineer_weather_features(X)

X_test_final = preprocessor.transform(X)
final_predictions = xgboost_pipeline.named_steps['model'].predict(X_test_final)

results = pd.DataFrame({"Id": np.arange(final_predictions.shape[0]), "log_bike_count": final_predictions})
results.to_csv("submission_xgboost_optuna8.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, "year"] = X["date_x"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, "month"] = X["date_x"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, "day"] = X["date_x"].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in