In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt
import optuna
from vacances_scolaires_france import SchoolHolidayDates
from sklearn.model_selection import train_test_split
import importlib
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error


from preprocess_FE import (
    get_zone_c_holidays,
    get_public_holidays,
    encode_dates,
    engineer_weather_features, 
    remove_outliers
)

In [2]:
school_holiday = get_zone_c_holidays()
public_holiday = get_public_holidays()

In [3]:
def _merge_weather_data(X):
    file_path = Path("./external_data/external_data.csv")
    important_columns = ["date", "pres", "ff", "t", "vv", "rr1"]
    weather_data = pd.read_csv(file_path, usecols=important_columns, parse_dates=["date"])
    weather_data["date"] = pd.to_datetime(weather_data["date"], format='%Y-%m-%d %H:%M:%S').astype("datetime64[ns]")
    
    X = X.copy()
    X["orig_index"] = np.arange(X.shape[0])  # Save the original order
    X = pd.merge_asof(
        X.sort_values("date"),
        weather_data.sort_values("date"),
        on="date"
    )
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X

In [4]:
data = pd.read_parquet(Path("data") / "train.parquet")
data = remove_outliers(data)
data["date"] = pd.to_datetime(data["date"], format='%Y-%m-%d %H:%M:%S').astype("datetime64[ns]")

important_columns = ["date", "pres", "ff", "t", "vv", "rr1"]
weather_data = pd.read_csv("./external_data/external_data.csv", usecols=important_columns)

  data.groupby(["counter_name", "date_truncated"])


In [5]:
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data = weather_data.dropna(axis=1, how="all")
weather_data.set_index("date", inplace=True)
weather_data = weather_data[~weather_data.index.duplicated(keep="first")]
weather_data_interpolated = weather_data.resample("h").interpolate(method="linear")

In [6]:
data = _merge_weather_data(data)

covid_data_path = Path("./synthese-fra (1).csv")
columns = ["date", "nouveaux_patients_hospitalises"]
covid_data = pd.read_csv(covid_data_path, usecols=columns, parse_dates=["date"])
covid_data["date"] = pd.to_datetime(covid_data["date"]).astype("datetime64[ns]")
covid_data["date_only"] = covid_data["date"].dt.date

data["date_only"] = data["date"].dt.date
merged_data = data.merge(covid_data, on="date_only", how="left")

data.drop(columns=["date_only"], inplace=True)

In [7]:
X = merged_data[["counter_name", "site_name", "date_x", "longitude", "latitude", "ff", "t", "vv", "pres", "rr1", "nouveaux_patients_hospitalises"]]
y = merged_data["log_bike_count"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply date encoding

X_train_encoded1 = encode_dates(X_train, school_holiday, public_holiday)
X_test_encoded1 = encode_dates(X_test, school_holiday, public_holiday)

X_train_encoded3 = engineer_weather_features(X_train_encoded1)
X_test_encoded3 = engineer_weather_features(X_test_encoded1)

# Column transformer for preprocessing
categorical_features = ["counter_name", "site_name", "rain_category", "season", "year"
                        # "snow_category"
                       ]
numerical_features = list(X_train_encoded3.drop(columns=categorical_features).columns)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)

In [8]:
def time_series_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 10.0),
    }
    
    xgb_model = XGBRegressor(**params, random_state=42)
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", xgb_model)
    ])
    
    # Initialize time series split
    tscv = TimeSeriesSplit(n_splits=5)
    rmse_scores = []
    
    for train_idx, val_idx in tscv.split(X_train_encoded3):
        X_train_fold, X_val_fold = X_train_encoded3.iloc[train_idx], X_train_encoded3.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        pipeline.fit(X_train_fold, y_train_fold)
        y_val_pred = pipeline.predict(X_val_fold)
        
        # Calculate RMSE for each fold
        rmse = np.sqrt(mean_squared_error(y_val_fold, y_val_pred))
        rmse_scores.append(rmse)
    
    # Return the average RMSE across all folds
    return np.mean(rmse_scores)

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(time_series_objective, n_trials=100)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)

best_params = study.best_params
best_model = XGBRegressor(**best_params, random_state=42)

xgboost_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", best_model)
])

[I 2024-12-11 10:36:19,861] A new study created in memory with name: no-name-1d4e4a36-0497-4f27-9b4f-79abe7ea8b93
[I 2024-12-11 10:36:26,972] Trial 0 finished with value: 0.636211432079651 and parameters: {'n_estimators': 272, 'max_depth': 3, 'learning_rate': 0.042075159372562426, 'subsample': 0.6968633669774987, 'colsample_bytree': 0.7032997604450973, 'reg_alpha': 5.550039700535189, 'reg_lambda': 1.3228315658994987}. Best is trial 0 with value: 0.636211432079651.
[I 2024-12-11 10:36:35,610] Trial 1 finished with value: 0.4847981020274833 and parameters: {'n_estimators': 258, 'max_depth': 4, 'learning_rate': 0.144066959576798, 'subsample': 0.5106623724487274, 'colsample_bytree': 0.7483131520209647, 'reg_alpha': 4.227225528985124, 'reg_lambda': 4.0616016911045305}. Best is trial 1 with value: 0.4847981020274833.
[I 2024-12-11 10:36:46,778] Trial 2 finished with value: 0.4114463354121871 and parameters: {'n_estimators': 334, 'max_depth': 5, 'learning_rate': 0.17842191361126838, 'subsampl

Best hyperparameters: {'n_estimators': 459, 'max_depth': 10, 'learning_rate': 0.10900096993542714, 'subsample': 0.8804187634001926, 'colsample_bytree': 0.6916212425221336, 'reg_alpha': 1.7457705633286669, 'reg_lambda': 5.641531429497041}


In [9]:
xgboost_pipeline.fit(X_train_encoded3, y_train)
y_pred = xgboost_pipeline.predict(X_test_encoded3)
final_rmse = root_mean_squared_error(y_test, y_pred)
print(f"Final XGBoost RMSE: {final_rmse:.4f}")

Final XGBoost RMSE: 0.3365


In [10]:
df_test = pd.read_parquet("./data/final_test.parquet")
df_test_merged = df_test.merge(weather_data_interpolated, on='date', how='left')
df_test_merged['date_only'] = pd.to_datetime(df_test_merged['date']).dt.date

df_test_merged = df_test_merged.merge(covid_data, on='date_only', how='left')

X = df_test_merged[["counter_name", "site_name", "date_x", "longitude", "latitude", "ff", "t", "vv", 
                    "pres", "rr1", "nouveaux_patients_hospitalises"]]

X = encode_dates(X, school_holiday, public_holiday)
X = engineer_weather_features(X)

X_test_final = preprocessor.transform(X)
final_predictions = xgboost_pipeline.named_steps['model'].predict(X_test_final)

results = pd.DataFrame({"Id": np.arange(final_predictions.shape[0]), "log_bike_count": final_predictions})
results.to_csv("submission_xgboost_optuna6.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["year"] = X["date_x"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["month"] = X["date_x"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["day"] = X["date_x"].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 