In [20]:
import pandas as pd
import numpy as np

from skrub import TableVectorizer
import xgboost as xgb

from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline

import holidays

from datetime import datetime

from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

from sklearn.compose import ColumnTransformer


from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split

import optuna

import datetime

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.preprocessing import StandardScaler, LabelEncoder




In [21]:
# Import the files
df_train = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/train.parquet")
df_test = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/final_test.parquet")

In [22]:
# Add jour ferie data
jour_feries = (
    pd.read_csv(
        "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/jours_feries_metropole.csv",
        date_format="%Y%m%d%H"  # Ensure date format is handled correctly
    )
    .drop(columns=["annee", "zone"])  # Drop unnecessary columns
)

# Convert 'date' column to datetime
jour_feries['date'] = pd.to_datetime(jour_feries['date'])

# Filter rows based on the date range of df_train and df_test
jour_feries = jour_feries[
    (jour_feries["date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (jour_feries["date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

In [23]:
# Add mouvements sociaux data :
mouvements_sociaux = (
    pd.read_csv(
        "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/mouvements-sociaux-depuis-2002.csv",
        date_format="%Y%m%d%H",
        sep=";"
    )
    .drop(columns=['date_de_fin', 'Organisations syndicales', 'Métiers ciblés par le préavis',
                   'Population devant travailler ciblee par le préavis', 'Nombre de grévistes du préavis'])  # Drop unnecessary columns
)

mouvements_sociaux['Date'] = pd.to_datetime(mouvements_sociaux['Date'])

mouvements_sociaux = mouvements_sociaux[
    (mouvements_sociaux["Date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (mouvements_sociaux["Date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

mouvements_sociaux = mouvements_sociaux[mouvements_sociaux['Date'] != pd.Timestamp('2021-03-08')]

In [24]:
# Extract the date feature on different time scales :

fr_holidays = holidays.France()

def _encode_dates(X):
    X = X.copy()  # Modify a copy of X

    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Creation of a binary variable depicting if the day is a weekend
    X["is_weekend"] = np.where(X["weekday"] + 1 > 5, 1, 0)

    # Add a feature to indicate if the day is a holiday in France
    X["is_holiday"] = X["date"].apply(lambda d: 1 if d in fr_holidays else 0)

    # Add a feature to indicate if it is a jour férié in France
    X["is_jour_ferie"] = X["date"].dt.date.isin(jour_feries["date"]).astype(int)

    # Add a feature to indicate if it is a jour of "mouvement social" in France
    X["is_jour_mouvement_social"] = X["date"].dt.date.isin(mouvements_sociaux["Date"]).astype(int)

    return X

df_train = _encode_dates(df_train)
df_test = _encode_dates(df_test)


In [25]:
import geopandas as gpd
from shapely.geometry import Point

# To add an "arrondissement" feature based on latitute ande longitude
def arrondissement(X, shapefile_path="/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/arrondissements.shp"):

    arrondissements = gpd.read_file(shapefile_path)

    # Create a GeoDataFrame for the input dataset
    X = X.copy()  # Work on a copy of the dataset
    X["geometry"] = X.apply(lambda row: Point(row["longitude"], row["latitude"]), axis=1)
    gdf = gpd.GeoDataFrame(X, geometry="geometry", crs=arrondissements.crs)

    # Perform a spatial join to match points to arrondissements
    merged = gpd.sjoin(gdf, arrondissements, how="left", predicate="within")

    # Extract the arrondissement code (e.g., "c_ar") and fill missing values with 21
    X["district"] = merged["c_ar"].fillna(21).astype(int)

    # Drop the geometry column (optional, if not needed further)
    X = X.drop(columns=["geometry"])

    return X

df_train = arrondissement(df_train)
df_test = arrondissement(df_test)

In [26]:
df_train = df_train.drop(columns=['date'])
df_test = df_test.drop(columns=['date'])

In [27]:
# Extract features from counter_installation_date
for df in [df_train, df_test]:
    df["installation_year"] = df["counter_installation_date"].dt.year
    df["installation_month"] = df["counter_installation_date"].dt.month

df_train = df_train.drop(columns=["counter_installation_date"])
df_test = df_test.drop(columns=["counter_installation_date"])

In [28]:
# Preprocessing :

# Label encode high-cardinality categorical features
label_encoders = {}


for col in ["counter_id", "site_id", "counter_name", "site_name", "counter_technical_id", "coordinates"]:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.fit_transform(df_test[col])
    label_encoders[col] = le


In [29]:
X_train = df_train.drop(columns=["bike_count", "log_bike_count"])
y_train = df_train["log_bike_count"]

X_test = df_test.copy()

In [30]:
# Split the subset into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [31]:
# Define the Optuna objective function
def objective(trial):
    # Suggest hyperparameters
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-8, 10.0),
    }

    # Initialize and train the model
    model = xgb.XGBRegressor(**param, random_state=42)
    model.fit(X_train_split, y_train_split)

    # Predict on the validation set
    y_pred = model.predict(X_val_split)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val_split, y_pred))
    return rmse

# Create an Optuna study and optimize
study = optuna.create_study(direction="minimize")  # Minimize RMSE
study.optimize(objective, n_trials=50, timeout=1200)  # Adjust n_trials and timeout as needed

# Get the best parameters and score
print("Best Parameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2024-12-11 12:07:36,088] A new study created in memory with name: no-name-8eb002b1-dc6e-4012-a64d-8d8806443b27
[I 2024-12-11 12:07:39,885] Trial 0 finished with value: 0.36084131450792706 and parameters: {'n_estimators': 390, 'learning_rate': 0.10763675131572949, 'max_depth': 8, 'subsample': 0.501419763371572, 'colsample_bytree': 0.9596296260224091, 'reg_alpha': 5.6255065287813775, 'reg_lambda': 6.422166245742959, 'min_child_weight': 0.6919146869536584}. Best is trial 0 with value: 0.36084131450792706.
[I 2024-12-11 12:07:40,787] Trial 1 finished with value: 0.5879202044142422 and parameters: {'n_estimators': 215, 'learning_rate': 0.17896666604431075, 'max_depth': 3, 'subsample': 0.5731698599442838, 'colsample_bytree': 0.9958147075943817, 'reg_alpha': 5.532415964996166, 'reg_lambda': 6.816865933997174, 'min_child_weight': 7.823453191686197}. Best is trial 0 with value: 0.36084131450792706.
[I 2024-12-11 12:07:41,553] Trial 2 finished with value: 0.794086880956681 and parameters: {'n

Best Parameters: {'n_estimators': 267, 'learning_rate': 0.23131912472401384, 'max_depth': 10, 'subsample': 0.9698109227988079, 'colsample_bytree': 0.7669160892672701, 'reg_alpha': 2.7333110713230804, 'reg_lambda': 6.90084097297034, 'min_child_weight': 4.655571633509011}
Best RMSE: 0.3359484333378738


In [32]:
# Train the final model with the best parameters on the full dataset
best_params = study.best_params
best_model = xgb.XGBRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)  # Use the full training set for the final model

# Predict on the test set
y_predictions = best_model.predict(X_test)



In [33]:
print(y_predictions)

[0.5044088 1.7093858 2.1024637 ... 5.474777  4.9322305 3.972835 ]


In [34]:
pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
    columns={"index": "Id"}
).to_csv("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/predictions_XGBoost_Optuna_sanspipeline.csv", index=False)