In [44]:
import pandas as pd
import numpy as np

from skrub import TableVectorizer
import xgboost as xgb
from sklearn.pipeline import Pipeline

import holidays

from datetime import datetime

from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler, LabelEncoder

import datetime

from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split


In [45]:
# Import the files
df_train = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/train.parquet")
df_test = pd.read_parquet("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/data/final_test.parquet")

In [46]:
# Add jour ferie data
jour_feries = (
    pd.read_csv(
        "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/jours_feries_metropole.csv",
        date_format="%Y%m%d%H"  # Ensure date format is handled correctly
    )
    .drop(columns=["annee", "zone"])  # Drop unnecessary columns
)

# Convert 'date' column to datetime
jour_feries['date'] = pd.to_datetime(jour_feries['date'])

# Filter rows based on the date range of df_train and df_test
jour_feries = jour_feries[
    (jour_feries["date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (jour_feries["date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

In [47]:
# Add mouvements sociaux data :
mouvements_sociaux = (
    pd.read_csv(
        "/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/mouvements-sociaux-depuis-2002.csv",
        date_format="%Y%m%d%H",
        sep=";"
    )
    .drop(columns=['date_de_fin', 'Organisations syndicales', 'Métiers ciblés par le préavis',
                   'Population devant travailler ciblee par le préavis', 'Nombre de grévistes du préavis'])  # Drop unnecessary columns
)

mouvements_sociaux['Date'] = pd.to_datetime(mouvements_sociaux['Date'])

mouvements_sociaux = mouvements_sociaux[
    (mouvements_sociaux["Date"] >= df_train["date"].min() - datetime.timedelta(hours=1))
    & (mouvements_sociaux["Date"] <= df_test["date"].max() + datetime.timedelta(hours=1))
]

mouvements_sociaux = mouvements_sociaux[mouvements_sociaux['Date'] != pd.Timestamp('2021-03-08')]

In [48]:
# Extract the date feature on different time scales :

fr_holidays = holidays.France()

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # creation of a binary varible depicting if day in weekend
    X["is_weekend"] = np.where(X["weekday"] + 1 > 5, 1, 0)

    # Add a feature to indicate if the day is a holiday in France
    X["is_holiday"] = X["date"].apply(lambda d: 1 if d in fr_holidays else 0)

    # and if it is a jour ferie in France :
    X["is_jour_ferie"] = X["date"].dt.date.isin(jour_feries['date']).astype(int)

    # and it is a jour of "mouvement social" in France :
    X["is_jour_mouvement_social"] = X["date"].dt.date.isin(mouvements_sociaux['Date']).astype(int)

    # Add morning rush and evening rush features
    # X["is_working_day"] = np.where((X["weekday"] + 1 <= 5), 1, 0)
    # X["morning_rush"] = (X["hour"].between(7, 9)) & X["is_working_day"]
    # X["evening_rush"] = (X["hour"].between(17, 19)) & X["is_working_day"]

    # Add the season feature
    # def season_date(date):
      #  if (date > datetime.datetime(2020, 9, 21)) & (date < datetime.datetime(2020, 12, 21)):
       #     return 1
       # if (date > datetime.datetime(2020, 12, 20)) & (date < datetime.datetime(2021, 3, 20)):
       #     return 2
       # if (date > datetime.datetime(2021, 3, 19)) & (date < datetime.datetime(2021, 6, 21)):
       #     return 3
       # if ((date > datetime.datetime(2021, 6, 20)) & (date < datetime.datetime(2021, 9, 22))) | \
       #    ((date > datetime.datetime(2020, 6, 19)) & (date < datetime.datetime(2020, 9, 22))):
       #     return 4
       #  return 0  # fallback if none matches

    # X["season"] = X["date"].apply(season_date)

    return X

df_train = _encode_dates(df_train)
df_test = _encode_dates(df_test)


In [49]:
import geopandas as gpd
from shapely.geometry import Point

# To add an "arrondissement" feature based on latitute ande longitude
def arrondissement(X, shapefile_path="/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/external_data/arrondissements.shp"):

    arrondissements = gpd.read_file(shapefile_path)

    # Create a GeoDataFrame for the input dataset
    X = X.copy()  # Work on a copy of the dataset
    X["geometry"] = X.apply(lambda row: Point(row["longitude"], row["latitude"]), axis=1)
    gdf = gpd.GeoDataFrame(X, geometry="geometry", crs=arrondissements.crs)

    # Perform a spatial join to match points to arrondissements
    merged = gpd.sjoin(gdf, arrondissements, how="left", predicate="within")

    # Extract the arrondissement code (e.g., "c_ar") and fill missing values with 21
    X["district"] = merged["c_ar"].fillna(21).astype(int)

    # Drop the geometry column (optional, if not needed further)
    X = X.drop(columns=["geometry"])

    return X

df_train = arrondissement(df_train)
df_test = arrondissement(df_test)

In [50]:
'''
# To add covid features : one binary feature for lockdown and one binary feature for curfew periods
def covid_features(data):
    # Lockdown periods
    lockdown_periods = [
        ("2020-10-30", "2020-12-15"),
        ("2021-04-03", "2021-05-03"),
    ]

    # Binary column for lockdown
    data["is_lockdown"] = 0
    for start_date, end_date in lockdown_periods:
        data.loc[
            (data["date"] >= start_date) & (data["date"] < end_date),
            "is_lockdown"
        ] = 1

    # Curfew periods with specific time restrictions
    curfew_periods = [
        ("2020-10-17", "2020-10-30", 21, 6),  # Curfew from 9 PM to 6 AM
        ("2020-12-16", "2021-01-15", 20, 6),  # Curfew from 8 PM to 6 AM
        ("2021-01-15", "2021-03-20", 19, 6),  # Curfew from 7 PM to 6 AM
        ("2021-03-20", "2021-04-03", 18, 6),  # Curfew from 6 PM to 6 AM
        ("2021-05-03", "2021-06-09", 19, 6),  # Curfew from 7 PM to 6 AM
        ("2021-06-09", "2021-06-20", 23, 6),  # Curfew from 11 PM to 6 AM
    ]

    # Binary column for curfew
    data["is_curfew"] = 0
    for start_date, end_date, start_hour, end_hour in curfew_periods:
        data.loc[
            (data["date"] >= start_date) & (data["date"] < end_date)
            & ((data["hour"] >= start_hour) | (data["hour"] < end_hour)),
            "is_curfew"
        ] = 1

    return data

# Apply the function to your datasets
df_train = covid_features(df_train)
df_test = covid_features(df_test)
'''

# remove the date column
df_train = df_train.drop(columns=['date'])
df_test = df_test.drop(columns=['date'])

In [51]:
# Preprocessing :

# Extract features from counter_installation_date
for df in [df_train, df_test]:
    df["installation_year"] = df["counter_installation_date"].dt.year
    df["installation_month"] = df["counter_installation_date"].dt.month

df_train = df_train.drop(columns=["counter_installation_date"])
df_test = df_test.drop(columns=["counter_installation_date"])

# Label encode high-cardinality categorical features
label_encoders = {}
for col in ["counter_id", "site_id", "counter_name", "site_name", "counter_technical_id", "coordinates"]:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    df_test[col] = le.fit_transform(df_test[col])
    label_encoders[col] = le

In [52]:
X_train = df_train.drop(columns=["bike_count", "log_bike_count"])
y_train = df_train["log_bike_count"]

X_test = df_test.copy()

In [53]:
# Step 1: Split data into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# and sample a subset for hyperparameter tuning :
subset_size = int(0.4 * len(X_train_split))
subset_indices = np.random.choice(X_train_split.index, subset_size, replace=False)

X_subset = X_train_split.loc[subset_indices]
y_subset = y_train_split.loc[subset_indices]

In [54]:
# Step 2: Define parameter grid for RandomizedSearchCV
param_grid = {
    'iterations': [500, 1000, 2000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 10],
    'bagging_temperature': [0.8, 1],
    'rsm': [0.8, 1.0]
}

In [55]:
# Step 3: Initialize CatBoost model
catboost_model = CatBoostRegressor(
    loss_function='RMSE',
    random_seed=42,
    verbose=100
)

In [56]:
# Step 4: Use RandomizedSearchCV to find the best parameters
random_search = RandomizedSearchCV(
    estimator=catboost_model,
    param_distributions=param_grid,
    scoring='neg_mean_squared_error',
    n_iter=50,
    cv=5,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# Perform the search
random_search.fit(X_subset, y_subset)

# Print best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", -random_search.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
0:	learn: 1.5489309	total: 94.5ms	remaining: 3m 8s
0:	learn: 1.6063998	total: 98.5ms	remaining: 49.2s
0:	learn: 1.5478054	total: 131ms	remaining: 4m 22s
0:	learn: 1.5481287	total: 139ms	remaining: 4m 37s
0:	learn: 1.5503147	total: 96.2ms	remaining: 3m 12s
0:	learn: 1.6073151	total: 86.5ms	remaining: 43.2s
0:	learn: 1.6081019	total: 77.1ms	remaining: 38.5s
0:	learn: 1.5497505	total: 107ms	remaining: 3m 34s
100:	learn: 0.6539090	total: 2.62s	remaining: 10.3s
100:	learn: 0.6512441	total: 2.49s	remaining: 9.86s
100:	learn: 0.6513960	total: 2.76s	remaining: 10.9s
100:	learn: 0.5177510	total: 3.81s	remaining: 1m 11s
100:	learn: 0.5150731	total: 3.92s	remaining: 1m 13s
100:	learn: 0.5150107	total: 4.01s	remaining: 1m 15s
100:	learn: 0.5126028	total: 3.94s	remaining: 1m 14s
100:	learn: 0.5152577	total: 4.02s	remaining: 1m 15s
200:	learn: 0.5639981	total: 5.26s	remaining: 7.82s
200:	learn: 0.5606477	total: 5.11s	remaining: 7.6s
200:	

In [57]:
# Step 5: Train the final model on the full training set with eval_set
best_params = random_search.best_params_

final_model = CatBoostRegressor(
    **best_params,
    loss_function='RMSE',
    random_seed=42,
    verbose=100,
    use_best_model=True  # Ensure best iteration is selected
)

final_model.fit(
    X_train_split,
    y_train_split,
    eval_set=(X_val_split, y_val_split),
    early_stopping_rounds=50  # Stops if no improvement for 50 iterations
)


0:	learn: 1.5520818	test: 1.5506976	best: 1.5506976 (0)	total: 22.8ms	remaining: 45.5s
100:	learn: 0.5164090	test: 0.5157534	best: 0.5157534 (100)	total: 2.22s	remaining: 41.8s
200:	learn: 0.4424089	test: 0.4437167	best: 0.4437167 (200)	total: 4.37s	remaining: 39.1s
300:	learn: 0.4088735	test: 0.4117168	best: 0.4117168 (300)	total: 6.54s	remaining: 36.9s
400:	learn: 0.3886461	test: 0.3934111	best: 0.3934111 (400)	total: 8.71s	remaining: 34.7s
500:	learn: 0.3745411	test: 0.3810397	best: 0.3810397 (500)	total: 11s	remaining: 32.8s
600:	learn: 0.3639318	test: 0.3721866	best: 0.3721866 (600)	total: 13.2s	remaining: 30.7s
700:	learn: 0.3557594	test: 0.3656754	best: 0.3656754 (700)	total: 15.5s	remaining: 28.7s
800:	learn: 0.3492878	test: 0.3608026	best: 0.3608026 (800)	total: 17.7s	remaining: 26.4s
900:	learn: 0.3436169	test: 0.3565898	best: 0.3565898 (900)	total: 19.8s	remaining: 24.2s
1000:	learn: 0.3386148	test: 0.3531363	best: 0.3531363 (1000)	total: 22.2s	remaining: 22.2s
1100:	learn: 

<catboost.core.CatBoostRegressor at 0x280767860>

In [58]:
# Step 6: Make predictions on the test set
y_predictions = final_model.predict(X_test)


In [59]:
print(y_predictions)

[0.41039275 1.63160619 2.02659416 ... 5.23482533 4.676473   3.64370671]


In [60]:
pd.DataFrame(y_predictions, columns=["log_bike_count"]).reset_index().rename(
    columns={"index": "Id"}
).to_csv("/Users/louisleibovici/Documents/VS_Code/Bike_counters DSB Project/bike_counters/predictions_CatBoost_reg.csv", index=False)