In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from jours_feries_france import JoursFeries

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
print(data.head())
print(data.info())

                counter_id              counter_name    site_id  \
48321  100007049-102007049  28 boulevard Diderot E-O  100007049   
48324  100007049-102007049  28 boulevard Diderot E-O  100007049   
48327  100007049-102007049  28 boulevard Diderot E-O  100007049   
48330  100007049-102007049  28 boulevard Diderot E-O  100007049   
48333  100007049-102007049  28 boulevard Diderot E-O  100007049   

                  site_name  bike_count                date  \
48321  28 boulevard Diderot         0.0 2020-09-01 02:00:00   
48324  28 boulevard Diderot         1.0 2020-09-01 03:00:00   
48327  28 boulevard Diderot         0.0 2020-09-01 04:00:00   
48330  28 boulevard Diderot         4.0 2020-09-01 15:00:00   
48333  28 boulevard Diderot         9.0 2020-09-01 18:00:00   

      counter_installation_date         coordinates counter_technical_id  \
48321                2013-01-18  48.846028,2.375429          Y2H15027244   
48324                2013-01-18  48.846028,2.375429          Y2H15

In [3]:
# Define holidays for 2020 and 2021
holidays_2020_2021 = (
    list(JoursFeries.for_year(2020).values()) +
    list(JoursFeries.for_year(2021).values())
)
holidays_2020_2021 = pd.to_datetime(holidays_2020_2021)

In [4]:
# Date encoding function
def _encode_dates(X):
    """
    Encode date information from the 'date' column.
    Adds year, month, day, weekday, hour, holiday, and weekend indicators.
    """
    lockdown_periods = [
        ("2020-03-17", "2020-05-11"),
        ("2020-10-30", "2020-12-14"),
        ("2021-04-03", "2021-06-30"),
    ]
    lockdown_ranges = [
        (pd.to_datetime(start), pd.to_datetime(end)) for start, end in lockdown_periods
    ]
    X = X.copy()
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    X['holiday'] = X['date'].isin(holidays_2020_2021).astype(int)
    X['weekend'] = (X['date'].dt.dayofweek > 4).astype(int)
    X["lockdown"] = X["date"].apply(
        lambda d: any(start <= d <= end for start, end in lockdown_ranges)
    ).astype(int)
    
    return X.drop(columns=["date"])

In [5]:
data = data.assign(**_encode_dates(data[["date"]]))

In [6]:
# Load weather data
important_columns = ["date", "pres", "ff", "t", "u", "vv", "n", "hbas", "ht_neige", "rr1"]
weather_data = pd.read_csv('./external_data/external_data.csv', usecols=important_columns)
weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data = weather_data.dropna(axis=1, how='all')
weather_data.set_index('date', inplace=True)
weather_data = weather_data[~weather_data.index.duplicated(keep='first')]
weather_data_interpolated = weather_data.resample('h').interpolate(method='linear')

In [7]:
# Merge with main dataset
merged_data = data.merge(weather_data_interpolated, on='date', how='left')

In [8]:
X = merged_data[
    ["counter_name", "site_name", "year", "month", "day", "weekday", "hour", "holiday",
     "weekend", "lockdown", "longitude", "latitude", "ff", "t", "u", "vv", "n", "hbas", "pres", "ht_neige", "rr1"]
]
y = merged_data['log_bike_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = ["counter_name", "site_name", "month", "weekday", "holiday", "weekend", "lockdown"]

In [9]:
# Define CatBoost model
model = CatBoostRegressor(
    loss_function='RMSE',
    cat_features=categorical_features,
    random_state=42,
    verbose=0  # Suppress training logs for GridSearchCV
)

# Define parameter grid for GridSearchCV
param_grid = {
    "iterations": [100, 300, 500],
    "learning_rate": [0.01, 0.1, 0.2],
    "depth": [4, 6, 8],
}

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and evaluation
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {-grid_search.best_score_}")

# Evaluate best model on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse}")

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'depth': 8, 'iterations': 500, 'learning_rate': 0.2}
Best Score: 0.386950514960059
Test RMSE: 0.382036896546149


In [10]:
# Load and preprocess test set
df_test = pd.read_parquet("./data/final_test.parquet")
df_test = df_test.merge(weather_data_interpolated, on='date', how='left')
df_test = df_test.assign(**_encode_dates(df_test[["date"]]))

# Prepare features for prediction
X_test_final = df_test[
    ["counter_name", "site_name", "year", "month", "day", "weekday", "hour", "holiday",
     "weekend", "lockdown", "longitude", "latitude", "ff", "t", "u", "vv", "n", "hbas", "pres", "ht_neige", "rr1"]
]

# Make predictions on the test set
y_test_pred = best_model.predict(X_test_final)

# Save results
results = pd.DataFrame(
    dict(
        Id=np.arange(y_test_pred.shape[0]),
        log_bike_count=y_test_pred,
    )
)
results.to_csv("submission.csv", index=False)