In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from jours_feries_france import JoursFeries

import seaborn as sns
import time

In [2]:
# Load training data
data = pd.read_parquet(Path("data") / "train.parquet")
print(data.head())
print(data.info())

# Define holidays for 2020 and 2021
holidays_2020_2021 = (
    list(JoursFeries.for_year(2020).values()) +
    list(JoursFeries.for_year(2021).values())
)
holidays_2020_2021 = pd.to_datetime(holidays_2020_2021)
print(holidays_2020_2021[:5])

                counter_id              counter_name    site_id  \
48321  100007049-102007049  28 boulevard Diderot E-O  100007049   
48324  100007049-102007049  28 boulevard Diderot E-O  100007049   
48327  100007049-102007049  28 boulevard Diderot E-O  100007049   
48330  100007049-102007049  28 boulevard Diderot E-O  100007049   
48333  100007049-102007049  28 boulevard Diderot E-O  100007049   

                  site_name  bike_count                date  \
48321  28 boulevard Diderot         0.0 2020-09-01 02:00:00   
48324  28 boulevard Diderot         1.0 2020-09-01 03:00:00   
48327  28 boulevard Diderot         0.0 2020-09-01 04:00:00   
48330  28 boulevard Diderot         4.0 2020-09-01 15:00:00   
48333  28 boulevard Diderot         9.0 2020-09-01 18:00:00   

      counter_installation_date         coordinates counter_technical_id  \
48321                2013-01-18  48.846028,2.375429          Y2H15027244   
48324                2013-01-18  48.846028,2.375429          Y2H15

In [3]:
def _encode_dates(X):
    """
    Encode date information from the 'date' column.
    Adds year, month, day, weekday, hour, holiday, and weekend indicators.
    """
    X = X.copy()
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    X['holiday'] = X['date'].isin(holidays_2020_2021).astype(int)
    X['weekend'] = (X['date'].dt.dayofweek > 4).astype(int)
    return X.drop(columns=["date"])

# Check the date column
print(data["date"].head())

# Apply encoding function
encoded_dates = _encode_dates(data[["date"]])
print(encoded_dates.head())

48321   2020-09-01 02:00:00
48324   2020-09-01 03:00:00
48327   2020-09-01 04:00:00
48330   2020-09-01 15:00:00
48333   2020-09-01 18:00:00
Name: date, dtype: datetime64[us]
       year  month  day  weekday  hour  holiday  weekend
48321  2020      9    1        1     2        0        0
48324  2020      9    1        1     3        0        0
48327  2020      9    1        1     4        0        0
48330  2020      9    1        1    15        0        0
48333  2020      9    1        1    18        0        0


In [4]:
# Define FunctionTransformer and preprocessor
date_encoder = FunctionTransformer(_encode_dates, validate=False)
date_cols = _encode_dates(data[["date"]]).columns.tolist()
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "site_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

In [5]:
important_columns = ["date", "pres", "ff", "t", "u", "vv", "n", "hbas", "ht_neige", "rr1"]

# Load weather data
weather_data = pd.read_csv('./external_data/external_data.csv', usecols=important_columns)
print(weather_data.head())
print(weather_data.info())

# Process weather data
weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data = weather_data.dropna(axis=1, how='all')
weather_data.set_index('date', inplace=True)
weather_data = weather_data[~weather_data.index.duplicated(keep='first')]
weather_data_interpolated = weather_data.resample('h').interpolate(method='linear')

# Merge with main dataset
merged_data = data.merge(weather_data_interpolated, on='date', how='left')
print(merged_data.head())

                  date   ff       t   u    vv     n    hbas    pres  ht_neige  \
0  2021-01-01 00:00:00  1.8  272.75  96   990  10.0   800.0   99680      0.00   
1  2021-01-01 03:00:00  1.7  271.25  98   210  25.0  1750.0   99790      0.00   
2  2021-01-01 06:00:00  2.6  271.95  98  3660  90.0   450.0   99820      0.00   
3  2021-01-01 09:00:00  1.7  272.45  97  3500  50.0  1750.0   99970      0.01   
4  2021-01-01 12:00:00  1.0  276.95  82  8000  90.0   450.0  100000     -0.01   

   rr1  
0  0.0  
1  0.0  
2  0.0  
3  0.0  
4  0.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      3322 non-null   object 
 1   ff        3322 non-null   float64
 2   t         3322 non-null   float64
 3   u         3322 non-null   int64  
 4   vv        3322 non-null   int64  
 5   n         3166 non-null   float64
 6   hbas      2869 non-null   float64
 

In [None]:
# Define features and target
X = merged_data[["counter_name", "site_name", "date", "longitude", "latitude", "ff", "t", "u", "vv", "n", "hbas", "pres", "ht_neige", "rr1"]]
y = merged_data['log_bike_count']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model and pipeline
model = XGBRegressor(objective='reg:squarederror', random_state=42)
pipe = make_pipeline(date_encoder, preprocessor, model)

# Define parameter grid for GridSearchCV
param_grid = {
    'xgbregressor__n_estimators': [100, 200, 300],
    'xgbregressor__max_depth': [3, 5, 7],
    'xgbregressor__learning_rate': [0.01, 0.1, 0.2]
}

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test RMSE: {rmse}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
# Load and preprocess test set
df_test = pd.read_parquet("./data/final_test.parquet")
df_test_merged = df_test.merge(weather_data_interpolated, on='date', how='left')

# Prepare features for prediction
X_test_final = df_test_merged[["counter_name", "site_name", "date", "longitude", "latitude", "ff", "t", "u", "vv", "n", "hbas", "pres", "ht_neige", "rr1"]]

# Make predictions
y_pred = best_model.predict(X_test_final)

# Save results
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)