In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from jours_feries_france import JoursFeries

import seaborn as sns
import time

In [2]:
# Load training data
data = pd.read_parquet(Path("data") / "train.parquet")
print(data.head())
print(data.info())

data.to_csv('data.csv')
    
# Define holidays for 2020 and 2021
holidays_2020_2021 = (
    list(JoursFeries.for_year(2020).values()) +
    list(JoursFeries.for_year(2021).values())
)
holidays_2020_2021 = pd.to_datetime(holidays_2020_2021)
print(holidays_2020_2021[:5])

                counter_id              counter_name    site_id  \
48321  100007049-102007049  28 boulevard Diderot E-O  100007049   
48324  100007049-102007049  28 boulevard Diderot E-O  100007049   
48327  100007049-102007049  28 boulevard Diderot E-O  100007049   
48330  100007049-102007049  28 boulevard Diderot E-O  100007049   
48333  100007049-102007049  28 boulevard Diderot E-O  100007049   

                  site_name  bike_count                date  \
48321  28 boulevard Diderot         0.0 2020-09-01 02:00:00   
48324  28 boulevard Diderot         1.0 2020-09-01 03:00:00   
48327  28 boulevard Diderot         0.0 2020-09-01 04:00:00   
48330  28 boulevard Diderot         4.0 2020-09-01 15:00:00   
48333  28 boulevard Diderot         9.0 2020-09-01 18:00:00   

      counter_installation_date         coordinates counter_technical_id  \
48321                2013-01-18  48.846028,2.375429          Y2H15027244   
48324                2013-01-18  48.846028,2.375429          Y2H15

In [3]:
# def _encode_dates(X):
#    """
#    Encode date information from the 'date' column.
#    Adds year, month, day, weekday, hour, holiday, and weekend indicators.
#    """

#    lockdown_periods = [
#        ("2020-03-17", "2020-05-11"),
#        ("2020-10-30", "2020-12-14"),
#        ("2021-04-03", "2021-06-30"),
#    ]
    
#    lockdown_ranges = [
#        (pd.to_datetime(start), pd.to_datetime(end)) for start, end in lockdown_periods
#    ]
    
#    X = X.copy()
#    X["year"] = X["date"].dt.year
#    X["month"] = X["date"].dt.month
#    X["day"] = X["date"].dt.day
#    X["weekday"] = X["date"].dt.weekday
#    X["hour"] = X["date"].dt.hour
#    X['holiday'] = X['date'].isin(holidays_2020_2021).astype(int)
#    X['weekend'] = (X['date'].dt.dayofweek > 4).astype(int)
#    X["lockdown"] = X["date"].apply(
#        lambda d: any(start <= d <= end for start, end in lockdown_ranges)
#    ).astype(int)
    
#    return X.drop(columns=["date"])

# Check the date column
# print(data["date"].head())

# Apply encoding function
# encoded_dates = _encode_dates(data[["date"]])
# print(encoded_dates.head())

In [4]:
def _encode_dates(X):
    """
    Encode date information from the 'date' column.
    Adds year, month, day, weekday, hour, holiday, weekend indicators, lockdown, is_sun, and cyclical hour features.
    """
    # Define lockdown periods
    lockdown_periods = [
        ("2020-03-17", "2020-05-11"),
        ("2020-10-30", "2020-12-14"),
        ("2021-04-03", "2021-06-30"),
    ]
    lockdown_ranges = [
        (pd.to_datetime(start), pd.to_datetime(end)) for start, end in lockdown_periods
    ]

    # Define daylight data
    daylight_data = {
        'month': [
            'January', 'February', 'March', 'April', 'May', 'June',
            'July', 'August', 'September', 'October', 'November', 'December'
        ],
        'sunrise': [
            '08:37', '07:57', '07:01', '06:56', '06:06', '05:44',
            '06:02', '06:42', '07:26', '08:10', '07:58', '08:35'
        ],
        'sunset': [
            '17:22', '18:11', '18:57', '20:44', '21:27', '21:58',
            '21:51', '21:07', '20:04', '19:02', '17:11', '16:56'
        ]
    }
    daylight_df = pd.DataFrame(daylight_data)

    # Convert sunrise and sunset times to timedelta
    daylight_df['sunrise'] = pd.to_timedelta(daylight_df['sunrise'] + ':00')
    daylight_df['sunset'] = pd.to_timedelta(daylight_df['sunset'] + ':00')

    # Copy input DataFrame
    X = X.copy()

    # Add basic date components
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month_name()
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    X["holiday"] = X["date"].isin(holidays_2020_2021).astype(int)
    X["weekend"] = (X["date"].dt.dayofweek > 4).astype(int)

    # Add lockdown information
    X["lockdown"] = X["date"].apply(
        lambda d: any(start <= d <= end for start, end in lockdown_ranges)
    ).astype(int)

    # Map sunrise and sunset times to the DataFrame
    X = X.merge(daylight_df, on='month', how='left')

    # Calculate time of day as timedelta since midnight
    X['time_of_day'] = (
        X['date'].dt.hour * 3600 +
        X['date'].dt.minute * 60 +
        X['date'].dt.second
    ).apply(pd.to_timedelta, unit='s')

    # Add is_sun column
    X['is_sun'] = (
        (X['time_of_day'] >= X['sunrise']) &
        (X['time_of_day'] <= X['sunset'])
    ).astype(int)

    # Add cyclical hour features
    X['sin_hour'] = np.sin(2 * np.pi * X['hour'] / 24)
    X['cos_hour'] = np.cos(2 * np.pi * X['hour'] / 24)

    # Drop unnecessary columns
    X = X.drop(columns=["date", "sunrise", "sunset", "time_of_day", "month", "hour"])

    return X

In [5]:
# Define FunctionTransformer and preprocessor
date_encoder = FunctionTransformer(_encode_dates, validate=False)
date_cols = _encode_dates(data[["date"]]).columns.tolist()
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
    ]
)

In [6]:
important_columns = ["date", "pres", "ff", "t", "u", "vv", "n", "ht_neige", "rr1"]

# Load weather data
weather_data = pd.read_csv('./external_data/external_data.csv', usecols=important_columns)
print(weather_data.head())
print(weather_data.info())

# Process weather data
weather_data['date'] = pd.to_datetime(weather_data['date'])
weather_data = weather_data.dropna(axis=1, how='all')
weather_data.set_index('date', inplace=True)
weather_data = weather_data[~weather_data.index.duplicated(keep='first')]
weather_data_interpolated = weather_data.resample('h').interpolate(method='linear')

# Merge with main dataset
merged_data = data.merge(weather_data_interpolated, on='date', how='left')
print(merged_data.head())

                  date   ff       t   u    vv     n    pres  ht_neige  rr1
0  2021-01-01 00:00:00  1.8  272.75  96   990  10.0   99680      0.00  0.0
1  2021-01-01 03:00:00  1.7  271.25  98   210  25.0   99790      0.00  0.0
2  2021-01-01 06:00:00  2.6  271.95  98  3660  90.0   99820      0.00  0.0
3  2021-01-01 09:00:00  1.7  272.45  97  3500  50.0   99970      0.01  0.0
4  2021-01-01 12:00:00  1.0  276.95  82  8000  90.0  100000     -0.01  0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      3322 non-null   object 
 1   ff        3322 non-null   float64
 2   t         3322 non-null   float64
 3   u         3322 non-null   int64  
 4   vv        3322 non-null   int64  
 5   n         3166 non-null   float64
 6   pres      3322 non-null   int64  
 7   ht_neige  3273 non-null   float64
 8   rr1       3313 non-null   float64
dtypes: float64

In [7]:
# Define features and target
X = merged_data[["counter_name", "date", "longitude", "latitude", "ff", "t", "u", "vv", "n", "pres", "ht_neige", "rr1"]]
y = merged_data['log_bike_count']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model and pipeline
model = XGBRegressor(objective='reg:squarederror', random_state=42)
pipe = make_pipeline(date_encoder, preprocessor, model)

# Define parameter grid for GridSearchCV
param_grid = {
    'xgbregressor__n_estimators': [100, 200, 300],
    'xgbregressor__max_depth': [3, 5, 7],
    'xgbregressor__learning_rate': [0.01, 0.1, 0.2]
}

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=4
)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Test RMSE: {rmse}")

Best Parameters: {'xgbregressor__learning_rate': 0.2, 'xgbregressor__max_depth': 7, 'xgbregressor__n_estimators': 300}
Test RMSE: 0.534080834088994


In [8]:
# Load and preprocess test set
df_test = pd.read_parquet("./data/final_test.parquet")
df_test_merged = df_test.merge(weather_data_interpolated, on='date', how='left')
df_test_merged = df_test_merged.assign(**_encode_dates(df_test_merged[["date"]]))

# Prepare features for prediction
X_test_final = df_test_merged[["counter_name", "date", "longitude", "latitude", "ff", "t", "u", "vv", "n", "pres", "ht_neige", "rr1"]]

# Make predictions
y_pred = best_model.predict(X_test_final)

# Save results
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)