In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
import matplotlib.pyplot as plt
import optuna
from vacances_scolaires_france import SchoolHolidayDates
from catboost import CatBoostRegressor

In [2]:
holiday_dates = SchoolHolidayDates()

# Fetch holidays for Zone C for specific years
zone_c_holidays_2020 = holiday_dates.holidays_for_year_and_zone(2020, 'C')
zone_c_holidays_2021 = holiday_dates.holidays_for_year_and_zone(2021, 'C')

# Combine holidays for both years
all_zone_c_holidays = list(zone_c_holidays_2020.keys()) + list(zone_c_holidays_2021.keys())

# Convert to pandas datetime
all_zone_c_holidays = pd.to_datetime(all_zone_c_holidays)

In [3]:
def encode_dates(X):
    """
    Encode date information from the 'date' column.
    Adds year, month, day, weekday, hour, holiday, and weekend indicators.
    """
    lockdown_periods = [
        ("2020-03-17", "2020-05-11"),
        ("2020-10-30", "2020-12-14"),
        ("2021-04-03", "2021-06-30"),
    ]
    
    lockdown_ranges = [
        (pd.to_datetime(start), pd.to_datetime(end)) for start, end in lockdown_periods
    ]
    
    X = X.copy()
    X["year"] = X["date_x"].dt.year
    X["month"] = X["date_x"].dt.month
    X["day"] = X["date_x"].dt.day
    X["weekday"] = X["date_x"].dt.weekday
    X["hour"] = X["date_x"].dt.hour
    X["holiday"] = X["date_x"].isin(all_zone_c_holidays).astype(int)
    X["weekend"] = (X["date_x"].dt.dayofweek > 4).astype(int)
    X["lockdown"] = X["date_x"].apply(
        lambda d: any(start <= d <= end for start, end in lockdown_ranges)
    ).astype(int)
    X['is_peak'] = X['hour'].apply(lambda x: 1 if (6 <= x < 9 or 16 <= x < 19) else 0)

    X['sin_hour'] = np.sin(2 * np.pi * X['hour'] / 24)
    X['cos_hour'] = np.cos(2 * np.pi * X['hour'] / 24)
    
    return X.drop(columns=['date_x', 'hour'])

In [4]:
def engineer_weather_features(data):
    # 1. Categorical Buckets
    data['rain_category'] = pd.cut(
        data['rr1'], bins=[-1, 0, 2, 10, float('inf')],
        labels=['No Rain', 'Light Rain', 'Moderate Rain', 'Heavy Rain']
    )
    
    data['snow_category'] = pd.cut(
        data['ht_neige'], bins=[-1, 0, 0.01, 0.05, float('inf')],
        labels=['No Snow', 'Light Snow', 'Moderate Snow', 'Heavy Snow']
    )
    
    data['is_hot_day'] = (data['t'] > 300).astype(int)  # Assuming temperature in Kelvin
    data['is_cold_day'] = (data['t'] < 283).astype(int)
    data['high_wind'] = (data['ff'] > 5).astype(int)
    
    # 3. Interaction Features
    data['rain_with_wind'] = data['rr1'] * data['ff']
    data['rolling_rain'] = data['rr1'].rolling(window=3, min_periods=1).sum()
    
    return data

In [5]:
data = pd.read_parquet(Path("data") / "train.parquet")

important_columns = ["date", "pres", "ff", "t", "u", "vv", "n", "ht_neige", "rr1"]
weather_data = pd.read_csv("./external_data/external_data.csv", usecols=important_columns)

In [6]:
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data = weather_data.dropna(axis=1, how="all")
weather_data.set_index("date", inplace=True)
weather_data = weather_data[~weather_data.index.duplicated(keep="first")]
weather_data_interpolated = weather_data.resample("h").interpolate(method="linear")

In [7]:
covid_data = pd.read_csv('./synthese-fra (1).csv', parse_dates=False)
covid_data['date_only'] = pd.to_datetime(covid_data['date']).dt.date

In [8]:
merged_data = data.merge(weather_data_interpolated, on="date", how="left")
merged_data['date_only'] = pd.to_datetime(merged_data['date']).dt.date

merged_data = merged_data.merge(covid_data, on="date_only", how="left")

missing_values = merged_data.isnull().sum()

# Display missing values
print("Missing Values per Column:")
print(missing_values)

Missing Values per Column:
counter_id                             0
counter_name                           0
site_id                                0
site_name                              0
bike_count                             0
date_x                                 0
counter_installation_date              0
coordinates                            0
counter_technical_id                   0
latitude                               0
longitude                              0
log_bike_count                         0
ff                                     0
t                                      0
u                                      0
vv                                     0
n                                      0
pres                                   0
ht_neige                               0
rr1                                    0
date_only                              0
date_y                                 0
total_cas_confirmes               270570
total_deces_hopital           

In [9]:
merged_data

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date_x,counter_installation_date,coordinates,counter_technical_id,latitude,...,total_cas_confirmes,total_deces_hopital,total_deces_ehpad,total_cas_confirmes_ehpad,total_cas_possibles_ehpad,patients_reanimation,patients_hospitalises,total_patients_gueris,nouveaux_patients_hospitalises,nouveaux_patients_reanimation
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,286007.0,20147,10514.0,40300.0,,424.0,4604.0,86712.0,300.0,54.0
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,286007.0,20147,10514.0,40300.0,,424.0,4604.0,86712.0,300.0,54.0
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,286007.0,20147,10514.0,40300.0,,424.0,4604.0,86712.0,300.0,54.0
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,286007.0,20147,10514.0,40300.0,,424.0,4604.0,86712.0,300.0,54.0
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,286007.0,20147,10514.0,40300.0,,424.0,4604.0,86712.0,300.0,54.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496822,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,445.0,2021-09-09 06:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,,88621,26742.0,,,2195.0,10323.0,413273.0,554.0,139.0
496823,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,145.0,2021-09-09 10:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,,88621,26742.0,,,2195.0,10323.0,413273.0,554.0,139.0
496824,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,218.0,2021-09-09 15:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,,88621,26742.0,,,2195.0,10323.0,413273.0,554.0,139.0
496825,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,21.0,2021-09-09 22:00:00,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,...,,88621,26742.0,,,2195.0,10323.0,413273.0,554.0,139.0


In [10]:
X = merged_data[["counter_name", "site_name", "date_x", "longitude", "latitude", "ff", "t", "u", "vv", "n", "pres", "ht_neige", "rr1", "nouveaux_patients_hospitalises"]]
y = merged_data["log_bike_count"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply date encoding

X_train_encoded2 = encode_dates(X_train)
X_test_encoded2 = encode_dates(X_test)

X_train_encoded3 = engineer_weather_features(X_train_encoded2)
X_test_encoded3 = engineer_weather_features(X_test_encoded2)

# Column transformer for preprocessing
categorical_features = ["counter_name", "site_name", "rain_category", "snow_category"]
numerical_features = list(X_train_encoded3.drop(columns=categorical_features).columns)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numerical_features)
    ]
)

In [11]:
catboost_model = CatBoostRegressor(
    iterations=707,  # Number of boosting iterations
    learning_rate=0.29318426953870014,  # Learning rate
    depth=10,  # Depth of the trees
    loss_function='RMSE',  # Loss function for regression
    l2_leaf_reg=8.370078515917884,
    bagging_temperature=0.5183044632651785,
    border_count=227,
    random_strength=8.322949500202851,
    random_seed=42,
    verbose=200
)

catboost_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", catboost_model)
])

In [12]:
catboost_pipeline.fit(X_train_encoded3, y_train)

# Predict on test data
y_pred = catboost_pipeline.predict(X_test_encoded3)

# Evaluate RMSE
final_rmse = root_mean_squared_error(y_test, y_pred)
print(f"Final CatBoost RMSE: {final_rmse:.4f}")

0:	learn: 1.3816288	total: 402ms	remaining: 4m 43s
200:	learn: 0.4938446	total: 8.39s	remaining: 21.1s
400:	learn: 0.4721609	total: 16.1s	remaining: 12.3s
600:	learn: 0.4578562	total: 23.4s	remaining: 4.13s
706:	learn: 0.4519940	total: 27.3s	remaining: 0us
Final CatBoost RMSE: 0.5179


In [13]:
df_test = pd.read_parquet("./data/final_test.parquet")
df_test_merged = df_test.merge(weather_data_interpolated, on='date', how='left')
df_test_merged['date_only'] = pd.to_datetime(df_test_merged['date']).dt.date

df_test_merged = df_test_merged.merge(covid_data, on='date_only', how='left')

df_test_merged = df_test_merged.assign(**encode_dates(df_test_merged[["date_x"]]))
df_test_merged = df_test_merged.assign(**engineer_weather_features(df_test_merged))
X_test_final = df_test_merged[[
    "counter_name", "site_name", "longitude", "latitude", "ff", "t", "u", "vv", "n", "pres", "ht_neige", "rr1",
    "rain_category", "snow_category", "is_hot_day", "is_cold_day", 
    "high_wind", "rain_with_wind", "rolling_rain", "year", "month", "day", "weekday", "sin_hour", "cos_hour", 
    "is_peak", "holiday", "weekend", "lockdown", "nouveaux_patients_hospitalises"
]]

X_test_final = preprocessor.transform(X_test_final)
final_predictions = catboost_pipeline.named_steps['model'].predict(X_test_final)

# Save predictions
results = pd.DataFrame({"Id": np.arange(final_predictions.shape[0]), "log_bike_count": final_predictions})
results.to_csv("submission_catboost_baseline2.csv", index=False)