In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
# from jours_feries_france import JoursFeries
import matplotlib.pyplot as plt
from vacances_scolaires_france import SchoolHolidayDates

In [2]:
#holidays_2020_2021 = (
#    list(JoursFeries.for_year(2020).values()) +
#    list(JoursFeries.for_year(2021).values())
#)

# holidays_2020_2021 = pd.to_datetime(holidays_2020_2021)

In [3]:
holiday_dates = SchoolHolidayDates()

# Fetch holidays for Zone C for specific years
zone_c_holidays_2020 = holiday_dates.holidays_for_year_and_zone(2020, 'C')
zone_c_holidays_2021 = holiday_dates.holidays_for_year_and_zone(2021, 'C')

# Combine holidays for both years
all_zone_c_holidays = list(zone_c_holidays_2020.keys()) + list(zone_c_holidays_2021.keys())

# Convert to pandas datetime
all_zone_c_holidays = pd.to_datetime(all_zone_c_holidays)

In [4]:
def encode_dates(X):
    """
    Encode date information from the 'date' column.
    Adds year, month, day, weekday, hour, holiday, and weekend indicators.
    """
    lockdown_periods = [
        ("2020-03-17", "2020-05-11"),
        ("2020-10-30", "2020-12-14"),
        ("2021-04-03", "2021-06-30"),
    ]
    
    lockdown_ranges = [
        (pd.to_datetime(start), pd.to_datetime(end)) for start, end in lockdown_periods
    ]
    
    X = X.copy()
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    X["holiday"] = X["date"].isin(all_zone_c_holidays).astype(int)
    X["weekend"] = (X["date"].dt.dayofweek > 4).astype(int)
    X["lockdown"] = X["date"].apply(
        lambda d: any(start <= d <= end for start, end in lockdown_ranges)
    ).astype(int)
    X['is_peak'] = X['hour'].apply(lambda x: 1 if (6 <= x < 9 or 16 <= x < 19) else 0)

    X['sin_hour'] = np.sin(2 * np.pi * X['hour'] / 24)
    X['cos_hour'] = np.cos(2 * np.pi * X['hour'] / 24)
    
    return X.drop(columns=['date', 'hour'])

In [5]:
def engineer_weather_features(data):
    # 1. Categorical Buckets
    data['rain_category'] = pd.cut(
        data['rr1'], bins=[-1, 0, 2, 10, float('inf')],
        labels=['No Rain', 'Light Rain', 'Moderate Rain', 'Heavy Rain']
    )
    
    data['snow_category'] = pd.cut(
        data['ht_neige'], bins=[-1, 0, 0.01, 0.05, float('inf')],
        labels=['No Snow', 'Light Snow', 'Moderate Snow', 'Heavy Snow']
    )
    
    data['is_hot_day'] = (data['t'] > 300).astype(int)  # Assuming temperature in Kelvin
    data['is_cold_day'] = (data['t'] < 283).astype(int)
    data['high_wind'] = (data['ff'] > 5).astype(int)
    
    # 3. Interaction Features
    data['rain_with_wind'] = data['rr1'] * data['ff']
    
    data['rolling_rain'] = data['rr1'].rolling(window=3, min_periods=1).sum()
    
    return data

In [6]:
data = pd.read_parquet(Path("data") / "train.parquet")

important_columns = ["date", "pres", "ff", "t", "u", "vv", "n", "ht_neige", "rr1"]
weather_data = pd.read_csv("./external_data/external_data.csv", usecols=important_columns)

In [7]:
weather_data["date"] = pd.to_datetime(weather_data["date"])
weather_data = weather_data.dropna(axis=1, how="all")
weather_data.set_index("date", inplace=True)
weather_data = weather_data[~weather_data.index.duplicated(keep="first")]
weather_data_interpolated = weather_data.resample("h").interpolate(method="linear")

In [8]:
merged_data = data.merge(weather_data_interpolated, on="date", how="left")

X = merged_data[["counter_name", "site_name", "date", "longitude", "latitude", "ff", "t", "u", "vv", "n", "pres", "ht_neige", "rr1"]]
y = merged_data["log_bike_count"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply date encoding

X_train_encoded2 = encode_dates(X_train)
X_test_encoded2 = encode_dates(X_test)

X_train_encoded3 = engineer_weather_features(X_train_encoded2)
X_test_encoded3 = engineer_weather_features(X_test_encoded2)

# Column transformer for preprocessing
categorical_features = ["counter_name", "site_name", "rain_category", "snow_category"]
numerical_features = list(X_train_encoded3.drop(columns=categorical_features).columns)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)

In [9]:
xgboost_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", XGBRegressor(random_state=42))
])

In [10]:
print("\nXGBoost Model Performance:")
xgboost_pipeline.fit(X_train_encoded3, y_train)
y_pred = xgboost_pipeline.predict(X_test_encoded3)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"XGBoost RMSE: {rmse:.4f}")


XGBoost Model Performance:
XGBoost RMSE: 0.4288


In [11]:
# Load the final test dataset
df_test = pd.read_parquet("./data/final_test.parquet")

# Merge with weather data
df_test_merged = df_test.merge(weather_data_interpolated, on='date', how='left')

# Apply date and weather feature engineering
df_test_merged = df_test_merged.assign(**encode_dates(df_test_merged[["date"]]))
df_test_merged = df_test_merged.assign(**engineer_weather_features(df_test_merged))

# Prepare features for prediction
X_test_final = df_test_merged[[
    "counter_name", "site_name", "longitude", "latitude", "ff", "t", "u", "vv", "n", "pres", "ht_neige", "rr1",
    "rain_category", "snow_category", "is_hot_day", "is_cold_day", "high_wind", "rain_with_wind", "rolling_rain",
    "year", "month", "day", "weekday", "sin_hour", "cos_hour", "is_peak", "holiday", "weekend", "lockdown"
]]

# Ensure consistency in column transformations
X_test_final = preprocessor.transform(X_test_final)

# Make predictions using the trained model
y_pred = xgboost_pipeline.named_steps['model'].predict(X_test_final)

# Save results to a CSV file
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)