In [2]:
import Feature_engineering

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import root_mean_squared_error
import xgboost as xgb
from xgboost import XGBRegressor

import Feature_engineering

problem_title = "Bike count prediction"
_target_column_name = "log_bike_count"


X_train, y_train = Feature_engineering.get_train_data(path="data/train.parquet")

X_test = pd.read_parquet("data/final_test.parquet")


columns_encoder = FunctionTransformer(Feature_engineering._encode_columns)

oneHot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
oneHot_cols = ["counter_id", "longitude", "latitude", "counter_count"]

ord_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=22)
ord_cols = ["counter_installation_date"]

date_encoder = FunctionTransformer(Feature_engineering._encode_dates)
date_cols = ["year", "month", "day", "weekday", "hour"]

covid_encoder = FunctionTransformer(Feature_engineering._add_covid)
covid_cols = ["is_lockdown"]

meteo_encoder = FunctionTransformer(Feature_engineering._merge_external_data)
meteo_cols = ["t", "n", "u"]

holidays_encoder = FunctionTransformer(Feature_engineering._add_holiday)
holidays_cols = ["is_holidays", "is_bank_holiday"]

scaler = StandardScaler(with_mean=False)

preprocessor = ColumnTransformer(
    [
        (
            "date",
            OneHotEncoder(handle_unknown="ignore", sparse_output=False),
            date_cols,
        ),
        ("meteo", scaler, meteo_cols),
        ("cat onehot", oneHot_encoder, oneHot_cols),
        ("cat ordinal", make_pipeline(ord_encoder, scaler), ord_cols),
    ]
)

regressor = XGBRegressor(
    objective="reg:squarederror",
    max_depth=9,
    learning_rate=0.025902257423954078,
    n_estimators=486,
    subsample=0.911218854047308,
    colsample_bytree=0.8798851225524656,
    min_child_weight=2,
    gamma=0.06729662326925455,
    reg_alpha=1.0490840880173807e-05,
    reg_lambda=0.0010921919306983888,
)

pipe = make_pipeline(
    columns_encoder,
    meteo_encoder,
    covid_encoder,
    holidays_encoder,
    date_encoder,
    preprocessor,
    regressor,
)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
#results.to_csv("submission.csv", index=False)

KeyboardInterrupt: 

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
import Feature_engineering

problem_title = "Bike count prediction"
_target_column_name = "log_bike_count"

X_train, y_train = Feature_engineering.get_train_data(path="data/train.parquet")

X_test = pd.read_parquet("data/final_test.parquet")

#X_train, y_train = Feature_engineering.get_train_data(path="../input/msdb-2024/train.parquet")

#X_test = pd.read_parquet("../input/msdb-2024/final_test.parquet")

columns_encoder = FunctionTransformer(Feature_engineering._encode_columns)

date_encoder = FunctionTransformer(Feature_engineering._encode_dates)

time_encoder = FunctionTransformer(Feature_engineering.get_time_of_day)

season_encoder = FunctionTransformer(Feature_engineering.get_season)

covid_encoder = FunctionTransformer(Feature_engineering._add_covid)

meteo_encoder = FunctionTransformer(Feature_engineering._merge_external_data)

holidays_encoder = FunctionTransformer(Feature_engineering._add_holiday)

district_encoder = FunctionTransformer(Feature_engineering._add_arrondissement_with_geopandas)

erase_date = FunctionTransformer(Feature_engineering.erase_date)

ordinal_cols = ["counter_installation_date"]
onehot_cols = ["counter_name"]
scale_cols = [
        "latitude",
        "longitude",
        "year",
        "month",
        "week_number",
        "day",
        "weekday",
        "hour",
        "dayofyear",
        "time_of_day",
        "season",
        "pres",
        "u",
        "tend",
        "ww",
        "rr6",
        "rr12",
        "rr24",
        "etat_sol",
        "ht_neige",
        "n",
        "t",
        "td",
        "tend24",
        "district"]

scaler = StandardScaler()
onehot = OneHotEncoder(sparse_output=False)
ordinal = OrdinalEncoder()

preprocessor = ColumnTransformer(
    [
        ("num", scaler, scale_cols),
        ("onehot", onehot, onehot_cols),
        ("ordinal", ordinal, ordinal_cols),
    ]
)

regressor = XGBRegressor()

pipe = make_pipeline(
    columns_encoder,
    date_encoder,
    time_encoder,
    season_encoder,
    meteo_encoder,
    covid_encoder,
    holidays_encoder, 
    district_encoder,
    preprocessor,
    regressor,
)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)
