In [9]:
from pathlib import Path
import numpy as np
import pandas as pd
from feature_engineering import codify_date_2, remove_outliers, get_X_y, covid_19_2
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor


In [10]:
from sklearn.preprocessing import FunctionTransformer

def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [11]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19_2(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])
  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Unnamed: 0,StringencyIndex_Average,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,46.76,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,46.76,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,46.76,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,46.76,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,46.76,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [12]:
X_test = pd.read_parquet(Path("data") / "final_test.parquet")
X_test = codify_date_2(X_test)
X_test = covid_19_2(X_test)

  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


In [13]:
X_original = pd.read_parquet(Path("data") / "final_test.parquet")

X_original.loc[:, "index1"] = X_original.index
X_original = codify_date_2(X_original)
X_test.loc[:, "index2"] = X_test.index
merged_df = X_test.merge(X_original, on=["datetime", "counter_name"], how="left")

merged_df = merged_df.sort_values("index1")

In [14]:
X_test = X_test.drop(columns=["counter_id", "site_id", "site_name", 
                                "counter_installation_date", 
                              "coordinates", "counter_technical_id",
                              "latitude", "longitude", "datetime", "date"])

In [15]:
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ("hour_sin", sin_transformer(24), ["hour"]),
    ("hour_cos", cos_transformer(24), ["hour"]),
], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(
        colsample_bytree=0.8494252738248523,
        gamma=0.8835608079221302,
        learning_rate=0.12825147053070918,
        max_depth=8,
        n_estimators=428,
        reg_alpha=5.479087800903766,
        reg_lambda=6.995216197905481,
        subsample=0.6983244655616523,
        random_state=1
    ))
])

pipeline.fit(X, y)

y_pred = pipeline.predict(X_test)

results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)