In [None]:
!pip install -q scikit-learn==1.3.1

In [None]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import TargetEncoder, OrdinalEncoder

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')

DATA_RESOLUTION = "1h"

VALIDATION_PERIODS = [
    (pd.Timestamp("2016-10-01 00:00:00"), pd.Timestamp("2016-11-01 00:00:00")),
    # (pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")),
    # (pd.Timestamp("2016-12-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")),
]

# Data Loading

In [None]:
CATEGORY_COLS = ["building_id", "meter_id", "site_id", "primary_use", "primary_use_id"]
UINT8_COLS = ["hour", "day_of_week", "month"]


def cast_dtypes(df: pd.DataFrame) -> pd.DataFrame:

    # Timestamps
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except KeyError:
        print("Col 'timestamp' missing from df. Skipping ...")

    # Categories
    for col in CATEGORY_COLS:
        try:
            df[col] = df[col].astype("category")
        except KeyError:
            print(f"Col '{col}' missing from df. Skipping ...")

    # UINT8
    for col in UINT8_COLS:
        try:
            if df[col].max() > np.iinfo(np.uint8).max:
                print(f"Col max for '{col}' exceeds np.uint8 max. Skipping ...")
                continue
            df[col] = df[col].astype(np.uint8)
        except KeyError:
            print(f"Col '{col}' missing from df. Skipping ...")
    
    return df

In [None]:
train_dataset = pd.read_parquet("/kaggle/input/ashrae-iii/train_df.parquet")
train_dataset = cast_dtypes(train_dataset)

# Target Encoding

In [None]:
train_dataset.head()

In [None]:
"building_id",
    "meter_id",
    # "site_id",
    # "square_feet",
    # "floor_count",
    # "air_temperature",
    # "cloud_coverage",
    # "dew_temperature",
    # "precip_depth_1_hr",
    # "sea_level_pressure",
    # "wind_direction_sin",
    # "wind_direction_cos",
    # "wind_speed",
    # "air_temperature_lag_1",
    # "air_temperature_lag_2",
    # "dew_temperature_lag_1",
    # "dew_temperature_lag_2",
    # "sea_level_pressure_lag_1",
    # "sea_level_pressure_lag_2",
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
    "is_weekend",
    # "building_age_years",
    # "building_area_square_feet",
    # "primary_use_id",
    # "relative_humidity",
    # "cold_chill",
    # "apparent_temperature",
    # "heat_index",

In [None]:
NUMERICAL_FEATURES = [
    "meter_id",
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
    "is_weekend",
    "square_feet",
    "floor_count",
    "building_age_years",
    "building_area_square_feet",
    "air_temperature",
    "cloud_coverage",
    "dew_temperature",
    "precip_depth_1_hr",
    "sea_level_pressure",
    "wind_direction_sin",
    "wind_direction_cos",
    "wind_speed",
    "air_temperature_lag_1",
    "air_temperature_lag_2",
    "dew_temperature_lag_1",
    "dew_temperature_lag_2",
    "sea_level_pressure_lag_1",
    "sea_level_pressure_lag_2",
    "relative_humidity",
    "cold_chill",
    "apparent_temperature",
    "heat_index",
]
HIGH_CATEGORICAL_FEATURES = ["building_id"]
LOW_CATEGORICAL_FEATURES = ["primary_use"]
LABEL_COLUMN = "log_meter_reading"

In [None]:
target_encoder = TargetEncoder(categories="auto", target_type="continuous", smooth="auto")
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

col_transformer = ColumnTransformer(
    transformers=[
        ("numerical", "passthrough", NUMERICAL_FEATURES),
        ("categorical", target_encoder, CATEGORICAL_FEATURES),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)
col_transformer.set_output(transform="pandas")

model = HistGradientBoostingRegressor(
    random_state=0,
    categorical_features=["meter_id", "is_weekend"]
)

pipeline = Pipeline(steps=[("transformer", col_transformer), ("estimator", model)])

In [None]:
def train_valid_split(train_df: pd.DataFrame):
    for val_start, val_end in VALIDATION_PERIODS:
        train_mask = train_df["timestamp"] < val_start
        test_mask = (train_df["timestamp"] >= val_start) & (train_df["timestamp"] < val_end)
        yield train_df.loc[train_mask], train_df.loc[test_mask]


def _train_valid_split(train_df: pd.DataFrame):
    for val_start, val_end in VALIDATION_PERIODS:
        train_mask = train_df["timestamp"] < val_start
        train_idx = train_df.loc[train_mask].index
        
        test_mask = (train_df["timestamp"] >= val_start) & (train_df["timestamp"] < val_end)
        test_idx = train_df.loc[test_mask].index
        yield np.array(train_idx), np.array(test_idx)

In [None]:
for train_idx, val_idx in _train_valid_split(train_dataset):
    break

In [None]:
for split, (train_df, val_df) in enumerate(train_valid_split(train_dataset)):
    break

In [None]:
X_train = train_df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
y_train = train_df[LABEL_COLUMN]

X_val = val_df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
y_val = val_df[LABEL_COLUMN]

In [None]:
pipeline.fit(X, y)

In [None]:
y_hat = pipeline.predict(X_val)

In [None]:
scores = cross_validate(
    pipeline,
    X=train_dataset[NUMERICAL_FEATURES + CATEGORICAL_FEATURES + ["timestamp"]],
    y=train_dataset[LABEL_COLUMN],
    scoring=["neg_mean_squared_error"],
    cv=_train_valid_split(train_dataset),
    return_indices=True,
    verbose=1
)

In [None]:
scores

In [None]:
scores["inde"]