In [None]:
!pip install -q scikit-learn==1.3.1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import TargetEncoder, OrdinalEncoder

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')

DATA_RESOLUTION = "1h"

# Data Loading

In [None]:
CATEGORY_COLS = ["building_id", "meter_id", "site_id", "primary_use", "primary_use_id"]
UINT8_COLS = ["hour", "day_of_week", "month"]


def cast_dtypes(df: pd.DataFrame) -> pd.DataFrame:

    # Timestamps
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except KeyError:
        print("Col 'timestamp' missing from df. Skipping ...")

    # Categories
    for col in CATEGORY_COLS:
        try:
            df[col] = df[col].astype("category")
        except KeyError:
            print(f"Col '{col}' missing from df. Skipping ...")

    # UINT8
    for col in UINT8_COLS:
        try:
            if df[col].max() > np.iinfo(np.uint8).max:
                print(f"Col max for '{col}' exceeds np.uint8 max. Skipping ...")
                continue
            df[col] = df[col].astype(np.uint8)
        except KeyError:
            print(f"Col '{col}' missing from df. Skipping ...")
    
    return df

In [None]:
train_dataset = pd.read_parquet("/kaggle/input/ashrae-iii/train_df.parquet")
train_dataset = cast_dtypes(train_dataset)

# Target Encoding

In [None]:
VALIDATION_PERIODS = [
    (pd.Timestamp("2016-10-01 00:00:00"), pd.Timestamp("2016-11-01 00:00:00")),
    (pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")),
    (pd.Timestamp("2016-12-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")),
]

val_start, val_end = VALIDATION_PERIODS[0]

train_mask = train_dataset["timestamp"] < val_start
train_df = train_dataset.loc[train_mask]

val_mask = (train_dataset["timestamp"] >= val_start) & (train_dataset["timestamp"] < val_end)
val_df = train_dataset.loc[val_mask]

In [None]:
cols = ["building_id", "meter_id", "log_meter_reading"]
train_df = train_df[cols]
val_df = val_df[cols]

train_encodings, val_encodings = [], []
for meter_id in train_df["meter_id"].unique():
    
    train_meter_df = train_df[train_df["meter_id"] == meter_id]
    X_train, y_train = train_meter_df[["building_id"]], train_meter_df["log_meter_reading"]
    
    val_meter_df = val_df[val_df["meter_id"] == meter_id]
    X_val, y_val = val_meter_df[["building_id"]], val_meter_df["log_meter_reading"]
    
    encoder = TargetEncoder(categories="auto", target_type="continuous", smooth="auto")
    encoder = encoder.fit(X_train, y_train)

    X_train_trans = encoder.transform(X_train)
    X_train_trans = pd.DataFrame(X_train_trans, columns=["building_id_transform"])
    X_train_trans = pd.concat([train_meter_df.reset_index(drop=True), X_train_trans], axis=1)
    train_encodings.append(X_train_trans)

    X_val_trans = encoder.transform(X_val)
    X_val_trans = pd.DataFrame(X_val_trans, columns=["building_id_transform"])
    X_val_trans = pd.concat([val_meter_df.reset_index(drop=True), X_val_trans], axis=1)
    val_encodings.append(X_val_trans)
    
train_encodings = pd.concat(train_encodings, ignore_index=True)
val_encodings = pd.concat(val_encodings, ignore_index=True)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(15, 3.5), sharey=True)
for meter_id in train_df["meter_id"].unique():
    meter_df = train_encodings[train_encodings["meter_id"] == meter_id]
    meter_df = meter_df.sample(750)
    ax[meter_id].scatter(
        x=meter_df["building_id_transform"].values,
        y=meter_df["log_meter_reading"].values,
        alpha=0.2
    )
    ax[meter_id].set(
        title=f"Meter id: {meter_id}",
        xlabel="building_id_encoding"
    )
ax[0].set(ylabel="log_meter_reading")

fig.tight_layout()

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(15, 3.5), sharey=True)
for meter_id in val_df["meter_id"].unique():
    meter_df = val_encodings[val_encodings["meter_id"] == meter_id]
    meter_df = meter_df.sample(750)
    ax[meter_id].scatter(
        x=meter_df["building_id_transform"].values,
        y=meter_df["log_meter_reading"].values,
        alpha=0.2
    )
    ax[meter_id].set(
        title=f"Meter id: {meter_id}",
        xlabel="building_id_encoding"
    )
ax[0].set(ylabel="log_meter_reading")

fig.tight_layout()