In [None]:
import gc

import numpy as np
import pandas as pd
from matplotlib import cm
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm

from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

from tqdm import tqdm

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')

DATA_RESOLUTION = "1h"

## Data Loading

In [None]:
CATEGORY_COLS = ["building_id", "meter_id", "site_id", "primary_use"]
UINT8_COLS = ["hour", "day_of_week", "month"]


def drop_cols(df: pd.DataFrame) -> pd.DataFrame:
    cols_to_drop = ["hour", "day_of_week", "month"]
    df = df.drop(columns=cols_to_drop)
    return df


def cast_dtypes(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:

    # Timestamps
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except KeyError:
        if verbose:
            print("Col 'timestamp' missing from df. Skipping ...")

    # Categories
    for col in CATEGORY_COLS:
        try:
            df[col] = df[col].astype("category")
        except KeyError:
            if verbose:
                print(f"Col '{col}' missing from df. Skipping ...")

    # UINT8
    for col in UINT8_COLS:
        try:
            if df[col].max() > np.iinfo(np.uint8).max:
                print(f"Col max for '{col}' exceeds np.uint8 max. Skipping ...")
                continue
            df[col] = df[col].astype(np.uint8)
        except KeyError:
            if verbose:
                print(f"Col '{col}' missing from df. Skipping ...")
    
    return df

In [None]:
train_dataset = pd.read_parquet("/kaggle/input/ashrae-iii/train_df.parquet")
train_dataset = drop_cols(train_dataset)
train_dataset = cast_dtypes(train_dataset)

## Model Training

In [None]:
CATEGORICAL_FEATURES = [
    "building_id",
    "meter_id",
    "site_id",
    "primary_use",
]
NUMERICAL_FEATURES = [
    "square_feet",
    "floor_count",
    "air_temperature",
    "cloud_coverage",
    "dew_temperature",
    "precip_depth_1_hr",
    "sea_level_pressure",
    "wind_direction_sin",
    "wind_direction_cos",
    "wind_speed",
    "air_temperature_lag_1",
    "air_temperature_lag_2",
    "air_temperature_lag_3",
    "air_temperature_lag_4",
    "air_temperature_lag_5",
    "air_temperature_rolling_mean_12",
    "air_temperature_rolling_mean_24",
    "dew_temperature_lag_1",
    "dew_temperature_lag_2",
    "dew_temperature_lag_3",
    "dew_temperature_lag_4",
    "dew_temperature_lag_5",
    "dew_temperature_rolling_mean_12",
    "dew_temperature_rolling_mean_24",
    "sea_level_pressure_lag_1",
    "sea_level_pressure_lag_2",
    "sea_level_pressure_lag_3",
    "sea_level_pressure_lag_4",
    "sea_level_pressure_lag_5",
    "sea_level_pressure_rolling_mean_12",
    "sea_level_pressure_rolling_mean_24",
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
    "is_weekend",
    "building_age_years",
    "building_area_square_feet",
    "relative_humidity",
    "cold_chill",
    "apparent_temperature",
    "heat_index",
]
FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

LABEL = "meter_reading"

In [None]:
N_ITERATIONS = 10
TRAIN_PARAMETERS = {
    "objective": "mean_squared_error",
    "learning_rate": 0.01,
    "seed": 1,
    "max_bin": 255,
    "num_leaves": 2 ** 6 - 1,
    "min_data_in_leaf": 50,
    "metric": ["rmse"],
}
DATASET_PARAMETERS = {"categorical_feature": CATEGORICAL_FEATURES}

In [None]:
def get_column_transformer() -> ColumnTransformer:
    ordinal_encoder = OrdinalEncoder(
        categories="auto",
        handle_unknown="use_encoded_value",
        unknown_value=-1,
        dtype=np.int32,
    )
    transformer = ColumnTransformer(
        transformers=[
            ("numerical_features", "passthrough", NUMERICAL_FEATURES),
            ("ordinal_encoder", ordinal_encoder, ["primary_use"]),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )
    transformer.set_output(transform="pandas")
    return transformer


def target_transform(y: pd.Series) -> pd.Series:
    return np.log1p(y)

In [None]:
val_start, val_end = pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")
val_start, val_end = pd.Timestamp("2016-12-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")

# Train / val split
train_df = train_dataset[train_dataset["timestamp"] < val_start]
valid_df = train_dataset[(train_dataset["timestamp"].between(val_start, val_end, inclusive="left"))]

X_train, y_train = train_df[FEATURES], train_df[LABEL]
X_valid, y_valid = valid_df[FEATURES], valid_df[LABEL]

# Feature / target transforms
y_train = target_transform(y_train)
y_valid = target_transform(y_valid)

col_transformer = get_column_transformer()
col_tranformer = col_transformer.fit(X_train, y_train)
X_train = col_transformer.transform(X_train)
X_train = X_train.astype({"primary_use": "category"})
X_valid = col_tranformer.transform(X_valid)
X_valid = X_valid.astype({"primary_use": "category"})

train_ds = lgbm.Dataset(data=X_train, label=y_train)
valid_ds = lgbm.Dataset(data=X_valid, label=y_valid)

# Train Lightgbm
eval_results = {}
model = lgbm.train(
    TRAIN_PARAMETERS,
    num_boost_round=N_ITERATIONS,
    train_set=train_ds,
    valid_sets=[train_ds, valid_ds],
    valid_names=["train", "valid"],
    callbacks=[
        lgbm.log_evaluation(period=10),
        lgbm.record_evaluation(eval_results),
    ]
)

In [None]:
# Predict 
y_hat = model.predict(X_valid)

valid_df = valid_df.copy()
valid_df["y_true"] = y_valid
valid_df["y_hat"] = y_hat

## Evalute

In [None]:
valid_df["squared_error"] = (y_hat - y_valid) ** 2
valid_df = valid_df.set_index(["meter_id", "site_id", "building_id"])

### Predictions by site, meter, and building

In [None]:
# Plot predictions for individual buildings
meter_id = 2
site_id = 14
building_ids = valid_df.loc[(meter_id, site_id)].index.values

n_plots = 5
n_buildings_per_plot = 3

fig, ax = plt.subplots(1, n_plots, figsize=(20, 3.5))
colors = cm.copper(np.linspace(0, 1, n_buildings_per_plot))
for plot_idx in range(n_plots):
    building_ids_to_plot = np.random.choice(building_ids, size=n_buildings_per_plot)
    for b_idx, building_id in enumerate(building_ids_to_plot):
        msb_df = valid_df.loc[(meter_id, site_id, building_id)].sort_values("timestamp")
        ax[plot_idx].plot(
            msb_df["timestamp"].values,
            msb_df["y_true"].values,
            label=f"B.ID: {building_id}",
            color=colors[b_idx],
        )
        ax[plot_idx].plot(
            msb_df["timestamp"].values,
            msb_df["y_hat"].values,
            color=colors[b_idx],
            ls="--",
        )

    ax[plot_idx].legend(fontsize="x-small")
    for tick in ax[plot_idx].get_xticklabels():
        tick.set_rotation(45)
    
fig.tight_layout();

In [None]:
site_ids_by_meter_type = {0: [2, 3, 14]}

In [None]:
# Plot heatmaps
meter_id = 0
sites_to_plot = site_ids_by_meter_type[meter_id]

fig, ax = plt.subplots(1, len(sites_to_plot), figsize=(5 * len(sites_to_plot), 4)) 

for i, site_id in enumerate(sites_to_plot):
    site_meter_df = (
        valid_df.loc[(meter_id, site_id)]
        .reset_index()
        .sort_values(["building_id", "timestamp"])
        .pivot(index="building_id", columns="timestamp", values="squared_error")
    )
    
    ax[i] = sns.heatmap(site_meter_df, ax=ax[i])
    
    # Format x-axis labels
    timestamps = site_meter_df.columns
    xtick_locs = range(0, len(timestamps), 72)  # Every third entry
    xtick_labels = [pd.to_datetime(timestamps[i]).strftime("%Y-%m-%d") for i in xtick_locs]
    
    ax[i].set_xticks(xtick_locs)
    ax[i].set_xticklabels(xtick_labels, rotation=45, ha="center")
    ax[i].set_xlabel("")
    
fig.tight_layout();

In [None]:
meter_id = 0
sites_to_plot = site_ids_by_meter_type[meter_id]

fig, ax = plt.subplots(1, len(sites_to_plot), figsize=(5 * len(sites_to_plot), 4)) 

for i, site_id in enumerate(sites_to_plot):
    site_meter_df = (
        valid_df.loc[(meter_id, site_id)]
        .reset_index()
        .sort_values(["building_id", "timestamp"])
    )
    
    # Get error thresholds and scales
    error_threshold_q75 = np.percentile(site_meter_df["squared_error"], q=75)
    error_threshold_q90 = np.percentile(site_meter_df["squared_error"], q=90)
    
    def error_threshold(e: float):
        if e < error_threshold_q75:
            return 1
        if e < error_threshold_q90:
            return 2
        return 3
    
    site_meter_df["error_scale"] = site_meter_df["squared_error"].apply(error_threshold)
    
    site_meter_df = site_meter_df.pivot(index="building_id", columns="timestamp", values="error_scale")
    
    ax[i] = sns.heatmap(site_meter_df, ax=ax[i], cmap=cmap, norm=norm, cbar=True, linewidths=0.5)
    cbar = ax[i].collections[0].colorbar
    cbar.set_ticks(unique_values)
    cbar.set_ticklabels([str(v) for v in unique_values])
    
    # Format x-axis labels
    timestamps = site_meter_df.columns
    xtick_locs = range(0, len(timestamps), 72)  # Every third entry
    xtick_labels = [pd.to_datetime(timestamps[i]).strftime("%Y-%m-%d") for i in xtick_locs]
    
    ax[i].set_xticks(xtick_locs)
    ax[i].set_xticklabels(xtick_labels, rotation=45, ha="center")
    ax[i].set_xlabel("")
    
fig.tight_layout();

In [None]:
unique_values = [0, 1, 2]
# Define a custom discrete colormap with three colors
colors = ["#1f78b4", "#33a02c", "#e31a1c"]  # Blue, Green, Red
cmap = mcolors.ListedColormap(colors)

# Create a normalization that maps unique values to colors
bounds = unique_values + [unique_values[-1] + 1]  # Add upper bound
norm = mcolors.BoundaryNorm(boundaries=bounds, ncolors=len(colors))