In [None]:
!pip install scikit-learn==1.3.1 -q

In [None]:
import gc

import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm

from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import TargetEncoder, OrdinalEncoder

In [None]:
SUBMISSION_RUN = True

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')

DATA_RESOLUTION = "1h"

VALIDATION_PERIODS = [
    (pd.Timestamp("2016-08-01 00:00:00"), pd.Timestamp("2016-09-01 00:00:00")),
    (pd.Timestamp("2016-09-01 00:00:00"), pd.Timestamp("2016-10-01 00:00:00")),
    (pd.Timestamp("2016-10-01 00:00:00"), pd.Timestamp("2016-11-01 00:00:00")),
    (pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")),
    (pd.Timestamp("2016-12-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")),
]

# VALIDATION_PERIODS = [
#     (pd.Timestamp("2016-08-01 00:00:00"), pd.Timestamp("2016-10-01 00:00:00")),
#     (pd.Timestamp("2016-09-01 00:00:00"), pd.Timestamp("2016-11-01 00:00:00")),
#     (pd.Timestamp("2016-10-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")),
#     (pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")),
# ]

METER_IDS = [0, 1, 2, 3]

## Data loading

In [None]:
CATEGORY_COLS = ["building_id", "meter_id", "site_id", "primary_use"]
UINT8_COLS = ["hour", "day_of_week", "month"]


def drop_cols(df: pd.DataFrame) -> pd.DataFrame:
    cols_to_drop = ["hour", "day_of_week", "month"]
    cols_to_drop = cols_to_drop + (["timestamp"] if SUBMISSION_RUN else [])
    df = df.drop(columns=cols_to_drop)
    return df


def cast_dtypes(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:

    # Timestamps
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except KeyError:
        if verbose:
            print("Col 'timestamp' missing from df. Skipping ...")

    # Categories
    for col in CATEGORY_COLS:
        try:
            df[col] = df[col].astype("category")
        except KeyError:
            if verbose:
                print(f"Col '{col}' missing from df. Skipping ...")

    # UINT8
    for col in UINT8_COLS:
        try:
            if df[col].max() > np.iinfo(np.uint8).max:
                if verbose:
                    print(f"Col max for '{col}' exceeds np.uint8 max. Skipping ...")
                continue
            df[col] = df[col].astype(np.uint8)
        except KeyError:
            if verbose:
                print(f"Col '{col}' missing from df. Skipping ...")
    
    return df

In [None]:
train_dataset = pd.read_parquet("/kaggle/input/ashrae-iii/train_df.parquet")
train_dataset = drop_cols(train_dataset)
train_dataset = cast_dtypes(train_dataset, verbose=True)

## Cross Validation

In [None]:
CATEGORICAL_FEATURES = ["site_id", "primary_use"]
NUMERICAL_FEATURES = [
    "square_feet",
    "floor_count",
    "air_temperature",
    "cloud_coverage",
    "dew_temperature",
    "precip_depth_1_hr",
    "sea_level_pressure",
    "wind_direction_sin",
    "wind_direction_cos",
    "wind_speed",
    "air_temperature_smoothed_lag_1",
    "air_temperature_smoothed_lag_2",
    "air_temperature_smoothed_lag_3",
    "air_temperature_smoothed_lag_4",
    "air_temperature_smoothed_lag_5",
    "air_temperature_smoothed_rolling_mean_12",
    "air_temperature_smoothed_rolling_mean_24",
    "dew_temperature_smoothed_lag_1",
    "dew_temperature_smoothed_lag_2",
    "dew_temperature_smoothed_lag_3",
    "dew_temperature_smoothed_lag_4",
    "dew_temperature_smoothed_lag_5",
    "dew_temperature_smoothed_rolling_mean_12",
    "dew_temperature_smoothed_rolling_mean_24",
    "sea_level_pressure_smoothed_lag_1",
    "sea_level_pressure_smoothed_lag_2",
    "sea_level_pressure_smoothed_lag_3",
    "sea_level_pressure_smoothed_lag_4",
    "sea_level_pressure_smoothed_lag_5",
    "sea_level_pressure_smoothed_rolling_mean_12",
    "sea_level_pressure_smoothed_rolling_mean_24",
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
    "is_weekend",
    "building_age_years",
    "building_area_square_feet",
    "relative_humidity",
    "cold_chill",
    "apparent_temperature",
    "heat_index",
]

# building_id is encoded differently for different meter ids
FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES + ["building_id"]
LABEL = "meter_reading"

In [None]:
N_ITERATIONS = 2000
TRAIN_PARAMETERS = {
    "objective": "mean_squared_error",
    "learning_rate": 0.01,
    "seed": 1,
    "max_bin": 255,
    "num_leaves": 2 ** 6 - 1,
    "min_data_in_leaf": 1000,
    "bagging_fraction": 0.75,
    "bagging_freq": 1,
    "feature_fraction": 0.8,
    "metric": ["rmse"],
}

In [None]:
def get_column_transformer(meter_id: int) -> ColumnTransformer:
    ordinal_encoder = OrdinalEncoder(
        categories="auto",
        handle_unknown="use_encoded_value",
        unknown_value=-1,
        dtype=np.int32,
    )
    target_encoder = TargetEncoder(
        categories="auto",
        target_type="continuous",
        smooth="auto",
    )

    transformers = [("ordinal_encoder", ordinal_encoder, ["primary_use"])]
    if meter_id in (1, 2):
        transformers += [("target_encoder", target_encoder, ["building_id"])]
    
    transformer = ColumnTransformer(
        transformers=transformers,
        remainder="passthrough",
        verbose_feature_names_out=False,
    )
    transformer.set_output(transform="pandas")
    return transformer


def get_feature_dtypes(meter_id: int):
    dtypes = {
        "building_id": np.float32 if meter_id in (1, 2) else "category",
        "primary_use": "category",
    }
    return dtypes


def target_transform(y: pd.Series) -> pd.Series:
    return np.log1p(y)

In [None]:
def train_valid_split(train_df: pd.DataFrame):
    for val_start, val_end in VALIDATION_PERIODS:
        train_mask = train_df["timestamp"] < val_start
        test_mask = (train_df["timestamp"] >= val_start) & (train_df["timestamp"] < val_end)
        yield train_df.loc[train_mask], train_df.loc[test_mask]


def train_predict_score(train_df: pd.DataFrame, valid_df: pd.DataFrame):
    results_by_meter_id = {}
    for meter_id in METER_IDS:
        print(f"Running train/predict/score for meter id {meter_id}")
        
        train_meter_df = train_df[train_df["meter_id"] == meter_id]
        valid_meter_df = valid_df[valid_df["meter_id"] == meter_id]

        X_train, y_train = train_meter_df[FEATURES], train_meter_df[LABEL]
        X_valid, y_valid = valid_meter_df[FEATURES], valid_meter_df[LABEL]

        print(f"Train shape: {X_train.shape, y_train.shape}")
        print(f"Valid shape: {X_valid.shape, y_valid.shape}\n")

        # Target transform
        y_train = target_transform(y_train)
        y_valid = target_transform(y_valid)

        # Feature transforms
        col_transformer = get_column_transformer(meter_id=meter_id)
        col_transformer.fit(X_train, y_train)
        X_train = col_transformer.transform(X_train)
        X_valid = col_transformer.transform(X_valid)
        
        dtypes = get_feature_dtypes(meter_id)
        X_train = X_train.astype(dtypes)
        X_valid = X_valid.astype(dtypes)

        # Train LightGBM
        cat_features = CATEGORICAL_FEATURES + (["building_id"] if meter_id in (0, 3) else [])
        ds_params = {"categorical_feature": cat_features}
        train_ds = lgbm.Dataset(data=X_train, label=y_train, **ds_params)
        valid_ds = lgbm.Dataset(data=X_valid, label=y_valid, **ds_params)

        train_valid_loss = {}
        model = lgbm.train(
            TRAIN_PARAMETERS,
            num_boost_round=N_ITERATIONS,
            train_set=train_ds,
            valid_sets=[train_ds, valid_ds],
            valid_names=["train", "valid"],
            callbacks=[
                lgbm.log_evaluation(period=10),
                lgbm.record_evaluation(train_valid_loss),
            ]
        )

        y_hat = model.predict(X_valid)
        results = {"y_true": y_valid, "y_pred": y_hat, "train_valid_loss": train_valid_loss}
        results_by_meter_id[meter_id] = results

        print("====================================")

    return results_by_meter_id

In [None]:
if not SUBMISSION_RUN:
    cv_results = []
    for i, (train_df, valid_df) in enumerate(train_valid_split(train_dataset)):
        print(f"Running cross validation on split {i}\n")
        results = train_predict_score(train_df, valid_df)
        cv_results.append(results)
        print("====================================")

else:
    print(f"Training models for submission run.")
    COL_TRANSFORMERS, MODELS = {}, {}
    for meter_id in METER_IDS:
        print(f"Training models for meter id {meter_id}")
        
        train_meter_df = train_dataset[train_dataset["meter_id"] == meter_id]
        X_train, y_train = train_meter_df[FEATURES], train_meter_df[LABEL]
        print(f"Train shape: {X_train.shape, y_train.shape}")

        # Target and feature transform
        y_train = target_transform(y_train)

        col_transformer = get_column_transformer(meter_id=meter_id)
        col_transformer.fit(X_train, y_train)
        X_train = col_transformer.transform(X_train)
        dtypes = get_feature_dtypes(meter_id)
        X_train = X_train.astype(dtypes)

        # Train LightGBM
        cat_features = CATEGORICAL_FEATURES + (["building_id"] if meter_id in (0, 3) else [])
        ds_params = {"categorical_feature": cat_features}
        train_ds = lgbm.Dataset(data=X_train, label=y_train, **ds_params)
        model = lgbm.train(
            TRAIN_PARAMETERS,
            num_boost_round=N_ITERATIONS,
            train_set=train_ds,
            valid_sets=[train_ds],
            valid_names=["train"],
            callbacks=[lgbm.log_evaluation(period=10)]
        )

        # Save transformer and model for inference
        COL_TRANSFORMERS[meter_id] = col_transformer
        MODELS[meter_id] = model

        print("====================================")

    del train_dataset
    gc.collect()

In [None]:
# Plot training loss
if not SUBMISSION_RUN:
    fig, ax = plt.subplots(4, 2, figsize=(10, 12), sharex=True)
    
    for split_idx, split_scores in enumerate(cv_results):
        for meter_id, meter_scores in split_scores.items():
            ax[meter_id, 0].plot(
                meter_scores["train_valid_loss"]["train"]["rmse"],
                label=f"split {split_idx}",
            )
            
            ax[meter_id, 1].plot(
                meter_scores["train_valid_loss"]["valid"]["rmse"],
                label=f"split {split_idx}",
            )

    for meter_id in METER_IDS:
        ax[meter_id, 0].legend()
        ax[meter_id, 0].set(ylabel=f"loss, meter_id {meter_id}")
        ax[meter_id, 1].legend()
    
    ax[0, 0].set(title="train")
    ax[0, 1].set(title="valid")

    fig.tight_layout();
    plt.savefig("loss_curves.png", dpi=300)

## Submission

In [None]:
# Make predictions in batches
def load_test_df_in_batches(pq_file: str, batch_size: int = 200_000):
    test_pq_file = pq.ParquetFile(pq_file)
    for batch in tqdm(test_pq_file.iter_batches(batch_size)):
        batch_df = batch.to_pandas()
        yield batch_df

In [None]:
def kwh_to_kbtu(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 3.4118
    return df

In [None]:
if SUBMISSION_RUN:
    # Load data and make predictions in batches
    prediction_dfs = []
    
    test_pq_file = "/kaggle/input/ashrae-iii/test_df.parquet"
    for test_df in load_test_df_in_batches(test_pq_file):
        test_df = drop_cols(test_df)
        test_df = cast_dtypes(test_df)
    
        for meter_id in METER_IDS:
            test_meter_df = test_df[test_df["meter_id"] == meter_id]
            if test_meter_df.empty: continue
            X_test = test_meter_df[FEATURES]
    
            transformer = COL_TRANSFORMERS[meter_id]
            X_test = transformer.transform(X_test)
            dtypes = get_feature_dtypes(meter_id)
            X_test = X_test.astype(dtypes)
            
            model = MODELS[meter_id]
            y_hat = np.expm1(model.predict(X_test))
            y_hat = np.clip(y_hat, a_min=0.0, a_max=None).astype(np.float32)
    
            prediction_df = test_meter_df[["row_id", "building_id", "meter_id"]].copy()
            prediction_df["meter_reading"] = y_hat
            prediction_dfs.append(prediction_df)
        
    # Submit
    submission_df = pd.concat(prediction_dfs, axis=0, ignore_index=True)
    submission_df = kwh_to_kbtu(submission_df)
    submission_df = submission_df[["row_id", "meter_reading"]].sort_values("row_id").reset_index(drop=True)
    submission_df.to_csv("submission.csv", index=False)