In [None]:
import gc

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import matplotlib.pyplot as plt

import lightgbm as lgbm

from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

from tqdm import tqdm

In [None]:
SUBMISSION_RUN = False

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')

DATA_RESOLUTION = "1h"

VALIDATION_PERIODS = [
    (pd.Timestamp("2016-10-01 00:00:00"), pd.Timestamp("2016-11-01 00:00:00")),
    (pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")),
    (pd.Timestamp("2016-12-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")),
]

## Data loading

In [None]:
CATEGORY_COLS = ["building_id", "meter_id", "site_id", "primary_use"]
UINT8_COLS = ["hour", "day_of_week", "month"]


def drop_cols(df: pd.DataFrame) -> pd.DataFrame:
    cols_to_drop = ["hour", "day_of_week", "month"]
    cols_to_drop = cols_to_drop + (["timestamp"] if SUBMISSION_RUN else [])
    df = df.drop(columns=cols_to_drop)
    return df


def cast_dtypes(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:

    # Timestamps
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except KeyError:
        if verbose:
            print("Col 'timestamp' missing from df. Skipping ...")

    # Categories
    for col in CATEGORY_COLS:
        try:
            df[col] = df[col].astype("category")
        except KeyError:
            if verbose:
                print(f"Col '{col}' missing from df. Skipping ...")

    # UINT8
    for col in UINT8_COLS:
        try:
            if df[col].max() > np.iinfo(np.uint8).max:
                print(f"Col max for '{col}' exceeds np.uint8 max. Skipping ...")
                continue
            df[col] = df[col].astype(np.uint8)
        except KeyError:
            if verbose:
                print(f"Col '{col}' missing from df. Skipping ...")
    
    return df

In [None]:
train_dataset = pd.read_parquet("/kaggle/input/ashrae-iii/train_df.parquet")
train_dataset = drop_cols(train_dataset)
train_dataset = cast_dtypes(train_dataset)

## Cross Validation

In [None]:
CATEGORICAL_FEATURES = [
    "building_id",
    "meter_id",
    "site_id",
    "primary_use",
]
NUMERICAL_FEATURES = [
    "square_feet",
    "floor_count",
    "air_temperature",
    "cloud_coverage",
    "dew_temperature",
    "precip_depth_1_hr",
    "sea_level_pressure",
    "wind_direction_sin",
    "wind_direction_cos",
    "wind_speed",
    "air_temperature_lag_1",
    "air_temperature_lag_2",
    "air_temperature_lag_3",
    "air_temperature_lag_4",
    "air_temperature_lag_5",
    "air_temperature_rolling_mean_12",
    "air_temperature_rolling_mean_24",
    "dew_temperature_lag_1",
    "dew_temperature_lag_2",
    "dew_temperature_lag_3",
    "dew_temperature_lag_4",
    "dew_temperature_lag_5",
    "dew_temperature_rolling_mean_12",
    "dew_temperature_rolling_mean_24",
    "sea_level_pressure_lag_1",
    "sea_level_pressure_lag_2",
    "sea_level_pressure_lag_3",
    "sea_level_pressure_lag_4",
    "sea_level_pressure_lag_5",
    "sea_level_pressure_rolling_mean_12",
    "sea_level_pressure_rolling_mean_24",
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
    "is_weekend",
    "building_age_years",
    "building_area_square_feet",
    "relative_humidity",
    "cold_chill",
    "apparent_temperature",
    "heat_index",
]
FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

LABEL = "meter_reading"

In [None]:
N_ITERATIONS = 2000
TRAIN_PARAMETERS = {
    "objective": "mean_squared_error",
    "learning_rate": 0.01,
    "seed": 1,
    "max_bin": 255,
    "num_leaves": 2 ** 6 - 1,
    "min_data_in_leaf": 50,
    "metric": ["rmse"],
}
DATASET_PARAMETERS = {"categorical_feature": CATEGORICAL_FEATURES}

In [None]:
def train_valid_split(train_df: pd.DataFrame):
    for val_start, val_end in VALIDATION_PERIODS:
        train_mask = train_df["timestamp"] < val_start
        test_mask = (train_df["timestamp"] >= val_start) & (train_df["timestamp"] < val_end)
        yield train_df.loc[train_mask], train_df.loc[test_mask]


def get_column_transformer() -> ColumnTransformer:
    ordinal_encoder = OrdinalEncoder(
        categories="auto",
        handle_unknown="use_encoded_value",
        unknown_value=-1,
        dtype=np.int32,
    )
    transformer = ColumnTransformer(
        transformers=[
            ("numerical_features", "passthrough", NUMERICAL_FEATURES),
            ("ordinal_encoder", ordinal_encoder, ["primary_use"]),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )
    transformer.set_output(transform="pandas")
    return transformer


def target_transform(y: pd.Series) -> pd.Series:
    return np.log1p(y)

In [None]:
if not SUBMISSION_RUN:
    print(f"Running cross validation ...\n")
    
    cv_results = []
    for i, (train_df, valid_df) in enumerate(train_valid_split(train_dataset)):
        
        print(f"Split {i}")

        X_train, y_train = train_df[FEATURES], train_df[LABEL]
        X_valid, y_valid = valid_df[FEATURES], valid_df[LABEL]
        
        y_train = target_transform(y_train)
        y_valid = target_transform(y_valid)

        transformer = get_column_transformer()
        transformer.fit(X_train, y_train)
        X_train = transformer.transform(X_train)
        X_train = X_train.astype({"primary_use": "category"})
        X_valid = transformer.transform(X_valid)
        X_valid = X_valid.astype({"primary_use": "category"})
        
        train_ds = lgbm.Dataset(data=X_train, label=y_train)
        valid_ds = lgbm.Dataset(data=X_valid, label=y_valid)

        # Train Lightgbm
        eval_results = {}
        model = lgbm.train(
            TRAIN_PARAMETERS,
            num_boost_round=N_ITERATIONS,
            train_set=train_ds,
            valid_sets=[train_ds, valid_ds],
            valid_names=["train", "valid"],
            callbacks=[
                lgbm.log_evaluation(period=10),
                lgbm.record_evaluation(eval_results),
            ]
        )
        cv_results.append(eval_results)

        print("=================================")

else:
    print(f"Training model for submission ...\n")

    X_train, y_train = train_dataset[FEATURES], train_dataset[LABEL]
    
    transformer = get_column_transformer()
    y_train = target_transform(y_train)
    transformer.fit(X_train, y_train)
    X_train = transformer.transform(X_train)
    X_train = X_train.astype({"primary_use": "category"})
    
    train_ds = lgbm.Dataset(data=X_train, label=y_train)

    eval_results = {}
    model = lgbm.train(
        TRAIN_PARAMETERS,
        num_boost_round=N_ITERATIONS,
        train_set=train_ds,
        valid_sets=[train_ds],
        valid_names=["train"],
        callbacks=[
            lgbm.log_evaluation(period=10),
            lgbm.record_evaluation(eval_results),
        ]
    )

    del train_dataset
    del X_train, y_train
    gc.collect()

In [None]:
# Plot training loss
if not SUBMISSION_RUN:
    fig, ax = plt.subplots(1, 2, figsize=(10, 3.5), sharex=True, sharey=True)
    
    for split_idx, split_scores in enumerate(cv_results):
        ax[0].plot(
            split_scores["train"]["rmse"],
            label=f"split {split_idx}",
        )
        ax[0].legend()
        
        ax[1].plot(
            split_scores["valid"]["rmse"],
            label=f"split {split_idx}",
        )
        ax[1].legend()

    plt.savefig("loss_curves.png", dpi=300)

## Submission

In [None]:
def inverse_target_transform(y: np.ndarray) -> np.ndarray:
    return np.clip(np.expm1(y), a_min=0.0, a_max=None)


def kwh_to_kbtu(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 3.4118
    return df

In [None]:
if SUBMISSION_RUN:
    
    # Make predictions in batches
    parquet_file = "/kaggle/input/ashrae-iii/test_df.parquet"
    test_pq_file = pq.ParquetFile(parquet_file)
    
    prediction_dfs = []
    for batch in tqdm(test_pq_file.iter_batches(100000)):
        test_data = batch.to_pandas()
        test_data = drop_cols(test_data)
        test_data = cast_dtypes(test_data, verbose=False)
    
        # Predict
        X_test = test_data[FEATURES]
        X_test = transformer.transform(X_test)
        X_test = X_test.astype({"primary_use": "category"})
        y_hat = inverse_target_transform(model.predict(X_test))
        y_hat = pd.DataFrame(y_hat, columns=["meter_reading"])
    
        prediction_df = pd.concat([test_data, y_hat], axis=1)
        prediction_df = kwh_to_kbtu(prediction_df)
        prediction_dfs.append(prediction_df[["row_id", "meter_reading"]])
    
    submission_df = pd.concat(prediction_dfs, axis=0)
    submission_df = submission_df.sort_values("row_id").reset_index(drop=True)
    submission_df.to_csv("submission.csv", index=False)