In [None]:
import gc

import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

In [None]:
SUBMISSION_RUN = True

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')

DATA_RESOLUTION = "1h"

VALIDATION_PERIODS = [
    (pd.Timestamp("2016-10-01 00:00:00"), pd.Timestamp("2016-11-01 00:00:00")),
    (pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")),
    (pd.Timestamp("2016-12-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")),
]

## Data loading

In [None]:
CATEGORY_COLS = ["building_id", "meter_id", "site_id", "primary_use", "primary_use_id"]
UINT8_COLS = ["hour", "day_of_week", "month"]


def cast_dtypes(df: pd.DataFrame) -> pd.DataFrame:

    # Timestamps
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except KeyError:
        print("Col 'timestamp' missing from df. Skipping ...")

    # Categories
    for col in CATEGORY_COLS:
        try:
            df[col] = df[col].astype("category")
        except KeyError:
            print(f"Col '{col}' missing from df. Skipping ...")

    # UINT8
    for col in UINT8_COLS:
        try:
            if df[col].max() > np.iinfo(np.uint8).max:
                print(f"Col max for '{col}' exceeds np.uint8 max. Skipping ...")
                continue
            df[col] = df[col].astype(np.uint8)
        except KeyError:
            print(f"Col '{col}' missing from df. Skipping ...")
    
    return df

In [None]:
# train_pq_file = pq.ParquetFile("/kaggle/input/ashrae-iii/train_df.parquet")

# train_dataset = pd.DataFrame()
# for batch in tqdm(train_pq_file.iter_batches(batch_size=131_072)):
#     train_batch_df = batch.to_pandas()
#     train_batch_df = cast_dtypes(train_batch_df)
#     train_dataset = pd.concat([train_dataset, train_batch_df], ignore_index=True)
    
#     del train_batch_df
#     _ = gc.collect()

# train_dataset = cast_dtypes(train_dataset)

In [None]:
train_dataset = pd.read_parquet("/kaggle/input/ashrae-iii/train_df.parquet")
train_dataset = cast_dtypes(train_dataset)

## Cross Validation

In [None]:
FEATURES = [
    "building_id",
    "meter_id",
    # "site_id",
    # "square_feet",
    # "floor_count",
    # "air_temperature",
    # "cloud_coverage",
    # "dew_temperature",
    # "precip_depth_1_hr",
    # "sea_level_pressure",
    # "wind_direction_sin",
    # "wind_direction_cos",
    # "wind_speed",
    # "air_temperature_lag_1",
    # "air_temperature_lag_2",
    # "dew_temperature_lag_1",
    # "dew_temperature_lag_2",
    # "sea_level_pressure_lag_1",
    # "sea_level_pressure_lag_2",
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
    "is_weekend",
    # "building_age_years",
    # "building_area_square_feet",
    # "primary_use_id",
    # "relative_humidity",
    # "cold_chill",
    # "apparent_temperature",
    # "heat_index",
]

LABEL = "log_meter_reading"

In [None]:
TRAIN_PARAMETERS = {
    "objective": "mean_squared_error",
    "learning_rate": 0.01,
    "seed": 1,
    "max_bin": 255,
    "num_leaves": 2 ** 6 - 1,
}

In [None]:
def train_valid_split(train_df: pd.DataFrame):
    for val_start, val_end in VALIDATION_PERIODS:
        train_mask = train_df["timestamp"] < val_start
        test_mask = (train_df["timestamp"] >= val_start) & (train_df["timestamp"] < val_end)
        yield train_df.loc[train_mask], train_df.loc[test_mask]



def train_predict_score(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
):
    
    train_ds = lgbm.Dataset(data=X_train, label=y_train)
    valid_ds = lgbm.Dataset(data=X_valid, label=y_valid)

    # Train and log evals.
    eval_results = {}
    model = lgbm.train(
        TRAIN_PARAMETERS,
        num_boost_round=1000,
        train_set=train_ds,
        valid_sets=[train_ds, valid_ds],
        valid_names=["train", "valid"],
        callbacks=[
            # lgbm.early_stopping(stopping_rounds=5),
            lgbm.log_evaluation(period=10),
            lgbm.record_evaluation(eval_results),
        ]
    )

    # Predict and score
    y_hat = model.predict(X_test)
    metrics = {
        "mse": mean_squared_error(np.array(y_valid), np.array(y_hat)),
        "rmse": np.sqrt(mean_squared_error(np.array(y_valid), np.array(y_hat))),
        "eval_results": eval_results
    }
    
    return metrics

In [None]:
if not SUBMISSION_RUN:
    cv_results = []
    for i, (train_df, valid_df) in enumerate(train_valid_split(train_dataset)):
        print(f"Running cross validation on split {i}")
        X_train, y_train = train_df[FEATURES], train_df[LABEL]
        X_valid, y_valid = valid_df[FEATURES], valid_df[LABEL]
        scores = train_predict_score(X_train, y_train, X_valid, y_valid)
        cv_results.append(scores)
        print("====================================")

else:
    X_train, y_train = train_dataset[FEATURES], train_dataset[LABEL]
    train_ds = lgbm.Dataset(data=X_train, label=y_train)
    model = lgbm.train(
        TRAIN_PARAMETERS,
        num_boost_round=1000,
        train_set=train_ds,
        valid_sets=[train_ds],
        valid_names=["train"],
        callbacks=[lgbm.log_evaluation(period=10)]
    )

In [None]:
# Plot training loss
if not SUBMISSION_RUN:
    fig, ax = plt.subplots(1, 2, figsize=(10, 3.5), sharex=True, sharey=True)
    
    for split_idx, split_scores in enumerate(cv_results):
        loss_by_iteration = split_scores["eval_results"]
        
        ax[0].plot(
            loss_by_iteration["train"]["l2"],
            label=f"split {split_idx}",
        )
        ax[0].legend()
        
        ax[1].plot(
            loss_by_iteration["valid"]["l2"],
            label=f"split {split_idx}",
        )
        ax[1].legend()
        

## Submission

In [None]:
def kwh_to_kbtu(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 3.4118
    return df

In [None]:
# Load data
test_data = pd.read_parquet("/kaggle/input/ashrae-iii/test_df.parquet")
test_data = cast_dtypes(test_data)

# Predict
X_test = test_data[FEATURES]
y_hat = np.expm1(model.predict(X_test))
y_hat = pd.DataFrame(y_hat, columns=["meter_reading"])

# Submit
submission_df = pd.concat([test_data, y_hat], axis=1)
submission_df = kwh_to_kbtu(submission_df)
submission_df = submission_df[["row_id", "meter_reading"]].sort_values("row_id")
submission_df.to_csv("submission.csv", index=False)