In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
import matplotlib.pyplot as plt

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')

DATA_RESOLUTION = "1h"

VALIDATION_PERIODS = [
    (pd.Timestamp("2016-10-01 00:00:00"), pd.Timestamp("2016-11-01 00:00:00")),
    (pd.Timestamp("2016-11-01 00:00:00"), pd.Timestamp("2016-12-01 00:00:00")),
    (pd.Timestamp("2016-12-01 00:00:00"), pd.Timestamp("2017-01-01 00:00:00")),
]

In [None]:
CATEGORY_COLS = ["building_id", "meter_id", "site_id", "primary_use", "primary_use_id"]
UINT8_COLS = ["hour", "day_of_week", "month"]


def cast_dtypes(df: pd.DataFrame) -> pd.DataFrame:

    # Timestamps
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except KeyError:
        print("Col 'timestamp' missing from df. Skipping ...")

    # Categories
    for col in CATEGORY_COLS:
        try:
            df[col] = df[col].astype("category")
        except KeyError:
            print(f"Col '{col}' missing from df. Skipping ...")

    # UINT8
    for col in UINT8_COLS:
        try:
            if df[col].max() > np.iinfo(np.uint8).max:
                print(f"Col max for '{col}' exceeds np.uint8 max. Skipping ...")
                continue
            df[col] = df[col].astype(np.uint8)
        except KeyError:
            print(f"Col '{col}' missing from df. Skipping ...")
    
    return df

In [None]:
def train_valid_split(train_df: pd.DataFrame):
    for val_start, val_end in VALIDATION_PERIODS:
        train_mask = train_df["timestamp"] < val_start
        test_mask = (train_df["timestamp"] >= val_start) & (train_df["timestamp"] < val_end)
        yield train_df.loc[train_mask], train_df.loc[test_mask]

In [None]:
FEATURES = [
    "building_id",
    "meter_id",
    # "site_id",
    # "square_feet",
    # "floor_count",
    # "air_temperature",
    # "cloud_coverage",
    # "dew_temperature",
    # "precip_depth_1_hr",
    # "sea_level_pressure",
    # "wind_direction_sin",
    # "wind_direction_cos",
    # "wind_speed",
    # "air_temperature_lag_1",
    # "air_temperature_lag_2",
    # "dew_temperature_lag_1",
    # "dew_temperature_lag_2",
    # "sea_level_pressure_lag_1",
    # "sea_level_pressure_lag_2",
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
    "is_weekend",
    # "building_age_years",
    # "building_area_square_feet",
    # "primary_use_id",
    # "relative_humidity",
    # "cold_chill",
    # "apparent_temperature",
    # "heat_index",
]

LABEL = "log_meter_reading"

# Cross Validation

In [None]:
train_data = pd.read_parquet("/kaggle/input/ashrae-iii/train_df.parquet")
train_data = cast_dtypes(train_data)

In [None]:
for i, (train_df, valid_df) in enumerate(train_valid_split(train_data)):
    break

In [None]:
X_train, y_train = train_df[FEATURES], train_df[LABEL]
X_valid, y_valid = valid_df[FEATURES], valid_df[LABEL]

In [None]:
train_dataset = lgbm.Dataset(data=X_train, label=y_train)
valid_dataset = lgbm.Dataset(data=X_valid, label=y_valid)

In [None]:
TRAIN_PARAMETERS = {
    "objective": "mean_squared_error",
    "learning_rate": 0.01,
    "seed": 1,
    "max_bin": 255,
}

In [None]:
eval_results = {}
model = lgbm.train(
    TRAIN_PARAMETERS,
    num_boost_round=1000,
    train_set=train_dataset,
    valid_sets=[train_dataset, valid_dataset],
    callbacks=[
        # lgbm.early_stopping(stopping_rounds=5),
        lgbm.log_evaluation(period=10),
        lgbm.record_evaluation(eval_results),
    ]
)

In [None]:
fig, ax = plt.subplots()
metric = "l2"
ax.plot(eval_results["training"][metric], label="train")
ax.plot(eval_results["valid_1"][metric], label="valid")
ax.legend()

## Submission

In [None]:
def kwh_to_kbtu(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 3.4118
    return df

In [None]:
test_data = pd.read_parquet("/kaggle/input/ashrae-iii/test_df.parquet")
test_data = cast_dtypes(test_data)

In [None]:
submission_rows_ids = test_data[["row_id"]]

In [None]:
# Predict
y_hat = model.predict(X_test)
    y_hat = np.clip(y_hat, a_min=0, a_max=np.inf)
    rmse = np.sqrt(np.mean((y_test - y_hat) ** 2))

In [None]:
submission_df = pd.concat(y_hat_dfs)
submission_df = kwh_to_kbtu(submission_df)
submission_df = submission_df[["row_id", "meter_reading"]].sort_values("row_id")
submission_df.to_csv("submission.csv", index=False)