# ASHRAE BASELINE

This notebook develops a simple baseline RidgeRegression model to predict building energy consumption. It uses minimal feature engineering, or hyperparameter tuning and serves as a baseline predictor. All other more complex approaches should outperform the simple baseline model developed here.

In [None]:
from datetime import datetime, date

import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
TRAIN_DATA_RESOLUTION = "1h"

## Load train data

In [None]:
train_df = pd.read_csv(
    f"{INPUT_DATA_PATH}/train.csv",
    header=0,
    names=["building_id", "meter_id", "timestamp", "meter_reading"],
    parse_dates=["timestamp"],
    dtype={"meter_reading": float}
)

## Feature engineering

In [None]:
def kbtu_to_kwh(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 0.2931
    return df


def cyclic_features(df: pd.DataFrame, feature: str, period: int) -> pd.DataFrame:
    df[f"{feature}_sin"] = np.sin(2 * np.pi * df[feature] / period)
    df[f"{feature}_cos"] = np.cos(2 * np.pi * df[feature] / period)
    return df


def temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    df["hour"] = df["timestamp"].dt.hour
    df = cyclic_features(df, "hour", 24)
    
    df["day_of_week"] = df["timestamp"].dt.weekday
    df = cyclic_features(df, "day_of_week", 7)
    
    df["month"] = df["timestamp"].dt.month
    df = cyclic_features(df, "month", 12)
    
    df["is_weekend"] = (df["timestamp"].dt.weekday >= 5).astype(float)
    
    return df


def target_transform(df: pd.DataFrame) -> pd.DataFrame:
    df["log_reading"] = np.log1p(np.array(df["meter_reading"]))
    return df

In [None]:
train_df = kbtu_to_kwh(train_df)
train_df = temporal_features(train_df)

train_df.head()

## Baseline

In [None]:
LABEL = "meter_reading"
FEATURES = [
    "meter_id",
    "hour_sin",
    "hour_cos",
    "day_of_week_sin",
    "day_of_week_cos",
    "month_sin",
    "month_cos",
    "is_weekend"
]

In [None]:
# Construct pipeline

def get_column_transformer():
    transformer = ColumnTransformer(
        [("oh_encoder", OneHotEncoder(), ["meter_id"])],
        remainder="passthrough",
        verbose_feature_names_out=False,
    )
    return transformer


def get_target_regressor():
    regressor = TransformedTargetRegressor(
        regressor=Ridge(),
        func=np.log1p,
        inverse_func=np.expm1,
    )
    return regressor


def make_pipeline():
    pipeline = Pipeline(
        steps = [
            ("transformer", get_column_transformer()),
            ("regressor", get_target_regressor())
        ]
    )
    return pipeline

In [None]:
# Train pipeline for each building

building_ids = list(train_df["building_id"].unique())
models_by_building = {}
for building_id in tqdm(building_ids):
    building_df = train_df[train_df["building_id"] == building_id]
    X, y = building_df[FEATURES], building_df[LABEL]
    
    model = make_pipeline()
    model = model.fit(X, y)
    
    models_by_building[building_id] = model

## Submission

In [None]:
def kwh_to_kbtu(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 3.4118
    return df


In [None]:
test_df = pd.read_csv(
    f"{INPUT_DATA_PATH}/test.csv",
    header=0,
    names=["row_id", "building_id", "meter_id", "timestamp"],
    parse_dates=["timestamp"],
    dtype={"meter_reading": float}
)

In [None]:
test_df = temporal_features(test_df)

test_df.head()

In [None]:
test_building_ids = list(test_df["building_id"].unique())

y_hat_dfs = []
for test_building_id in tqdm(test_building_ids):
    model = models_by_building[test_building_id]
    building_df = test_df[test_df["building_id"] == test_building_id].copy()

    y_hat = model.predict(building_df[FEATURES])
    y_hat = np.clip(y_hat, a_min=0, a_max=None)
    
    building_df["meter_reading"] = y_hat
    y_hat_df = building_df[["row_id", "building_id", "meter_id", "timestamp", "meter_reading"]].copy()
    y_hat_dfs.append(y_hat_df)
    

In [None]:
submission_df = pd.concat(y_hat_dfs)
submission_df = kwh_to_kbtu(submission_df)
submission_df = submission_df[["row_id", "meter_reading"]].sort_values("row_id")
submission_df.to_csv("submission.csv", index=False)