In [None]:
from datetime import datetime, date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
TRAIN_DATA_RESOLUTION = "1h"

## Train data

### Load raw data

In [None]:
# Meter readings
readings_df = pd.read_csv(
    f"{INPUT_DATA_PATH}/train.csv",
    header=0,
    names=["building_id", "meter_id", "timestamp", "meter_reading"],
)
readings_df["timestamp"] = pd.to_datetime(readings_df["timestamp"])

# Buildings
buildings_df = pd.read_csv(f"{INPUT_DATA_PATH}/building_metadata.csv")

# Weather
weather_df = pd.read_csv(f"{INPUT_DATA_PATH}/weather_train.csv")
weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])

### Feature engineering

#### Weather data

In [None]:
WEATHER_FEATURE_COLUMNS = [
    'air_temperature',
    'cloud_coverage',
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_direction',
    'wind_speed'
]

In [None]:
def reindex_weather_data(weather_df: pd.DataFrame) -> pd.DataFrame:
    # Reindex weather data such that every site has a measurement
    # for each training timestamp
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])
    timestamps = pd.date_range(
        MIN_TRAIN_TIMESTAMP,
        MAX_TRAIN_TIMESTAMP,
        freq=TRAIN_DATA_RESOLUTION,
        inclusive="both"
    )
    timestamps = pd.DatetimeIndex(timestamps, name="timestamp")
    site_dfs = []
    for site_id, site_df in weather_df.groupby("site_id", observed=True):
        site_df = site_df.set_index("timestamp").reindex(timestamps).reset_index()
        site_df["site_id"] = site_df["site_id"].fillna(value=site_id)
        site_dfs.append(site_df)

    weather_df = pd.concat(site_dfs, ignore_index=True)
    return weather_df


def interpolate(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    weather_df[column] = weather_df[column].interpolate("linear", limit=12)
    weather_df[column] = weather_df[column].ffill(limit=2)
    weather_df[column] = weather_df[column].bfill(limit=2)
    return weather_df


def _mean_weather_by_date_and_site(weather_df, column) -> pd.DataFrame:
    mean_values = (
        weather_df
        .groupby(["date", "site_id"])[[column]]
        .mean()
        .reset_index()
    )
    return mean_values


def _merge_onto_weather_df(weather_df, right, right_suffix) -> pd.DataFrame:
    weather_df = weather_df.merge(
        right=right,
        how="left",
        on=["date", "site_id"],
        suffixes=("", right_suffix)
    )
    return weather_df


def impute_with_same_day_mean(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Compute same day mean values and merge onto weather df
    mean_values = _mean_weather_by_date_and_site(weather_df, column)
    weather_df = _merge_onto_weather_df(weather_df, mean_values, "_mean")
    
    # Fill with means from same day
    weather_df[column] = weather_df[column].fillna(weather_df[f"{column}_mean"])
    weather_df = weather_df.drop(f"{column}_mean", axis=1)
    return weather_df


def ffill_mean_by_date(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Compute same day mean values and merge onto weather df
    mean_values = _mean_weather_by_date_and_site(weather_df, column)
    
    # ffill by site and date
    site_dfs = []
    for site_id, site_df in mean_values.groupby("site_id"):
        site_df = site_df.sort_values("date")
        site_df[column] = site_df[column].ffill().bfill()
        site_dfs.append(site_df)
    mean_values = pd.concat(site_dfs, ignore_index=True)
    
    # Merge back onto main and fill with mean values
    weather_df = _merge_onto_weather_df(weather_df, mean_values, "_mean")
    weather_df[column] = weather_df[column].fillna(weather_df[f"{column}_mean"])
    weather_df = weather_df.drop(f"{column}_mean", axis=1)
    return weather_df


def fill_missing_weather_data(weather_df: pd.DataFrame) -> pd.DataFrame:
    weather_df["date"] = weather_df["timestamp"].dt.date

    for column in WEATHER_FEATURE_COLUMNS:
        weather_df = interpolate(weather_df, column)
        weather_df = impute_with_same_day_mean(weather_df, column)
        weather_df = ffill_mean_by_date(weather_df, column)

    weather_df = weather_df.drop(columns=["date"])
    
    return weather_df

In [None]:
weather_df = reindex_weather_data(weather_df)
weather_df = fill_missing_weather_data(weather_df)

## Weather feature engineering

In [None]:
def add_smoothed_weather_feature(df: pd.DataFrame, feature: str) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id"):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        site_df[f"{feature}_smoothed"] = savgol_filter(
            np.array(site_df[feature]),
            window_length=12,
            polyorder=2,
        )
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df


def add_lagged_weather_feature(df: pd.DataFrame, feature: str, lag: int) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id"):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        site_df[f"{feature}_lag_{lag}"] = site_df[feature].shift(lag)
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df
    

def add_rolling_mean_weather_feature(df: pd.DataFrame, feature: str, window: int) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id"):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        site_df[f"{feature}_rolling_{window}"] = site_df[feature].rolling(window).mean()
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df

In [None]:
WEATHER_LAGS = [1, 2]
WEATHER_ROLLING_WINDOWS = [6, 12, 24]

# Air temperature
feature_name = "air_temperature"
weather_df = add_smoothed_weather_feature(weather_df, feature_name)
for lag in WEATHER_LAGS:
    weather_df = add_lagged_weather_feature(weather_df, feature_name, lag)
for window in WEATHER_ROLLING_WINDOWS:
    weather_df = add_rolling_mean_weather_feature(weather_df, feature_name, window)


# Dew temperature
feature_name = "dew_temperature"
weather_df = add_smoothed_weather_feature(weather_df, feature_name)
for lag in WEATHER_LAGS:
    weather_df = add_lagged_weather_feature(weather_df, feature_name, lag)
for window in WEATHER_ROLLING_WINDOWS:
    weather_df = add_rolling_mean_weather_feature(weather_df, feature_name, window)


# Sea level pressure
feature_name = "sea_level_pressure"
weather_df = add_smoothed_weather_feature(weather_df, feature_name)
for lag in WEATHER_LAGS:
    weather_df = add_lagged_weather_feature(weather_df, feature_name, lag)
for window in WEATHER_ROLLING_WINDOWS:
    weather_df = add_rolling_mean_weather_feature(weather_df, feature_name, window)

## Merge

In [None]:
def merge_dfs(readings_df: pd.DataFrame, buildings_df: pd.DataFrame, weather_df: pd.DataFrame) -> pd.DataFrame:
    # Cast merge cols to the same type
    readings_df["building_id"] = readings_df["building_id"].astype(int)
    readings_df["timestamp"] = pd.to_datetime(readings_df["timestamp"])

    buildings_df["building_id"] = buildings_df["building_id"].astype(int)
    buildings_df["site_id"] = buildings_df["site_id"].astype(int)

    weather_df["site_id"] = weather_df["site_id"].astype(int)
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])

    # Merge
    merged_df = pd.merge(left=readings_df, right=buildings_df, how="left", on="building_id")
    merged_df = pd.merge(left=merged_df, right=weather_df, how="left", on=["site_id", "timestamp"])

    return merged_df

In [None]:
train_df = merge_dfs(readings_df, buildings_df, weather_df)

## Feature engineering

In [None]:
def add_periodic_features(df: pd.DataFrame, feature: str, period: int) -> pd.DataFrame:
    df[f"{feature}_sin"] = np.sin(2 * np.pi * df[feature] / period)
    df[f"{feature}_cos"] = np.cos(2 * np.pi * df[feature] / period)
    return df


In [None]:
# Meter reading features

def kbtu_to_kwh(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 0.2931
    return df


def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    df["hour"] = df["timestamp"].dt.hour
    df = add_periodic_features(df, "hour", 24)
    
    df["day_of_week"] = df["timestamp"].dt.weekday
    df = add_periodic_features(df, "day_of_week", 7)
    
    df["month"] = df["timestamp"].dt.month
    df = add_periodic_features(df, "month", 12)
    
    df["is_weekend"] = (df["timestamp"].dt.weekday >= 5).astype(float)
    
    return df

In [None]:
train_df = kbtu_to_kwh(train_df)
train_df = add_temporal_features(train_df)

In [None]:
# Building features

def add_building_age_feature(df: pd.DataFrame) -> pd.DataFrame:
    df["building_age_years"] = df["timestamp"].dt.year - df["year_built"]
    return df


def add_building_area_feature(df: pd.DataFrame) -> pd.DataFrame:
    df["building_area_square_feet"] = df["square_feet"] * df["floor_count"]
    return df

In [None]:
train_df = add_building_age_feature(train_df)
train_df = add_building_area_feature(train_df)

In [None]:
# Weather features

# https://www.kaggle.com/code/selfishgene/filtering-and-auto-correlation-tutorial


def saturation_vapour_pressure(temperature: pd.Series) -> pd.Series:
    return 6.1094 * np.exp(17.625 * temperature / (temperature + 243.04))


def add_relative_humidity_feature(df: pd.DataFrame) -> pd.DataFrame:
    svp_air_temp = saturation_vapour_pressure(df["air_temperature"])
    svp_dew_temp = saturation_vapour_pressure(df["dew_temperature"])
    df["relative_humidity"] = 100 * svp_dew_temp / svp_air_temp
    return df


def add_cold_chill_feature(df: pd.DataFrame) -> pd.DataFrame:
    # Cold chill only defined for temps below 10C and wind speeds above 1.3 m/s
    mask = (df["air_temperature"] <= 10.0) & (df["wind_speed"] >= 1.3)
    air_temp = df.loc[mask, "air_temperature"]
    wind_speed = df.loc[mask, "wind_speed"]
    cold_chill = (
        13.12 
        +  0.6215 * air_temp
        - 11.37 * (3.6 * wind_speed) ** 0.16
        + 0.3965 * air_temp * (3.6 * wind_speed) ** 0.16
    )
    df.loc[mask, "cold_chill"] = cold_chill
    return df



def add_apparent_temperature_feature(df: pd.DataFrame) -> pd.DataFrame:
    mask = df["air_temperature"].between(10, 27, inclusive="both")
    air_temp = df.loc[mask, "air_temperature"]
    wind_speed = df.loc[mask, "wind_speed"]
    humidity = df.loc[mask, "relative_humidity"] / 100
    pressure = humidity * 6.105 * np.exp((17.27 * air_temp) / (air_temp + 237.7))
    apparent_temp = air_temp + 0.33 * pressure - 0.7 * wind_speed - 4
    df.loc[mask, "apparent_temperature"] = apparent_temp
    return df


def add_heat_index_feature(df: pd.DataFrame) -> pd.DataFrame:
    mask = df["air_temperature"] >= 27
    air_temp = df.loc[mask, "air_temperature"]
    humidity = df.loc[mask, "relative_humidity"]
    heat_index = (
        - 8.7847 
        + 1.6114 * air_temp 
        + 2.3385 * humidity
        - 0.1461 * air_temp * humidity
        - 0.0123 * air_temp ** 2 
        - 0.0164 * humidity ** 2
        + 2.212e-03 * air_temp ** 2 * humidity
        + 7.255e-04 * air_temp * humidity ** 2
        - 3.582e-06 * air_temp ** 2 * humidity ** 2
    )
    df.loc[mask, "heat_index"] = heat_index
    return df


def cooling_degree_days(df: pd.DataFrame) -> pd.DataFrame:
    # https://www.investopedia.com/terms/c/colddegreeday.asp
    ...


def heating_degree_days():
    ...

In [None]:
weather_df = add_relative_humidity_feature(weather_df)
weather_df = add_cold_chill_feature(weather_df)
weather_df = add_apparent_temperature_feature(weather_df)
weather_df = add_heat_index_feature(weather_df)

In [None]:
def target_transform(df: pd.DataFrame) -> pd.DataFrame:
    df["log_meter_reading"] = np.log1p(np.array(df["meter_reading"]))
    return df