In [None]:
from datetime import datetime, date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
MIN_TEST_TIMESTAMP = pd.Timestamp("2017-01-01 00:00:00")
MAX_TEST_TIMESTAMP = pd.Timestamp('2018-12-31 23:00:00')
DATA_RESOLUTION = "1h"

WEATHER_FEATURE_COLUMNS = [
    'air_temperature',
    'cloud_coverage',
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_direction',
    'wind_speed'
]

In [None]:
def cast_readings_data(df: pd.DataFrame) -> pd.DataFrame:
    df["building_id"] = df["building_id"].astype("category")
    df["meter_id"] = df["meter_id"].astype("category")
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    try:
        df["meter_reading"] = df["meter_reading"].astype(np.float32)
    except KeyError:
        pass
    return df


def cast_weather_data(df: pd.DataFrame) -> pd.DataFrame:
    df["site_id"] = df["site_id"].astype("category")
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    for col in WEATHER_FEATURE_COLUMNS:
        df[col] = df[col].astype(np.float32)
    return df


def cast_buildings_data(df: pd.DataFrame) -> pd.DataFrame:
    df["site_id"] = df["site_id"].astype("category")
    df["building_id"] = df["building_id"].astype("category")
    df["primary_use"] = df["primary_use"].astype("category")
    for col in ["square_feet", "year_built", "floor_count"]:
        df[col] = df[col].astype(np.float32)
    return df

## Train data

### Load raw data

In [None]:
# Meter readings
readings_df_train = pd.read_csv(
    f"{INPUT_DATA_PATH}/train.csv",
    header=0,
    names=["building_id", "meter_id", "timestamp", "meter_reading"],
)
readings_df_train = cast_readings_data(readings_df_train)

# Weather
weather_df_train = pd.read_csv(f"{INPUT_DATA_PATH}/weather_train.csv")
weather_df_train = cast_weather_data(weather_df_train)

# Buildings
buildings_df = pd.read_csv(f"{INPUT_DATA_PATH}/building_metadata.csv")
buildings_df = cast_buildings_data(buildings_df)

### Weather data

In [None]:
# Reindex weather data such that every site has a measurement
# for each training timestamp


def reindex_weather_data(
    weather_df: pd.DataFrame,
    start_timestamp: pd.Timestamp,
    end_timestamp: pd.Timestamp,
    freq: str = "1h"
) -> pd.DataFrame:
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])
    timestamps = pd.date_range(
        start_timestamp,
        end_timestamp,
        freq=freq,
        inclusive="both"
    )
    timestamps = pd.DatetimeIndex(timestamps, name="timestamp")
    site_dfs = []
    for site_id, site_df in weather_df.groupby("site_id", observed=True):
        site_df = site_df.set_index("timestamp").reindex(timestamps).reset_index()
        site_df["site_id"] = site_df["site_id"].fillna(value=site_id)
        site_dfs.append(site_df)

    weather_df = pd.concat(site_dfs, ignore_index=True)
    return weather_df

In [None]:
weather_df_train = reindex_weather_data(
    weather_df=weather_df_train,
    start_timestamp=MIN_TRAIN_TIMESTAMP,
    end_timestamp=MAX_TRAIN_TIMESTAMP,
)

In [None]:
# Functionality for imputing missing weather data

def interpolate(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    weather_df[column] = weather_df[column].interpolate("linear", limit=12)
    weather_df[column] = weather_df[column].ffill(limit=2)
    weather_df[column] = weather_df[column].bfill(limit=2)
    return weather_df


def _mean_weather_by_date_and_site(weather_df, column) -> pd.DataFrame:
    mean_values = (
        weather_df
        .groupby(["date", "site_id"])[[column]]
        .mean()
        .reset_index()
    )
    return mean_values


def _merge_onto_weather_df(weather_df, right, right_suffix) -> pd.DataFrame:
    weather_df = weather_df.merge(
        right=right,
        how="left",
        on=["date", "site_id"],
        suffixes=("", right_suffix)
    )
    return weather_df


def impute_with_same_day_mean(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Compute same day mean values and merge onto weather df
    mean_values = _mean_weather_by_date_and_site(weather_df, column)
    weather_df = _merge_onto_weather_df(weather_df, mean_values, "_mean")
    
    # Fill with means from same day
    weather_df[column] = weather_df[column].fillna(weather_df[f"{column}_mean"])
    weather_df = weather_df.drop(f"{column}_mean", axis=1)
    return weather_df


def ffill_mean_by_date(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Compute same day mean values and merge onto weather df
    mean_values = _mean_weather_by_date_and_site(weather_df, column)
    
    # ffill by site and date
    site_dfs = []
    for site_id, site_df in mean_values.groupby("site_id"):
        site_df = site_df.sort_values("date")
        site_df[column] = site_df[column].ffill().bfill()
        site_dfs.append(site_df)
    mean_values = pd.concat(site_dfs, ignore_index=True)
    
    # Merge back onto main and fill with mean values
    weather_df = _merge_onto_weather_df(weather_df, mean_values, "_mean")
    weather_df[column] = weather_df[column].fillna(weather_df[f"{column}_mean"])
    weather_df = weather_df.drop(f"{column}_mean", axis=1)
    return weather_df


def fill_missing_weather_data(weather_df: pd.DataFrame) -> pd.DataFrame:
    weather_df["date"] = weather_df["timestamp"].dt.date

    for column in WEATHER_FEATURE_COLUMNS:
        weather_df = interpolate(weather_df, column)
        weather_df = impute_with_same_day_mean(weather_df, column)
        weather_df = ffill_mean_by_date(weather_df, column)

    weather_df = weather_df.drop(columns=["date"])
    
    return weather_df

In [None]:
# Weather data feature engineering
# Lagged / rolling features. Compute these before merging to make merges
# less memory intensive.

def add_smoothed_weather_feature(df: pd.DataFrame, feature: str) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id", observed=True):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        site_df[f"{feature}_smoothed"] = savgol_filter(
            np.array(site_df[feature]),
            window_length=12,
            polyorder=2,
        )
        site_df[f"{feature}_smoothed"] = site_df[f"{feature}_smoothed"].astype(np.float32)
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df


def add_lagged_weather_feature(df: pd.DataFrame, feature: str, lag: int) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id", observed=True):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        lag_series = site_df[feature].shift(lag).astype(np.float32)
        site_df[f"{feature}_lag_{lag}"] = lag_series
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df
    

def add_rolling_mean_weather_feature(df: pd.DataFrame, feature: str, window: int) -> pd.DataFrame:
    site_dfs = []
    for site_id, site_df in df.groupby("site_id", observed=True):
        site_df = site_df[["site_id", "timestamp", feature]]
        site_df = site_df.sort_values("timestamp").drop_duplicates(keep="first")
        rolling_series = site_df[feature].rolling(window).mean().astype(np.float32)
        site_df[f"{feature}_rolling_{window}"] = rolling_series
        site_dfs.append(site_df)
    site_dfs = pd.concat(site_dfs, ignore_index=True).drop(columns=[feature])
    df = df.merge(right=site_dfs, on=["site_id", "timestamp"], how="left")
    return df

In [None]:
WEATHER_LAGS = [1, 2]

# Air temperature
feature_name = "air_temperature"
for lag in WEATHER_LAGS:
    weather_df_train = add_lagged_weather_feature(weather_df_train, feature_name, lag)

# Dew temperature
feature_name = "dew_temperature"
for lag in WEATHER_LAGS:
    weather_df_train = add_lagged_weather_feature(weather_df_train, feature_name, lag)

# Sea level pressure
feature_name = "sea_level_pressure"
for lag in WEATHER_LAGS:
    weather_df_train = add_lagged_weather_feature(weather_df_train, feature_name, lag)

## Merge

In [None]:
def merge_dfs(readings_df: pd.DataFrame, buildings_df: pd.DataFrame, weather_df: pd.DataFrame) -> pd.DataFrame:

    # Merge
    merged_df = pd.merge(left=readings_df, right=buildings_df, how="left", on="building_id")
    merged_df = pd.merge(left=merged_df, right=weather_df, how="left", on=["site_id", "timestamp"])

    return merged_df

In [None]:
train_df = merge_dfs(readings_df_train, buildings_df, weather_df_train)

## Feature engineering

In [None]:
def kbtu_to_kwh(df: pd.DataFrame) -> pd.DataFrame:
    mask = (df["building_id"] == 0) & (df["meter_id"] == 0)
    df.loc[mask, "meter_reading"] = df.loc[mask, "meter_reading"] * 0.2931
    return df


def add_target_transform(df: pd.DataFrame) -> pd.DataFrame:
    df["log_meter_reading"] = np.log1p(np.array(df["meter_reading"]))
    return df


def add_periodic_features(df: pd.DataFrame, feature: str, period: int) -> pd.DataFrame:
    df[f"{feature}_sin"] = np.sin(2 * np.pi * df[feature] / period)
    df[f"{feature}_sin"] = df[f"{feature}_sin"].astype(np.float32)
    
    df[f"{feature}_cos"] = np.cos(2 * np.pi * df[feature] / period).astype(np.float32)
    df[f"{feature}_cos"] = df[f"{feature}_cos"].astype(np.float32)
    
    return df


def add_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    df["hour"] = df["timestamp"].dt.hour.astype(np.uint8)
    df = add_periodic_features(df, "hour", 24)

    df["day_of_week"] = df["timestamp"].dt.weekday.astype(np.uint8)
    df = add_periodic_features(df, "day_of_week", 7)

    df["month"] = df["timestamp"].dt.month.astype(np.uint8)
    df = add_periodic_features(df, "month", 12)

    is_weekend = (df["timestamp"].dt.weekday >= 5)
    df["is_weekend"] = is_weekend.astype(np.uint8)
    
    return df


def add_building_age_feature(df: pd.DataFrame) -> pd.DataFrame:
    df["building_age_years"] = df["timestamp"].dt.year - df["year_built"]
    return df


def add_building_area_feature(df: pd.DataFrame) -> pd.DataFrame:
    df["building_area_square_feet"] = df["square_feet"] * df["floor_count"]
    return df


def add_building_purpose_id_feature(df: pd.DataFrame, purpose_id_map: dict[str, int]) -> pd.DataFrame:
    df["primary_use_id"] = df["primary_use"].map(purpose_id_map)
    return df


def saturation_vapour_pressure(temperature: pd.Series) -> pd.Series:
    return 6.1094 * np.exp(17.625 * temperature / (temperature + 243.04))


def add_relative_humidity_feature(df: pd.DataFrame) -> pd.DataFrame:
    svp_air_temp = saturation_vapour_pressure(df["air_temperature"])
    svp_dew_temp = saturation_vapour_pressure(df["dew_temperature"])
    rh = 100 * svp_dew_temp / svp_air_temp
    df["relative_humidity"] = rh.astype(np.float32)
    return df


def add_cold_chill_feature(df: pd.DataFrame) -> pd.DataFrame:
    # Cold chill only defined for temps below 10C and wind speeds above 1.3 m/s
    mask = (df["air_temperature"] <= 10.0) & (df["wind_speed"] >= 1.3)
    air_temp = df.loc[mask, "air_temperature"]
    wind_speed = df.loc[mask, "wind_speed"]
    cold_chill = (
        13.12 
        +  0.6215 * air_temp
        - 11.37 * (3.6 * wind_speed) ** 0.16
        + 0.3965 * air_temp * (3.6 * wind_speed) ** 0.16
    )
    df.loc[mask, "cold_chill"] = cold_chill.astype(np.float32)
    return df


def add_apparent_temperature_feature(df: pd.DataFrame) -> pd.DataFrame:
    mask = df["air_temperature"].between(10, 27, inclusive="both")
    air_temp = df.loc[mask, "air_temperature"]
    wind_speed = df.loc[mask, "wind_speed"]
    humidity = df.loc[mask, "relative_humidity"] / 100
    pressure = humidity * 6.105 * np.exp((17.27 * air_temp) / (air_temp + 237.7))
    apparent_temp = air_temp + 0.33 * pressure - 0.7 * wind_speed - 4
    df.loc[mask, "apparent_temperature"] = apparent_temp.astype(np.float32)
    return df


def add_heat_index_feature(df: pd.DataFrame) -> pd.DataFrame:
    mask = df["air_temperature"] >= 27
    air_temp = df.loc[mask, "air_temperature"]
    humidity = df.loc[mask, "relative_humidity"]
    heat_index = (
        - 8.7847 
        + 1.6114 * air_temp 
        + 2.3385 * humidity
        - 0.1461 * air_temp * humidity
        - 0.0123 * air_temp ** 2 
        - 0.0164 * humidity ** 2
        + 2.212e-03 * air_temp ** 2 * humidity
        + 7.255e-04 * air_temp * humidity ** 2
        - 3.582e-06 * air_temp ** 2 * humidity ** 2
    )
    df.loc[mask, "heat_index"] = heat_index.astype(np.float32)
    return df


def cooling_degree_days(df: pd.DataFrame) -> pd.DataFrame:
    # https://www.investopedia.com/terms/c/colddegreeday.asp
    ...


def heating_degree_days():
    ...

In [None]:
# Timestamp features
train_df = add_temporal_features(train_df)

# Meter reading features
train_df = kbtu_to_kwh(train_df)
train_df = add_target_transform(train_df)

# Building features
train_df = add_building_age_feature(train_df)
train_df = add_building_area_feature(train_df)
primary_use_id_map = {
    use: i
    for i, use in enumerate(train_df["primary_use"].unique().tolist())
}
train_df = add_building_purpose_id_feature(train_df, primary_use_id_map)

# Weather features
train_df = add_relative_humidity_feature(train_df)
train_df = add_cold_chill_feature(train_df)
train_df = add_apparent_temperature_feature(train_df)
train_df = add_heat_index_feature(train_df)
train_df = add_periodic_features(train_df, "wind_direction", 360)

In [None]:
train_df.to_parquet("train_df.parquet")

## Test data

In [None]:
# Meter readings
readings_df_test = pd.read_csv(
    f"{INPUT_DATA_PATH}/test.csv",
    header=0,
    names=["row_id", "building_id", "meter_id", "timestamp"],
)
readings_df_test = cast_readings_data(readings_df_test)
readings_df_test["row_id"] = readings_df_test["row_id"].astype(np.uint32)

# Weather
weather_df_test = pd.read_csv(f"{INPUT_DATA_PATH}/weather_test.csv")
weather_df_test = cast_weather_data(weather_df_test)

In [None]:
# Reindex weather data
weather_df_test = reindex_weather_data(
    weather_df=weather_df_test,
    start_timestamp=MIN_TEST_TIMESTAMP,
    end_timestamp=MAX_TEST_TIMESTAMP,
)

# Concat train and test weather data
train_timestamp_cutoff = MIN_TEST_TIMESTAMP - pd.Timedelta("1d")  # More than enough
timestamp_mask = weather_df_train["timestamp"] >= train_timestamp_cutoff
weather_cols = ["timestamp", "site_id"] + WEATHER_FEATURE_COLUMNS
weather_df_test = pd.concat(
    [
        weather_df_test,
        weather_df_train[timestamp_mask][weather_cols]
        
    ],
    axis=0,
    ignore_index=True
)

In [None]:
# Compute lagged / rolling weather features
WEATHER_LAGS = [1, 2]

# Air temperature
feature_name = "air_temperature"
for lag in WEATHER_LAGS:
    weather_df_test = add_lagged_weather_feature(weather_df_test, feature_name, lag)

# Dew temperature
feature_name = "dew_temperature"
for lag in WEATHER_LAGS:
    weather_df_test = add_lagged_weather_feature(weather_df_test, feature_name, lag)

# Sea level pressure
feature_name = "sea_level_pressure"
for lag in WEATHER_LAGS:
    weather_df_test = add_lagged_weather_feature(weather_df_test, feature_name, lag)


weather_df_test = weather_df_test.sort_values(["site_id", "timestamp"]).reset_index(drop=True)

In [None]:
# Merge all dfs
test_df = merge_dfs(readings_df_test, buildings_df, weather_df_test)

### Feature engineering

In [None]:
# Timestamp features
test_df = add_temporal_features(test_df)

# Building features
test_df = add_building_age_feature(test_df)
test_df = add_building_area_feature(test_df)
test_df = add_building_purpose_id_feature(test_df, primary_use_id_map)

# Weather features
test_df = add_relative_humidity_feature(test_df)
test_df = add_cold_chill_feature(test_df)
test_df = add_apparent_temperature_feature(test_df)
test_df = add_heat_index_feature(test_df)
test_df = add_periodic_features(test_df, "wind_direction", 360)

In [None]:
test_df.to_parquet("test_df.parquet")