In [None]:
from datetime import datetime, date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
TRAIN_DATA_RESOLUTION = "1h"

WEATHER_FEATURE_COLUMNS = [
    'air_temperature',
    'cloud_coverage',
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_direction',
    'wind_speed'
]

In [None]:
# Load raw data
# train_df = pd.read_csv(f"{INPUT_DATA_PATH}/train.csv")
# building_df = pd.read_csv(f"{INPUT_DATA_PATH}/building_metadata.csv")
weather_df = pd.read_csv(f"{INPUT_DATA_PATH}/weather_train.csv")

In [None]:
# train_df["building_id"] = train_df["building_id"].astype("category")
# train_df["meter"] = train_df["meter"].astype("category")
# train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])
# train_df["meter_reading"] = train_df["meter_reading"].astype(float)

In [None]:
# building_df["site_id"] = building_df["site_id"].astype("category")
# building_df["building_id"] = building_df["building_id"].astype("category")

In [None]:
# weather_train_df["site_id"] = weather_train_df["site_id"].astype("category")
# weather_train_df["timestamp"] = pd.to_datetime(weather_train_df["timestamp"])

In [None]:
def reindex_weather_df(weather_df: pd.DataFrame) -> pd.DataFrame:
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])

    # Reindex weather data
    timestamps = pd.date_range(
        MIN_TRAIN_TIMESTAMP,
        MAX_TRAIN_TIMESTAMP,
        freq=TRAIN_DATA_RESOLUTION,
        inclusive="both"
    )
    timestamps = pd.DatetimeIndex(timestamps, name="timestamp")
    site_dfs = []
    for site_id, site_df in weather_df.groupby("site_id", observed=True):
        site_df = site_df.set_index("timestamp").reindex(timestamps).reset_index()
        site_df["site_id"] = site_df["site_id"].fillna(value=site_id)
        site_dfs.append(site_df)

    weather_df = pd.concat(site_dfs, ignore_index=True)
    return weather_df

In [None]:
def timestamp_features(weather_df: pd.DataFrame) -> pd.DataFrame:
    weather_df["date"] = weather_df["timestamp"].dt.date
    return weather_df

In [None]:
def missing_weather_stats(weather_df: pd.DataFrame) -> pd.DataFrame:
    nan_count = weather_df.isna().sum().to_frame("n_missing")
    nan_count["pct_missing"] = nan_count["n_missing"] / len(weather_df) * 100
    return nan_count

In [None]:
weather_df = reindex_weather_df(weather_df)
weather_df = timestamp_features(weather_df)

In [None]:
missing_weather_stats(weather_df)

In [None]:
def interpolate(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    weather_df[column] = weather_df[column].interpolate("linear", limit=12)
    weather_df[column] = weather_df[column].ffill(limit=2)
    weather_df[column] = weather_df[column].bfill(limit=2)
    return weather_df

In [None]:
for column in WEATHER_FEATURE_COLUMNS:
    weather_df = interpolate(weather_df, column)
    
missing_weather_stats(weather_df)

In [None]:
def impute_with_same_day_mean(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Compute same day mean values and merge onto weather df
    mean_values = (
        weather_df
        .groupby(["date", "site_id"])[[column]]
        .mean()
        .reset_index()
    )
    weather_df = weather_df.merge(
        right=mean_values,
        how="left",
        on=["date", "site_id"],
        suffixes=("", "_mean")
    )
    
    weather_df[column] = weather_df[column].fillna(weather_df[f"{column}_mean"])
    weather_df = weather_df.drop(f"{column}_mean", axis=1)
    return weather_df

In [None]:
for column in WEATHER_FEATURE_COLUMNS:
    weather_df = impute_with_same_day_mean(weather_df, column)
    
missing_weather_stats(weather_df)

In [None]:
def impute_with_previous_day_mean(weather_df: pd.DataFrame, column: str) -> pd.DataFrame:
    # Compute previous day mean values
    mean_values = (
        weather_df
        .groupby(["date", "site_id"])[[column]]
        .mean()
        .reset_index()
    )
    
    # Shift by groups
    site_dfs = []
    for site_id, site_df in mean_values.groupby("site_id"):
        site_df = site_df.sort_values("date")
        site_df[column] = site_df[column].shift(1)
        site_dfs.append(site_df)
    mean_values = pd.concat(site_dfs, ignore_index=True)
    
    # Merge back onto main
    weather_df = weather_df.merge(
        right=mean_values,
        how="left",
        on=["date", "site_id"],
        suffixes=("", "_mean")
    )
    
    weather_df[column] = weather_df[column].fillna(weather_df[f"{column}_mean"])
    weather_df = weather_df.drop(f"{column}_mean", axis=1)
    return weather_df

In [None]:
for column in WEATHER_FEATURE_COLUMNS:
    weather_df = impute_with_previous_day_mean(weather_df, column)

missing_weather_stats(weather_df)