In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

In [None]:
# Load raw data
train_df = pd.read_csv(f"{INPUT_DATA_PATH}/train.csv")
building_df = pd.read_csv(f"{INPUT_DATA_PATH}/building_metadata.csv")
weather_df = pd.read_csv(f"{INPUT_DATA_PATH}/weather_train.csv")

In [None]:
# train_df["building_id"] = train_df["building_id"].astype("category")
# train_df["meter"] = train_df["meter"].astype("category")
# train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])
# train_df["meter_reading"] = train_df["meter_reading"].astype(float)

In [None]:
# building_df["site_id"] = building_df["site_id"].astype("category")
# building_df["building_id"] = building_df["building_id"].astype("category")

In [None]:
# weather_train_df["site_id"] = weather_train_df["site_id"].astype("category")
# weather_train_df["timestamp"] = pd.to_datetime(weather_train_df["timestamp"])

In [None]:
def reindex_weather_df(weather_df: pd.DataFrame) -> pd.DataFrame:
    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"])

    # Reindex weather data
    start = pd.Timestamp("2016-01-01 00:00:00")
    end = pd.Timestamp("2016-12-31 23:00:00")
    freq="1h"

    timestamps = pd.date_range(start, end, freq=freq, inclusive="both")
    timestamps = pd.DatetimeIndex(timestamps, name="timestamp")
    site_dfs = []
    for site_id, site_df in weather_train_df.groupby("site_id", observed=True):
        site_df = site_df.set_index("timestamp").reindex(timestamps).reset_index()
        site_df["site_id"] = site_df["site_id"].fillna(value=site_id)
        site_dfs.append(site_df)

    weather_df = pd.concat(site_dfs)
    return weather_df

In [None]:
def missing_weather_stats(weather_df: pd.DataFrame) -> pd.DataFrame:
    nan_count = weather_df.isna().sum().to_frame("n_missing")
    nan_count["pct_missing"] = nan_count["n_missing"] / len(weather_df) * 100
    return nan_count

In [None]:
weather_df = reindex_weather_df(weather_df)

In [None]:
missing_weather_stats(weather_df)

In [None]:
weather_df.assign(missing_dew=weather_df["dew_temperature"].isna()).groupby("site_id")["missing_dew"].sum()

In [None]:
# air_temperature
site_id = 0
site_df = weather_df[weather_df["site_id"] == site_id][["timestamp", "site_id", "dew_temperature"]].copy()
nan_index = np.argwhere(site_df["dew_temperature"].isna()).flatten()

fig, ax = plt.subplots(len(nan_index), 1, figsize=(10, 2.5 * len(nan_index)))
ax = ax if len(nan_index) > 1 else [ax]
for i, nan_idx in enumerate(nan_index):
    site_df.iloc[max(nan_idx - 30, 0): nan_idx + 30].plot(x="timestamp", y="dew_temperature", ax=ax[i], lw=2.5)

fig.tight_layout()

In [None]:
site_id = 7
site_df = weather_df[weather_df["site_id"] == site_id][["timestamp", "site_id", "dew_temperature"]].copy()

In [None]:
missing_idx = site_df[site_df["dew_temperature"].isna()].index
missing_idx_lower = missing_idx - 10
missing_idx_upper = missing_idx + 10

In [None]:
missing_idx[150:200]

In [None]:
site_df.iloc[1080:1300].plot(x="timestamp", y="dew_temperature")

In [None]:
# air_temperature: linear interpolation, ffill, bfill
# dew_temperature: linear interpolation, ffill, bfill for sites !=

In [None]:
pd.date_range(start=start, end=end, freq="1h")[58*24:60*24]

In [None]:
merged_train_df = pd.merge(
    left=merged_train_df,
    right=weather_train_df,
    on=["site_id", "timestamp"],
    how="left",
    indicator=True
)

In [None]:
merged_train_df[(merged_train_df["_merge"] != "both")]