In [None]:
from datetime import datetime, date
import pandas as pd
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Constants
INPUT_DATA_PATH = "/kaggle/input/ashrae-energy-prediction"

MIN_TRAIN_TIMESTAMP = pd.Timestamp("2016-01-01 00:00:00")
MAX_TRAIN_TIMESTAMP = pd.Timestamp("2016-12-31 23:00:00")
TRAIN_DATA_RESOLUTION = "1h"

WEATHER_FEATURE_COLUMNS = [
    'air_temperature',
    'cloud_coverage',
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_direction',
    'wind_speed'
]

In [None]:
# Load raw data
readings_df = pd.read_csv(f"{INPUT_DATA_PATH}/train.csv")
weather_df = pd.read_csv(f"{INPUT_DATA_PATH}/weather_train.csv")

# Meter readings

## Meter counts by building

In [None]:
# Note: meter_id: type -> {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}
# Conclusion: Not every building as readings for every meter type

meter_counts_by_building = readings_df.groupby("building_id")["meter"].nunique()
freq, edges = np.histogram(
    np.array(meter_counts_by_building),
    np.arange(1, meter_counts_by_building.max() + 2)
)

fig, ax = plt.subplots()
ax.bar(edges[:-1], freq, width=np.diff(edges), edgecolor="black")
ax.set(ylabel="Frequency", xlabel="n_meters", xticks=edges[:-1]);

## Distribution of meter readings

In [None]:
# Fix units for one of the meters: https://www.kaggle.com/c/ashrae-energy-prediction/discussion/119261
mask = (readings_df["building_id"] == 0) & (readings_df["meter"] == 0)
readings_df.loc[mask, "meter_reading"] = readings_df.loc[mask, "meter_reading"] * 0.2931

In [None]:
readings = np.array(readings_df["meter_reading"])
log_readings = np.log(readings + 1)

fig, ax = plt.subplots(1, 2, figsize=(10, 3.5))
freq, edges = np.histogram(readings)
ax[0].bar(edges[:-1], freq, width=np.diff(edges), edgecolor="black")
ax[0].set(ylabel="Frequency", xlabel="meter_reading", xticks=edges[:-1]);

freq, edges = np.histogram(log_readings)
ax[1].bar(edges[:-1], freq, width=np.diff(edges), edgecolor="black")
ax[1].set(ylabel="Frequency", xlabel="log_meter_reading", xticks=edges[:-1]);
fig.tight_layout();

## Distribution of meter readings by type

In [None]:
meter_types = {0: "electricity", 1: "chilledwater", 2: "steam", 3: "hotwater"}

fig, ax = plt.subplots(len(meter_types), 2, figsize=(10, 3.5 * len(meter_types)))

for i, (meter_id, meter_type) in enumerate(meter_types.items()):
    meter_df = readings_df[readings_df["meter"] == meter_id]
    
    readings = np.array(meter_df["meter_reading"])
    freq, edges = np.histogram(readings)
    ax[i, 0].bar(edges[:-1], freq, width=np.diff(edges), edgecolor="black", label=meter_type)
    ax[i, 0].set(ylabel="Frequency")
    ax[i, 0].legend()
    
    log_readings = np.log(readings + 1)
    freq, edges = np.histogram(log_readings)
    ax[i, 1].bar(edges[:-1], freq, width=np.diff(edges), edgecolor="black", label=f"log {meter_type}")
    ax[i, 1].set(xticks=edges[:-1])
    ax[i, 1].legend()

fig.tight_layout();

## Distribution by meter and building

In [None]:
meter_types = {0: "electricity", 1: "chilledwater", 2: "steam", 3: "hotwater"}
max_buildings = 30  # number of buildings to plot chosen at random

fig, ax = plt.subplots(1, len(meter_types), figsize=(len(meter_types) * 4, 3.5), sharey=True)
for m, meter_id in enumerate(meter_types):
    
    meter_df = readings_df[(readings_df["meter"] == meter_id)]
    all_building_ids = list(meter_df["building_id"].unique())
    building_ids_to_plot = np.random.choice(
        all_building_ids,
        min(max_buildings, len(all_building_ids))
    )
    
    colors = cm.viridis(np.linspace(0, 1, len(building_ids_to_plot)))
    for b, building_id in enumerate(building_ids_to_plot):
        readings = meter_df[meter_df["building_id"] == building_id]["meter_reading"]
        log_readings = np.log(np.array(readings) + 1)
        freq, edges = np.histogram(log_readings, bins=50)
        ax[m].stairs(freq / len(readings), edges, label=building_id, color=colors[b])
    
    ax[m].set_title(f"{meter_types[meter_id]}")

fig.tight_layout();

In [None]:
# Plot daily averages
readings_df["timestamp"] = pd.to_datetime(readings_df["timestamp"])
daily_average = (
    readings_df
    .assign(log_reading=np.log(readings_df["meter_reading"] + 1))
    .groupby(["building_id", "meter", readings_df["timestamp"].dt.date])
    [["log_reading"]]
    .mean()
    .reset_index()
)

daily_building_average = (
    readings_df
    .assign(log_reading=np.log(readings_df["meter_reading"] + 1))
    .groupby(["meter", readings_df["timestamp"].dt.date])
    [["log_reading"]]
    .mean()
    .reset_index()
)

meter_types = {0: "electricity", 1: "chilledwater", 2: "steam", 3: "hotwater"}
fig, ax = plt.subplots(len(meter_types), 1, figsize=(15, len(meter_types) * 2), sharex=True)
for m, meter_id in enumerate(meter_types):
    
    meter_df = daily_average[(daily_average["meter"] == meter_id)]
    building_ids = list(meter_df["building_id"].unique())
    colors = cm.viridis(np.linspace(0, 1, len(building_ids)))
    for b, building_id in enumerate(building_ids):
        readings = meter_df[meter_df["building_id"] == building_id]
        ax[m].plot(
            readings["timestamp"].values,
            readings["log_reading"].values,
            color="gray",
            lw=0.5,
            alpha=0.3
        )
    
    # Add building average to plot
    average_df = daily_building_average[daily_building_average["meter"] == meter_id]
    ax[m].plot(
        average_df["timestamp"].values,
        average_df["log_reading"].values,
        color="orange",
        lw=2.5,
    )
    
    ax[m].set_ylabel(f"{meter_types[meter_id]}")

fig.tight_layout();

# Buildings data

## Meter readings by primary use

In [None]:
building_df = pd.read_csv(f"{INPUT_DATA_PATH}/building_metadata.csv")

In [None]:
fig, ax = plt.subplots()

building_count_by_use = building_df["primary_use"].value_counts()
ys = np.arange(len(building_count_by_use))
ax.barh(ys, building_count_by_use.values, edgecolor="black")
ax.set(
    xlabel="Frequency",
    yticks=ys,
    yticklabels=list(building_count_by_use.index),
    title="Building count by primary use"
);

In [None]:
readings_df["building_id"] = readings_df["building_id"].astype(int)
building_df["building_id"] = building_df["building_id"].astype(int) 

readings_and_buildings = readings_df.merge(
    right=building_df,
    on="building_id",
    how="left"
)

In [None]:
readings_and_buildings["timestamp"] = pd.to_datetime(readings_and_buildings["timestamp"])
daily_average_by_use = (
    readings_and_buildings
    .assign(log_reading=np.log(readings_and_buildings["meter_reading"] + 1))
    .groupby([
        "primary_use",
        "meter",
        readings_and_buildings["timestamp"].dt.date
    ])
    [["log_reading"]]
    .mean()
    .reset_index()
)

In [None]:
meter_types = {0: "electricity", 1: "chilledwater", 2: "steam", 3: "hotwater"}

fig, ax = plt.subplots(len(meter_types), 1, figsize=(15, len(meter_types) * 2), sharex=True)

building_uses = list(daily_average_by_use["primary_use"].unique())
colors = cm.viridis(np.linspace(0, 1, len(building_uses)))

for m, meter_id in enumerate(meter_types):
    meter_df = daily_average_by_use[(daily_average_by_use["meter"] == meter_id)]

    for u, use in enumerate(building_uses):
        use_df = meter_df[meter_df["primary_use"] == use]
        ax[m].plot(
            use_df["timestamp"].values,
            use_df["log_reading"].values,
            color=colors[u],
            label=use,
        )
    
    ax[m].set(ylabel=meter_types[meter_id])

handles, labels = ax[-1].get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(1.15, 0.7), fontsize="small")

fig.tight_layout();

In [None]:
meter_types = {0: "electricity", 1: "chilledwater", 2: "steam", 3: "hotwater"}

fig, ax = plt.subplots(len(meter_types), 1, figsize=(16, len(meter_types) * 3), sharex=True)

for m, meter_id in enumerate(meter_types):
    meter_df = daily_average_by_use[(daily_average_by_use["meter"] == meter_id)]
    meter_pivot = meter_df.pivot(columns="timestamp", index="primary_use", values="log_reading")
    ax[m] = sns.heatmap(meter_pivot, ax=ax[m])
    
    ys = np.arange(meter_pivot.shape[0]) + 0.5
    ax[m].set_yticks(ys)
    ax[m].set_yticklabels(list(meter_pivot.index), fontsize="small")
    ax[m].set(ylabel=meter_types[meter_id], xlabel="")
    
fig.tight_layout();