# **3 Data Dashboard**

---

## **3.1 Library**

In [10]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark.sql.functions as F
import matplotlib.dates as mdates
import numpy as np

In [11]:
from kafka3 import KafkaConsumer
from collections import deque, defaultdict
from matplotlib.animation import FuncAnimation
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from zoneinfo import ZoneInfo
from bisect import bisect_left

## **3.2 Meter reading**

For now, we treat the meter readings as static data, but plan to implement them as streaming data in the future.

## **3.2.1 Loading**

In [13]:
## ============================================================================================= Implementation
## --------------------------------------------------------------------------------------------- Session Creation
master = "local[4]"
app_name = "FIT5202_A2B_app"
conf = SparkConf().setMaster(master).setAppName(app_name)
spark_session = SparkSession.builder.config(conf=conf).getOrCreate()
## --------------------------------------------------------------------------------------------- Schema
schema_meter = StructType([
    StructField("building_id", IntegerType()),
    StructField("meter_type",  StringType()),
    StructField("ts",          TimestampType()),
    StructField("value",       DecimalType(10,4)),
    StructField("row_id",      IntegerType())
])
## --------------------------------------------------------------------------------------------- Load Data
DF_meter_raw = spark_session.read.csv(
    "../dataset/new_meters.csv",
    schema = schema_meter,
    header = True
)
## --------------------------------------------------------------------------------------------- Pre-processing
DF_meter_01 = (
    DF_meter_raw
    .drop("row_id")
    .withColumns({
        "timestamp": F.col("ts")
    })
    .drop("ts")
    .withColumns({
        "year" : F.year(F.col("timestamp")).cast("integer"),
        "month": F.month(F.col("timestamp")).cast("integer"),
        "day"  : F.day(F.col("timestamp")).cast("integer"),
        "hour" : F.hour(F.col("timestamp")).cast("integer")
    })
    .drop("timestamp")
    .drop("year")
    ## ------------------------------------------------------------------------------------------ Binning
    .withColumn(
        colName = "start_hour",
        col     = F.when(condition = (F.col("hour") >=  0) & (F.col("hour") <   6), value =  0).
                    when(condition = (F.col("hour") >=  6) & (F.col("hour") <  12), value =  6).
                    when(condition = (F.col("hour") >= 12) & (F.col("hour") <  18), value = 12).
                    when(condition = (F.col("hour") >= 18) , value = 18).
                    otherwise(value = None)
    )
    .drop("hour")
    .groupBy("building_id","month","day")
    .agg(F.sum(F.col("value")).alias("value"))
    .orderBy(["building_id","month","day"])
)

## **3.2.2 Pre-processing**

In [14]:
## ============================================================================================= Implementation
## --------------------------------------------------------------------------------------------- Schema
schema_build = StructType([
    StructField("site_id",     IntegerType()),
    StructField("building_id", IntegerType()),
    StructField("primary_use", StringType()),
    StructField("square_feet", IntegerType()),
    StructField("floor_count", IntegerType()),
    StructField("row_id",      IntegerType()),
    StructField("year_built",  IntegerType()),
    StructField("latent_y",    DecimalType()),
    StructField("latent_s",    DecimalType()),
    StructField("latent_r",    DecimalType())
])
## --------------------------------------------------------------------------------------------- Read in
DF_build_raw = (
    spark_session
    .read
    .csv(
        "../dataset/new_building_information.csv",
        header=True,
        schema=schema_build
    )
    .select("site_id","building_id")
    .distinct()
)
## --------------------------------------------------------------------------------------------- Joining
DF_join = (
    DF_meter_01
    .join(
        other = DF_build_raw,
        on    = "building_id",
        how   = "left"
    )
)
## --------------------------------------------------------------------------------------------- Aggregation
DF_meter_final = (
    DF_join
    .groupBy(["site_id","month","day"])
    .agg(F.sum("value").alias("value"))
)

## **3.3 Dashboard**

### **3.3.1 Consumer Definition**

In [None]:
## ============================================================================================= Implementation
## --------------------------------------------------------------------------------------------- Kafka Consumer For 6a
topic = "FIT5202_A2B_prediction"
host_ip  = "kafka"
consumer03 = KafkaConsumer(
    topic,
    consumer_timeout_ms = 1000,
    auto_offset_reset   = 'latest',
    bootstrap_servers   = [f'{host_ip}:9092'],
    max_poll_records    = 200,
    value_deserializer  = lambda m: json.loads(m.decode("utf-8")),
    enable_auto_commit  = False,
    api_version         = (0, 10)
)
## --------------------------------------------------------------------------------------------- Kafka Consumer For 6b
topic   = "FIT5202_A2B_daily"
host_ip = "kafka"
consumer01 = KafkaConsumer(
    topic,
    consumer_timeout_ms = 1000,
    auto_offset_reset   = 'latest',
    bootstrap_servers   = [f'{host_ip}:9092'],
    max_poll_records    = 16,
    value_deserializer  = lambda m: json.loads(m.decode("utf-8")),
    enable_auto_commit  = False,
    api_version = (0, 10)
)
## --------------------------------------------------------------------------------------------- Kafka Consumer For 6c
topic   = "FIT5202_A2B_weekly"
host_ip = "kafka"
consumer02 = KafkaConsumer(
    topic,
    consumer_timeout_ms = 1000,
    auto_offset_reset   = 'latest',
    bootstrap_servers   = [f'{host_ip}:9092'],
    max_poll_records    = 40,
    value_deserializer  = lambda m: json.loads(m.decode("utf-8")),
    enable_auto_commit  = False,
    api_version         = (0, 10)
)

### **3.3.2 Daily Dashboard**

In [15]:
%matplotlib notebook
## ============================================================================================= Implementation
## --------------------------------------------------------------------------------------------- Drawing Line
fig, ax = plt.subplots(figsize=(20, 6))
ax.set_title("Stream daily energy consumption prediction by site")
ax.set_xlabel("date")
ax.set_ylabel("daily energy consumption prediction")
## --------------------------------------------------------------------------------------------- Formatting Timeline Axis
# To formatting the axis automatically,
# just for better looking
locator   = mdates.AutoDateLocator(
    minticks = 5, # Minimum sticks
    maxticks = 7  # Maximum sticks
)
formatter = mdates.DateFormatter(
    '%Y-%m-%d %H:%M:%S',
    tz = ZoneInfo('Australia/Melbourne')
)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
## --------------------------------------------------------------------------------------------- Buffer Definition
WINDOW_MAX = 48 # Showing only two weeks
buffers_x = defaultdict(lambda: deque(maxlen=WINDOW_MAX)) # Buffer for x-axis
buffers_y = defaultdict(lambda: deque(maxlen=WINDOW_MAX)) # Buffer for y-axis
## --------------------------------------------------------------------------------------------- Function
lines = {}  # Saving Line Target
# Adding line
def ensure_line(site_id):
    """
    Function to create a line for unseen site id
    """
    if site_id not in lines:
        (line,) = ax.plot([], [], label=f"site {site_id}")
        lines[site_id] = line
        ax.legend(loc="upper left", ncol=2)
# Get data from kafka
def poll_batch(
        timeout_ms  = 100, # Print new data every 14 second (match with sender)
        max_records = 16   # Get 2 point every time (two weeks)
) -> list:
    """
    Pull data from kafka and formatting it
    """
    out = []
    # Pull n message from kafka
    raw = consumer01.poll(timeout_ms=timeout_ms, max_records=max_records)
    # Return empty list when not getting any data
    if not raw:
        return out
    # Formatting data
    for _, records in raw.items():
        for r in records:
            v = r.value
            site_id = v["site_id"]
            value   = float(v["daily_value_pred_adjust"])
            t_end   = pd.to_datetime(v["window"]["end"])  # Using window ends as event time
            ts_mpl  = mdates.date2num(pd.to_datetime(t_end).to_pydatetime())
            out.append({"site_id": site_id, "ts_mpl": ts_mpl, "value": value})
    return out

# Animation update frequency
UPDATE_INTERVAL_MS = 7000

def update(frame):
    """
    Update the animation for matplotlib.animation
    """
    batch = poll_batch(timeout_ms=10, max_records=5)

    # Keep the axis
    if not batch and not lines:
        return []

    # Update data into buffer
    for item in batch:
        site_id = item["site_id"]
        ensure_line(site_id)
        buffers_x[site_id].append(item["ts_mpl"])
        buffers_y[site_id].append(item["value"])

    # Update data for each line
    artists = []
    for site_id, line in lines.items():
        xs = list(buffers_x[site_id])
        ys = list(buffers_y[site_id])
        line.set_data(xs, ys)
        artists.append(line)

    # Modify the axis
    if artists:
        ax.relim()
        ax.autoscale_view()

    return artists
## --------------------------------------------------------------------------------------------- Showing
ani = FuncAnimation(
    fig,
    update,
    interval=UPDATE_INTERVAL_MS,
    blit=False,
    cache_frame_data=False
)
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### **3.3.2 Weekly Dashboard**

In [16]:
%matplotlib notebook
## ============================================================================================= Implementation
## --------------------------------------------------------------------------------------------- Subplots Creation
fig, axes = plt.subplots(ncols=1, nrows=4, figsize=(20, 15), sharex=True)
fig.suptitle("Streaming weekly energy consumption prediction by start_hour & building")

SLOTS = [0, 6, 12, 18]
axis_map = {slot: ax for slot, ax in zip(SLOTS, axes.ravel())}
for slot, ax in axis_map.items():
    ax.set_title(f"start_hour = {slot}")
    ax.set_ylabel("value")

# Time axis formatting
locator   = mdates.AutoDateLocator(minticks=5, maxticks=7)
formatter = mdates.DateFormatter('%Y-%m-%d %H:%M:%S', tz=ZoneInfo('Australia/Melbourne'))
for ax in axis_map.values():
    ax.xaxis.set_major_locator(locator)
    ax.xaxis.set_major_formatter(formatter)

## --------------------------------------------------------------------------------------------- Buffer
WINDOW_MAX = 1_000_000  # Keep full history (just make it big enough)
buffers_x = defaultdict(lambda: defaultdict(list))  # buffers_x[slot][bid] = [mdates float days...]
buffers_y = defaultdict(lambda: defaultdict(list))

# Line + scatter per series: lines[(slot,bid)] = {"line": Line2D, "pts": PathCollection}
lines = {}

def ensure_line(slot: int, building_id: int):
    key = (slot, building_id)
    if key in lines:
        return
    ax = axis_map.get(slot)
    if ax is None:
        return
    (line,) = ax.plot([], [], linestyle='-', linewidth=1, label=f"bld {building_id}")
    pts = ax.scatter([], [], s=18)  # isolated points
    lines[key] = {"line": line, "pts": pts}
    ax.legend(loc="upper left", ncol=2, fontsize=9)

# Handle different window.end field shapes
def _get_window_end(v: dict):
    if isinstance(v.get("window"), dict) and "end" in v["window"]:
        return v["window"]["end"]
    return v.get("window.end") or v.get("window_end") or v.get("end")

# --------------------------------------------------------------------------------------------- Fetch a batch & normalize
def poll_batch(timeout_ms=200, max_records=40):
    out = []
    raw = consumer02.poll(timeout_ms=timeout_ms, max_records=max_records)
    if not raw:
        return out

    for _, records in raw.items():
        for r in records:
            v = r.value
            try:
                building_id = int(v["building_id"])
                slot        = int(v["start_hour"])
                value       = float(v.get("value",
                                   v.get("total_value_pred_adjust", v.get("daily_value_pred_adjust"))))
                t_end_raw   = _get_window_end(v)
                if t_end_raw is None:
                    continue
                t_end  = pd.to_datetime(t_end_raw)
                ts_mpl = mdates.date2num(t_end.to_pydatetime())
                out.append({"slot": slot, "building_id": building_id, "ts_mpl": ts_mpl, "value": value})
            except Exception:
                continue
    return out
# --------------------------------------------------------------------------------------------- Ordered insert (no trimming)
def ordered_insert(slot, bid, x_new, y_new, maxlen=WINDOW_MAX):
    xs = buffers_x[slot][bid]
    ys = buffers_y[slot][bid]
    i = bisect_left(xs, x_new)
    if i < len(xs) and xs[i] == x_new:
        ys[i] = y_new  # overwrite same timestamp
    else:
        xs.insert(i, x_new)
        ys.insert(i, y_new)
        if len(xs) > maxlen:
            xs.pop(0); ys.pop(0)
# --------------------------------------------------------------------------------------------- Animation update (dots on breaks, lines when continuous + show view window)
UPDATE_INTERVAL_MS = 1000     # refresh every 1s
GAP_SEC = 8.0                 # consecutive if gap <= 8s; otherwise break; How I control the dot and line :)
SEC_PER_DAY = 86400.0         # To convert timestamp
VIEW_WINDOW_SEC  = 60.0       # The printer window
VIEW_WINDOW_DAYS = VIEW_WINDOW_SEC / SEC_PER_DAY # # Calculate how many day per windows

# Shaded span cache per subplot (to highlight current window)
span_map = {
    slot: None
    for slot in axis_map.keys()
}

def update(_frame):
    batch = poll_batch(timeout_ms=300, max_records=40)
    if not batch and not lines:
        return []

    # Fan out and insert into buffers
    for item in batch:
        slot = int(item["slot"])
        bid  = int(item["building_id"])
        if slot not in axis_map:
            continue
        ensure_line(slot, bid)
        ordered_insert(slot, bid, item["ts_mpl"], item["value"])

    artists = []

    # Find the right edge per subplot: max timestamp among its series
    right_edge = {}
    for slot in axis_map.keys():
        rights = []
        for (s, bid) in lines.keys():
            if s != slot:
                continue
            xs_all = buffers_x[s][bid]
            if xs_all:
                rights.append(xs_all[-1])  # ordered insert → last one is max
        if rights:
            right_edge[slot] = max(rights)

    # Render: only draw data inside the window; dots for breaks, lines when continuous
    for (slot, bid), art in lines.items():
        xs_all = np.array(buffers_x[slot][bid], dtype=float)
        ys_all = np.array(buffers_y[slot][bid], dtype=float)

        if xs_all.size == 0 or slot not in right_edge:
            art["line"].set_data([], [])
            art["pts"].set_offsets(np.empty((0, 2)))
            continue

        right = right_edge[slot]
        left  = right - VIEW_WINDOW_DAYS

        # keep only points in the window
        mask = xs_all >= left
        xs = xs_all[mask]
        ys = ys_all[mask]

        if xs.size == 0:
            art["line"].set_data([], [])
            art["pts"].set_offsets(np.empty((0, 2)))
            continue
        if xs.size == 1:
            art["line"].set_data([], [])
            art["pts"].set_offsets(np.c_[xs, ys])
            artists.extend([art["pts"]])
            continue

        # Break segments by inserting NaN when the gap exceeds the threshold
        gaps_sec = np.diff(xs) * SEC_PER_DAY
        line_x = [xs[0]]; line_y = [ys[0]]
        for i, g in enumerate(gaps_sec, start=1):
            if g > GAP_SEC:
                line_x += [np.nan, xs[i]]
                line_y += [np.nan, ys[i]]
            else:
                line_x.append(xs[i]); line_y.append(ys[i])

        # Isolated points: points that are broken on both sides
        left_break  = np.r_[True,  gaps_sec > GAP_SEC]
        right_break = np.r_[gaps_sec > GAP_SEC, True]
        isolated_idx = np.where(left_break & right_break)[0]

        art["line"].set_data(line_x, line_y)
        if isolated_idx.size:
            art["pts"].set_offsets(np.c_[xs[isolated_idx], ys[isolated_idx]])
        else:
            art["pts"].set_offsets(np.empty((0, 2)))

        artists.extend([art["line"], art["pts"]])

    # Set xlim per subplot and draw/update the shaded window
    for slot, ax in axis_map.items():
        if slot not in right_edge:
            continue
        right = right_edge[slot]
        left  = right - VIEW_WINDOW_DAYS

        ax.set_xlim(left, right)
        ax.relim()
        ax.autoscale_view(scalex=False, scaley=True)

        # Update shaded span
        if span_map[slot] is not None:
            try: span_map[slot].remove()
            except Exception: pass
            span_map[slot] = None
        span_map[slot] = ax.axvspan(left, right, alpha=0.10)

    return artists

ani = FuncAnimation(
    fig, update,
    interval=UPDATE_INTERVAL_MS,
    blit=False,
    cache_frame_data=False
)
fig.autofmt_xdate()
plt.tight_layout()
plt.show()


<IPython.core.display.Javascript object>

---

## **3.4 Forcasting Shortfall Dashboard**

In [18]:
%matplotlib notebook
## ============================================================================================= Implementation
## --------------------------------------------------------------------------------------------- Config (16 sites, 7-day window by (month, day) arrival order)
SITE_IDS    = list(range(16))   # 0..15
WINDOW_LEN  = 7                 # keep last 7 (month, day) per site (by arrival order)
UPDATE_MS   = 4000              # refresh every 1s
TZ          = ZoneInfo("Australia/Melbourne")  # just for titles/logs
## --------------------------------------------------------------------------------------------- Meter lookup: {(site_id, month, day) -> meter_value}
pdf_meter = (
    DF_meter_final
    .select("site_id", "month", "day", "value")
    .toPandas()
)
meter_by_site_md = {
    (int(r["site_id"]), int(r["month"]), int(r["day"])): float(r["value"])
    for _, r in pdf_meter.iterrows()
}
## --------------------------------------------------------------------------------------------- Streaming state
# predicted daily totals by (site, month, day) — only maintained for keys still inside the 7-day window
pred_sum = defaultdict(lambda: defaultdict(float))  # pred_sum[site_id][(m,d)] += value

# per-site 7-key window (arrival ordered); when a new (m,d) comes in, old ones get dropped automatically
window_keys = {sid: deque(maxlen=WINDOW_LEN) for sid in SITE_IDS}  # deque([(m,d), ...])
## --------------------------------------------------------------------------------------------- Figure (4x4 grid)
fig, axes = plt.subplots(4, 4, figsize=(18, 12), sharey=True)
fig.suptitle("Daily shortfall/excess per site (pred − meter) — last 7 (month, day) by arrival")
ax_map = {sid: ax for sid, ax in zip(SITE_IDS, axes.ravel())}

# Keep bar containers & cached x-tick labels for fast updates
bars_by_site   = {}   # site_id -> BarContainer
xticks_by_site = {}   # site_id -> list[str] like ["03-19", ...]

def _fmt_md(m, d):
    return f"{int(m):02d}-{int(d):02d}"

def _ensure_key_in_window(site_id: int, md: tuple):
    """Make sure (month, day) is in the site's 7-key window; adding may drop the oldest (frozen)."""
    dq = window_keys[site_id]
    if md not in dq:
        dq.append(md)  # deque(maxlen=7) auto-drops the leftmost when full

def _is_in_window(site_id: int, md: tuple) -> bool:
    return md in window_keys[site_id]

def _meter_total_for_site_md(site_id: int, m: int, d: int) -> float:
    """Direct lookup: meter_by_site_md[(site, month, day)] or 0 if missing."""
    return meter_by_site_md.get((site_id, m, d), 0.0)

def _poll_and_update(timeout_ms=100, max_records=10):
    """
    Read a batch; for each record:
      - take (site_id, month, day, value_pred)
      - add (m,d) into site's 7-key window (arrival-ordered); may freeze the oldest
      - only if (m,d) is currently inside the window → accumulate pred_sum
      - if it's outside (already frozen) → ignore late arrivals for that key
    """
    raw = consumer03.poll(timeout_ms=timeout_ms, max_records=max_records)
    if not raw:
        return

    for _, records in raw.items():
        for r in records:
            v = r.value
            try:
                site_id  = int(v["site_id"])
                m        = int(v["month"])
                d        = int(v["day"])
                val_pred = float(v["value_pred_adjust"])
            except Exception as e:
                raise e
                continue  # skip malformed messages

            md = (m, d)
            _ensure_key_in_window(site_id, md)

            if _is_in_window(site_id, md):
                pred_sum[site_id][md] += val_pred
            # else: it's frozen → do nothing

def _redraw_all_sites():
    """
    For each site, draw a 7-bar chart for keys in its current window (left→right = old→new).
    Bar height = shortfall = predicted_total − meter_total.
    """
    artists = []
    for sid, ax in ax_map.items():
        keys = list(window_keys[sid])
        if not keys:
            # init look & feel on first run
            ax.cla()
            ax.set_title(f"site {sid}")
            ax.set_xlabel("month-day")
            ax.set_ylabel("shortfall (kWh)")
            ax.axhline(0, linewidth=1)
            xticks_by_site[sid] = []
            bars_by_site[sid] = None
            continue

        # compute shortfall per key
        heights = []
        labels  = []
        for (m, d) in keys:
            pred  = pred_sum[sid].get((m, d), 0.0)
            meter = _meter_total_for_site_md(sid, m, d)
            heights.append(pred - meter)
            labels.append(_fmt_md(m, d))

        x = np.arange(len(keys))

        # rebuild bars if labels changed (window slid) or bars not created yet; else just update heights
        if xticks_by_site.get(sid) != labels or bars_by_site.get(sid) is None:
            ax.cla()
            ax.set_title(f"site {sid}")
            ax.set_xlabel("month-day")
            ax.set_ylabel("shortfall (kWh)")
            ax.axhline(0, linewidth=1)

            bars = ax.bar(x, heights)
            ax.set_xticks(x, labels, rotation=0)
            ax.set_xlim(-0.5, len(keys) - 0.5)

            bars_by_site[sid]  = bars
            xticks_by_site[sid] = labels
            artists.extend(list(bars))
        else:
            bars = bars_by_site[sid]
            for rect, h in zip(bars, heights):
                rect.set_height(h)
            artists.extend(list(bars))

        # autoscale y only
        ax.relim()
        ax.autoscale_view(scalex=False, scaley=True)

    return artists

def update(_frame):
    _poll_and_update(timeout_ms=300, max_records=200)
    return _redraw_all_sites()

# one-time axis cosmetics
for sid, ax in ax_map.items():
    ax.set_title(f"site {sid}")
    ax.set_xlabel("month-day")
    ax.set_ylabel("shortfall (kWh)")
    ax.axhline(0, linewidth=1)

ani = FuncAnimation(
    fig, update,
    interval=UPDATE_MS,
    blit=False,
    cache_frame_data=False
)
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>