# Hourly Values and Weather Data

## Load data daily and hourly

In [24]:
from src.energy_forecast.config import PROCESSED_DATA_DIR, INTERIM_DATA_DIR, META_DIR, REPORTS_DIR
import polars as pl

dataset_hourly_csv_ = PROCESSED_DATA_DIR / "dataset_hourly_feat.csv"
df_hourly = pl.read_csv(dataset_hourly_csv_)
len(df_hourly)

688766

In [22]:
dataset_daily_csv_ = PROCESSED_DATA_DIR / "dataset_daily_feat.csv"
df_daily = pl.read_csv(dataset_daily_csv_)
len(df_daily)

101646

In [3]:
from src.energy_forecast.config import META_DIR

data_df = data_df.with_columns(
        pl.coalesce(data_df.join(pl.read_csv(META_DIR / "kinergy_meta.csv"), on="id", how="left")["plz"],
                    data_df.join(pl.read_csv(META_DIR / "legacy_meta.csv"), on="id", how="left")["plz"],
                    data_df.join(pl.read_csv(META_DIR / "dh_meta.csv").rename({"postal_code": "plz"}),
                                 on="id", how="left")["plz"],
                    ).str.strip_chars())
len(data_df)

688766

## Data Sources

In [5]:
data_df.group_by(pl.col("source")).agg(pl.len())

source,len
str,u32
"""kinergy""",163665
"""dh""",525101


In [4]:
city_df = data_df.group_by(pl.col("plz")).agg(pl.col("datetime").min().alias("min_date"),
                                                  pl.col("datetime").max().alias("max_date"))
city_df

plz,min_date,max_date
str,str,str
"""22844""","""2022-08-23T08:00:00.000000""","""2025-04-22T11:00:00.000000"""
"""22419""","""2021-10-04T10:00:00.000000""","""2023-09-21T10:00:00.000000"""
"""10249""","""2022-10-19T13:00:00.000000""","""2023-09-13T10:00:00.000000"""
"""20539""","""2022-01-24T13:00:00.000000""","""2023-09-21T08:00:00.000000"""
"""91054""","""2021-08-13T13:00:00.000000""","""2023-09-18T15:00:00.000000"""
…,…,…
"""12681""","""2023-01-01T01:00:00.000000""","""2023-09-09T14:00:00.000000"""
"""97084""","""2021-07-08T15:00:00.000000""","""2023-07-07T00:00:00.000000"""
"""97074""","""2021-05-06T14:00:00.000000""","""2023-09-13T06:00:00.000000"""
"""95447""","""2021-08-13T13:00:00.000000""","""2023-01-31T09:00:00.000000"""


In [5]:
from src.energy_forecast.config import FEATURES_DIR

weather_df = pl.read_csv(FEATURES_DIR / "weather_hourly.csv")
weather_df

time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco,plz
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
"""2022-08-23T06:00:00.000000000""",18.1,13.4,74.0,0.0,0.0,130.0,10.4,19.0,1018.0,15.0,4.0,22846
"""2022-08-23T07:00:00.000000000""",20.1,14.0,68.0,0.0,0.0,130.0,9.7,19.0,1018.3,6.0,4.0,22846
"""2022-08-23T08:00:00.000000000""",22.3,13.4,57.0,0.0,0.0,140.0,12.6,29.0,1018.3,60.0,3.0,22846
"""2022-08-23T09:00:00.000000000""",24.1,13.3,51.0,0.0,0.0,140.0,15.8,30.0,1018.6,60.0,1.0,22846
"""2022-08-23T10:00:00.000000000""",24.5,13.4,50.0,0.0,0.0,140.0,17.3,31.0,1018.5,60.0,1.0,22846
…,…,…,…,…,…,…,…,…,…,…,…,…
"""2023-09-17T13:00:00.000000000""",26.4,13.5,45.0,0.0,0.0,130.0,4.7,16.0,1015.9,60.0,4.0,10963
"""2023-09-17T14:00:00.000000000""",26.3,13.8,46.0,0.0,0.0,140.0,6.8,15.0,1015.6,60.0,4.0,10963
"""2023-09-17T15:00:00.000000000""",26.7,13.8,45.0,0.0,0.0,100.0,6.8,15.0,1015.3,60.0,3.0,10963
"""2023-09-17T16:00:00.000000000""",26.2,14.3,48.0,0.0,0.0,100.0,6.5,12.0,1015.2,60.0,1.0,10963


In [8]:
df = pl.read_csv(PROCESSED_DATA_DIR / "dataset_hourly_feat.csv")
df

id,datetime,diff,typ,primary_energy,prcp,snow,wdir,wspd,wpgt,pres,tsun,daily_avg,heated_area,heated_area_lod,anzahlwhg,ground_surface,building_height,storeys_above_ground,weekend,holiday,weekday,day_of_month,temp,dwpt,rhum,coco
str,str,f64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64,str,str,str,i64,i64,i64,i64,f64,f64,f64,f64
"""83758fff-e013-11eb-9d61-02b402…","""2022-01-24T13:00:00.000000000""",293.8,2,"""gas""",0.0,0.0,262.0,13.0,27.4,1034.1,0.0,71.207019,8125.67,,0,,,,0,0,1,24,5.2,1.9,79.0,4.0
"""83758fff-e013-11eb-9d61-02b402…","""2022-01-24T14:00:00.000000000""",282.5,2,"""gas""",0.0,0.0,260.0,12.2,25.6,1033.9,0.0,71.207019,8125.67,,0,,,,0,0,1,24,5.0,1.8,80.0,4.0
"""83758fff-e013-11eb-9d61-02b402…","""2022-01-24T15:00:00.000000000""",180.8,2,"""gas""",0.0,0.0,269.0,10.8,24.1,1033.9,0.0,71.207019,8125.67,,0,,,,0,0,1,24,5.0,1.3,77.0,4.0
"""83758fff-e013-11eb-9d61-02b402…","""2022-01-24T16:00:00.000000000""",192.1,2,"""gas""",0.0,0.0,268.0,10.1,20.9,1033.9,0.0,71.207019,8125.67,,0,,,,0,0,1,24,4.7,1.4,79.0,4.0
"""83758fff-e013-11eb-9d61-02b402…","""2022-01-24T17:00:00.000000000""",79.1,2,"""gas""",0.0,0.0,272.0,11.2,22.0,1034.0,0.0,71.207019,8125.67,,0,,,,0,0,1,24,4.5,0.8,77.0,4.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""cae17ef4-cfad-4446-8b09-3cf946…","""2025-04-20T13:00:00.000000000""",75.0,2,"""district heating""",0.0,,210.0,7.9,24.1,1007.6,17.0,47.456401,,"""4709.639785015887""",,"""1714.1421299993767""","""8.586""","""1""",1,0,7,20,9.7,7.0,83.0,4.0
"""cae17ef4-cfad-4446-8b09-3cf946…","""2025-04-20T14:00:00.000000000""",41.0,2,"""district heating""",0.0,,210.0,8.6,25.9,1007.6,56.0,47.456401,,"""4709.639785015887""",,"""1714.1421299993767""","""8.586""","""1""",1,0,7,20,10.6,6.9,78.0,4.0
"""cae17ef4-cfad-4446-8b09-3cf946…","""2025-04-21T02:00:00.000000000""",23.0,2,"""district heating""",0.0,,50.0,2.2,14.8,1008.3,0.0,47.456401,,"""4709.639785015887""",,"""1714.1421299993767""","""8.586""","""1""",0,0,1,21,2.5,2.5,100.0,4.0
"""cae17ef4-cfad-4446-8b09-3cf946…","""2025-04-21T14:00:00.000000000""",42.0,2,"""district heating""",0.0,0.0,200.0,7.9,18.0,1010.7,24.0,47.456401,,"""4709.639785015887""",,"""1714.1421299993767""","""8.586""","""1""",0,0,1,21,15.8,9.0,64.0,4.0


In [11]:
df.group_by(["primary_energy"]).agg(pl.len())

primary_energy,len
str,u32
"""gas""",127381
"""district heating""",561385


In [19]:
df_kinergy_hourly = pl.read_csv(INTERIM_DATA_DIR / "kinergy_hourly.csv")
df_kinergy_meta = pl.read_csv(META_DIR / "kinergy_meta.csv")
df_kinergy_hourly.join(df_kinergy_meta, on="id", how="left").group_by(["primary_energy"]).agg(pl.len())

primary_energy,len
str,u32
"""gas""",128697
"""district heating""",40137


In [21]:
df_kinergy_daily = pl.read_csv(INTERIM_DATA_DIR / "kinergy_daily.csv")
df_kinergy_daily.join(df_kinergy_meta, on="id", how="left").group_by(["primary_energy"]).agg(pl.len())

primary_energy,len
str,u32
"""district heating""",1694
"""gas""",5434


## Time span for each sensor

In [42]:
import plotly.graph_objs as go

df_min_max_dates = df_hourly.group_by(["id"]).agg(
    [pl.col("datetime").min().alias("min_date"),
     pl.col("datetime").max().alias("max_date"),
     pl.len().alias("days")
     ]
)

fig = go.Figure()
for (start, end, value, days) in zip(df_min_max_dates["min_date"], df_min_max_dates["max_date"],
                                     df_min_max_dates["id"], df_min_max_dates["days"]):
    # name = f"{start} to {end}"
    name = days
    fig.add_trace(go.Scatter(x=[start, end], y=[value, value],
                             mode='lines', name=name))

fig.update_layout(
    title = "Data Spans for Hourly Time-series",
    title_x = 0.5
)
fig.update_yaxes(visible=False, showticklabels=False)
fig.update_layout(showlegend=False, template="plotly")

fig.show()
# fig.write_html("../reports/figures/gaszähler_start_end_datum.html")
fig.write_image("../reports/figures/hourly_time_span.png")

In [43]:
df_min_max_dates = df_daily.group_by(["id"]).agg(
    [pl.col("datetime").min().alias("min_date"),
     pl.col("datetime").max().alias("max_date"),
     pl.len().alias("days")
     ]
)

fig = go.Figure()
for (start, end, value, days) in zip(df_min_max_dates["min_date"], df_min_max_dates["max_date"],
                                     df_min_max_dates["id"], df_min_max_dates["days"]):
    # name = f"{start} to {end}"
    name = days
    fig.add_trace(go.Scatter(x=[start, end], y=[value, value],
                             mode='lines', name=name))

fig.update_layout(
    title = "Data Spans for Daily Time-series",
    title_x = 0.5
)
fig.update_yaxes(visible=False, showticklabels=False)
fig.update_layout(showlegend=False, template="plotly")
fig.show()
# fig.write_html("../reports/figures/gaszähler_start_end_datum.html")
fig.write_image("../reports/figures/daily_time_span.png")

## Outlier Detection - plot outlier

In [56]:
from src.energy_forecast.dataset import Dataset

ds = Dataset(res="daily")
ds.create()

2025-06-09 21:29:34.709500: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-09 21:29:34.744156: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749497374.786861 1684895 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749497374.800237 1684895 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749497374.831195 1684895 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

[32m2025-06-09 21:29:38.923[0m | [1mINFO    [0m | [36msrc.energy_forecast.dataset[0m:[36mcreate[0m:[36m46[0m - [1mCreating daily dataset[0m
[32m2025-06-09 21:29:38.953[0m | [1mINFO    [0m | [36msrc.energy_forecast.dataset[0m:[36mcreate[0m:[36m51[0m - [1mNumber of rows: 110005[0m
[32m2025-06-09 21:29:38.974[0m | [1mINFO    [0m | [36msrc.energy_forecast.dataset[0m:[36mcreate[0m:[36m53[0m - [1mNumber of sensors: 142[0m


In [68]:
sensor_id = "a0bb40c4-7d73-4c5d-91bd-d82fe748a75d.74820f614mer"
df_s = ds.df.filter(pl.col("id") == sensor_id)
df_s = df_s.select(["id", "datetime", "diff"])
df_s

id,datetime,diff
str,str,f64
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2022-08-24T00:00:00.000000""",491.0
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2022-08-25T00:00:00.000000""",489.0
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2022-08-26T00:00:00.000000""",417.0
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2022-08-27T00:00:00.000000""",426.0
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2022-08-28T00:00:00.000000""",524.0
…,…,…
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2025-04-18T00:00:00.000000""",0.0
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2025-04-19T00:00:00.000000""",0.0
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2025-04-20T00:00:00.000000""",0.0
"""a0bb40c4-7d73-4c5d-91bd-d82fe7…","""2025-04-21T00:00:00.000000""",0.0


In [64]:
import plotly.express as px

fig = px.scatter(df_s, x="datetime", y="diff")
fig.update_layout(template="plotly")
fig.show()

In [75]:
column = "diff"
q25 = df_s[column].quantile(0.25)
q75 = df_s[column].quantile(0.75)
iqr = q75 - q25

upper_bound = q75 + 1.5 * iqr

filtered_df = df_s.filter(pl.col(column) <= upper_bound)

df_new = df_s.with_columns(~(pl.col("datetime").is_in(filtered_df["datetime"])).alias("outlier"))

fig = px.scatter(df_new, x="datetime", y="diff", color="outlier")
fig.update_layout(template="plotly")
fig.show()

In [77]:
from src.energy_forecast.config import REPORTS_DIR
fig.write_image(REPORTS_DIR / f"outlier_{sensor_id}.png")