# Features

This notebook creates features for the raw data.

## Weather Features

Add features like temperature, humidity, sun hours, ...

Start with the daily data

In [7]:
import numpy as np
import polars as pl

from src.energy_forecast.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

data_df = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").with_columns(pl.col("datetime").str.to_datetime())
data_df

[32m2025-02-12 15:24:06.006[0m | [1mINFO    [0m | [36msrc.energy_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\User\PycharmProjects\energy-forecast-wahl[0m


id,datetime,diff,source
str,datetime[μs],f64,str
"""9500b2eb-c260-4200-b657-125604…",2022-08-24 00:00:00,1.0,"""dh"""
"""9500b2eb-c260-4200-b657-125604…",2022-08-25 00:00:00,1.0,"""dh"""
"""9500b2eb-c260-4200-b657-125604…",2022-08-26 00:00:00,1.0,"""dh"""
"""9500b2eb-c260-4200-b657-125604…",2022-08-27 00:00:00,1.0,"""dh"""
"""9500b2eb-c260-4200-b657-125604…",2022-08-28 00:00:00,9.0,"""dh"""
…,…,…,…
"""400302GVG""",2021-08-17 00:00:00,29.6,"""legacy"""
"""400302GVG""",2021-08-18 00:00:00,25.2,"""legacy"""
"""400302GVG""",2021-08-19 00:00:00,27.3,"""legacy"""
"""400302GVG""",2021-08-20 00:00:00,28.3,"""legacy"""


Find time intervals for every city

In [8]:
data_df = data_df.with_columns(pl.coalesce(data_df.join(pl.read_csv(RAW_DATA_DIR / "kinergy_meta.csv"), on="id", how="left")["plz"],
    data_df.join(pl.read_csv(RAW_DATA_DIR / "legacy_meta.csv"), on="id", how="left")["plz"],
    data_df.join(pl.read_csv(RAW_DATA_DIR / "dh_meta.csv").rename({"eco_u_id": "id", "postal_code":"plz"}), on="id", how="left")["plz"],
).str.strip_chars())

In [9]:
city_df = data_df.group_by(pl.col("plz")).agg(pl.col("datetime").min().alias("min_date"),
                                                             pl.col("datetime").max().alias("max_date")).filter(~(pl.col("plz") == "2700"))  # wien
city_df

plz,min_date,max_date
str,datetime[μs],datetime[μs]
"""22848""",2021-10-29 00:00:00,2022-04-01 00:00:00
"""21493""",2018-02-24 00:00:00,2022-04-04 00:00:00
"""22523""",2020-08-19 00:00:00,2022-03-16 00:00:00
"""25336""",2018-12-14 00:00:00,2022-04-04 00:00:00
"""24118""",2020-02-15 00:00:00,2022-03-16 00:00:00
…,…,…
"""22081""",2018-07-12 00:00:00,2020-12-29 00:00:00
"""21033""",2018-08-08 00:00:00,2022-03-23 00:00:00
"""10249""",2022-10-19 00:00:00,2023-09-13 00:00:00
"""21149""",2021-01-28 00:00:00,2022-03-18 00:00:00


Add coordinates to every city

In [10]:
import pgeocode

rows = list()
for plz in city_df["plz"].unique():
    data = pgeocode.Nominatim("de").query_postal_code(str(plz))
    rows.append({"plz":plz, "lat": data["latitude"], "lon": data["longitude"], "state": data["state_code"]})

info_df = pl.DataFrame(rows)
city_df = city_df.join(info_df, on="plz", how="left")
city_df

plz,min_date,max_date,lat,lon,state
str,datetime[μs],datetime[μs],f64,f64,str
"""22848""",2021-10-29 00:00:00,2022-04-01 00:00:00,53.6736,9.9833,"""SH"""
"""21493""",2018-02-24 00:00:00,2022-04-04 00:00:00,53.551392,10.510546,"""SH"""
"""22523""",2020-08-19 00:00:00,2022-03-16 00:00:00,53.6079,9.9097,"""HH"""
"""25336""",2018-12-14 00:00:00,2022-04-04 00:00:00,53.73565,9.6567,"""SH"""
"""24118""",2020-02-15 00:00:00,2022-03-16 00:00:00,54.3334,10.1176,"""SH"""
…,…,…,…,…,…
"""22081""",2018-07-12 00:00:00,2020-12-29 00:00:00,53.5758,10.0364,"""HH"""
"""21033""",2018-08-08 00:00:00,2022-03-23 00:00:00,53.501725,10.1756,"""HH"""
"""10249""",2022-10-19 00:00:00,2023-09-13 00:00:00,52.5238,13.4428,"""BE"""
"""21149""",2021-01-28 00:00:00,2022-03-18 00:00:00,53.4667,9.867,"""HH"""


In [33]:
city_df.write_csv(RAW_DATA_DIR / "cities.csv")

In [54]:
from meteostat import Point, Daily

weather_dfs = list()
for row in city_df.iter_rows():
    start = row[1]
    end = row[2]
    loc = Point(row[3], row[4])

    data = Daily(loc, start, end)
    data = data.fetch()
    weather_dfs.append(pl.from_pandas(data.reset_index()).with_columns(pl.lit(row[0]).alias("plz")))
weather_df = pl.concat(weather_dfs)
weather_df

time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,plz
datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2021-03-20 00:00:00,1.5,-5.1,5.1,0.4,0.0,262.0,15.1,44.3,1025.7,0.0,"""22335"""
2021-03-21 00:00:00,5.7,1.9,9.7,0.0,0.0,304.0,22.0,63.7,1016.7,330.0,"""22335"""
2021-03-22 00:00:00,4.4,-1.5,9.4,0.3,0.0,302.0,14.0,38.2,1021.6,222.0,"""22335"""
2021-03-23 00:00:00,5.7,4.1,7.2,0.0,0.0,257.0,11.9,31.3,1023.6,0.0,"""22335"""
2021-03-24 00:00:00,8.6,4.2,13.8,0.0,0.0,234.0,11.5,44.3,1021.3,294.0,"""22335"""
…,…,…,…,…,…,…,…,…,…,…,…
2022-03-12 00:00:00,4.2,-1.4,11.4,0.0,0.0,126.0,17.6,43.9,1025.1,636.0,"""18119"""
2022-03-13 00:00:00,5.8,-0.3,11.8,0.0,0.0,117.0,16.2,47.2,1022.7,636.0,"""18119"""
2022-03-14 00:00:00,6.1,1.1,11.5,0.0,0.0,144.0,12.2,31.7,1024.6,462.0,"""18119"""
2022-03-15 00:00:00,6.7,2.0,10.6,0.0,0.0,270.0,10.1,26.3,1026.8,150.0,"""18119"""


Merge with data

In [5]:
data_df.with_columns(pl.col("date").dt.date()).join(weather_df.with_columns(pl.col("time").dt.date().alias("date")),
                                                    on=["plz", 'date'], how="left")

id,date,diff,primary_energy,adresse,ort,plz,source,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,date,f64,str,str,str,i64,str,datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-01 00:00:00,11.8,9.6,15.9,18.3,0.0,201.0,18.7,58.0,1003.3,156.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-02 00:00:00,12.4,9.3,16.4,11.4,0.0,264.0,15.1,45.7,1013.8,318.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-03 00:00:00,12.7,8.1,16.2,0.1,0.0,279.0,13.7,45.7,1023.0,138.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-04 00:00:00,12.9,8.8,17.8,0.0,0.0,223.0,11.5,35.3,1019.8,354.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-05 00:00:00,15.3,12.3,17.6,0.0,0.0,213.0,22.7,58.0,1013.9,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-11 00:00:00,4.6,-0.3,9.7,0.0,0.0,122.0,30.2,58.0,1023.9,636.0
"""4008231VG""",2022-03-12,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-12 00:00:00,5.8,0.2,11.9,0.0,0.0,132.0,23.4,54.4,1021.4,594.0
"""4008231VG""",2022-03-13,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-13 00:00:00,6.1,0.0,12.0,0.0,0.0,117.0,19.1,38.9,1019.6,654.0
"""4008231VG""",2022-03-14,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-14 00:00:00,6.5,1.3,9.9,1.3,0.0,180.0,11.2,25.9,1023.9,240.0


From the [meteostat](https://dev.meteostat.net/python/daily.html#api) documentation:

Column	Description	Type

station	The Meteostat ID of the weather station (only if query refers to multiple stations)	String

time	The date	Datetime64

tavg	The average air temperature in °C	Float64

tmin	The minimum air temperature in °C	Float64

tmax	The maximum air temperature in °C	Float64

prcp	The daily precipitation total in mm	Float64

snow	The snow depth in mm	Float64

wdir	The average wind direction in degrees (°)	Float64

wspd	The average wind speed in km/h	Float64

wpgt	The peak wind gust in km/h	Float64

pres	The average sea-level air pressure in hPa	Float64

tsun	The daily sunshine total in minutes (m)	Float64


Humidity is missing from Daily-data, we can retrieve hourly data and merge to daily data

In [55]:
from meteostat import Point, Hourly

weather_dfs = list()
for row in city_df.iter_rows():
    start = row[1]
    end = row[2]
    loc = Point(row[3], row[4])

    data = Hourly(loc, start, end)
    data = data.fetch()
    weather_dfs.append(pl.from_pandas(data.reset_index()).group_by_dynamic(
        index_column="time", every="1d"
    ).agg(pl.col("rhum").mean().alias("hum_avg"),
          pl.col("rhum").min().alias("hum_min"),
          pl.col("rhum").max().alias("hum_max")
          ).with_columns(pl.lit(row[0]).alias("plz")))
weather_df_hourly = pl.concat(weather_dfs)
weather_df_hourly



time,hum_avg,hum_min,hum_max,plz
datetime[ns],f64,f64,f64,str
2021-03-20 00:00:00,77.791667,55.0,98.0,"""22335"""
2021-03-21 00:00:00,75.75,50.0,97.0,"""22335"""
2021-03-22 00:00:00,84.25,58.0,99.0,"""22335"""
2021-03-23 00:00:00,84.5,75.0,93.0,"""22335"""
2021-03-24 00:00:00,73.541667,55.0,91.0,"""22335"""
…,…,…,…,…
2022-03-12 00:00:00,42.583333,23.0,57.0,"""18119"""
2022-03-13 00:00:00,40.25,26.0,56.0,"""18119"""
2022-03-14 00:00:00,49.916667,27.0,93.0,"""18119"""
2022-03-15 00:00:00,86.208333,64.0,100.0,"""18119"""


Add to other weather data

In [56]:
weather_df = weather_df_hourly.join(weather_df, on=["plz", 'time'], how="left")
weather_df

time,hum_avg,hum_min,hum_max,plz,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
datetime[ns],f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2021-03-20 00:00:00,77.791667,55.0,98.0,"""22335""",1.5,-5.1,5.1,0.4,0.0,262.0,15.1,44.3,1025.7,0.0
2021-03-21 00:00:00,75.75,50.0,97.0,"""22335""",5.7,1.9,9.7,0.0,0.0,304.0,22.0,63.7,1016.7,330.0
2021-03-22 00:00:00,84.25,58.0,99.0,"""22335""",4.4,-1.5,9.4,0.3,0.0,302.0,14.0,38.2,1021.6,222.0
2021-03-23 00:00:00,84.5,75.0,93.0,"""22335""",5.7,4.1,7.2,0.0,0.0,257.0,11.9,31.3,1023.6,0.0
2021-03-24 00:00:00,73.541667,55.0,91.0,"""22335""",8.6,4.2,13.8,0.0,0.0,234.0,11.5,44.3,1021.3,294.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2022-03-12 00:00:00,42.583333,23.0,57.0,"""18119""",4.2,-1.4,11.4,0.0,0.0,126.0,17.6,43.9,1025.1,636.0
2022-03-13 00:00:00,40.25,26.0,56.0,"""18119""",5.8,-0.3,11.8,0.0,0.0,117.0,16.2,47.2,1022.7,636.0
2022-03-14 00:00:00,49.916667,27.0,93.0,"""18119""",6.1,1.1,11.5,0.0,0.0,144.0,12.2,31.7,1024.6,462.0
2022-03-15 00:00:00,86.208333,64.0,100.0,"""18119""",6.7,2.0,10.6,0.0,0.0,270.0,10.1,26.3,1026.8,150.0


In [57]:
weather_df.write_csv( / "weather_daily.csv")

In [9]:
data_df.with_columns(pl.col("date").dt.date()).join(weather_df.with_columns(pl.col("time").dt.date().alias("date")),
                                                    on=["plz", 'date'], how="left")

id,date,diff,primary_energy,adresse,ort,plz,source,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,date,f64,str,str,str,i64,str,datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-01 00:00:00,85.666667,73.0,96.0,11.8,9.6,15.9,18.3,0.0,201.0,18.7,58.0,1003.3,156.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-02 00:00:00,86.333333,66.0,96.0,12.4,9.3,16.4,11.4,0.0,264.0,15.1,45.7,1013.8,318.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-03 00:00:00,88.333333,74.0,97.0,12.7,8.1,16.2,0.1,0.0,279.0,13.7,45.7,1023.0,138.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-04 00:00:00,83.0,59.0,100.0,12.9,8.8,17.8,0.0,0.0,223.0,11.5,35.3,1019.8,354.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-05 00:00:00,74.833333,68.0,81.0,15.3,12.3,17.6,0.0,0.0,213.0,22.7,58.0,1013.9,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-11 00:00:00,41.666667,26.0,59.0,4.6,-0.3,9.7,0.0,0.0,122.0,30.2,58.0,1023.9,636.0
"""4008231VG""",2022-03-12,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-12 00:00:00,34.791667,26.0,44.0,5.8,0.2,11.9,0.0,0.0,132.0,23.4,54.4,1021.4,594.0
"""4008231VG""",2022-03-13,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-13 00:00:00,37.458333,27.0,51.0,6.1,0.0,12.0,0.0,0.0,117.0,19.1,38.9,1019.6,654.0
"""4008231VG""",2022-03-14,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-14 00:00:00,65.958333,39.0,97.0,6.5,1.3,9.9,1.3,0.0,180.0,11.2,25.9,1023.9,240.0


Get hourly weather data as well

In [58]:
from meteostat import Point, Hourly

weather_dfs = list()
for row in city_df.iter_rows():
    start = row[1]
    end = row[2]
    loc = Point(row[3], row[4])

    data = Hourly(loc, start, end)
    data = data.fetch()
    weather_dfs.append(pl.from_pandas(data.reset_index()).with_columns(pl.lit(row[0]).alias("plz")))
weather_df_hourly = pl.concat(weather_dfs)
weather_df_hourly



time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco,plz
datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2021-03-20 00:00:00,-4.6,-7.5,80.0,0.0,0.0,300.0,2.2,4.0,1030.1,0.0,2.0,"""22335"""
2021-03-20 01:00:00,-4.3,-7.2,80.0,0.0,0.0,330.0,1.4,2.0,1030.2,0.0,4.0,"""22335"""
2021-03-20 02:00:00,-4.6,-7.4,81.0,0.0,0.0,300.0,2.2,4.0,1030.1,0.0,4.0,"""22335"""
2021-03-20 03:00:00,-3.7,-5.4,88.0,0.0,0.0,330.0,4.0,8.0,1029.7,0.0,4.0,"""22335"""
2021-03-20 04:00:00,-2.0,-6.0,74.0,0.0,0.0,270.0,2.5,5.0,1029.5,0.0,4.0,"""22335"""
…,…,…,…,…,…,…,…,…,…,…,…,…
2022-03-15 20:00:00,4.4,3.5,94.0,0.0,0.0,10.0,5.4,1.8,1026.0,0.0,5.0,"""18119"""
2022-03-15 21:00:00,3.4,3.1,98.0,0.0,0.0,30.0,6.8,5.0,1025.9,0.0,5.0,"""18119"""
2022-03-15 22:00:00,2.4,2.3,99.0,0.0,0.0,70.0,4.0,9.0,1025.9,0.0,5.0,"""18119"""
2022-03-15 23:00:00,2.4,2.4,100.0,0.0,0.0,50.0,5.8,7.6,1026.3,0.0,5.0,"""18119"""


In [59]:
weather_df_hourly.write_csv(  / "weather_hourly.csv")

## Time Features

School/University Break, Holidays

In [11]:
import holidays

holiday_dict = dict()
ger_holidays = holidays.country_holidays("DE", years=range(2018, 2024))
holiday_dict.update(ger_holidays)
holiday_dict

{datetime.date(2018, 1, 1): "New Year's Day",
 datetime.date(2018, 3, 30): 'Good Friday',
 datetime.date(2018, 4, 2): 'Easter Monday',
 datetime.date(2018, 5, 1): 'Labor Day',
 datetime.date(2018, 5, 10): 'Ascension Day',
 datetime.date(2018, 5, 21): 'Whit Monday',
 datetime.date(2018, 10, 3): 'German Unity Day',
 datetime.date(2018, 12, 25): 'Christmas Day',
 datetime.date(2018, 12, 26): 'Second Day of Christmas',
 datetime.date(2019, 1, 1): "New Year's Day",
 datetime.date(2019, 4, 19): 'Good Friday',
 datetime.date(2019, 4, 22): 'Easter Monday',
 datetime.date(2019, 5, 1): 'Labor Day',
 datetime.date(2019, 5, 30): 'Ascension Day',
 datetime.date(2019, 6, 10): 'Whit Monday',
 datetime.date(2019, 10, 3): 'German Unity Day',
 datetime.date(2019, 12, 25): 'Christmas Day',
 datetime.date(2019, 12, 26): 'Second Day of Christmas',
 datetime.date(2020, 1, 1): "New Year's Day",
 datetime.date(2020, 4, 10): 'Good Friday',
 datetime.date(2020, 4, 13): 'Easter Monday',
 datetime.date(2020, 5, 1

In [12]:
holidays_state_dict = dict()
for state in city_df["state"].unique():
    state_holidays = holidays.country_holidays("DE", subdiv=state, years=range(2018, 2024))
    holidays_state_dict.update({state: state_holidays})

In [28]:
import numpy as np
holiday_list = list()
for state in city_df["state"].unique():
    for date, holiday in holidays_state_dict[state].items():
        holiday_list.append({"state": state, "start": date, "end": "null", "type": holiday})

pl.DataFrame(holiday_list).cast({"end": pl.Date}, strict=False)

state,start,end,type
str,date,date,str
"""BY""",2018-01-01,,"""New Year's Day"""
"""BY""",2018-03-30,,"""Good Friday"""
"""BY""",2018-04-02,,"""Easter Monday"""
"""BY""",2018-05-01,,"""Labor Day"""
"""BY""",2018-05-10,,"""Ascension Day"""
…,…,…,…
"""SH""",2023-05-29,,"""Whit Monday"""
"""SH""",2023-10-03,,"""German Unity Day"""
"""SH""",2023-12-25,,"""Christmas Day"""
"""SH""",2023-12-26,,"""Second Day of Christmas"""


In [30]:
from src.energy_forecast.config import DATA_DIR
df_holidays = pl.read_csv(DATA_DIR / "ferien.csv", separator=";").with_columns(pl.col("start").str.to_date(),
                                                                 pl.col("end").str.to_date())
pl.concat([df_holidays, pl.DataFrame(holiday_list).cast({"end": pl.Date}, strict=False)]).write_csv(RAW_DATA_DIR / "holidays.csv")

In [14]:
data_df.with_columns(pl.col("date").dt.date().alias("date"))

id,date,diff,primary_energy,adresse,ort,plz,source
str,date,f64,str,str,str,i64,str
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-12,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-13,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-14,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""


In [15]:
data_df = data_df.with_columns(pl.col("date").dt.date().alias("date")
                               ).with_columns(
    pl.when(pl.col("date").is_in(set(holiday_dict.keys()))).then(1).otherwise(0).alias("holiday"))
data_df

id,date,diff,primary_energy,adresse,ort,plz,source,holiday
str,date,f64,str,str,str,i64,str,i32
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",1
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",0
…,…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",0
"""4008231VG""",2022-03-12,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",0
"""4008231VG""",2022-03-13,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",0
"""4008231VG""",2022-03-14,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",0


## Building Features

### Legacy Data

In [41]:
leg_data = pl.read_csv(  / "legacy_daily.csv")
leg_data_meta = leg_data.group_by(
    ["id", "ort", "adresse", "plz", "primary_energy", "qmbehfl", "anzlwhg", "co2koeffizient"]).agg().with_columns(pl.lit("Mehrfamilienhaus").alias("typ")).rename({"anzlwhg": "anzahlwhg"})
leg_data_meta = leg_data_meta.rename({"qmbehfl": "heated_area"}).cast(
    {"heated_area": pl.Float64, "plz": pl.String}).with_columns(pl.lit("leg").alias("source"))
leg_data_meta

id,ort,adresse,plz,primary_energy,heated_area,anzahlwhg,co2koeffizient,typ,source
str,str,str,str,str,f64,i64,f64,str,str
"""400308PVG""","""Hamburg""","""Martinistraße 44""","""20251""","""gas""",0.0,0,2.26,"""Mehrfamilienhaus""","""leg"""
"""400690GVG""","""Hamburg""","""Op´n Hainholt 4-18""","""22589""","""gas""",18493.0,290,2.26,"""Mehrfamilienhaus""","""leg"""
"""400356PVG""","""Elmshorn""","""Fehrsstraße 7""","""25336""","""gas""",1215.0,23,2.26,"""Mehrfamilienhaus""","""leg"""
"""400131GVG""","""Lübeck""","""Brandenbaumer Landstraße 177""","""23566""","""gas""",6387.0,120,2.26,"""Mehrfamilienhaus""","""leg"""
"""4008231VG""","""Hamburg""","""Tinnumer Weg 1-9, Morsumer Weg…","""22117""","""gas""",19530.0,262,2.26,"""Mehrfamilienhaus""","""leg"""
…,…,…,…,…,…,…,…,…,…
"""400768GVG""","""Hamburg""","""Schenfelder Holt 135""","""22589""","""gas""",49339.0,697,2.26,"""Mehrfamilienhaus""","""leg"""
"""400287SVG""","""Schwarzenbek""","""Verbrüderungsring 21""","""21493""","""gas""",3393.0,51,2.26,"""Mehrfamilienhaus""","""leg"""
"""400711GVG""","""Hamburg""","""Dahlgrünring 5-9""","""21109""","""gas""",6904.0,96,2.26,"""Mehrfamilienhaus""","""leg"""
"""400067GVG""","""Elmshorn""","""Friedensallee 13""","""25335""","""gas""",6106.0,94,2.26,"""Mehrfamilienhaus""","""leg"""


In [17]:
leg_data_meta.write_csv(  / "legacy_meta.csv")

### Kinergy Data

In [31]:
from src.energy_forecast.config import DATA_DIR
import json

with open(DATA_DIR / "kinergy" / "kinergy_eco_u_list.json", "r", encoding="UTF-8") as f:
    eco_u_data = json.loads(f.read())
item_list = [i for k, i in eco_u_data.items()]
kinergy_meta = pl.from_dicts(item_list).select(
    ["hash", "ort", "name", "plz", "anzahlwhg", "typ", "complexity", "complexity_score", "primary_energy",
     "heated_area", "renewable_energy_used", "has_pwh", "pwh_type", "netz_nummer"])

bem_meta = pl.read_csv(DATA_DIR / "kinergy" / "berlin_fernwärmenetz_info.csv").rename(
    {"Netznummer": "netz_nummer"}).select(["netz_nummer", "min_vorlauf_temp", "max_vorlauf_temp"])
kinergy_meta = kinergy_meta.join(bem_meta, on="netz_nummer", how="left")
kinergy_meta = kinergy_meta.rename({"hash": "id", "name": "adresse"}).with_columns(pl.lit("kin").alias("source"))
kinergy_meta

hash,ort,name,plz,anzahlwhg,typ,complexity,complexity_score,primary_energy,heated_area,renewable_energy_used,has_pwh,pwh_type,netz_nummer,min_vorlauf_temp,max_vorlauf_temp
str,str,str,str,i64,str,i64,f64,str,f64,bool,bool,str,i64,str,str
"""1 # JHe51""","""Bamberg""","""Hegelstraße 51""","""96052""",134,"""Studentenwohnheim""",0,25.0,"""district heating""",2736.65,false,true,"""central""",,,
"""2 # JMe4""","""Erlangen""","""Mittlere Schulstraße 4""","""91054""",64,"""Studentenwohnheim""",1,34.0,"""gas""",1201.78,false,true,"""central""",,,
"""3 # JOe11""","""Erlangen""","""Otto-Goetze-Straße 11""","""91054""",168,"""Studentenwohnheim""",1,40.5,"""gas""",3141.78,false,true,"""central""",,,
"""4 # JSe21/23""","""Bayreuth""","""Schellingstraße 21/23""","""95447""",60,"""Studentenwohnheim""",0,28.0,"""gas""",2488.92,false,true,"""central""",,,
"""5 # WFe21-25""","""Würzburg""","""Friedrichstraße 21-25""","""97082""",64,"""Mehrfamilienhaus""",1,35.0,"""district heating""",3600.0,true,true,"""decentral""",,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""16 # PRe45""","""Plön""","""Rautenbergstraße 45""","""24306""",0,"""""",0,20.5,"""gas""",,false,true,"""central""",,,
"""17 # SFp36""","""Hamburg""","""Foorthkamp 36""","""22419""",0,"""Schule""",1,35.0,"""gas""",6428.0,false,true,"""central""",,,
"""18 # SGe171""","""Hamburg""","""Gaußstraße 171""","""22765""",0,"""Schule""",1,35.5,"""gas""",6140.0,false,false,,,,
"""19 # SKg63""","""Hamburg""","""Kapellenweg 63""","""21077""",0,"""Schule""",1,42.0,"""gas""",7511.0,false,true,"""central""",,,


In [19]:
kinergy_meta.filter(pl.col("primary_energy") == "district heating")

hash,ort,name,plz,anzahlwhg,typ,complexity,complexity_score,primary_energy,heated_area,renewable_energy_used,has_pwh,pwh_type,netz_nummer,min_vorlauf_temp,max_vorlauf_temp
str,str,str,str,i64,str,i64,f64,str,f64,bool,bool,str,i64,str,str
"""1 # JHe51""","""Bamberg""","""Hegelstraße 51""","""96052""",134,"""Studentenwohnheim""",0,25.0,"""district heating""",2736.65,False,True,"""central""",,,
"""5 # WFe21-25""","""Würzburg""","""Friedrichstraße 21-25""","""97082""",64,"""Mehrfamilienhaus""",1,35.0,"""district heating""",3600.0,True,True,"""decentral""",,,
"""9 # BMr03""","""Berlin""","""Marzahner Chaussee 231 Sportha…","""12681""",0,"""Schule""",0,27.5,"""district heating""",1141.0,False,True,"""central""",2100.0,""" 80 °C""",""" 135 °C"""
"""10 # BSeH1+3+4""","""Berlin""","""Straßmannstraße 14-16 H1+3+4""","""10249""",24,"""Schule""",0,23.0,"""district heating""",9456.0,False,False,,2600.0,""" 80 °C""",""" 135 °C"""
"""11 # BSeH2""","""Berlin""","""Straßmannstraße 14-16 H2""","""10249""",0,"""Schule""",0,27.0,"""district heating""",2360.0,False,True,"""central""",2600.0,""" 80 °C""",""" 135 °C"""
"""12 # BTr9""","""Berlin""","""Trebbiner Str.9""","""10963""",0,"""Museum""",0,26.0,"""district heating""",6000.0,False,False,,2601.0,""" 80 °C""",""" 135 °C"""


### District Heating Data

In [32]:
dh_meta = pl.read_csv(RAW_DATA_DIR / "district_heating_meta.csv").with_columns(
    pl.lit("Mehrfamilienhaus").alias("typ"),
    pl.lit(75).alias("min_vorlauf_temp"),
    pl.lit(90).alias("max_vorlauf_temp")
).rename({"eco_u_id": "id"}).select(
    ["id", "city", "address", "postal_code", "typ", "min_vorlauf_temp", "max_vorlauf_temp"])
dh_meta = dh_meta.rename({"address": "adresse", "postal_code": "plz", "city": "ort"}).select(
    ["id", "adresse", "ort", "plz", "typ", "min_vorlauf_temp", "max_vorlauf_temp"]).cast(
    {"plz": pl.String, "min_vorlauf_temp": pl.String, "max_vorlauf_temp": pl.String}).with_columns(
    pl.lit("dh").alias("source"),
    pl.lit("district heating").alias("primary_energy")
)
dh_meta

id,adresse,ort,plz,typ,min_vorlauf_temp,max_vorlauf_temp,source,primary_energy
str,str,str,str,str,str,str,str,str
"""8f7b3862-a50d-44eb-8ac9-de0cf4…","""Kielort 20""","""Norderstedt""","""22850""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""42d6efdc-d590-40b7-af9a-90121d…","""Moorbekstraße 19""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""c00c8cba-b6de-4c10-89c0-e92312…","""Moorbekstraße 29""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""7bdbc8ee-00fb-4795-99cb-c1739f…","""Moorbekstraße 31""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""d00d6502-a08d-45df-99e3-7d8cd5…","""Moorbekstraße 17""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
…,…,…,…,…,…,…,…,…
"""82a01deb-7c9d-4e87-a79a-4693bd…","""Waldstraße 81""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""cae17ef4-cfad-4446-8b09-3cf946…","""Hasenstieg 13""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""a9644794-439b-401c-b879-8c0225…","""Kielort 25""","""Norderstedt""","""22850""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""561a9d67-5802-4a54-ae7d-0a7822…","""Segeberger Chaussee 104b""","""Norderstedt""","""22850""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""


### Merge

In [34]:
kinergy_meta.schema

Schema([('id', String),
        ('ort', String),
        ('adresse', String),
        ('plz', String),
        ('anzahlwhg', Int64),
        ('typ', String),
        ('complexity', Int64),
        ('complexity_score', Float64),
        ('primary_energy', String),
        ('heated_area', Float64),
        ('renewable_energy_used', Boolean),
        ('has_pwh', Boolean),
        ('pwh_type', String),
        ('netz_nummer', Int64),
        ('min_vorlauf_temp', String),
        ('max_vorlauf_temp', String),
        ('source', String)])

In [35]:
leg_data_meta.schema

Schema([('id', String),
        ('ort', String),
        ('adresse', String),
        ('plz', String),
        ('primary_energy', String),
        ('heated_area', Float64),
        ('anzlwhg', Int64),
        ('co2koeffizient', Float64),
        ('typ', String),
        ('source', String)])

In [36]:
dh_meta.schema

Schema([('id', String),
        ('adresse', String),
        ('ort', String),
        ('plz', String),
        ('typ', String),
        ('min_vorlauf_temp', String),
        ('max_vorlauf_temp', String),
        ('source', String),
        ('primary_energy', String)])

In [42]:
df_meta = pl.concat([leg_data_meta, kinergy_meta, dh_meta], how="diagonal")
df_meta

id,ort,adresse,plz,primary_energy,heated_area,anzahlwhg,co2koeffizient,typ,source,complexity,complexity_score,renewable_energy_used,has_pwh,pwh_type,netz_nummer,min_vorlauf_temp,max_vorlauf_temp
str,str,str,str,str,f64,i64,f64,str,str,i64,f64,bool,bool,str,i64,str,str
"""400308PVG""","""Hamburg""","""Martinistraße 44""","""20251""","""gas""",0.0,0,2.26,"""Mehrfamilienhaus""","""leg""",,,,,,,,
"""400690GVG""","""Hamburg""","""Op´n Hainholt 4-18""","""22589""","""gas""",18493.0,290,2.26,"""Mehrfamilienhaus""","""leg""",,,,,,,,
"""400356PVG""","""Elmshorn""","""Fehrsstraße 7""","""25336""","""gas""",1215.0,23,2.26,"""Mehrfamilienhaus""","""leg""",,,,,,,,
"""400131GVG""","""Lübeck""","""Brandenbaumer Landstraße 177""","""23566""","""gas""",6387.0,120,2.26,"""Mehrfamilienhaus""","""leg""",,,,,,,,
"""4008231VG""","""Hamburg""","""Tinnumer Weg 1-9, Morsumer Weg…","""22117""","""gas""",19530.0,262,2.26,"""Mehrfamilienhaus""","""leg""",,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""82a01deb-7c9d-4e87-a79a-4693bd…","""Norderstedt""","""Waldstraße 81""","""22846""","""district heating""",,,,"""Mehrfamilienhaus""","""dh""",,,,,,,"""75""","""90"""
"""cae17ef4-cfad-4446-8b09-3cf946…","""Norderstedt""","""Hasenstieg 13""","""22846""","""district heating""",,,,"""Mehrfamilienhaus""","""dh""",,,,,,,"""75""","""90"""
"""a9644794-439b-401c-b879-8c0225…","""Norderstedt""","""Kielort 25""","""22850""","""district heating""",,,,"""Mehrfamilienhaus""","""dh""",,,,,,,"""75""","""90"""
"""561a9d67-5802-4a54-ae7d-0a7822…","""Norderstedt""","""Segeberger Chaussee 104b""","""22850""","""district heating""",,,,"""Mehrfamilienhaus""","""dh""",,,,,,,"""75""","""90"""


In [43]:
df_meta.describe()

statistic,id,ort,adresse,plz,primary_energy,heated_area,anzahlwhg,co2koeffizient,typ,source,complexity,complexity_score,renewable_energy_used,has_pwh,pwh_type,netz_nummer,min_vorlauf_temp,max_vorlauf_temp
str,str,str,str,str,str,f64,f64,f64,str,str,f64,f64,f64,f64,str,f64,str,str
"""count""","""139""","""139""","""139""","""139""","""139""",48.0,49.0,29.0,"""139""","""139""",20.0,20.0,20.0,20.0,"""16""",4.0,"""94""","""94"""
"""null_count""","""0""","""0""","""0""","""0""","""0""",91.0,90.0,110.0,"""0""","""0""",119.0,119.0,119.0,119.0,"""123""",135.0,"""45""","""45"""
"""mean""",,,,,,6634.015833,85.979592,2.26,,,0.5,29.3,0.15,0.8,,2475.25,,
"""std""",,,,,,9154.866906,143.185557,4.5195e-16,,,0.606977,10.181769,,,,250.167111,,
"""min""","""0c9ad311-b86f-4371-a695-512ca4…","""Bamberg""","""Alter Sportplatz 1-5""","""10249""","""district heating""",0.0,0.0,2.26,"""""","""dh""",0.0,12.5,0.0,0.0,"""central""",2100.0,""" 80 °C""",""" 135 °C"""
"""25%""",,,,,,1812.0,0.0,2.26,,,0.0,23.0,,,,2600.0,,
"""50%""",,,,,,3141.78,31.0,2.26,,,0.0,28.0,,,,2600.0,,
"""75%""",,,,,,6904.0,96.0,2.26,,,1.0,35.5,,,,2600.0,,
"""max""","""fb684f25-a63d-4d3e-9277-6d759b…","""Würzburg""","""Wilhelmstraße 33-41""","""97084""","""gas""",49339.0,697.0,2.26,"""Studentenwohnheim""","""leg""",2.0,51.5,1.0,1.0,"""decentral""",2601.0,"""75""","""90"""
