# Features

This notebook creates features for the raw data.

## Weather Features

Add features like temperature, humidity, sun hours, ...

Start with the daily data

In [1]:
import holidays.countries
import polars as pl

from src.energy_forecast.config import RAW_DATA_DIR

data_df = pl.read_csv(RAW_DATA_DIR / "daily.csv").with_columns(pl.col("date").str.to_datetime())
data_df

[32m2025-02-03 19:42:22.103[0m | [1mINFO    [0m | [36msrc.energy_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\User\PycharmProjects\energy-forecast-wahl[0m


id,date,diff,primary_energy,adresse,ort,plz,source
str,datetime[μs],f64,str,str,str,i64,str
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01 00:00:00,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02 00:00:00,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03 00:00:00,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04 00:00:00,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05 00:00:00,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11 00:00:00,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-12 00:00:00,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-13 00:00:00,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-14 00:00:00,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""


Find time intervals for every city

In [2]:
city_df = data_df.group_by(pl.col("ort"), pl.col("plz")).agg(pl.col("date").min().alias("min_date"),
                                                             pl.col("date").max().alias("max_date"))
city_df

ort,plz,min_date,max_date
str,i64,datetime[μs],datetime[μs]
"""Hamburg""",22523,2020-06-30 00:00:00,2022-03-16 00:00:00
"""Norderstedt""",22846,2022-08-24 00:00:00,2024-05-14 00:00:00
"""Elmshorn""",25336,2018-12-14 00:00:00,2022-04-04 00:00:00
"""Hamburg""",22399,2020-08-15 00:00:00,2022-03-16 00:00:00
"""Berlin""",10249,2022-10-17 00:00:00,2023-09-13 00:00:00
…,…,…,…
"""Schwarzenbek""",21493,2018-02-24 00:00:00,2022-04-04 00:00:00
"""Hamburg""",21149,2021-01-28 00:00:00,2022-03-18 00:00:00
"""Bargteheide""",22941,2018-02-14 00:00:00,2022-03-16 00:00:00
"""Barmstedt""",25355,2019-08-09 00:00:00,2022-04-04 00:00:00


Add coordinates to every city

In [3]:
import pgeocode

city_df = city_df.with_columns(
    pl.col("plz").map_elements(lambda x: pgeocode.Nominatim("de").query_postal_code(str(x))["latitude"]).alias("lat"),
    pl.col("plz").map_elements(lambda x: pgeocode.Nominatim("de").query_postal_code(str(x))["longitude"]).alias("lon"),
    pl.col("plz").map_elements(lambda x: pgeocode.Nominatim("de").query_postal_code(str(x))["state_code"]).alias(
        "state"),
)
city_df



ort,plz,min_date,max_date,lat,lon,state
str,i64,datetime[μs],datetime[μs],f64,f64,str
"""Hamburg""",22523,2020-06-30 00:00:00,2022-03-16 00:00:00,53.6079,9.9097,"""HH"""
"""Norderstedt""",22846,2022-08-24 00:00:00,2024-05-14 00:00:00,53.7099,9.9946,"""SH"""
"""Elmshorn""",25336,2018-12-14 00:00:00,2022-04-04 00:00:00,53.73565,9.6567,"""SH"""
"""Hamburg""",22399,2020-08-15 00:00:00,2022-03-16 00:00:00,53.663875,10.07025,"""HH"""
"""Berlin""",10249,2022-10-17 00:00:00,2023-09-13 00:00:00,52.5238,13.4428,"""BE"""
…,…,…,…,…,…,…
"""Schwarzenbek""",21493,2018-02-24 00:00:00,2022-04-04 00:00:00,53.551392,10.510546,"""SH"""
"""Hamburg""",21149,2021-01-28 00:00:00,2022-03-18 00:00:00,53.4667,9.867,"""HH"""
"""Bargteheide""",22941,2018-02-14 00:00:00,2022-03-16 00:00:00,53.73095,10.2418,"""SH"""
"""Barmstedt""",25355,2019-08-09 00:00:00,2022-04-04 00:00:00,53.78995,9.765117,"""SH"""


In [4]:
from meteostat import Point, Daily

weather_dfs = list()
for row in city_df.iter_rows():
    start = row[2]
    end = row[3]
    loc = Point(row[4], row[5])

    data = Daily(loc, start, end)
    data = data.fetch()
    weather_dfs.append(pl.from_pandas(data.reset_index()).with_columns(pl.lit(row[1]).alias("plz")))
weather_df = pl.concat(weather_dfs)
weather_df

time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,plz
datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
2020-06-30 00:00:00,15.3,12.3,17.1,1.4,0.0,232.0,23.0,59.8,1006.2,12.0,22523
2020-07-01 00:00:00,16.7,12.4,22.4,9.4,0.0,233.0,14.0,42.5,1005.2,276.0,22523
2020-07-02 00:00:00,15.9,13.3,21.4,13.5,0.0,244.0,9.4,28.4,1009.0,84.0,22523
2020-07-03 00:00:00,16.9,13.0,21.0,0.4,0.0,238.0,14.0,56.9,1014.0,228.0,22523
2020-07-04 00:00:00,16.6,14.4,17.6,5.0,0.0,222.0,21.2,48.6,1011.4,0.0,22523
…,…,…,…,…,…,…,…,…,…,…,…
2023-09-21 00:00:00,20.3,16.8,26.7,1.8,0.0,174.0,14.1,37.4,1000.0,378.0,21077
2023-09-22 00:00:00,15.8,10.4,20.0,2.2,0.0,218.0,12.3,37.1,1000.3,240.0,21077
2023-09-23 00:00:00,13.8,11.1,17.6,0.6,0.0,233.0,13.7,39.2,1010.6,162.0,21077
2023-09-24 00:00:00,13.2,7.7,18.7,0.0,0.0,208.0,8.0,26.6,1024.6,540.0,21077


Merge with data

In [5]:
data_df.with_columns(pl.col("date").dt.date()).join(weather_df.with_columns(pl.col("time").dt.date().alias("date")),
                                                    on=["plz", 'date'], how="left")

id,date,diff,primary_energy,adresse,ort,plz,source,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,date,f64,str,str,str,i64,str,datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-01 00:00:00,11.8,9.6,15.9,18.3,0.0,201.0,18.7,58.0,1003.3,156.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-02 00:00:00,12.4,9.3,16.4,11.4,0.0,264.0,15.1,45.7,1013.8,318.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-03 00:00:00,12.7,8.1,16.2,0.1,0.0,279.0,13.7,45.7,1023.0,138.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-04 00:00:00,12.9,8.8,17.8,0.0,0.0,223.0,11.5,35.3,1019.8,354.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-05 00:00:00,15.3,12.3,17.6,0.0,0.0,213.0,22.7,58.0,1013.9,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-11 00:00:00,4.6,-0.3,9.7,0.0,0.0,122.0,30.2,58.0,1023.9,636.0
"""4008231VG""",2022-03-12,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-12 00:00:00,5.8,0.2,11.9,0.0,0.0,132.0,23.4,54.4,1021.4,594.0
"""4008231VG""",2022-03-13,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-13 00:00:00,6.1,0.0,12.0,0.0,0.0,117.0,19.1,38.9,1019.6,654.0
"""4008231VG""",2022-03-14,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-14 00:00:00,6.5,1.3,9.9,1.3,0.0,180.0,11.2,25.9,1023.9,240.0


From the [meteostat](https://dev.meteostat.net/python/daily.html#api) documentation:

Column	Description	Type

station	The Meteostat ID of the weather station (only if query refers to multiple stations)	String

time	The date	Datetime64

tavg	The average air temperature in °C	Float64

tmin	The minimum air temperature in °C	Float64

tmax	The maximum air temperature in °C	Float64

prcp	The daily precipitation total in mm	Float64

snow	The snow depth in mm	Float64

wdir	The average wind direction in degrees (°)	Float64

wspd	The average wind speed in km/h	Float64

wpgt	The peak wind gust in km/h	Float64

pres	The average sea-level air pressure in hPa	Float64

tsun	The daily sunshine total in minutes (m)	Float64


Humidity is missing from Daily-data, we can retrieve hourly data and merge to daily data

In [6]:
from meteostat import Point, Hourly

weather_dfs = list()
for row in city_df.iter_rows():
    start = row[2]
    end = row[3]
    loc = Point(row[4], row[5])

    data = Hourly(loc, start, end)
    data = data.fetch()
    weather_dfs.append(pl.from_pandas(data.reset_index()).group_by_dynamic(
        index_column="time", every="1d"
    ).agg(pl.col("rhum").mean().alias("hum_avg"),
          pl.col("rhum").min().alias("hum_min"),
          pl.col("rhum").max().alias("hum_max")
          ).with_columns(pl.lit(row[1]).alias("plz")))
weather_df_hourly = pl.concat(weather_dfs)
weather_df_hourly



time,hum_avg,hum_min,hum_max,plz
datetime[ns],f64,f64,f64,i32
2020-06-30 00:00:00,71.166667,56.0,95.0,22523
2020-07-01 00:00:00,82.166667,58.0,98.0,22523
2020-07-02 00:00:00,90.125,67.0,96.0,22523
2020-07-03 00:00:00,77.208333,58.0,96.0,22523
2020-07-04 00:00:00,84.958333,70.0,97.0,22523
…,…,…,…,…
2023-09-21 00:00:00,69.375,44.0,94.0,21077
2023-09-22 00:00:00,79.291667,56.0,95.0,21077
2023-09-23 00:00:00,85.833333,65.0,97.0,21077
2023-09-24 00:00:00,85.5,60.0,99.0,21077


Add to other weather data

In [7]:
weather_df = weather_df_hourly.join(weather_df, on=["plz", 'time'], how="left")
weather_df

time,hum_avg,hum_min,hum_max,plz,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
datetime[ns],f64,f64,f64,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2020-06-30 00:00:00,71.166667,56.0,95.0,22523,15.3,12.3,17.1,1.4,0.0,232.0,23.0,59.8,1006.2,12.0
2020-07-01 00:00:00,82.166667,58.0,98.0,22523,16.7,12.4,22.4,9.4,0.0,233.0,14.0,42.5,1005.2,276.0
2020-07-02 00:00:00,90.125,67.0,96.0,22523,15.9,13.3,21.4,13.5,0.0,244.0,9.4,28.4,1009.0,84.0
2020-07-03 00:00:00,77.208333,58.0,96.0,22523,16.9,13.0,21.0,0.4,0.0,238.0,14.0,56.9,1014.0,228.0
2020-07-04 00:00:00,84.958333,70.0,97.0,22523,16.6,14.4,17.6,5.0,0.0,222.0,21.2,48.6,1011.4,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2023-09-21 00:00:00,69.375,44.0,94.0,21077,20.3,16.8,26.7,1.8,0.0,174.0,14.1,37.4,1000.0,378.0
2023-09-22 00:00:00,79.291667,56.0,95.0,21077,15.8,10.4,20.0,2.2,0.0,218.0,12.3,37.1,1000.3,240.0
2023-09-23 00:00:00,85.833333,65.0,97.0,21077,13.8,11.1,17.6,0.6,0.0,233.0,13.7,39.2,1010.6,162.0
2023-09-24 00:00:00,85.5,60.0,99.0,21077,13.2,7.7,18.7,0.0,0.0,208.0,8.0,26.6,1024.6,540.0


In [8]:
weather_df.write_csv(RAW_DATA_DIR / "weather_daily.csv")

In [9]:
data_df.with_columns(pl.col("date").dt.date()).join(weather_df.with_columns(pl.col("time").dt.date().alias("date")),
                                                    on=["plz", 'date'], how="left")

id,date,diff,primary_energy,adresse,ort,plz,source,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,date,f64,str,str,str,i64,str,datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-01 00:00:00,85.666667,73.0,96.0,11.8,9.6,15.9,18.3,0.0,201.0,18.7,58.0,1003.3,156.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-02 00:00:00,86.333333,66.0,96.0,12.4,9.3,16.4,11.4,0.0,264.0,15.1,45.7,1013.8,318.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-03 00:00:00,88.333333,74.0,97.0,12.7,8.1,16.2,0.1,0.0,279.0,13.7,45.7,1023.0,138.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-04 00:00:00,83.0,59.0,100.0,12.9,8.8,17.8,0.0,0.0,223.0,11.5,35.3,1019.8,354.0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",2022-10-05 00:00:00,74.833333,68.0,81.0,15.3,12.3,17.6,0.0,0.0,213.0,22.7,58.0,1013.9,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-11 00:00:00,41.666667,26.0,59.0,4.6,-0.3,9.7,0.0,0.0,122.0,30.2,58.0,1023.9,636.0
"""4008231VG""",2022-03-12,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-12 00:00:00,34.791667,26.0,44.0,5.8,0.2,11.9,0.0,0.0,132.0,23.4,54.4,1021.4,594.0
"""4008231VG""",2022-03-13,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-13 00:00:00,37.458333,27.0,51.0,6.1,0.0,12.0,0.0,0.0,117.0,19.1,38.9,1019.6,654.0
"""4008231VG""",2022-03-14,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",2022-03-14 00:00:00,65.958333,39.0,97.0,6.5,1.3,9.9,1.3,0.0,180.0,11.2,25.9,1023.9,240.0


Get hourly weather data as well

In [10]:
from meteostat import Point, Hourly

weather_dfs = list()
for row in city_df.iter_rows():
    start = row[2]
    end = row[3]
    loc = Point(row[4], row[5])

    data = Hourly(loc, start, end)
    data = data.fetch()
    weather_dfs.append(pl.from_pandas(data.reset_index()).with_columns(pl.lit(row[1]).alias("plz")))
weather_df_hourly = pl.concat(weather_dfs)
weather_df_hourly



time,temp,dwpt,rhum,prcp,snow,wdir,wspd,wpgt,pres,tsun,coco,plz
datetime[ns],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32
2020-06-30 00:00:00,16.3,7.8,57.0,0.0,0.0,220.0,16.6,28.0,1006.9,0.0,4.0,22523
2020-06-30 01:00:00,16.1,7.6,57.0,0.0,,230.0,18.0,33.0,1006.4,0.0,4.0,22523
2020-06-30 02:00:00,15.6,6.9,56.0,0.0,,230.0,20.5,38.0,1006.1,0.0,4.0,22523
2020-06-30 03:00:00,15.3,6.6,56.0,0.0,,230.0,21.6,36.0,1005.7,0.0,7.0,22523
2020-06-30 04:00:00,14.8,6.9,59.0,0.0,,230.0,23.4,36.0,1005.7,0.0,4.0,22523
…,…,…,…,…,…,…,…,…,…,…,…,…
2023-09-24 20:00:00,11.2,10.4,95.0,0.0,0.0,151.0,9.7,18.4,1026.8,0.0,1.0,21077
2023-09-24 21:00:00,10.3,9.8,97.0,0.0,0.0,153.0,10.1,19.8,1026.5,0.0,1.0,21077
2023-09-24 22:00:00,10.0,9.5,97.0,0.0,0.0,154.0,11.2,22.0,1026.2,0.0,1.0,21077
2023-09-24 23:00:00,12.4,9.9,85.0,0.0,0.0,155.0,11.9,23.8,1025.8,0.0,1.0,21077


In [11]:
weather_df_hourly.write_csv(RAW_DATA_DIR / "weather_hourly.csv")

## Time Features

School/University Break, Holidays

In [12]:
import holidays

holiday_dict = dict()
ger_holidays = holidays.country_holidays("DE", years=range(2018, 2024))
holiday_dict.update(ger_holidays)
holiday_dict

{datetime.date(2018, 1, 1): "New Year's Day",
 datetime.date(2018, 3, 30): 'Good Friday',
 datetime.date(2018, 4, 2): 'Easter Monday',
 datetime.date(2018, 5, 1): 'Labor Day',
 datetime.date(2018, 5, 10): 'Ascension Day',
 datetime.date(2018, 5, 21): 'Whit Monday',
 datetime.date(2018, 10, 3): 'German Unity Day',
 datetime.date(2018, 12, 25): 'Christmas Day',
 datetime.date(2018, 12, 26): 'Second Day of Christmas',
 datetime.date(2019, 1, 1): "New Year's Day",
 datetime.date(2019, 4, 19): 'Good Friday',
 datetime.date(2019, 4, 22): 'Easter Monday',
 datetime.date(2019, 5, 1): 'Labor Day',
 datetime.date(2019, 5, 30): 'Ascension Day',
 datetime.date(2019, 6, 10): 'Whit Monday',
 datetime.date(2019, 10, 3): 'German Unity Day',
 datetime.date(2019, 12, 25): 'Christmas Day',
 datetime.date(2019, 12, 26): 'Second Day of Christmas',
 datetime.date(2020, 1, 1): "New Year's Day",
 datetime.date(2020, 4, 10): 'Good Friday',
 datetime.date(2020, 4, 13): 'Easter Monday',
 datetime.date(2020, 5, 1

In [13]:
for state in city_df["state"].unique():
    state_holidays = holidays.country_holidays("DE", subdiv=state, years=range(2018, 2024))
    holiday_dict.update(state_holidays)
holiday_dict

{datetime.date(2018, 1, 1): "New Year's Day",
 datetime.date(2018, 3, 30): 'Good Friday',
 datetime.date(2018, 4, 2): 'Easter Monday',
 datetime.date(2018, 5, 1): 'Labor Day',
 datetime.date(2018, 5, 10): 'Ascension Day',
 datetime.date(2018, 5, 21): 'Whit Monday',
 datetime.date(2018, 10, 3): 'German Unity Day',
 datetime.date(2018, 12, 25): 'Christmas Day',
 datetime.date(2018, 12, 26): 'Second Day of Christmas',
 datetime.date(2019, 1, 1): "New Year's Day",
 datetime.date(2019, 4, 19): 'Good Friday',
 datetime.date(2019, 4, 22): 'Easter Monday',
 datetime.date(2019, 5, 1): 'Labor Day',
 datetime.date(2019, 5, 30): 'Ascension Day',
 datetime.date(2019, 6, 10): 'Whit Monday',
 datetime.date(2019, 10, 3): 'German Unity Day',
 datetime.date(2019, 12, 25): 'Christmas Day',
 datetime.date(2019, 12, 26): 'Second Day of Christmas',
 datetime.date(2020, 1, 1): "New Year's Day",
 datetime.date(2020, 4, 10): 'Good Friday',
 datetime.date(2020, 4, 13): 'Easter Monday',
 datetime.date(2020, 5, 1

In [14]:
data_df.with_columns(pl.col("date").dt.date().alias("date"))

id,date,diff,primary_energy,adresse,ort,plz,source
str,date,f64,str,str,str,i64,str
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh"""
…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-12,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-13,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""
"""4008231VG""",2022-03-14,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy"""


In [15]:
data_df = data_df.with_columns(pl.col("date").dt.date().alias("date")
                               ).with_columns(
    pl.when(pl.col("date").is_in(set(holiday_dict.keys()))).then(1).otherwise(0).alias("holiday"))
data_df

id,date,diff,primary_energy,adresse,ort,plz,source,holiday
str,date,f64,str,str,str,i64,str,i32
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-01,703.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-02,334.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-03,891.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",1
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-04,661.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",0
"""0c9ad311-b86f-4371-a695-512ca4…",2022-10-05,499.0,"""district heating""","""Kielortring 14""","""Norderstedt""",22850,"""dh""",0
…,…,…,…,…,…,…,…,…
"""4008231VG""",2022-03-11,1435.1,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",0
"""4008231VG""",2022-03-12,1083.3,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",0
"""4008231VG""",2022-03-13,1038.4,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",0
"""4008231VG""",2022-03-14,996.6,"""gas""","""Tinnumer Weg 1-9, Morsumer Weg…","""Hamburg""",22117,"""legacy""",0


## Building Features

### Legacy Data

In [29]:
leg_data = pl.read_csv(RAW_DATA_DIR / "legacy_daily.csv")
leg_data_meta = leg_data.group_by(
    ["id", "ort", "adresse", "plz", "primary_energy", "qmbehfl", "anzlwhg", "co2koeffizient"]).agg()
leg_data_meta

id,ort,adresse,plz,primary_energy,qmbehfl,anzlwhg,co2koeffizient
str,str,str,i64,str,i64,i64,f64
"""400459GVA""","""Hamburg""","""Sven Hedin Str.11""",22523,"""gas""",15593,240,2.26
"""400711GVG""","""Hamburg""","""Dahlgrünring 5-9""",21109,"""gas""",6904,96,2.26
"""400287SVG""","""Schwarzenbek""","""Verbrüderungsring 21""",21493,"""gas""",3393,51,2.26
"""400259PVG""","""Tangstedt""","""Dorfstraße 122""",22889,"""gas""",1067,18,2.26
"""400328PVG""","""Barmstedt""","""Mühlenweg 46""",25355,"""gas""",2141,36,2.26
…,…,…,…,…,…,…,…
"""400352PVG""","""Hamburg""","""Dammwiesenstraße 1c""",22045,"""gas""",2551,42,2.26
"""400690GVG""","""Hamburg""","""Op´n Hainholt 4-18""",22589,"""gas""",18493,290,2.26
"""400284SVG""","""Schwarzenbek""","""Verbrüderungsring 9""",21493,"""gas""",3684,48,2.26
"""400452GVG""","""Hamburg""","""Heidrehmen 1""",22589,"""gas""",51121,841,2.26


In [25]:
leg_data_meta.write_csv(RAW_DATA_DIR / "legacy_meta.csv")

### Kinergy Data

In [30]:
from src.energy_forecast.config import DATA_DIR
import json

with open(DATA_DIR / "kinergy" / "kinergy_eco_u_list.json", "r", encoding="UTF-8") as f:
    eco_u_data = json.loads(f.read())
item_list = [i for k, i in eco_u_data.items()]
kinergy_meta = pl.from_dicts(item_list).select(
    ["hash", "ort", "name", "plz", "anzahlwhg", "typ", "complexity", "complexity_score", "primary_energy",
     "heated_area", "renewable_energy_used", "has_pwh", "pwh_type", "netz_nummer"])

bem_meta = pl.read_csv(DATA_DIR / "kinergy" / "berlin_fernwärmenetz_info.csv").rename(
    {"Netznummer": "netz_nummer"}).select(["netz_nummer", "min_vorlauf_temp", "max_vorlauf_temp"])
kinergy_meta = kinergy_meta.join(bem_meta, on="netz_nummer", how="left")
kinergy_meta

hash,ort,name,plz,anzahlwhg,typ,complexity,complexity_score,primary_energy,heated_area,renewable_energy_used,has_pwh,pwh_type,netz_nummer,min_vorlauf_temp,max_vorlauf_temp
str,str,str,str,i64,str,i64,f64,str,f64,bool,bool,str,i64,str,str
"""1 # JHe51""","""Bamberg""","""Hegelstraße 51""","""96052""",134,"""Studentenwohnheim""",0,25.0,"""district heating""",2736.65,false,true,"""central""",,,
"""2 # JMe4""","""Erlangen""","""Mittlere Schulstraße 4""","""91054""",64,"""Studentenwohnheim""",1,34.0,"""gas""",1201.78,false,true,"""central""",,,
"""3 # JOe11""","""Erlangen""","""Otto-Goetze-Straße 11""","""91054""",168,"""Studentenwohnheim""",1,40.5,"""gas""",3141.78,false,true,"""central""",,,
"""4 # JSe21/23""","""Bayreuth""","""Schellingstraße 21/23""","""95447""",60,"""Studentenwohnheim""",0,28.0,"""gas""",2488.92,false,true,"""central""",,,
"""5 # WFe21-25""","""Würzburg""","""Friedrichstraße 21-25""","""97082""",64,"""Mehrfamilienhaus""",1,35.0,"""district heating""",3600.0,true,true,"""decentral""",,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""16 # PRe45""","""Plön""","""Rautenbergstraße 45""","""24306""",0,"""""",0,20.5,"""gas""",,false,true,"""central""",,,
"""17 # SFp36""","""Hamburg""","""Foorthkamp 36""","""22419""",0,"""Schule""",1,35.0,"""gas""",6428.0,false,true,"""central""",,,
"""18 # SGe171""","""Hamburg""","""Gaußstraße 171""","""22765""",0,"""Schule""",1,35.5,"""gas""",6140.0,false,false,,,,
"""19 # SKg63""","""Hamburg""","""Kapellenweg 63""","""21077""",0,"""Schule""",1,42.0,"""gas""",7511.0,false,true,"""central""",,,


In [18]:
kinergy_meta.filter(pl.col("primary_energy") == "district heating")

hash,ort,name,plz,anzahlwhg,typ,complexity,complexity_score,primary_energy,heated_area,renewable_energy_used,has_pwh,pwh_type,netz_nummer,min_vorlauf_temp,max_vorlauf_temp
str,str,str,str,i64,str,i64,f64,str,f64,bool,bool,str,i64,str,str
"""1 # JHe51""","""Bamberg""","""Hegelstraße 51""","""96052""",134,"""Studentenwohnheim""",0,25.0,"""district heating""",2736.65,False,True,"""central""",,,
"""5 # WFe21-25""","""Würzburg""","""Friedrichstraße 21-25""","""97082""",64,"""Mehrfamilienhaus""",1,35.0,"""district heating""",3600.0,True,True,"""decentral""",,,
"""9 # BMr03""","""Berlin""","""Marzahner Chaussee 231 Sportha…","""12681""",0,"""Schule""",0,27.5,"""district heating""",1141.0,False,True,"""central""",2100.0,""" 80 °C""",""" 135 °C"""
"""10 # BSeH1+3+4""","""Berlin""","""Straßmannstraße 14-16 H1+3+4""","""10249""",24,"""Schule""",0,23.0,"""district heating""",9456.0,False,False,,2600.0,""" 80 °C""",""" 135 °C"""
"""11 # BSeH2""","""Berlin""","""Straßmannstraße 14-16 H2""","""10249""",0,"""Schule""",0,27.0,"""district heating""",2360.0,False,True,"""central""",2600.0,""" 80 °C""",""" 135 °C"""
"""12 # BTr9""","""Berlin""","""Trebbiner Str.9""","""10963""",0,"""Museum""",0,26.0,"""district heating""",6000.0,False,False,,2601.0,""" 80 °C""",""" 135 °C"""


### District Heating Data

In [47]:
dh_meta = pl.read_csv(RAW_DATA_DIR / "district_heating_meta.csv").with_columns(
    pl.lit("Mehrfamilienhaus").alias("typ"),
    pl.lit(75).alias("min_vorlauf_temp"),
    pl.lit(90).alias("max_vorlauf_temp")
).rename({"eco_u_id": "id"}).select(
    ["id", "city", "address", "postal_code", "typ", "min_vorlauf_temp", "max_vorlauf_temp"])
dh_meta = dh_meta.rename({"address": "adresse", "postal_code": "plz", "city": "ort"}).select(
    ["id", "adresse", "ort", "plz", "typ", "min_vorlauf_temp", "max_vorlauf_temp"]).cast(
    {"plz": pl.String, "min_vorlauf_temp": pl.String, "max_vorlauf_temp": pl.String}).with_columns(
    pl.lit("dh").alias("source"),
    pl.lit("district heating").alias("primary_energy")
)
dh_meta

id,adresse,ort,plz,typ,min_vorlauf_temp,max_vorlauf_temp,source,primary_energy
str,str,str,str,str,str,str,str,str
"""0c9ad311-b86f-4371-a695-512ca4…","""Kielortring 14""","""Norderstedt""","""22850""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""10af300b-a270-4e41-928d-e4048b…","""Heidehofweg 120""","""Norderstedt""","""22850""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""1657f5b3-fad0-4685-b56c-d57982…","""Moorbekstraße 15""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""1a94c658-a524-4293-bb95-020c53…","""Röntgengang 14""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""2a5d5a37-a843-4621-b20a-a0c1a5…","""Moorbekstraße 25""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
…,…,…,…,…,…,…,…,…
"""e7ad9b75-bc6c-4891-a8fd-45e393…","""Heidehofweg 122a+b +124""","""Norderstedt""","""22850""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""edcafda6-fe6f-4ca3-bb3d-f0c5fb…","""Kielort 24""","""Norderstedt""","""22850""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""f1c2b8a6-9833-4150-896c-20b054…","""Norderstraße 41""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""
"""fb5cc271-ae15-4f24-b9d5-30782b…","""Moorbekstraße 33""","""Norderstedt""","""22846""","""Mehrfamilienhaus""","""75""","""90""","""dh""","""district heating"""


### Merge

In [32]:
leg_data_meta = leg_data_meta.rename({"qmbehfl": "heated_area"}).cast(
    {"heated_area": pl.Float64, "plz": pl.String}).with_columns(pl.lit("leg").alias("source"))
kinergy_meta = kinergy_meta.rename({"hash": "id", "name": "adresse"}).with_columns(pl.lit("kin").alias("source"))

In [21]:
kinergy_meta.schema

Schema([('id', String),
        ('ort', String),
        ('adresse', String),
        ('plz', String),
        ('anzahlwhg', Int64),
        ('typ', String),
        ('complexity', Int64),
        ('complexity_score', Float64),
        ('primary_energy', String),
        ('heated_area', Float64),
        ('renewable_energy_used', Boolean),
        ('has_pwh', Boolean),
        ('pwh_type', String),
        ('netz_nummer', Int64),
        ('min_vorlauf_temp', String),
        ('max_vorlauf_temp', String),
        ('source', String)])

In [22]:
leg_data_meta.schema

Schema([('id', String),
        ('ort', String),
        ('adresse', String),
        ('plz', String),
        ('primary_energy', String),
        ('heated_area', Float64),
        ('anzlwhg', Int64),
        ('co2koeffizient', Float64),
        ('source', String)])

In [44]:
dh_meta.schema

Schema([('id', String),
        ('adresse', String),
        ('ort', String),
        ('plz', String),
        ('typ', String),
        ('min_vorlauf_temp', String),
        ('max_vorlauf_temp', String),
        ('source', String)])

In [48]:
df_meta = pl.concat([leg_data_meta, kinergy_meta, dh_meta], how="diagonal")
df_meta

id,ort,adresse,plz,primary_energy,heated_area,anzlwhg,co2koeffizient,source,anzahlwhg,typ,complexity,complexity_score,renewable_energy_used,has_pwh,pwh_type,netz_nummer,min_vorlauf_temp,max_vorlauf_temp
str,str,str,str,str,f64,i64,f64,str,i64,str,i64,f64,bool,bool,str,i64,str,str
"""400459GVA""","""Hamburg""","""Sven Hedin Str.11""","""22523""","""gas""",15593.0,240,2.26,"""leg""",,,,,,,,,,
"""400711GVG""","""Hamburg""","""Dahlgrünring 5-9""","""21109""","""gas""",6904.0,96,2.26,"""leg""",,,,,,,,,,
"""400287SVG""","""Schwarzenbek""","""Verbrüderungsring 21""","""21493""","""gas""",3393.0,51,2.26,"""leg""",,,,,,,,,,
"""400259PVG""","""Tangstedt""","""Dorfstraße 122""","""22889""","""gas""",1067.0,18,2.26,"""leg""",,,,,,,,,,
"""400328PVG""","""Barmstedt""","""Mühlenweg 46""","""25355""","""gas""",2141.0,36,2.26,"""leg""",,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""e7ad9b75-bc6c-4891-a8fd-45e393…","""Norderstedt""","""Heidehofweg 122a+b +124""","""22850""","""district heating""",,,,"""dh""",,"""Mehrfamilienhaus""",,,,,,,"""75""","""90"""
"""edcafda6-fe6f-4ca3-bb3d-f0c5fb…","""Norderstedt""","""Kielort 24""","""22850""","""district heating""",,,,"""dh""",,"""Mehrfamilienhaus""",,,,,,,"""75""","""90"""
"""f1c2b8a6-9833-4150-896c-20b054…","""Norderstedt""","""Norderstraße 41""","""22846""","""district heating""",,,,"""dh""",,"""Mehrfamilienhaus""",,,,,,,"""75""","""90"""
"""fb5cc271-ae15-4f24-b9d5-30782b…","""Norderstedt""","""Moorbekstraße 33""","""22846""","""district heating""",,,,"""dh""",,"""Mehrfamilienhaus""",,,,,,,"""75""","""90"""


In [49]:
df_meta.describe()

statistic,id,ort,adresse,plz,primary_energy,heated_area,anzlwhg,co2koeffizient,source,anzahlwhg,typ,complexity,complexity_score,renewable_energy_used,has_pwh,pwh_type,netz_nummer,min_vorlauf_temp,max_vorlauf_temp
str,str,str,str,str,str,f64,f64,f64,str,f64,str,f64,f64,f64,f64,str,f64,str,str
"""count""","""103""","""103""","""103""","""103""","""103""",55.0,36.0,36.0,"""103""",20.0,"""67""",20.0,20.0,20.0,20.0,"""16""",4.0,"""51""","""51"""
"""null_count""","""0""","""0""","""0""","""0""","""0""",48.0,67.0,67.0,"""0""",83.0,"""36""",83.0,83.0,83.0,83.0,"""87""",99.0,"""52""","""52"""
"""mean""",,,,,,8287.213818,158.666667,2.26,,29.05,,0.5,29.3,0.15,0.8,,2475.25,,
"""std""",,,,,,12004.17402,218.527409,4.5039e-16,,47.841267,,0.606977,10.181769,,,,250.167111,,
"""min""","""0c9ad311-b86f-4371-a695-512ca4…","""Bamberg""","""Alter Sportplatz 1-5""","""10249""","""district heating""",0.0,0.0,2.26,"""dh""",0.0,"""""",0.0,12.5,0.0,0.0,"""central""",2100.0,""" 80 °C""",""" 135 °C"""
"""25%""",,,,,,1812.0,28.0,2.26,,0.0,,0.0,23.0,,,,2600.0,,
"""50%""",,,,,,3393.0,80.0,2.26,,0.0,,0.0,28.0,,,,2600.0,,
"""75%""",,,,,,8125.67,162.0,2.26,,32.0,,1.0,35.5,,,,2600.0,,
"""max""","""fb684f25-a63d-4d3e-9277-6d759b…","""Würzburg""","""Wilhelmstraße 33-41""","""97084""","""gas""",51121.0,841.0,2.26,"""leg""",168.0,"""Studentenwohnheim""",2.0,51.5,1.0,1.0,"""decentral""",2601.0,"""75""","""90"""
