# Correlation Analysis

## Load data

In [25]:
import pandas as pd
import polars as pl
from src.energy_forecast.config import PROCESSED_DATA_DIR, RAW_DATA_DIR

df_weather = pl.read_csv(RAW_DATA_DIR / f"weather_daily.csv").with_columns(
    pl.col("time").str.to_datetime().alias("datetime"))
df_holidays = pl.read_csv(RAW_DATA_DIR / "holidays.csv").with_columns(pl.col("start").str.to_date(),
                                                                      pl.col("end").str.to_date(strict=False))
df_cities = pl.read_csv(RAW_DATA_DIR / "cities.csv")

plz,min_date,max_date,lat,lon,state
i64,str,str,f64,f64,str
22848,"""2021-10-29T00:00:00.000000""","""2022-04-01T00:00:00.000000""",53.6736,9.9833,"""SH"""
21493,"""2018-02-24T00:00:00.000000""","""2022-04-04T00:00:00.000000""",53.551392,10.510546,"""SH"""
22523,"""2020-08-19T00:00:00.000000""","""2022-03-16T00:00:00.000000""",53.6079,9.9097,"""HH"""
25336,"""2018-12-14T00:00:00.000000""","""2022-04-04T00:00:00.000000""",53.73565,9.6567,"""SH"""
24118,"""2020-02-15T00:00:00.000000""","""2022-03-16T00:00:00.000000""",54.3334,10.1176,"""SH"""
…,…,…,…,…,…
22081,"""2018-07-12T00:00:00.000000""","""2020-12-29T00:00:00.000000""",53.5758,10.0364,"""HH"""
21033,"""2018-08-08T00:00:00.000000""","""2022-03-23T00:00:00.000000""",53.501725,10.1756,"""HH"""
10249,"""2022-10-19T00:00:00.000000""","""2023-09-13T00:00:00.000000""",52.5238,13.4428,"""BE"""
21149,"""2021-01-28T00:00:00.000000""","""2022-03-18T00:00:00.000000""",53.4667,9.867,"""HH"""


In [22]:
holiday_dict = {"BE": [], "HH": [], "MV": [], "BY": [], "SH": []}
for row in df_holidays.iter_rows():
    if row[1] is not None and row[2] is not None:
        span = pd.date_range(row[1], row[2], freq="D")
        holiday_dict[row[0]].extend(span)
    elif row[1] is not None:
        holiday_dict[row[0]].extend([row[1]])

### District Heating data

In [4]:
df_dh = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").filter(pl.col("source") == "dh")
df_dh = df_dh.with_columns(pl.coalesce(pl.col("datetime").str.to_datetime(strict=False),
                                       pl.col("datetime").str.to_datetime(format="%y-%m-%dT%H:%M:%S#z", strict=False)))
df_meta_dh = pl.read_csv(RAW_DATA_DIR / "dh_meta.csv")
df_dh = df_dh.join(df_meta_dh.rename({"eco_u_id": "id"}), on="id", how="left")
df_dh = df_dh.rename({"postal_code": "plz"}).join(df_weather, on=["datetime", "plz"], how="left")

df_dh

id,datetime,diff,source,data_provider_id,address,city,plz,country,primary_energy,unit_code,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,datetime[μs],f64,str,str,str,str,i64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""9500b2eb-c260-4200-b657-125604…",2022-08-24 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-24T00:00:00.000000000""",63.166667,38.0,86.0,22.3,16.2,29.1,0.0,0.0,109.0,10.8,33.1,1020.5,792.0
"""9500b2eb-c260-4200-b657-125604…",2022-08-25 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-25T00:00:00.000000000""",78.625,48.0,98.0,21.0,14.4,28.3,0.0,0.0,86.0,9.4,31.7,1018.9,462.0
"""9500b2eb-c260-4200-b657-125604…",2022-08-26 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-26T00:00:00.000000000""",86.375,64.0,99.0,21.1,17.0,26.6,23.2,0.0,60.0,6.8,29.9,1014.3,234.0
"""9500b2eb-c260-4200-b657-125604…",2022-08-27 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-27T00:00:00.000000000""",81.291667,54.0,95.0,18.2,12.5,24.2,0.0,0.0,311.0,10.8,28.8,1015.0,294.0
"""9500b2eb-c260-4200-b657-125604…",2022-08-28 00:00:00,9.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-28T00:00:00.000000000""",79.041667,59.0,100.0,15.7,8.7,21.5,0.0,0.0,284.0,10.8,37.4,1018.1,288.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2f025f96-af2c-4140-b955-766a79…",2023-07-16 00:00:00,199.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-16T00:00:00.000000000""",66.666667,49.0,94.0,20.1,16.4,23.7,0.0,0.0,229.0,21.2,58.7,1011.7,492.0
"""2f025f96-af2c-4140-b955-766a79…",2023-07-17 00:00:00,113.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-17T00:00:00.000000000""",75.208333,51.0,96.0,17.0,11.9,21.8,4.0,0.0,236.0,17.3,52.6,1016.5,348.0
"""2f025f96-af2c-4140-b955-766a79…",2023-07-18 00:00:00,306.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-18T00:00:00.000000000""",80.666667,54.0,97.0,16.4,11.6,22.4,1.4,0.0,257.0,11.2,33.8,1018.1,444.0
"""2f025f96-af2c-4140-b955-766a79…",2023-07-19 00:00:00,261.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-19T00:00:00.000000000""",81.333333,65.0,96.0,15.9,12.5,20.0,1.7,0.0,5.0,7.6,27.7,1012.6,168.0


In [29]:
df_dh.join(df_cities.select(["plz", "state"]), on="plz", how="left").with_columns(pl.struct(["state", "datetime"]).map_elements(lambda x: 1 if x["datetime"] in holiday_dict[x["state"]] else 0).alias("holiday"))

  df_dh.join(df_cities.select(["plz", "state"]), on="plz", how="left").with_columns(pl.struct(["state", "datetime"]).map_elements(lambda x: 1 if x["datetime"] in holiday_dict[x["state"]] else 0).alias("holiday"))


id,datetime,diff,source,data_provider_id,address,city,plz,country,primary_energy,unit_code,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,state,holiday
str,datetime[μs],f64,str,str,str,str,i64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64
"""9500b2eb-c260-4200-b657-125604…",2022-08-24 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-24T00:00:00.000000000""",63.166667,38.0,86.0,22.3,16.2,29.1,0.0,0.0,109.0,10.8,33.1,1020.5,792.0,"""SH""",0
"""9500b2eb-c260-4200-b657-125604…",2022-08-25 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-25T00:00:00.000000000""",78.625,48.0,98.0,21.0,14.4,28.3,0.0,0.0,86.0,9.4,31.7,1018.9,462.0,"""SH""",0
"""9500b2eb-c260-4200-b657-125604…",2022-08-26 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-26T00:00:00.000000000""",86.375,64.0,99.0,21.1,17.0,26.6,23.2,0.0,60.0,6.8,29.9,1014.3,234.0,"""SH""",0
"""9500b2eb-c260-4200-b657-125604…",2022-08-27 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-27T00:00:00.000000000""",81.291667,54.0,95.0,18.2,12.5,24.2,0.0,0.0,311.0,10.8,28.8,1015.0,294.0,"""SH""",0
"""9500b2eb-c260-4200-b657-125604…",2022-08-28 00:00:00,9.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-28T00:00:00.000000000""",79.041667,59.0,100.0,15.7,8.7,21.5,0.0,0.0,284.0,10.8,37.4,1018.1,288.0,"""SH""",0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2f025f96-af2c-4140-b955-766a79…",2023-07-16 00:00:00,199.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-16T00:00:00.000000000""",66.666667,49.0,94.0,20.1,16.4,23.7,0.0,0.0,229.0,21.2,58.7,1011.7,492.0,"""SH""",0
"""2f025f96-af2c-4140-b955-766a79…",2023-07-17 00:00:00,113.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-17T00:00:00.000000000""",75.208333,51.0,96.0,17.0,11.9,21.8,4.0,0.0,236.0,17.3,52.6,1016.5,348.0,"""SH""",1
"""2f025f96-af2c-4140-b955-766a79…",2023-07-18 00:00:00,306.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-18T00:00:00.000000000""",80.666667,54.0,97.0,16.4,11.6,22.4,1.4,0.0,257.0,11.2,33.8,1018.1,444.0,"""SH""",1
"""2f025f96-af2c-4140-b955-766a79…",2023-07-19 00:00:00,261.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-19T00:00:00.000000000""",81.333333,65.0,96.0,15.9,12.5,20.0,1.7,0.0,5.0,7.6,27.7,1012.6,168.0,"""SH""",1


### Kinergy Data

In [29]:
df_k = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").filter(pl.col("source") == "kinergy").with_columns(
    pl.col("datetime").str.to_datetime())
df_meta_k = pl.read_csv(RAW_DATA_DIR / "kinergy_meta.csv")
df_k = df_k.join(df_meta_k, on="id", how="left")
df_k = df_k.select(
    ["id", "diff", "anzahlwhg", "complexity", "complexity_score", "primary_energy", "heated_area", "datetime", "plz"])
df_k = df_k.join(df_weather, on=["datetime", "plz"], how="left")
df_k

id,diff,anzahlwhg,complexity,complexity_score,primary_energy,heated_area,datetime,plz,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,f64,i64,i64,f64,str,f64,datetime[μs],i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",1173.0,0,0,27.0,"""district heating""",2360.0,2022-10-19 00:00:00,10249,"""2022-10-19T00:00:00.000000000""",77.75,61.0,88.0,10.6,5.1,14.1,0.0,0.0,319.0,11.2,33.5,1027.4,246.0
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",691.0,0,0,27.0,"""district heating""",2360.0,2022-10-20 00:00:00,10249,"""2022-10-20T00:00:00.000000000""",77.5,56.0,94.0,8.6,2.8,14.3,1.3,0.0,116.0,13.7,42.1,1022.2,558.0
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",710.0,0,0,27.0,"""district heating""",2360.0,2022-10-21 00:00:00,10249,"""2022-10-21T00:00:00.000000000""",86.791667,69.0,98.0,12.1,8.1,15.6,1.8,0.0,154.0,10.4,24.8,1012.0,90.0
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",423.0,0,0,27.0,"""district heating""",2360.0,2022-10-22 00:00:00,10249,"""2022-10-22T00:00:00.000000000""",88.083333,71.0,98.0,15.0,12.6,18.9,0.0,0.0,234.0,8.6,25.2,1013.5,258.0
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",363.0,0,0,27.0,"""district heating""",2360.0,2022-10-23 00:00:00,10249,"""2022-10-23T00:00:00.000000000""",83.625,67.0,99.0,14.5,9.0,18.9,2.4,0.0,179.0,9.4,29.2,1015.1,390.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""b12ea64c-04bf-11ec-9d61-02b402…",8.0,0,0,26.0,"""district heating""",6000.0,2023-09-17 00:00:00,10963,"""2023-09-17T00:00:00.000000000""",70.583333,45.0,98.0,20.4,12.5,27.4,0.0,0.0,115.0,5.0,16.2,1015.6,684.0
"""b12ea64c-04bf-11ec-9d61-02b402…",0.0,0,0,26.0,"""district heating""",6000.0,2023-09-18 00:00:00,10963,"""2023-09-18T00:00:00.000000000""",71.708333,52.0,87.0,21.0,16.5,26.3,0.5,0.0,153.0,12.2,33.5,1009.0,264.0
"""b12ea64c-04bf-11ec-9d61-02b402…",0.0,0,0,26.0,"""district heating""",6000.0,2023-09-19 00:00:00,10963,"""2023-09-19T00:00:00.000000000""",61.791667,42.0,87.0,18.4,15.8,21.1,0.0,0.0,232.0,16.2,44.6,1009.9,540.0
"""b12ea64c-04bf-11ec-9d61-02b402…",0.0,0,0,26.0,"""district heating""",6000.0,2023-09-20 00:00:00,10963,"""2023-09-20T00:00:00.000000000""",61.5,47.0,73.0,19.3,14.6,25.0,0.0,0.0,200.0,12.6,38.2,1010.6,510.0


### Legacy Data

In [30]:
df_l = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").filter(pl.col("source") == "legacy").with_columns(
    pl.col("datetime").str.to_datetime())
df_meta_l = pl.read_csv(RAW_DATA_DIR / "legacy_meta.csv")
df_l = df_l.join(df_meta_l, on="id", how="left")
df_l = df_l.select(["id", "datetime", "diff", "qmbehfl", "anzlwhg", "co2koeffizient", "plz", "Type"]).with_columns(
    pl.col("plz").str.strip_chars()).cast({"plz": pl.Int64})
df_l = df_l.join(df_weather, on=["datetime", "plz"], how="left")
df_l

id,datetime,diff,qmbehfl,anzlwhg,co2koeffizient,plz,Type,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,datetime[μs],f64,f64,i64,f64,i64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""400060pVG""",2017-09-21 00:00:00,339.312,27746.0,386,2.26,22547,10,"""2017-09-21T00:00:00.000000000""",85.75,63.0,100.0,13.8,10.5,18.8,0.0,0.0,243.0,6.1,20.5,1020.8,180.0
"""400060pVG""",2017-09-22 00:00:00,297.063,27746.0,386,2.26,22547,10,"""2017-09-22T00:00:00.000000000""",85.583333,64.0,100.0,13.8,9.0,18.3,0.0,0.0,42.0,5.0,18.4,1021.2,120.0
"""400060pVG""",2017-09-23 00:00:00,306.75,27746.0,386,2.26,22547,10,"""2017-09-23T00:00:00.000000000""",84.291667,61.0,98.0,13.7,9.0,19.4,0.0,0.0,62.0,5.8,20.5,1023.7,402.0
"""400060pVG""",2017-09-24 00:00:00,323.187,27746.0,386,2.26,22547,10,"""2017-09-24T00:00:00.000000000""",92.25,78.0,100.0,14.3,11.3,18.4,0.0,0.0,13.0,8.6,27.7,1024.0,108.0
"""400060pVG""",2017-09-25 00:00:00,276.188,27746.0,386,2.26,22547,10,"""2017-09-25T00:00:00.000000000""",86.833333,71.0,99.0,15.4,13.4,19.3,0.1,0.0,66.0,12.6,36.4,1023.8,120.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""400302GVG""",2021-08-17 00:00:00,29.6,1372.0,24,2.26,23562,10,"""2021-08-17T00:00:00.000000000""",82.833333,71.0,89.0,14.9,12.6,18.5,1.5,0.0,258.0,18.4,49.0,1008.3,186.0
"""400302GVG""",2021-08-18 00:00:00,25.2,1372.0,24,2.26,23562,10,"""2021-08-18T00:00:00.000000000""",79.625,65.0,92.0,15.9,12.7,20.2,1.7,0.0,253.0,15.8,52.2,1009.1,102.0
"""400302GVG""",2021-08-19 00:00:00,27.3,1372.0,24,2.26,23562,10,"""2021-08-19T00:00:00.000000000""",77.208333,64.0,92.0,16.7,13.9,20.6,0.5,0.0,253.0,12.6,39.2,1010.0,156.0
"""400302GVG""",2021-08-20 00:00:00,28.3,1372.0,24,2.26,23562,10,"""2021-08-20T00:00:00.000000000""",83.041667,65.0,95.0,16.0,11.3,21.5,0.1,0.0,234.0,6.1,37.4,1014.8,132.0


### Helper function for DH data

In [31]:
from src.energy_forecast.config import RAW_DATA_DIR

id = "8e9b1544-434e-44a7-8049-8f2e4b14a819"
attributes = ["diff", 'hum_avg',
              'hum_min',
              'hum_max',
              'tavg',
              'tmin',
              'tmax',
              'prcp',
              'snow',
              'wdir',
              'wspd',
              'wpgt',
              'pres',
              'tsun']


def get_df_sensor(df, id, attributes):
    df_sens = df.filter(pl.col("id") == id)
    source = df_sens["source"].unique().item()
    df_meta = pl.read_csv(RAW_DATA_DIR / f"{source}_meta.csv")
    sens_meta = df_meta.filter(pl.col("eco_u_id") == id)
    for col in df_meta.columns:
        df_sens = df_sens.with_columns(pl.lit(sens_meta[col].item()).alias(col))
    df_weather = pl.read_csv(RAW_DATA_DIR / f"weather_daily.csv").with_columns(
        pl.col("time").str.to_datetime().alias("datetime")).rename({"plz": "postal_code"})
    df_sens = df_sens.join(df_weather, on=["datetime", "postal_code"], how="left")
    return df_sens.select(attributes)


df_sens = get_df_sensor(df_dh, id, attributes)
df_sens

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,63.166667,38.0,86.0,22.3,16.2,29.1,0.0,0.0,109.0,10.8,33.1,1020.5,792.0
0.0,81.291667,54.0,95.0,18.2,12.5,24.2,0.0,0.0,311.0,10.8,28.8,1015.0,294.0
1.0,79.041667,59.0,100.0,15.7,8.7,21.5,0.0,0.0,284.0,10.8,37.4,1018.1,288.0
1.0,81.541667,64.0,99.0,16.7,10.4,20.8,0.0,0.0,327.0,6.5,22.7,1021.1,204.0
2.0,74.416667,46.0,95.0,15.2,6.0,21.7,0.0,0.0,31.0,7.2,27.7,1023.7,318.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
36.0,80.375,58.0,100.0,14.4,9.3,19.1,0.0,0.0,284.0,8.2,29.2,1024.3,780.0
36.0,71.25,51.0,89.0,14.2,9.5,19.9,0.0,0.0,15.0,8.3,27.7,1024.6,642.0
39.0,67.541667,40.0,95.0,14.8,9.5,20.6,0.0,0.0,82.0,13.3,38.9,1021.8,648.0
20.0,48.666667,30.0,68.0,18.0,10.5,24.6,0.0,0.0,104.0,14.4,36.7,1015.1,870.0


## Multiple Linear Regression with OLS

In [32]:
from statsmodels.regression.linear_model import RegressionResults
import statsmodels.api as sm


def get_p_vals(df: pl.DataFrame, list_cols: list) -> tuple[RegressionResults, list]:
    attr_list = list(set(list_cols) - {"diff"})
    X = df.select(attr_list).to_numpy()
    y = df.select(pl.col("diff")).to_numpy()
    X2 = sm.add_constant(X)
    est = sm.OLS(y, X2)
    est2 = est.fit()
    return est2, est2.summary2().tables[1]["P>|t|"].tolist()

In [33]:
est, p_vals = get_p_vals(df_sens, attributes)
for attr, p in zip(attributes, p_vals):
    print(f"{attr}: {p}")

diff: 0.052821713110866676
hum_avg: 0.06677598262137802
hum_min: 0.4491381463092624
hum_max: 0.11566754562117387
tavg: 0.00020384876790191566
tmin: 4.3072467486904096e-06
tmax: 0.5135447340075869
prcp: 0.0024896570239718923
snow: 0.6720447847765019
wdir: 0.023529609805249836
wspd: 5.643293725454865e-05
wpgt: 0.46147452862383487
pres: 7.609670463716078e-05
tsun: 0.273691269590197


In [34]:
est.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.878
Model:,OLS,Adj. R-squared:,0.875
Method:,Least Squares,F-statistic:,311.3
Date:,"Wed, 12 Feb 2025",Prob (F-statistic):,2.6500000000000002e-247
Time:,14:52:30,Log-Likelihood:,-3175.8
No. Observations:,578,AIC:,6380.0
Df Residuals:,564,BIC:,6441.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,538.0797,277.295,1.940,0.053,-6.577,1082.736
x1,-0.0391,0.021,-1.837,0.067,-0.081,0.003
x2,-0.2017,0.266,-0.757,0.449,-0.725,0.321
x3,-1.5362,0.975,-1.576,0.116,-3.451,0.379
x4,4.7465,1.270,3.739,0.000,2.253,7.240
x5,0.9853,0.212,4.641,0.000,0.568,1.402
x6,0.5411,0.828,0.654,0.514,-1.085,2.167
x7,-0.1018,0.034,-3.038,0.002,-0.168,-0.036
x8,-1.2981,3.065,-0.424,0.672,-7.318,4.721

0,1,2,3
Omnibus:,32.048,Durbin-Watson:,0.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,79.861
Skew:,-0.251,Prob(JB):,4.55e-18
Kurtosis:,4.75,Cond. No.,120000.0


#### P-Values District Heating Data

In [35]:
p_vals_coll = list()
for id in df_dh["id"].unique():
    df_sens = get_df_sensor(df_dh, id, attributes)
    est, p_vals = get_p_vals(df_sens, attributes)
    p_vals_coll.append(p_vals)

df_p = pl.DataFrame(p_vals_coll, orient="row", schema=attributes).drop_nans()
df_p

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.013008,0.00101,0.253398,0.119598,0.000269,2.5413e-7,0.339485,0.000147,0.417741,0.270263,0.00018,0.402174,0.001777,0.234198
0.841917,0.018608,0.360901,0.255361,0.143436,0.068272,0.010051,0.95646,0.254077,0.103513,0.039955,0.668219,0.86968,0.306326
0.958351,0.980839,0.414092,0.830203,0.001167,0.043726,0.000853,0.032195,0.998534,0.022084,0.020426,0.448619,0.162786,0.003373
0.052822,0.066776,0.449138,0.115668,0.000204,0.000004,0.513545,0.00249,0.672045,0.02353,0.000056,0.461475,0.000076,0.273691
0.086597,0.002991,0.533811,0.22881,0.47528,0.026019,0.251805,0.164016,0.403087,0.350796,0.033844,0.758773,0.122237,0.079662
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.003615,0.18215,0.03175,0.290713,0.027183,0.000099,0.3139,0.501211,0.047342,0.942368,0.692687,0.61597,0.046446,0.063021
0.00059,0.001354,0.023232,0.005348,0.000029,4.4145e-9,0.091468,0.306543,0.381816,0.315373,2.4026e-7,0.299092,0.000106,0.726805
0.8984,0.118351,0.326282,0.302548,0.000004,0.096859,0.204225,0.235927,0.975715,0.050121,0.000082,0.501994,0.002804,0.175473
0.188512,0.223485,0.969678,0.01122,0.000002,6.2207e-10,0.041529,0.995986,0.668175,0.751741,2.7928e-9,0.942532,0.000249,0.691079


In [36]:
df_p.mean()

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.24988,0.163577,0.393904,0.263925,0.118848,0.106232,0.335582,0.303739,0.557596,0.414612,0.077382,0.553081,0.154589,0.339716


### P-Values Kinergy Data

In [37]:
attributes_k_ha = attributes + ["anzahlwhg", "complexity_score", "heated_area"]  # error when including complexity
attributes_k = attributes
df_k_dh = df_k.filter(pl.col("primary_energy") == "district heating")
df_k_dh_ha = df_k.filter(
    (pl.col("primary_energy") == "district heating") & (pl.col("heated_area") != 0) & (pl.col("anzahlwhg") != 0))
df_k_g = df_k.filter(pl.col("primary_energy") == "gas")
df_k_g_ha = df_k.filter((pl.col("primary_energy") == "gas") & (pl.col("heated_area") != 0) & (pl.col("anzahlwhg") != 0))

In [38]:
import statsmodels.api as sm

attr_list = list(set(attributes_k) - {"diff"})
df_s = df_k_dh.filter(pl.col("id") == "aecb8acb-5dfc-47c9-8a44-cbae3ff7d2b3")
X = df_s.select(attr_list).to_numpy()
y = df_s.select(pl.col("diff")).to_numpy()
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
est2.summary()
est2.summary2().tables[1]

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,2632.502511,1942.79782,1.355006,0.176384,-1189.951265,6454.956287
x1,-0.139946,0.109185,-1.28173,0.200877,-0.354769,0.074876
x2,-1.381141,1.867245,-0.739668,0.460051,-5.054946,2.292663
x3,-20.126196,6.818225,-2.951823,0.003396,-33.54105,-6.711342
x4,-10.836318,5.924283,-1.829136,0.068322,-22.492341,0.819705
x5,6.844598,6.741514,1.015291,0.310744,-6.419327,20.108522
x6,9.489939,4.082225,2.324698,0.020721,1.458163,17.521714
x7,-0.278935,0.172205,-1.619783,0.106276,-0.617748,0.059879
x8,5.997347,20.454349,0.293206,0.769557,-34.246574,46.241268
x9,-7.044312,4.473533,-1.574664,0.116335,-15.845986,1.757363


#### P-Values Kinergy District Heating Data

In [39]:
def get_p_vals_coll(df, list_cols):
    p_vals_coll = list()
    for id in df["id"].unique():
        df_sens = df.filter(pl.col("id") == id)
        est, p_vals = get_p_vals(df_sens, list_cols)
        p_vals_coll.append(p_vals)
    df_p = pl.DataFrame(p_vals_coll, orient="row", schema=list_cols).with_columns(
        df["id"].unique().alias("id")).drop_nans()
    return df_p


get_p_vals_coll(df_k_dh, attributes_k)

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.080027,0.295162,0.190462,0.711381,0.03731,0.4174,0.924278,0.000277,0.619107,0.56163,0.118177,0.531345,0.744829,0.050474,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
0.014762,0.060034,0.080475,0.038588,0.838885,0.566682,0.148971,0.418858,0.168028,0.998229,0.000907,0.153211,0.062487,0.154883,"""b12ea64c-04bf-11ec-9d61-02b402…"
0.817444,0.037376,0.662017,0.002576,0.686859,0.206005,0.017007,0.080592,0.479059,0.781337,0.021315,0.336581,0.000223,0.163464,"""9c87eddf-04b5-11ec-9d61-02b402…"
0.517871,0.375553,0.229693,0.187484,0.441216,0.522956,0.482655,0.796733,0.102142,0.991137,0.958061,0.552333,0.013705,0.017224,"""1a9266de-dfff-11eb-9d61-02b402…"
0.176384,0.200877,0.460051,0.003396,0.068322,0.310744,0.020721,0.106276,0.769557,0.116335,0.055936,0.053868,2.5e-05,0.123846,"""841ccf85-04b7-11ec-9d61-02b402…"


In [40]:
get_p_vals_coll(df_k_dh, attributes_k).drop(["id"]).mean()

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.321298,0.1938,0.324539,0.188685,0.414518,0.404757,0.318727,0.280547,0.427579,0.689733,0.230879,0.325468,0.164254,0.101978


Make Regression over all values, because heated_area, number of appartements, ... is dependent on the building

In [41]:
len(attributes_k_ha)

17

In [42]:
est, p_vals = get_p_vals(df_k_dh_ha, attributes_k_ha)
for attr, p in zip(attributes_k_ha, p_vals):
    print(f"{attr}: {p}")

diff: 0.8161475012535392
hum_avg: 6.179604887532946e-07
hum_min: 0.04240522094008995
hum_max: 0.7321743451489311
tavg: 0.6775998502809499
tmin: 0.942627135800772
tmax: 0.07728343376255399
prcp: 0.0043816162997805785
snow: 0.1363268587067111
wdir: 0.003421865385950469
wspd: 0.46554384658891257
wpgt: 0.0031011260587621233
pres: 0.039602841645565816
tsun: 0.8161348831335267
anzahlwhg: 0.8161395878671691
complexity_score: 3.435078717761354e-10
heated_area: 0.11300156289381057


#### P-Values Kinergy Gas Data

In [43]:
att_list = list(set(attributes) - {"snow"})  # 299 nans for this df
get_p_vals_coll(df_k_g, att_list)

tsun,pres,hum_avg,wspd,hum_max,wdir,tmax,prcp,diff,tavg,wpgt,hum_min,tmin,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.494673,0.238245,0.719562,4e-06,0.715277,0.038498,0.455401,0.189158,0.847339,4.5e-05,0.111385,7.915e-10,0.30282,"""83758fff-e013-11eb-9d61-02b402…"
0.337395,0.002383,0.920845,2.3501e-10,0.771465,0.002034,0.086161,0.836248,0.214618,0.003291,0.203041,1.7526e-15,0.04333,"""6abb785c-dfff-11eb-9d61-02b402…"
0.000358,0.20852,0.005573,0.00272,0.541445,0.020976,0.00073,0.076392,0.18811,3.9823e-07,0.188699,0.000176,0.886332,"""578e031d-e014-11eb-9d61-02b402…"
0.006702,0.028785,0.168317,0.010363,4e-06,0.064107,0.144025,0.001085,0.403031,2.6838e-15,0.08606,0.000156,0.974043,"""730285cc-ae67-11eb-9b5e-02b402…"
0.837699,0.025777,0.119249,8.9116e-07,0.033292,0.156332,0.244112,0.388959,0.442913,5.9918e-08,0.00018,1.4523e-10,0.599675,"""bf254b46-e009-11eb-9d61-02b402…"
0.142719,0.023032,0.706631,0.069048,0.002616,0.180882,0.020682,0.11214,0.433039,3.2e-05,0.217088,0.006305,0.045863,"""a6cb1351-e013-11eb-9d61-02b402…"
0.707392,0.163964,0.440557,0.561443,0.178618,0.492595,0.656658,0.049908,0.028508,0.224059,0.436972,0.008886,0.043927,"""f6f7e866-e013-11eb-9d61-02b402…"
0.00619,0.23883,0.143328,0.000127,1.2e-05,0.963693,0.782274,0.295732,0.177074,0.022702,0.197167,3e-06,0.935783,"""3bf6985d-e014-11eb-9d61-02b402…"
0.54582,1e-06,0.270024,7e-06,0.093442,0.229109,0.832573,0.170743,0.535778,3e-06,0.500464,1.6388e-09,0.087791,"""cdd9b0a6-e013-11eb-9d61-02b402…"


In [44]:
att_list = list(set(attributes_k_ha) - {"snow"})  # 299 nans for this df
est, p_vals = get_p_vals(df_k_g_ha, att_list)
for attr, p in zip(att_list, p_vals):
    print(f"{attr}: {p}")

hum_avg: 0.2888805129424477
wspd: 2.7254200304724444e-09
wdir: 2.599050296049422e-06
heated_area: 0.2508484700412895
tmax: 0.24288513480618906
diff: 0.21959325126543025
tavg: 0.000690095783812386
wpgt: 1.0425137055890473e-06
tsun: 0.024559575367370474
pres: 0.8194965037627544
hum_max: 0.02536930460753472
prcp: 0.18572769215529805
anzahlwhg: 1.777615172898653e-251
complexity_score: 1.6799137182997674e-79
hum_min: 4.5620570488053775e-14
tmin: 2.1564835768680937e-06


### P-Values Legacy Data

In [54]:
attributes_l = list(set(attributes) - {"snow", "tsun"}) + ["qmbehfl", "anzlwhg", "co2koeffizient", "plz", "Type"]
attributes_leg = list(set(attributes) - {"snow", "tsun"})  # not available for lot of datapoints
get_p_vals_coll(df_l.filter(~(pl.col("wpgt").is_null())), attributes_leg)  # remove zero values from plz 2700

pres,hum_avg,wspd,hum_max,wdir,tmax,prcp,diff,tavg,wpgt,hum_min,tmin,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
3.3221e-9,0.000009,0.509721,0.006,0.657892,0.122561,0.118396,0.716974,0.000267,0.050913,0.100996,0.833845,"""400302GVG"""
1.5164e-10,0.000017,0.050209,0.000148,0.823069,0.000078,0.17421,0.768165,7.7406e-9,0.60329,9.5249e-9,0.901421,"""400917GVG"""
1.5269e-10,0.000031,0.000003,0.000059,0.362469,1.8570e-10,0.881357,0.067144,8.1048e-14,0.050043,3.3164e-10,0.889818,"""400352PVG"""
4.8795e-12,0.000021,0.00127,1.3464e-9,0.517142,9.1754e-10,0.136867,0.87662,3.0895e-18,0.00098,5.7539e-9,0.40511,"""400087GVG"""
5.0219e-20,1.6965e-11,0.193245,0.308333,0.765659,0.002421,0.360217,0.102578,6.2164e-9,0.055206,0.000447,0.004519,"""400303GVG"""
…,…,…,…,…,…,…,…,…,…,…,…,…
0.000002,0.002393,0.348307,0.000004,0.313456,2.5374e-9,0.115925,0.007553,0.000111,0.031791,0.000026,0.00855,"""400394GVG"""
1.3732e-20,2.8527e-12,0.000287,3.8301e-9,0.058753,2.1566e-11,0.371244,0.068691,1.9560e-16,0.008705,5.1143e-11,0.109497,"""400704GVG"""
0.001157,0.053066,0.051738,0.000039,0.088544,0.019055,0.601237,0.951433,0.000025,0.04678,0.000473,0.971798,"""400974GVG"""
0.002659,0.181191,0.558388,2.2272e-9,0.935647,0.124049,0.998379,0.049065,0.000021,0.011359,0.000852,0.001443,"""400961GVG"""


In [53]:
get_p_vals_coll(df_l.filter(~(pl.col("wpgt").is_null())), attributes_leg).mean()

pres,hum_avg,wspd,hum_max,wdir,tmax,prcp,diff,tavg,wpgt,hum_min,tmin,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.080325,0.115123,0.216467,0.132733,0.386908,0.134058,0.435064,0.416276,0.069271,0.214965,0.143438,0.438078,


In [60]:
est, p_vals = get_p_vals(
    df_l.filter(~(pl.col("wpgt").is_null()) & ~(pl.col("anzlwhg") == 0) & ~(pl.col("qmbehfl") == 0)), attributes_l)
for attr, p in zip(attributes_l, p_vals):
    print(f"{attr}: {p}")

pres: 0.04923068399094744
hum_avg: 1.942242576512059e-29
wspd: 6.205095989318545e-07
hum_max: 2.0140296304891986e-11
wdir: 0.80471984250151
tmax: 2.2698974489023917e-15
prcp: 0.7757454075114637
diff: 1.2962136198960052e-19
tavg: 0.03932461547618528
wpgt: 1.0293135406921679e-10
hum_min: 6.836336774127585e-10
tmin: 0.054623880059085736
qmbehfl: 0.08038280264846288
anzlwhg: 0.0
co2koeffizient: 1.0141678604062627e-05
plz: 0.4905322698044722


### P-Values Gas

In [72]:
attributes_g = ["id", "anzahlwhg", "heated_area"] + attributes
df_gas = pl.concat([df_l.filter(~(pl.col("wpgt").is_null()) & ~(pl.col("anzlwhg") == 0) & ~(pl.col("qmbehfl") == 0)).rename({"anzlwhg": "anzahlwhg", "qmbehfl": "heated_area"}).select(attributes_g),
                    df_k.filter(pl.col("primary_energy") == "gas").select(attributes_g)])
df_gas

id,anzahlwhg,heated_area,diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""400060pVG""",386,27746.0,339.312,85.75,63.0,100.0,13.8,10.5,18.8,0.0,0.0,243.0,6.1,20.5,1020.8,180.0
"""400060pVG""",386,27746.0,297.063,85.583333,64.0,100.0,13.8,9.0,18.3,0.0,0.0,42.0,5.0,18.4,1021.2,120.0
"""400060pVG""",386,27746.0,306.75,84.291667,61.0,98.0,13.7,9.0,19.4,0.0,0.0,62.0,5.8,20.5,1023.7,402.0
"""400060pVG""",386,27746.0,323.187,92.25,78.0,100.0,14.3,11.3,18.4,0.0,0.0,13.0,8.6,27.7,1024.0,108.0
"""400060pVG""",386,27746.0,276.188,86.833333,71.0,99.0,15.4,13.4,19.3,0.1,0.0,66.0,12.6,36.4,1023.8,120.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""578e031d-e014-11eb-9d61-02b402…",168,3141.78,485.0,76.875,52.0,96.0,16.1,10.7,21.3,0.0,0.0,343.0,6.8,19.8,1021.3,306.0
"""578e031d-e014-11eb-9d61-02b402…",168,3141.78,517.0,71.75,44.0,100.0,16.6,8.1,24.7,0.0,0.0,95.0,10.7,29.5,1019.2,678.0
"""578e031d-e014-11eb-9d61-02b402…",168,3141.78,516.0,72.708333,38.0,100.0,18.1,10.0,27.4,0.0,0.0,114.0,7.9,19.8,1016.2,714.0
"""578e031d-e014-11eb-9d61-02b402…",168,3141.78,525.0,72.0,45.0,97.0,20.8,13.6,29.1,0.0,0.0,139.0,8.0,25.9,1015.5,606.0


In [81]:
attributes_g = list(set(attributes) - {"id", "snow", "tsun"})
get_p_vals_coll(df_gas, attributes_g).mean()

pres,hum_avg,wspd,hum_max,wdir,tmax,prcp,diff,tavg,wpgt,hum_min,tmin,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.086253,0.116049,0.174714,0.154796,0.353286,0.157854,0.437152,0.390643,0.069791,0.241696,0.089502,0.455341,


In [79]:
est, p_vals = get_p_vals(df_gas, attributes_g)
for attr, p in zip(attributes_g, p_vals):
    print(f"{attr}: {p}")

pres: 0.37044844316458647
hum_avg: 0.019908484892239457
wspd: 0.04721098319906918
hum_max: 0.8599517468382283
wdir: 0.03193196388295678
tmax: 2.5274535607434356e-05
prcp: 0.972868746401084
diff: 0.6274540823625796
tavg: 5.01529203086987e-10
wpgt: 0.0026214349730708424
hum_min: 0.015654039907849398
tmin: 0.00624335378326159


### P-Values District Heating

In [89]:
attributes_d = attributes + ["id"]
df_d = pl.concat([df_k.filter(pl.col("primary_energy") == "district heating").select(attributes_d),
                  df_dh.select(attributes_d)])
df_d

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
1173.0,77.75,61.0,88.0,10.6,5.1,14.1,0.0,0.0,319.0,11.2,33.5,1027.4,246.0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
691.0,77.5,56.0,94.0,8.6,2.8,14.3,1.3,0.0,116.0,13.7,42.1,1022.2,558.0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
710.0,86.791667,69.0,98.0,12.1,8.1,15.6,1.8,0.0,154.0,10.4,24.8,1012.0,90.0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
423.0,88.083333,71.0,98.0,15.0,12.6,18.9,0.0,0.0,234.0,8.6,25.2,1013.5,258.0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
363.0,83.625,67.0,99.0,14.5,9.0,18.9,2.4,0.0,179.0,9.4,29.2,1015.1,390.0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
199.0,66.666667,49.0,94.0,20.1,16.4,23.7,0.0,0.0,229.0,21.2,58.7,1011.7,492.0,"""2f025f96-af2c-4140-b955-766a79…"
113.0,75.208333,51.0,96.0,17.0,11.9,21.8,4.0,0.0,236.0,17.3,52.6,1016.5,348.0,"""2f025f96-af2c-4140-b955-766a79…"
306.0,80.666667,54.0,97.0,16.4,11.6,22.4,1.4,0.0,257.0,11.2,33.8,1018.1,444.0,"""2f025f96-af2c-4140-b955-766a79…"
261.0,81.333333,65.0,96.0,15.9,12.5,20.0,1.7,0.0,5.0,7.6,27.7,1012.6,168.0,"""2f025f96-af2c-4140-b955-766a79…"


In [91]:
attributes_d = list(set(attributes) - {"id", "snow", "tsun"})
get_p_vals_coll(df_d, attributes_d).mean()

pres,hum_avg,wspd,hum_max,wdir,tmax,prcp,diff,tavg,wpgt,hum_min,tmin,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.244988,0.377135,0.384557,0.146868,0.404221,0.262495,0.489546,0.429697,0.103768,0.541228,0.131436,0.334398,


In [92]:
est, p_vals = get_p_vals(df_d, attributes_d)
for attr, p in zip(attributes_d, p_vals):
    print(f"{attr}: {p}")

pres: 3.744843382785507e-06
hum_avg: 0.01679702235427206
wspd: 0.0013242771122761122
hum_max: 1.1637396127152714e-05
wdir: 0.6659364058194841
tmax: 0.020552345753899026
prcp: 0.42891858654303616
diff: 0.7273658935390306
tavg: 1.8407123552980782e-07
wpgt: 0.09005869552499494
hum_min: 1.8627848311103813e-05
tmin: 0.8950897600919501


## Clustering with p-Values

## Principal Component Analysis