# Correlation Analysis

## Load data

In [1]:
import pandas as pd
import polars as pl
from src.energy_forecast.config import PROCESSED_DATA_DIR, RAW_DATA_DIR

df_weather = pl.read_csv(RAW_DATA_DIR / f"weather_daily.csv").with_columns(
    pl.col("time").str.to_datetime().alias("datetime"))
df_holidays = pl.read_csv(RAW_DATA_DIR / "holidays.csv").with_columns(pl.col("start").str.to_date(),
                                                                      pl.col("end").str.to_date(strict=False))
df_cities = pl.read_csv(RAW_DATA_DIR / "cities.csv")

[32m2025-02-13 08:32:58.471[0m | [1mINFO    [0m | [36msrc.energy_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\User\PycharmProjects\energy-forecast-wahl[0m


In [64]:
holiday_dict = {"BE": [], "HH": [], "MV": [], "BY": [], "SH": []}
for row in df_holidays.iter_rows():
    if row[1] is not None and row[2] is not None:
        span = pd.date_range(row[1], row[2], freq="D")
        holiday_dict[row[0]].extend(span)
    elif row[1] is not None:
        holiday_dict[row[0]].extend([row[1]])

### District Heating data

In [3]:
df_dh = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").filter(pl.col("source") == "dh")
df_dh = df_dh.with_columns(pl.coalesce(pl.col("datetime").str.to_datetime(strict=False),
                                       pl.col("datetime").str.to_datetime(format="%y-%m-%dT%H:%M:%S#z", strict=False)))
df_meta_dh = pl.read_csv(RAW_DATA_DIR / "dh_meta.csv")
df_dh = df_dh.join(df_meta_dh.rename({"eco_u_id": "id"}), on="id", how="left")
df_dh = df_dh.rename({"postal_code": "plz"}).join(df_weather, on=["datetime", "plz"], how="left")
df_dh = df_dh.join(df_cities.select(["plz", "state"]), on="plz", how="left").with_columns(pl.struct(["state", "datetime"]).map_elements(lambda x: 1 if x["datetime"] in holiday_dict[x["state"]] else 0, return_dtype=pl.Int64).alias("holiday"))

df_dh

id,datetime,diff,source,data_provider_id,address,city,plz,country,primary_energy,unit_code,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
str,datetime[μs],f64,str,str,str,str,i64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""9500b2eb-c260-4200-b657-125604…",2022-08-24 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-24T00:00:00.000000000""",63.166667,38.0,86.0,22.3,16.2,29.1,0.0,0.0,109.0,10.8,33.1,1020.5,792.0
"""9500b2eb-c260-4200-b657-125604…",2022-08-25 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-25T00:00:00.000000000""",78.625,48.0,98.0,21.0,14.4,28.3,0.0,0.0,86.0,9.4,31.7,1018.9,462.0
"""9500b2eb-c260-4200-b657-125604…",2022-08-26 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-26T00:00:00.000000000""",86.375,64.0,99.0,21.1,17.0,26.6,23.2,0.0,60.0,6.8,29.9,1014.3,234.0
"""9500b2eb-c260-4200-b657-125604…",2022-08-27 00:00:00,1.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-27T00:00:00.000000000""",81.291667,54.0,95.0,18.2,12.5,24.2,0.0,0.0,311.0,10.8,28.8,1015.0,294.0
"""9500b2eb-c260-4200-b657-125604…",2022-08-28 00:00:00,9.0,"""dh""","""bca40d02phyw""","""Friedrichsgaber Weg 432 A""","""Norderstedt""",22846,"""DE""","""district heating""","""kwh""","""2022-08-28T00:00:00.000000000""",79.041667,59.0,100.0,15.7,8.7,21.5,0.0,0.0,284.0,10.8,37.4,1018.1,288.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2f025f96-af2c-4140-b955-766a79…",2023-07-16 00:00:00,199.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-16T00:00:00.000000000""",66.666667,49.0,94.0,20.1,16.4,23.7,0.0,0.0,229.0,21.2,58.7,1011.7,492.0
"""2f025f96-af2c-4140-b955-766a79…",2023-07-17 00:00:00,113.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-17T00:00:00.000000000""",75.208333,51.0,96.0,17.0,11.9,21.8,4.0,0.0,236.0,17.3,52.6,1016.5,348.0
"""2f025f96-af2c-4140-b955-766a79…",2023-07-18 00:00:00,306.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-18T00:00:00.000000000""",80.666667,54.0,97.0,16.4,11.6,22.4,1.4,0.0,257.0,11.2,33.8,1018.1,444.0
"""2f025f96-af2c-4140-b955-766a79…",2023-07-19 00:00:00,261.0,"""dh""","""adb18a8bt7wz""","""Kielortring 22""","""Norderstedt""",22850,"""DE""","""district heating""","""kwh""","""2023-07-19T00:00:00.000000000""",81.333333,65.0,96.0,15.9,12.5,20.0,1.7,0.0,5.0,7.6,27.7,1012.6,168.0


### Kinergy Data

In [16]:
df_k = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").filter(pl.col("source") == "kinergy").with_columns(
    pl.col("datetime").str.to_datetime())
df_meta_k = pl.read_csv(RAW_DATA_DIR / "kinergy_meta.csv")
df_k = df_k.join(df_meta_k, on="id", how="left")
df_k = df_k.select(
    ["id", "diff", "anzahlwhg", "complexity", "complexity_score", "primary_energy", "heated_area", "datetime", "plz"])
df_k = df_k.join(df_weather, on=["datetime", "plz"], how="left")
df_k = df_k.join(df_cities.select(["plz", "state"]), on="plz", how="left").with_columns(pl.struct(["state", "datetime"]).map_elements(lambda x: 1 if x["datetime"] in holiday_dict[x["state"]] else 0, return_dtype=pl.Int64).alias("holiday"))
df_k

id,diff,anzahlwhg,complexity,complexity_score,primary_energy,heated_area,datetime,plz,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,state,holiday
str,f64,i64,i64,f64,str,f64,datetime[μs],i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",1173.0,0,0,27.0,"""district heating""",2360.0,2022-10-19 00:00:00,10249,"""2022-10-19T00:00:00.000000000""",77.75,61.0,88.0,10.6,5.1,14.1,0.0,0.0,319.0,11.2,33.5,1027.4,246.0,"""BE""",0
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",691.0,0,0,27.0,"""district heating""",2360.0,2022-10-20 00:00:00,10249,"""2022-10-20T00:00:00.000000000""",77.5,56.0,94.0,8.6,2.8,14.3,1.3,0.0,116.0,13.7,42.1,1022.2,558.0,"""BE""",0
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",710.0,0,0,27.0,"""district heating""",2360.0,2022-10-21 00:00:00,10249,"""2022-10-21T00:00:00.000000000""",86.791667,69.0,98.0,12.1,8.1,15.6,1.8,0.0,154.0,10.4,24.8,1012.0,90.0,"""BE""",0
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",423.0,0,0,27.0,"""district heating""",2360.0,2022-10-22 00:00:00,10249,"""2022-10-22T00:00:00.000000000""",88.083333,71.0,98.0,15.0,12.6,18.9,0.0,0.0,234.0,8.6,25.2,1013.5,258.0,"""BE""",0
"""aecb8acb-5dfc-47c9-8a44-cbae3f…",363.0,0,0,27.0,"""district heating""",2360.0,2022-10-23 00:00:00,10249,"""2022-10-23T00:00:00.000000000""",83.625,67.0,99.0,14.5,9.0,18.9,2.4,0.0,179.0,9.4,29.2,1015.1,390.0,"""BE""",0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""b12ea64c-04bf-11ec-9d61-02b402…",8.0,0,0,26.0,"""district heating""",6000.0,2023-09-17 00:00:00,10963,"""2023-09-17T00:00:00.000000000""",70.583333,45.0,98.0,20.4,12.5,27.4,0.0,0.0,115.0,5.0,16.2,1015.6,684.0,"""BE""",1
"""b12ea64c-04bf-11ec-9d61-02b402…",0.0,0,0,26.0,"""district heating""",6000.0,2023-09-18 00:00:00,10963,"""2023-09-18T00:00:00.000000000""",71.708333,52.0,87.0,21.0,16.5,26.3,0.5,0.0,153.0,12.2,33.5,1009.0,264.0,"""BE""",1
"""b12ea64c-04bf-11ec-9d61-02b402…",0.0,0,0,26.0,"""district heating""",6000.0,2023-09-19 00:00:00,10963,"""2023-09-19T00:00:00.000000000""",61.791667,42.0,87.0,18.4,15.8,21.1,0.0,0.0,232.0,16.2,44.6,1009.9,540.0,"""BE""",1
"""b12ea64c-04bf-11ec-9d61-02b402…",0.0,0,0,26.0,"""district heating""",6000.0,2023-09-20 00:00:00,10963,"""2023-09-20T00:00:00.000000000""",61.5,47.0,73.0,19.3,14.6,25.0,0.0,0.0,200.0,12.6,38.2,1010.6,510.0,"""BE""",1


### Legacy Data

In [70]:
df_l = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").filter(pl.col("source") == "legacy").with_columns(
    pl.col("datetime").str.to_datetime())
df_meta_l = pl.read_csv(RAW_DATA_DIR / "legacy_meta.csv")
df_l = df_l.join(df_meta_l, on="id", how="left")
df_l = df_l.select(["id", "datetime", "diff", "qmbehfl", "anzlwhg", "co2koeffizient", "plz", "Type"]).with_columns(
    pl.col("plz").str.strip_chars()).cast({"plz": pl.Int64})
df_l = df_l.join(df_weather, on=["datetime", "plz"], how="left")
df_l = df_l.join(df_cities.select(["plz", "state"]), on="plz", how="left").filter(~(pl.col("plz") == 2700)).with_columns(pl.struct(["state", "datetime"]).map_elements(lambda x: 1 if x["datetime"] in holiday_dict[x["state"]] else 0, return_dtype=pl.Int64).alias("holiday"))
df_l

id,datetime,diff,qmbehfl,anzlwhg,co2koeffizient,plz,Type,time,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,state,holiday
str,datetime[μs],f64,f64,i64,f64,i64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64
"""400060pVG""",2017-09-21 00:00:00,339.312,27746.0,386,2.26,22547,10,"""2017-09-21T00:00:00.000000000""",85.75,63.0,100.0,13.8,10.5,18.8,0.0,0.0,243.0,6.1,20.5,1020.8,180.0,"""HH""",0
"""400060pVG""",2017-09-22 00:00:00,297.063,27746.0,386,2.26,22547,10,"""2017-09-22T00:00:00.000000000""",85.583333,64.0,100.0,13.8,9.0,18.3,0.0,0.0,42.0,5.0,18.4,1021.2,120.0,"""HH""",0
"""400060pVG""",2017-09-23 00:00:00,306.75,27746.0,386,2.26,22547,10,"""2017-09-23T00:00:00.000000000""",84.291667,61.0,98.0,13.7,9.0,19.4,0.0,0.0,62.0,5.8,20.5,1023.7,402.0,"""HH""",0
"""400060pVG""",2017-09-24 00:00:00,323.187,27746.0,386,2.26,22547,10,"""2017-09-24T00:00:00.000000000""",92.25,78.0,100.0,14.3,11.3,18.4,0.0,0.0,13.0,8.6,27.7,1024.0,108.0,"""HH""",0
"""400060pVG""",2017-09-25 00:00:00,276.188,27746.0,386,2.26,22547,10,"""2017-09-25T00:00:00.000000000""",86.833333,71.0,99.0,15.4,13.4,19.3,0.1,0.0,66.0,12.6,36.4,1023.8,120.0,"""HH""",0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""400302GVG""",2021-08-17 00:00:00,29.6,1372.0,24,2.26,23562,10,"""2021-08-17T00:00:00.000000000""",82.833333,71.0,89.0,14.9,12.6,18.5,1.5,0.0,258.0,18.4,49.0,1008.3,186.0,"""SH""",0
"""400302GVG""",2021-08-18 00:00:00,25.2,1372.0,24,2.26,23562,10,"""2021-08-18T00:00:00.000000000""",79.625,65.0,92.0,15.9,12.7,20.2,1.7,0.0,253.0,15.8,52.2,1009.1,102.0,"""SH""",0
"""400302GVG""",2021-08-19 00:00:00,27.3,1372.0,24,2.26,23562,10,"""2021-08-19T00:00:00.000000000""",77.208333,64.0,92.0,16.7,13.9,20.6,0.5,0.0,253.0,12.6,39.2,1010.0,156.0,"""SH""",0
"""400302GVG""",2021-08-20 00:00:00,28.3,1372.0,24,2.26,23562,10,"""2021-08-20T00:00:00.000000000""",83.041667,65.0,95.0,16.0,11.3,21.5,0.1,0.0,234.0,6.1,37.4,1014.8,132.0,"""SH""",0


### Helper function for DH data

In [8]:
from src.energy_forecast.config import RAW_DATA_DIR

id = "8e9b1544-434e-44a7-8049-8f2e4b14a819"
attributes = ["diff", 'hum_avg',
              'hum_min',
              'hum_max',
              'tavg',
              'tmin',
              'tmax',
              'prcp',
              'snow',
              'wdir',
              'wspd',
              'wpgt',
              'pres',
              'tsun',
              "holiday"]


def get_df_sensor(df, id, attributes):
    df_sens = df.filter(pl.col("id") == id)
    source = df_sens["source"].unique().item()
    df_meta = pl.read_csv(RAW_DATA_DIR / f"{source}_meta.csv")
    sens_meta = df_meta.filter(pl.col("eco_u_id") == id)
    for col in df_meta.columns:
        df_sens = df_sens.with_columns(pl.lit(sens_meta[col].item()).alias(col))
    df_weather = pl.read_csv(RAW_DATA_DIR / f"weather_daily.csv").with_columns(
        pl.col("time").str.to_datetime().alias("datetime")).rename({"plz": "postal_code"})
    df_sens = df_sens.join(df_weather, on=["datetime", "postal_code"], how="left")
    return df_sens.select(attributes)


df_sens = get_df_sensor(df_dh, id, attributes)
df_sens

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
0.0,63.166667,38.0,86.0,22.3,16.2,29.1,0.0,0.0,109.0,10.8,33.1,1020.5,792.0,0
0.0,81.291667,54.0,95.0,18.2,12.5,24.2,0.0,0.0,311.0,10.8,28.8,1015.0,294.0,0
1.0,79.041667,59.0,100.0,15.7,8.7,21.5,0.0,0.0,284.0,10.8,37.4,1018.1,288.0,0
1.0,81.541667,64.0,99.0,16.7,10.4,20.8,0.0,0.0,327.0,6.5,22.7,1021.1,204.0,0
2.0,74.416667,46.0,95.0,15.2,6.0,21.7,0.0,0.0,31.0,7.2,27.7,1023.7,318.0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
36.0,80.375,58.0,100.0,14.4,9.3,19.1,0.0,0.0,284.0,8.2,29.2,1024.3,780.0,0
36.0,71.25,51.0,89.0,14.2,9.5,19.9,0.0,0.0,15.0,8.3,27.7,1024.6,642.0,0
39.0,67.541667,40.0,95.0,14.8,9.5,20.6,0.0,0.0,82.0,13.3,38.9,1021.8,648.0,0
20.0,48.666667,30.0,68.0,18.0,10.5,24.6,0.0,0.0,104.0,14.4,36.7,1015.1,870.0,0


## Multiple Linear Regression with OLS

In [9]:
from statsmodels.regression.linear_model import RegressionResults
import statsmodels.api as sm


def get_p_vals(df: pl.DataFrame, list_cols: list) -> tuple[RegressionResults, list]:
    attr_list = list(set(list_cols) - {"diff"})
    X = df.select(attr_list).to_numpy()
    y = df.select(pl.col("diff")).to_numpy()
    X2 = sm.add_constant(X)
    est = sm.OLS(y, X2)
    est2 = est.fit()
    return est2, est2.summary2().tables[1]["P>|t|"].tolist()

In [10]:
est, p_vals = get_p_vals(df_sens, attributes)
for attr, p in zip(attributes, p_vals):
    print(f"{attr}: {p}")

diff: 0.02261194297627369
hum_avg: 9.042241369502261e-05
hum_min: 0.23761756663957967
hum_max: 0.3391903034126412
tavg: 0.011799885415282756
tmin: 0.006905191317014539
tmax: 0.04946856508487702
prcp: 4.189244424627099e-06
snow: 2.260100368739058e-10
wdir: 1.6864859990203824e-06
wspd: 0.6471866086472776
wpgt: 0.0017209030698372135
pres: 0.6965508945417904
tsun: 0.9296261264508241
holiday: 0.7410103588470403


In [11]:
est.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.886
Model:,OLS,Adj. R-squared:,0.883
Method:,Least Squares,F-statistic:,312.9
Date:,"Thu, 13 Feb 2025",Prob (F-statistic):,9.11e-255
Time:,08:35:44,Log-Likelihood:,-3155.1
No. Observations:,578,AIC:,6340.0
Df Residuals:,563,BIC:,6406.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,612.8023,268.042,2.286,0.023,86.318,1139.286
x1,4.8352,1.226,3.944,0.000,2.427,7.244
x2,-0.4906,0.415,-1.182,0.238,-1.306,0.324
x3,-0.2461,0.257,-0.957,0.339,-0.751,0.259
x4,1.3917,0.551,2.526,0.012,0.310,2.474
x5,-0.0563,0.021,-2.711,0.007,-0.097,-0.016
x6,-1.2882,0.654,-1.969,0.049,-2.573,-0.003
x7,0.9530,0.205,4.647,0.000,0.550,1.356
x8,-35.5801,5.507,-6.460,0.000,-46.398,-24.763

0,1,2,3
Omnibus:,42.215,Durbin-Watson:,0.977
Prob(Omnibus):,0.0,Jarque-Bera (JB):,115.267
Skew:,-0.337,Prob(JB):,9.34e-26
Kurtosis:,5.082,Cond. No.,120000.0


#### P-Values District Heating Data

In [14]:
p_vals_coll = list()
for id in df_dh["id"].unique():
    df_sens = get_df_sensor(df_dh, id, attributes)
    est, p_vals = get_p_vals(df_sens, attributes)
    p_vals_coll.append(p_vals + [id])

df_p = pl.DataFrame(p_vals_coll, orient="row", schema=attributes + ["id"]).drop_nans()
df_p

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.092457,0.000412,0.670175,0.53524,0.000844,0.004765,0.641531,0.000874,0.079045,0.000031,0.020841,0.118554,0.133532,0.355233,0.308661,"""561a9d67-5802-4a54-ae7d-0a7822…"
0.099922,0.049025,0.603452,0.217001,0.491219,0.871047,0.136465,1.4137e-8,0.017459,0.095249,0.318375,0.592757,0.18603,0.159412,0.777366,"""5e2fd59d-603a-488b-a525-513541…"
0.78683,0.158148,0.798381,0.223944,0.391393,0.057808,0.137442,0.001152,0.00007,0.091141,0.579541,0.28334,0.759103,0.774762,0.7546,"""d566a120-d232-489a-aa42-850e5a…"
0.020963,0.004585,0.649284,0.406523,0.036495,0.738109,0.801557,0.918125,0.631008,2.4110e-10,0.136611,0.099981,0.003566,0.039302,0.319913,"""4edd9f9b-22a0-4932-a815-60c9a2…"
0.082369,0.055544,0.532264,0.656088,0.320675,0.004043,0.233237,7.8295e-8,0.048344,0.000014,0.958593,0.091788,0.373821,0.546232,0.685274,"""42d6efdc-d590-40b7-af9a-90121d…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.846425,0.000007,0.50898,0.343191,0.014174,0.048819,0.055209,0.102284,0.003452,0.00003,0.229959,0.243071,0.455999,0.230381,0.761439,"""e7ad9b75-bc6c-4891-a8fd-45e393…"
0.009147,0.000095,0.437919,0.290724,0.011738,0.000033,0.290316,0.000033,0.000001,0.000002,0.2471,0.000041,0.120184,0.35381,0.984271,"""9500b2eb-c260-4200-b657-125604…"
0.789016,0.036394,0.780686,0.54169,0.260273,0.11702,0.558304,0.222143,0.217529,0.00044,0.65171,0.007611,0.541718,0.144191,0.398876,"""edcafda6-fe6f-4ca3-bb3d-f0c5fb…"
0.00001,0.014965,0.046803,0.000023,0.709383,0.165613,0.548286,3.0966e-19,0.006722,0.666315,0.421268,0.593227,0.816549,0.764898,0.824566,"""bc098a2e-0cc7-4f01-b6ad-9d647a…"


In [15]:
df_p.mean()

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.236874,0.131071,0.518564,0.373669,0.178294,0.143993,0.437736,0.109327,0.135338,0.063305,0.386317,0.289511,0.333919,0.390103,0.552212,


### P-Values Kinergy Data

In [23]:
attributes_k_ha = attributes + ["anzahlwhg", "complexity_score", "heated_area"]  # error when including complexity
attributes_k = attributes
df_k_dh = df_k.filter(pl.col("primary_energy") == "district heating")
df_k_dh_ha = df_k.filter(
    (pl.col("primary_energy") == "district heating") & (pl.col("heated_area") != 0) & (pl.col("anzahlwhg") != 0))
df_k_g = df_k.filter(pl.col("primary_energy") == "gas")
df_k_g_ha = df_k.filter((pl.col("primary_energy") == "gas") & (pl.col("heated_area") != 0) & (pl.col("anzahlwhg") != 0))

In [18]:
import statsmodels.api as sm

attr_list = list(set(attributes_k) - {"diff"})
df_s = df_k_dh.filter(pl.col("id") == "aecb8acb-5dfc-47c9-8a44-cbae3ff7d2b3")
X = df_s.select(attr_list).to_numpy()
y = df_s.select(pl.col("diff")).to_numpy()
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
est2.summary()
est2.summary2().tables[1]

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,2178.370051,1975.308923,1.1028,0.270956,-1708.096738,6064.836839
x1,-10.312391,5.934246,-1.737776,0.083228,-21.988159,1.363377
x2,4.247283,2.208188,1.923425,0.055328,-0.097379,8.591945
x3,-0.9352,1.89992,-0.492232,0.622899,-4.673337,2.802936
x4,18.092333,4.135558,4.374822,1.7e-05,9.955525,26.229142
x5,-0.129445,0.10942,-1.183012,0.237696,-0.344731,0.085841
x6,-7.14276,4.470412,-1.597786,0.111093,-15.938401,1.652881
x7,6.728338,6.736401,0.998803,0.318657,-6.52569,19.982365
x8,41.959304,33.800814,1.24137,0.215393,-24.544593,108.463202
x9,-57.439327,31.339054,-1.832835,0.067771,-119.099654,4.220999


#### P-Values Kinergy District Heating Data

In [24]:
def get_p_vals_coll(df, list_cols):
    p_vals_coll = list()
    for id in df["id"].unique():
        df_sens = df.filter(pl.col("id") == id)
        est, p_vals = get_p_vals(df_sens, list_cols)
        p_vals_coll.append(p_vals)
    df_p = pl.DataFrame(p_vals_coll, orient="row", schema=list_cols).with_columns(
        df["id"].unique().alias("id")).drop_nans()
    return df_p


get_p_vals_coll(df_k_dh, attributes_k)

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.767804,0.837987,0.345783,0.332565,7.1e-05,0.06034,0.740164,0.215404,0.00448,0.032633,0.084798,0.054674,0.001724,0.025208,0.541954,"""1a9266de-dfff-11eb-9d61-02b402…"
0.025035,0.541376,0.08165,0.12813,0.05213,0.035542,0.998977,0.53318,0.021248,0.0019,0.165926,0.293394,0.013556,0.093237,0.276387,"""b12ea64c-04bf-11ec-9d61-02b402…"
0.270956,0.083228,0.055328,0.622899,1.7e-05,0.237696,0.111093,0.318657,0.215393,0.067771,0.094587,0.092612,0.003001,0.025435,0.805179,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
0.031821,0.093803,0.630956,0.083701,0.740908,0.390979,0.400917,0.343573,0.073228,0.073185,0.08759,0.000391,0.629461,0.940799,0.51128,"""9c87eddf-04b5-11ec-9d61-02b402…"
0.425704,0.408405,0.550223,0.185332,0.013355,0.391294,0.975018,0.602911,0.543892,0.869952,0.015496,0.798013,0.185778,0.480369,0.086411,"""841ccf85-04b7-11ec-9d61-02b402…"


In [25]:
get_p_vals_coll(df_k_dh, attributes_k).drop(["id"]).mean()

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.304264,0.39296,0.332788,0.270525,0.161296,0.22317,0.645234,0.402745,0.171648,0.209088,0.089679,0.247817,0.166704,0.31301,0.444242


Make Regression over all values, because heated_area, number of appartements, ... is dependent on the building

In [26]:
est, p_vals = get_p_vals(df_k_dh_ha, attributes_k_ha)
for attr, p in zip(attributes_k_ha, p_vals):
    print(f"{attr}: {p}")

diff: 0.7043057435147397
hum_avg: 0.9355848402702175
hum_min: 0.0023616692436663335
hum_max: 0.7043170676475878
tavg: 1.2331547357561974e-05
tmin: 0.09088553300604636
tmax: 0.05105523510928836
prcp: 0.5059658930022979
snow: 0.21956975221751251
wdir: 0.0045651224946168705
wspd: 6.207506133750417e-09
wpgt: 7.424150196262206e-05
pres: 0.052750745010920384
tsun: 0.7106347088232294
holiday: 0.7043128453711209
anzahlwhg: 0.13369168172564966
complexity_score: 0.827720592751114
heated_area: 0.005188588368316938


#### P-Values Kinergy Gas Data

In [60]:
att_list = list(set(attributes) - {"snow"})  # 299 nans for this df
get_p_vals_coll(df_k_g, att_list)

wspd,diff,wpgt,pres,hum_min,tsun,prcp,holiday,tavg,tmin,wdir,hum_avg,hum_max,tmax,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.960051,0.189534,0.000741,0.150398,6e-06,0.063459,0.808465,3.4219e-16,1.2164e-09,0.412954,0.438615,0.000934,0.288205,0.420134,"""a6cb1351-e013-11eb-9d61-02b402…"
0.517011,0.076082,0.738871,0.284689,5.8487e-08,5.78e-07,0.663006,0.002336,5.5348e-07,0.121813,0.797584,4.8e-05,0.282787,0.02411,"""f6f7e866-e013-11eb-9d61-02b402…"
0.008699,0.685779,0.138788,0.207389,0.002338,0.522299,0.573541,3.7044e-12,0.001113,0.35467,0.388802,0.00637,0.773181,0.000254,"""bf254b46-e009-11eb-9d61-02b402…"
0.164145,0.105433,0.323328,0.759642,0.004524,0.022009,0.514128,0.147134,2.6e-05,0.053026,0.016623,0.032446,0.110053,0.005868,"""578e031d-e014-11eb-9d61-02b402…"
0.68605,0.044093,0.440637,0.428669,0.008507,0.210257,0.032677,0.440542,0.231124,0.053084,0.815404,0.437745,0.482509,0.19482,"""3bf6985d-e014-11eb-9d61-02b402…"
0.000454,0.07704,0.177259,0.006438,0.000168,0.215317,0.196114,0.54941,3.8182e-07,0.856483,0.000651,0.00233,0.018243,0.559397,"""6abb785c-dfff-11eb-9d61-02b402…"
0.003402,0.00077,0.091721,0.124575,0.000373,0.021822,0.45828,0.005794,2.8701e-15,0.806373,0.139313,0.025217,0.145083,7e-06,"""cdd9b0a6-e013-11eb-9d61-02b402…"
0.301085,0.688234,0.481667,0.981744,1.2625e-12,0.001236,0.293213,3e-06,0.000469,0.078645,0.062578,1.5704e-08,0.003475,0.205495,"""83758fff-e013-11eb-9d61-02b402…"
0.546038,0.078207,0.297042,0.699011,2.373e-08,0.220761,0.778113,2e-06,8e-06,0.409502,0.333944,6.5e-05,0.036429,0.557401,"""730285cc-ae67-11eb-9b5e-02b402…"


In [61]:
get_p_vals_coll(df_k_g, att_list).mean()

wspd,diff,wpgt,pres,hum_min,tsun,prcp,holiday,tavg,tmin,wdir,hum_avg,hum_max,tmax,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.354104,0.21613,0.298895,0.404728,0.001768,0.141907,0.479726,0.127247,0.02586,0.349617,0.332613,0.056128,0.237774,0.21861,


P-Values for every building with heated area info (alltogether because heated area is the same when calculating for each building individually)

In [59]:
att_list = list(set(attributes_k_ha) - {"snow"})  # 299 nans for this df
est, p_vals = get_p_vals(df_k_g_ha, att_list)
for attr, p in zip(att_list, p_vals):
    print(f"{attr}: {p}")

pres: 0.265335270205267
tavg: 0.769245163804587
anzahlwhg: 1.6978130556953993e-05
hum_avg: 8.86413977422949e-255
wspd: 8.410012036869176e-06
tmax: 0.6815047269036455
heated_area: 0.0018580859580536512
diff: 0.29938893692865476
wpgt: 3.36955800554553e-05
tsun: 0.014424463130344055
hum_min: 6.076082098489003e-09
holiday: 7.215862518853405e-19
prcp: 0.393169192161434
complexity_score: 3.527847975320709e-81
tmin: 5.1469189516936444e-05
wdir: 0.2729703681856988
hum_max: 0.07558158527710514


### P-Values Legacy Data

In [71]:
attributes_l = list(set(attributes) - {"snow", "tsun"}) + ["qmbehfl", "anzlwhg", "co2koeffizient", "plz", "Type"]
attributes_leg = list(set(attributes) - {"snow", "tsun"})  # not available for lot of datapoints
get_p_vals_coll(df_l.filter(~(pl.col("wpgt").is_null())), attributes_leg)  # remove zero values from plz 2700

wspd,diff,wpgt,pres,hum_min,prcp,holiday,tavg,tmin,wdir,hum_avg,hum_max,tmax,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.008261,0.780071,0.019022,0.378314,0.003879,0.000002,0.069345,0.000006,0.002971,0.195337,0.807976,0.805865,3.7279e-9,"""400309GVG"""
0.000087,0.021122,0.656761,0.017722,0.32017,0.661387,0.594564,0.002127,0.485358,0.016859,0.301578,0.162433,0.237703,"""400057GVG"""
4.8418e-8,0.147855,0.001392,0.001461,1.6339e-9,0.815552,0.507927,5.5400e-18,0.901466,0.000001,0.002491,0.318483,1.1434e-10,"""400697GVG"""
1.3091e-20,0.382138,0.070233,5.3425e-12,0.000273,0.014734,0.088867,8.2109e-9,0.003618,0.001441,0.171794,0.734981,0.35746,"""400705GVG"""
3.7431e-9,0.119739,0.052597,0.00001,0.104208,0.982997,0.718568,0.000271,0.834319,0.12519,0.514373,0.661353,0.006116,"""4008231VG"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.000019,0.022484,0.557314,0.003391,0.000211,0.006231,0.380557,1.7233e-8,0.158615,0.01046,0.003847,0.987503,0.749076,"""400917GVG"""
0.000053,0.871447,0.270905,0.000576,0.101867,0.000986,0.269468,0.67875,0.015251,0.751225,0.159453,0.012553,0.891078,"""400303GVG"""
5.5352e-12,0.137223,0.001012,0.000023,7.8691e-9,0.996417,0.876796,3.1937e-18,0.405301,1.0015e-9,0.00143,0.521317,1.3841e-9,"""400067GVG"""
4.2437e-10,0.229094,0.658077,0.000056,2.4214e-7,0.000001,0.835948,1.8779e-9,0.745117,0.000199,0.120335,0.857322,0.000196,"""400098GVG"""


In [72]:
get_p_vals_coll(df_l.filter(~(pl.col("wpgt").is_null())), attributes_leg).mean()

wspd,diff,wpgt,pres,hum_min,prcp,holiday,tavg,tmin,wdir,hum_avg,hum_max,tmax,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.078222,0.424291,0.214804,0.120193,0.138463,0.21878,0.433706,0.061267,0.456024,0.13147,0.238364,0.406437,0.141402,


In [73]:
est, p_vals = get_p_vals(
    df_l.filter(~(pl.col("wpgt").is_null()) & ~(pl.col("anzlwhg") == 0) & ~(pl.col("qmbehfl") == 0)), attributes_l)
for attr, p in zip(attributes_l, p_vals):
    print(f"{attr}: {p}")

wspd: 3.0877964292546164e-10
diff: 2.841131682383253e-09
wpgt: 2.21194771706315e-15
pres: 4.3692622667221094e-20
hum_min: 0.12258693647416466
prcp: 0.6435386123754425
holiday: 8.472378093543686e-07
tavg: 0.8092089577409582
tmin: 0.053889629557053655
wdir: 8.791665116017247e-05
hum_avg: 7.887603762197873e-05
hum_max: 0.0
tmax: 0.09788188186154136
qmbehfl: 2.1894972338286156e-29
anzlwhg: 0.5356580492190399
co2koeffizient: 6.964967228235477e-11
plz: 0.12835145713221635


### P-Values Gas

In [74]:
attributes_g = ["id", "anzahlwhg", "heated_area"] + attributes
df_gas = pl.concat([df_l.filter(~(pl.col("wpgt").is_null()) & ~(pl.col("anzlwhg") == 0) & ~(pl.col("qmbehfl") == 0)).rename({"anzlwhg": "anzahlwhg", "qmbehfl": "heated_area"}).select(attributes_g),
                    df_k.filter(pl.col("primary_energy") == "gas").select(attributes_g)])
df_gas

id,anzahlwhg,heated_area,diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday
str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
"""400060pVG""",386,27746.0,339.312,85.75,63.0,100.0,13.8,10.5,18.8,0.0,0.0,243.0,6.1,20.5,1020.8,180.0,0
"""400060pVG""",386,27746.0,297.063,85.583333,64.0,100.0,13.8,9.0,18.3,0.0,0.0,42.0,5.0,18.4,1021.2,120.0,0
"""400060pVG""",386,27746.0,306.75,84.291667,61.0,98.0,13.7,9.0,19.4,0.0,0.0,62.0,5.8,20.5,1023.7,402.0,0
"""400060pVG""",386,27746.0,323.187,92.25,78.0,100.0,14.3,11.3,18.4,0.0,0.0,13.0,8.6,27.7,1024.0,108.0,0
"""400060pVG""",386,27746.0,276.188,86.833333,71.0,99.0,15.4,13.4,19.3,0.1,0.0,66.0,12.6,36.4,1023.8,120.0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""578e031d-e014-11eb-9d61-02b402…",168,3141.78,485.0,76.875,52.0,96.0,16.1,10.7,21.3,0.0,0.0,343.0,6.8,19.8,1021.3,306.0,1
"""578e031d-e014-11eb-9d61-02b402…",168,3141.78,517.0,71.75,44.0,100.0,16.6,8.1,24.7,0.0,0.0,95.0,10.7,29.5,1019.2,678.0,1
"""578e031d-e014-11eb-9d61-02b402…",168,3141.78,516.0,72.708333,38.0,100.0,18.1,10.0,27.4,0.0,0.0,114.0,7.9,19.8,1016.2,714.0,1
"""578e031d-e014-11eb-9d61-02b402…",168,3141.78,525.0,72.0,45.0,97.0,20.8,13.6,29.1,0.0,0.0,139.0,8.0,25.9,1015.5,606.0,1


In [83]:
attributes_g = list(set(attributes) - {"id", "snow", "tsun"})
get_p_vals_coll(df_gas, attributes_g)

tmax,diff,wpgt,pres,hum_min,holiday,prcp,tavg,tmin,wdir,hum_avg,hum_max,wspd,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.000002,0.000005,0.03252,0.011245,3.6883e-9,1.0857e-7,0.923586,9.5814e-13,0.640632,0.000001,0.000003,0.070817,0.810511,"""400095GVG"""
1.5594e-11,0.000323,0.28605,0.000004,1.1629e-13,0.003157,0.057265,6.6061e-7,0.018416,9.6450e-13,2.9854e-7,0.007069,0.290834,"""400306GVG"""
5.3668e-7,0.095063,0.535518,0.002054,0.000637,0.017295,0.957427,0.000001,0.955034,0.005525,0.000613,0.341274,0.711481,"""a6cb1351-e013-11eb-9d61-02b402…"
1.0887e-10,0.000052,0.048204,0.000029,1.7284e-9,0.041358,0.076182,3.7204e-14,0.845395,6.7969e-10,0.00001,0.487149,0.794425,"""400356PVG"""
1.3091e-20,0.35746,0.070233,5.3425e-12,0.000273,0.014734,0.088867,8.2109e-9,0.003618,0.001441,0.171794,0.734981,0.382138,"""400352PVG"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.000053,0.891078,0.270905,0.000576,0.101867,0.000986,0.269468,0.67875,0.015251,0.751225,0.159453,0.012553,0.871447,"""400131GVG"""
0.837395,0.387972,0.000984,0.095889,0.000022,1.5106e-16,0.698892,1.1623e-9,0.599484,0.649575,0.000354,0.21175,0.097767,"""400313GVG"""
0.000359,0.55128,0.15884,0.004494,0.000077,0.525729,0.213062,6.8923e-7,0.765036,0.000382,0.003909,0.023877,0.13907,"""400087GVG"""
0.331962,0.896885,0.188608,0.150207,0.502558,0.000046,0.705566,0.146729,0.580602,0.498428,0.970302,0.209171,0.207346,"""400204GVA"""


In [76]:
est, p_vals = get_p_vals(df_gas, attributes_g)
for attr, p in zip(attributes_g, p_vals):
    print(f"{attr}: {p}")

tmax: 0.9735896433588378
diff: 0.8871556555827069
wpgt: 0.01263088947218137
pres: 0.2519837305119538
hum_min: 2.962471085760549e-07
holiday: 7.257667260052444e-107
prcp: 0.3095595939140629
tavg: 3.162467008625379e-08
tmin: 0.02272724296449491
wdir: 3.217456290811079e-07
hum_avg: 5.535983906590841e-06
hum_max: 2.4943068638662565e-05
wspd: 0.35320749846440047


### P-Values District Heating

In [77]:
attributes_d = attributes + ["id"]
df_d = pl.concat([df_k.filter(pl.col("primary_energy") == "district heating").select(attributes_d),
                  df_dh.select(attributes_d)])
df_d

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,str
1173.0,77.75,61.0,88.0,10.6,5.1,14.1,0.0,0.0,319.0,11.2,33.5,1027.4,246.0,0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
691.0,77.5,56.0,94.0,8.6,2.8,14.3,1.3,0.0,116.0,13.7,42.1,1022.2,558.0,0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
710.0,86.791667,69.0,98.0,12.1,8.1,15.6,1.8,0.0,154.0,10.4,24.8,1012.0,90.0,0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
423.0,88.083333,71.0,98.0,15.0,12.6,18.9,0.0,0.0,234.0,8.6,25.2,1013.5,258.0,0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
363.0,83.625,67.0,99.0,14.5,9.0,18.9,2.4,0.0,179.0,9.4,29.2,1015.1,390.0,0,"""aecb8acb-5dfc-47c9-8a44-cbae3f…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
199.0,66.666667,49.0,94.0,20.1,16.4,23.7,0.0,0.0,229.0,21.2,58.7,1011.7,492.0,0,"""2f025f96-af2c-4140-b955-766a79…"
113.0,75.208333,51.0,96.0,17.0,11.9,21.8,4.0,0.0,236.0,17.3,52.6,1016.5,348.0,1,"""2f025f96-af2c-4140-b955-766a79…"
306.0,80.666667,54.0,97.0,16.4,11.6,22.4,1.4,0.0,257.0,11.2,33.8,1018.1,444.0,1,"""2f025f96-af2c-4140-b955-766a79…"
261.0,81.333333,65.0,96.0,15.9,12.5,20.0,1.7,0.0,5.0,7.6,27.7,1012.6,168.0,1,"""2f025f96-af2c-4140-b955-766a79…"


In [78]:
attributes_d = list(set(attributes) - {"id", "snow", "tsun"})
get_p_vals_coll(df_d, attributes_d).mean()

tmax,diff,wpgt,pres,hum_min,holiday,prcp,tavg,tmin,wdir,hum_avg,hum_max,wspd,id
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0.248715,0.154404,0.523706,0.346237,0.136786,0.166875,0.444142,0.093241,0.384682,0.252544,0.465993,0.446601,0.546907,


In [79]:
est, p_vals = get_p_vals(df_d, attributes_d)
for attr, p in zip(attributes_d, p_vals):
    print(f"{attr}: {p}")

tmax: 2.2081007637201614e-06
diff: 1.1619511081418624e-05
wpgt: 0.08116264847966939
pres: 0.012075680223941053
hum_min: 0.00012211895499417806
holiday: 0.04733409989259013
prcp: 0.7718212654995906
tavg: 8.051832631280465e-08
tmin: 0.9445941065577672
wdir: 0.019706159809013506
hum_avg: 0.004785880848020145
hum_max: 0.7631808684494717
wspd: 0.5403601633713155


## Clustering with p-Values

In [86]:
attributes_g = list(set(attributes) - {"id", "snow", "tsun"})
get_p_vals_coll(df_gas, attributes_g)
df_cluster = pl.concat([get_p_vals_coll(df_gas, attributes_g).select(["id"] + attributes_g),
                        get_p_vals_coll(df_d, attributes_g).select(["id"] + attributes_g)])

In [88]:
df_cluster.plot.scatter(x="tmax", y="tmin")  # add coloring with size of buildings, ...

## Principal Component Analysis