# Correlation Analysis

## Load Data

In [20]:
from typing import Any

import polars as pl
import pandas as pd
from src.energy_forecast.config import RAW_DATA_DIR

df_weather = pl.read_csv(RAW_DATA_DIR / f"weather_daily.csv").with_columns(
    pl.col("time").str.to_datetime().alias("datetime"))
df_holidays = pl.read_csv(RAW_DATA_DIR / "holidays.csv").with_columns(pl.col("start").str.to_date(),
                                                                      pl.col("end").str.to_date(strict=False))
df_cities = pl.read_csv(RAW_DATA_DIR / "cities.csv")

# META DATA
df_meta_l = pl.read_csv(RAW_DATA_DIR / "legacy_meta.csv").with_columns(pl.col("plz").str.strip_chars())
df_meta_dh = pl.read_csv(RAW_DATA_DIR / "dh_meta.csv").rename({"eco_u_id": "id"})
df_meta_k = pl.read_csv(RAW_DATA_DIR / "kinergy_meta.csv").with_columns(pl.lit("kinergy").alias("source"))

df_meta = pl.concat([df_meta_l.cast({"plz": pl.Int64}).rename(
    {"qmbehfl": "heated_area", "anzlwhg": "anzahlwhg", "adresse": "address"}).with_columns(
    pl.lit("gas").alias("primary_energy")),
    df_meta_dh.rename({"postal_code": "plz", "city": "ort"}),
    df_meta_k.rename({"name": "address"})],
    how="diagonal")

holiday_dict = {"BE": [], "HH": [], "MV": [], "BY": [], "SH": []}
for row in df_holidays.iter_rows():
    if row[1] is not None and row[2] is not None:
        span = pd.date_range(row[1], row[2], freq="D")
        holiday_dict[row[0]].extend(span)
    elif row[1] is not None:
        holiday_dict[row[0]].extend([row[1]])

In [34]:
df_meta.group_by(["address", "plz", "ort", "heated_area", "anzahlwhg", "source", "typ"]
                 ).agg(
                    ).with_columns(pl.when(pl.col("heated_area") == 0).then(None).otherwise(pl.col("heated_area")).name.keep(),
                                   pl.when(pl.col("anzahlwhg") == 0).then(None).otherwise(pl.col("anzahlwhg")).name.keep()
                                   ).filter(
    ((pl.col("heated_area").is_null().or_(pl.col("anzahlwhg").is_null()))
     .and_((pl.col("source").is_null()))
     .or_((pl.col("typ") == "").and_(pl.col("source") == "kinergy")))
    ).sort(["plz"]).filter(~(pl.col("ort") == "Wiener Neustadt")).drop(["source", "typ"])

address,plz,ort,heated_area,anzahlwhg
str,i64,str,f64,i64
"""Sievekingsallee 110-111""",20535,"""Hamburg""",6139.0,
"""Heimfelder Straße 34-38""",21075,"""Hamburg""",,
"""Homannstraße 3-7a""",21075,"""Hamburg""",,
"""Fallreep 28-42""",21129,"""Hamburg""",,
"""Süderelbering 2""",21149,"""Hamburg""",9205.0,
…,…,…,…,…
"""Oldenburger Str. 15""",24321,"""Lütjenburg""",831.38,
"""Oldenburger Str. 13""",24321,"""Lütjenburg""",831.38,
"""An den Kleingärten 2""",24619,"""Bornhöved""",,
"""Fehrsstraße 3""",25336,"""Elmshorn""",,


In [42]:
from src.energy_forecast.config import PROCESSED_DATA_DIR

df_daily = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").with_columns(pl.col("datetime").str.to_datetime())
df_hourly = pl.read_csv(PROCESSED_DATA_DIR / "dataset_hourly.csv")

In [79]:
attributes = ["diff", 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
              'pres', 'tsun', "holiday"]


def add_holidays(df):
    return df.join(df_cities.select(["plz", "state"]), on="plz", how="left").drop_nulls(["state"]).with_columns(
        pl.struct(["state", "datetime"]).map_elements(lambda x: 1 if x["datetime"] in holiday_dict[x["state"]] else 0,
                                                      return_dtype=pl.Int64).alias("holiday"))


def add_meta(df):
    df = df.join(df_meta, on="id", how="left").join(df_weather, on=["datetime", "plz"], how="left")
    return add_holidays(df)


attributes_ha = attributes + ["heated_area", "anzahlwhg"]
df_daily = add_meta(df_daily).select(["id", "datetime", "primary_energy"] + attributes_ha)

In [80]:
df_daily.write_csv(PROCESSED_DATA_DIR / "dataset_daily_feat.csv")

In [1]:
from src.energy_forecast.config import PROCESSED_DATA_DIR
import polars as pl

df_daily = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily_feat.csv").cast(
    {"heated_area": pl.Float64, "anzahlwhg": pl.Int64}).with_columns(pl.col("datetime").str.to_datetime())
df_daily = df_daily.with_columns(pl.col("diff").shift(1).over("id").alias("diff_t-1")).drop_nulls(subset=["diff_t-1"])
df_daily

[32m2025-02-20 09:43:55.137[0m | [1mINFO    [0m | [36msrc.energy_forecast.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/marja/PycharmProjects/energy-forecast-wahl[0m


id,datetime,primary_energy,diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday,heated_area,anzahlwhg,diff_t-1
str,datetime[μs],str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,f64
"""400103PVG""",2017-08-10 00:00:00,"""gas""",78.8,78.708333,61.0,99.0,16.7,10.1,21.7,0.0,0.0,327.0,7.6,22.3,1019.7,306.0,0,7878.0,107,82.5
"""400103PVG""",2017-08-11 00:00:00,"""gas""",105.5,90.958333,85.0,95.0,15.0,14.1,16.3,6.3,0.0,340.0,10.4,27.0,1015.3,0.0,0,7878.0,107,78.8
"""400103PVG""",2017-08-12 00:00:00,"""gas""",108.8,93.416667,89.0,97.0,15.0,13.6,17.1,4.9,0.0,247.0,14.4,33.5,1012.0,6.0,0,7878.0,107,105.5
"""400103PVG""",2017-08-13 00:00:00,"""gas""",110.1,77.541667,47.0,99.0,15.7,9.3,19.6,0.0,0.0,269.0,12.2,35.3,1019.0,528.0,0,7878.0,107,108.8
"""400103PVG""",2017-08-14 00:00:00,"""gas""",92.5,70.666667,42.0,100.0,16.1,7.0,22.4,0.0,0.0,78.0,8.3,28.4,1022.8,792.0,0,7878.0,107,110.1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""400089GVG""",2022-03-20 00:00:00,"""gas""",101.7,53.083333,29.0,90.0,7.1,0.9,12.9,0.0,0.0,98.0,27.4,66.6,1036.4,672.0,1,1994.0,42,96.7
"""400089GVG""",2022-03-21 00:00:00,"""gas""",97.7,43.75,34.0,56.0,8.2,3.3,13.6,0.0,0.0,128.0,17.6,37.1,1033.7,618.0,1,1994.0,42,101.7
"""400089GVG""",2022-03-22 00:00:00,"""gas""",85.9,45.875,26.0,66.0,9.7,3.0,17.8,0.0,0.0,109.0,6.8,20.5,1033.4,630.0,1,1994.0,42,97.7
"""400089GVG""",2022-03-23 00:00:00,"""gas""",81.7,58.125,30.0,89.0,9.3,-0.3,17.3,0.0,0.0,70.0,7.9,32.0,1032.2,672.0,1,1994.0,42,85.9


## Attributes and Description

In [32]:
attributes = ["diff", "diff_t-1", 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
              'pres', 'tsun', "holiday"]
attributes_ha = attributes + ["heated_area", "anzahlwhg"]

From the [meteostat](https://dev.meteostat.net/python/daily.html#api) documentation:

Column	Description	Type

station	The Meteostat ID of the weather station (only if query refers to multiple stations)	String

time	The date	Datetime64

tavg	The average air temperature in °C	Float64

tmin	The minimum air temperature in °C	Float64

tmax	The maximum air temperature in °C	Float64

prcp	The daily precipitation total in mm	Float64

snow	The snow depth in mm	Float64

wdir	The average wind direction in degrees (°)	Float64

wspd	The average wind speed in km/h	Float64

wpgt	The peak wind gust in km/h	Float64

pres	The average sea-level air pressure in hPa	Float64

tsun	The daily sunshine total in minutes (m)	Float64

Further Attributes apart from weather data:

- holiday: whether or not (0/1) there is a holiday in the state of the record on this date
- heated_area: heated area in square metres
- anzahlwhg: number of appartments in the building
- diff_t-1: the heat consumption on the day before

## Multiple Linear Regression with OLS

### All data (gas + district heating)

In [33]:
import statsmodels.api as sm


def get_p_vals(df: pl.DataFrame, list_cols: list):
    attr_list = list(set(list_cols) - {"diff"})
    X = df.select(attr_list).to_numpy()
    y = df.select(pl.col("diff")).to_numpy()
    X2 = sm.add_constant(X)
    est = sm.OLS(y, X2)
    est2 = est.fit()
    return est2, est2.rsquared, est2.rsquared_adj, est2.summary2().tables[1]["Coef."], est2.summary2().tables[1]["P>|t|"].tolist()


significance_level = 0.05

def get_significant_features(df, attributes):
    est, rsquared, rsquared_adj, _, p_vals = get_p_vals(df, attributes)
    print("r²: ", rsquared)
    print("r² adjusted: ", rsquared_adj)
    return pl.DataFrame({"attributes": attributes, "p-value": p_vals}).sort("p-value").filter(
        pl.col("p-value") < significance_level)


def iterative_feature_removal(df, attributes):
    print(f"Starting with {len(attributes)} features: {attributes}")
    for i in range(5):
        print(f"Computing r² and p-values for {len(attributes)} features: {attributes}")
        df_significant_feat = get_significant_features(df, attributes)
        print(f"{len(df_significant_feat)} significant features: ")
        for row in df_significant_feat.iter_rows():
            print(row)
        print("\n")
        attributes = df_significant_feat["attributes"].to_list()
        if "diff" not in attributes: attributes.append("diff")


iterative_feature_removal(df_daily.drop_nulls(["snow", "tsun", "wpgt"]), attributes)

Starting with 16 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday']
Computing r² and p-values for 16 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday']
r²:  0.9443083941955221
r² adjusted:  0.9442984052670627
12 significant features: 
('wspd', 0.0)
('hum_min', 7.032772037797866e-22)
('pres', 1.0110128437320324e-15)
('tavg', 2.415215154970249e-05)
('hum_avg', 0.00013275142334541725)
('wdir', 0.00022055988983713608)
('prcp', 0.0031203376845176066)
('hum_max', 0.003160911321434748)
('holiday', 0.00630527613107545)
('diff', 0.012888551723115465)
('wpgt', 0.015922480504726676)
('snow', 0.02692957578775672)


Computing r² and p-values for 12 features: ['wspd', 'hum_min', 'pres', 'tavg', 'hum_avg', 'wdir', 'prcp', 'hum_max', 'holiday', 'diff', 'wpgt', 'snow']
r²:  0.08267596993966952
r² a

With the value of yesterdays diff, we achieve an r-squared value of about 0.94. The feature is not deemed as significant though, and therefore removed in the next step. The r-squared value reduces to about 0.08.

In [34]:
est, r_squared, rsquared_adj, coeffs, p_vals = get_p_vals(df_daily.drop_nulls(["snow", "tsun", "wpgt"]), attributes)
for coeff, attribute in zip(coeffs, attributes):
    print(f"{attribute}: {coeff}")

diff: 185.99342608197455
diff_t-1: 0.2363640736590359
hum_avg: -0.307298268787203
hum_min: -2.2135768703349856
hum_max: 4.048182688099381
tavg: -3.141189058638023
tmin: -0.12471328368711548
tmax: -0.012868757335265595
prcp: 0.7655641831248425
snow: -0.43228298477923544
wdir: 0.018728927504575048
wspd: 0.9654551690374695
wpgt: 1.3859148690421188
pres: 0.7604168043814982
tsun: -0.8440011504987392
holiday: -0.4157250495618954


### All data (gas + district heating, only data with known heated area)

In [35]:
iterative_feature_removal(df_daily.drop_nulls(["snow", "tsun", "wpgt", "heated_area", "anzahlwhg"]), attributes_ha)

Starting with 18 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday', 'heated_area', 'anzahlwhg']
Computing r² and p-values for 18 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday', 'heated_area', 'anzahlwhg']
r²:  0.9581004196993863
r² adjusted:  0.9580889941297044
14 significant features: 
('wpgt', 0.0)
('tmax', 6.453579253544663e-39)
('anzahlwhg', 3.1831061735131385e-19)
('hum_min', 1.5154752469231236e-09)
('tsun', 2.842879918561998e-05)
('heated_area', 7.130049556103555e-05)
('pres', 0.0007925769910768565)
('hum_avg', 0.0008494048966723211)
('tavg', 0.001378634630490708)
('hum_max', 0.0019389076333986498)
('snow', 0.002262352094061612)
('wspd', 0.010289787748476461)
('diff', 0.01109864729194631)
('wdir', 0.023618996005212226)


Computing r² and p-values for 14 features: ['wpgt', 

When adding heated area AND the diff value of yesterday, the model is able to predict with an r-squared value of about 0.958.

### Gas Data

In [36]:
iterative_feature_removal(df_daily.filter(pl.col("primary_energy") == "gas").drop_nulls(["snow", "tsun", "wpgt"]),
                         attributes)

Starting with 16 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday']
Computing r² and p-values for 16 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday']
r²:  0.957517958545695
r² adjusted:  0.9575075023715353
11 significant features: 
('wspd', 0.0)
('hum_min', 8.955141558221586e-09)
('pres', 4.9403525017184995e-05)
('hum_max', 0.0002004516739777132)
('wpgt', 0.0002331897333461572)
('tavg', 0.0002447051600240952)
('holiday', 0.0004523749042421024)
('wdir', 0.0007362649232130205)
('hum_avg', 0.001326686357601526)
('prcp', 0.007546361031951895)
('diff', 0.018483760305510626)


Computing r² and p-values for 11 features: ['wspd', 'hum_min', 'pres', 'hum_max', 'wpgt', 'tavg', 'holiday', 'wdir', 'hum_avg', 'prcp', 'diff']
r²:  0.05718415993513226
r² adjusted:  0.05702946809289555
8 signifi

### Gas Data (with heated area available)

In [37]:
iterative_feature_removal(df_daily.filter(pl.col("primary_energy") == "gas").drop_nulls(["snow", "tsun", "wpgt", "heated_area", "anzahlwhg"]),
    attributes_ha)

Starting with 18 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday', 'heated_area', 'anzahlwhg']
Computing r² and p-values for 18 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday', 'heated_area', 'anzahlwhg']
r²:  0.9577301497381467
r² adjusted:  0.9577183582110229
13 significant features: 
('wpgt', 0.0)
('tmax', 2.3516746120701358e-34)
('anzahlwhg', 7.089708414597798e-16)
('hum_min', 5.592279234199274e-10)
('tsun', 1.4702124730065991e-05)
('pres', 0.00037535339196576994)
('hum_max', 0.0006118766056708177)
('heated_area', 0.0008837280618452483)
('hum_avg', 0.0012350835491258217)
('tavg', 0.0016453394260951943)
('wspd', 0.004272986168329824)
('snow', 0.012815521750559241)
('diff', 0.019256635715887687)


Computing r² and p-values for 13 features: ['wpgt', 'tmax', 'anzahlwhg', 'hum_mi

For gas values, 12 significant features ('wpgt', 'tmax', 'anzahlwhg', 'hum_min', 'tsun', 'pres', 'hum_max', 'heated_area', 'hum_avg', 'tavg', 'wspd', 'snow') achieve an r-squared value of about 0.958.

### District Heating Data

In [40]:
iterative_feature_removal(df_daily.filter(pl.col("primary_energy") == "district heating").drop_nulls(["snow", "tsun", "wpgt"]),
    attributes)

Starting with 16 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday']
Computing r² and p-values for 16 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday']
r²:  0.9121375980347465
r² adjusted:  0.9120794649118371
5 significant features: 
('wspd', 0.0)
('pres', 4.173353554316152e-08)
('hum_min', 1.1842735645502458e-06)
('hum_avg', 0.0030985919972274702)
('tmax', 0.041674993290442365)


Computing r² and p-values for 6 features: ['wspd', 'pres', 'hum_min', 'hum_avg', 'tmax', 'diff']
r²:  0.17757439510449768
r² adjusted:  0.17739309233899003
3 significant features: 
('hum_min', 0.0)
('wspd', 2.57234190103296e-05)
('diff', 0.0010297675184094592)


Computing r² and p-values for 3 features: ['hum_min', 'wspd', 'diff']
r²:  0.05917130409813076
r² adjusted:  0.059088353234446944
3 significant f

### District Heating Data (with heated area available)

In [44]:
iterative_feature_removal(df_daily.filter(pl.col("primary_energy") == "district heating").drop_nulls(["snow", "tsun", "wpgt", "heated_area", "anzahlwhg"]),
    attributes_ha)

Starting with 18 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday', 'heated_area', 'anzahlwhg']
Computing r² and p-values for 18 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday', 'heated_area', 'anzahlwhg']
r²:  0.9579894756042056
r² adjusted:  0.9574730772566072
2 significant features: 
('wpgt', 0.0)
('tmax', 4.056020340171389e-08)


Computing r² and p-values for 3 features: ['wpgt', 'tmax', 'diff']
r²:  0.4795374562824801
r² adjusted:  0.47879287467487286
2 significant features: 
('tmax', 4.8589371748550325e-199)
('wpgt', 6.226867035984887e-176)


Computing r² and p-values for 3 features: ['tmax', 'wpgt', 'diff']
r²:  0.4795374562824801
r² adjusted:  0.47879287467487286
2 significant features: 
('wpgt', 4.8589371748550325e-199)
('tmax', 6.226867035984887e-176)


Computing r² and

For district heating data, 2 features ('t_max', 'wpgt') achieve an r-squared value of about 0.48.

In [45]:
from src.energy_forecast.config import RAW_DATA_DIR
df_lod = pl.read_csv(RAW_DATA_DIR / "dh_meta_lod.csv").rename({"adresse": "address"})
dh_meta = pl.read_csv(RAW_DATA_DIR / "dh_meta.csv").rename({"eco_u_id": "id"}).select(["id", "address"])
df_lod = df_lod.join(dh_meta, on=["address"])

In [46]:
df_dh = df_daily.filter(pl.col("primary_energy") == "district heating").join(df_lod, on=["id"], how="left").cast({"Storeys Above Ground": pl.Int64})
df_dh.describe()

statistic,id,datetime,primary_energy,diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,holiday,heated_area,anzahlwhg,diff_t-1,address,Building ID,Country,postal_code,Function,Height (m),Storeys Above Ground,ground_surface
str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,f64,str,f64,f64,f64
"""count""","""22687""","""22687""","""22687""",22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,22687.0,1401.0,1401.0,22687.0,"""7483""","""7483""","""7483""",7483.0,"""7483""",7483.0,7483.0,7483.0
"""null_count""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21286.0,21286.0,0.0,"""15204""","""15204""","""15204""",15204.0,"""15204""",15204.0,15204.0,15204.0
"""mean""",,"""2023-06-27 23:15:53.197866""",,674.294265,81.391801,65.24243,94.872173,9.38035,5.276625,13.230216,2.563658,2.002028,187.670648,12.660757,37.876515,1013.321669,245.978798,0.371578,4706.697359,22.703783,675.017499,,,,22987.46305,,13.247523,3.010557,1267.006573
"""std""",,,,851.677308,10.600354,15.989349,6.093034,6.224428,5.518742,7.479177,4.392499,11.247004,82.513626,5.800939,14.50112,10.788211,252.744989,0.483237,2954.335983,26.708346,851.793851,,,,481.158585,,5.764078,1.544414,1736.574153
"""min""","""0c9ad311-b86f-4371-a695-512ca4…","""2022-04-06 00:00:00""","""district heating""",0.0,34.833333,17.0,51.0,-7.3,-11.6,-4.3,0.0,0.0,0.0,2.2,9.0,977.2,0.0,0.0,1141.0,0.0,0.0,"""Am Kielortplatz 1""","""DESHPDHK0000Z6T7""","""Deutschland""",22846.0,"""31001_1000""",4.305,1.0,303.146873
"""25%""",,"""2023-01-27 00:00:00""",,170.0,75.916667,54.0,93.0,5.1,1.7,7.9,0.0,0.0,128.0,8.3,27.7,1005.7,6.0,0.0,2360.0,0.0,171.0,,,,22846.0,,9.845,2.0,421.969325
"""50%""",,"""2023-06-18 00:00:00""",,423.0,83.416667,67.0,97.0,9.0,5.4,12.2,0.4,0.0,201.0,11.9,36.0,1013.7,180.0,0.0,3600.0,24.0,423.0,,,,22846.0,,11.382,3.0,653.00923
"""75%""",,"""2023-11-27 00:00:00""",,822.0,88.958333,77.0,99.0,14.2,9.2,18.8,3.4,0.0,250.0,15.8,46.3,1020.5,426.0,1.0,6000.0,64.0,822.0,,,,22850.0,,18.084,4.0,1149.858409
"""max""","""fb684f25-a63d-4d3e-9277-6d759b…","""2024-05-14 00:00:00""","""district heating""",9160.0,99.25,97.0,100.0,28.6,20.6,38.5,43.1,120.0,359.0,38.9,99.4,1043.1,946.0,1.0,9456.0,64.0,9160.0,"""Waldstraße 63""","""DESHPDHK0007sekW""","""Deutschland""",24635.0,"""31001_3063""",26.345,8.0,7052.559212


In [47]:
iterative_feature_removal(df_dh.drop_nulls(["address"]), attributes + ["Height (m)", "Storeys Above Ground", "ground_surface"])

Starting with 19 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday', 'Height (m)', 'Storeys Above Ground', 'ground_surface']
Computing r² and p-values for 19 features: ['diff', 'diff_t-1', 'hum_avg', 'hum_min', 'hum_max', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'holiday', 'Height (m)', 'Storeys Above Ground', 'ground_surface']
r²:  0.8916570031200092
r² adjusted:  0.8913957257963436
3 significant features: 
('tsun', 0.0)
('wdir', 1.0990724083517135e-32)
('prcp', 0.008284773510365556)


Computing r² and p-values for 4 features: ['tsun', 'wdir', 'prcp', 'diff']
r²:  0.05090038779532391
r² adjusted:  0.05051968197414258
4 significant features: 
('tsun', 1.489748489436252e-240)
('prcp', 1.5739666359740781e-84)
('wdir', 3.9331726003311134e-08)
('diff', 3.9586963697051864e-05)


Computing r² and p-values for 4 features: ['tsun', 'prcp', 'wdir', 'diff'

### Per Building

Calculate p-values and r-squared values per building and average over those

In [48]:
def get_p_vals_id(df, attributes):
    sensor_list = list()
    for id in df["id"].unique():
        df_s = df.filter(pl.col("id") == id)
        _, r_squared, r_squared_adj, coeffs, p_vals = get_p_vals(df_s, attributes)
        sensor_list.append({"id": id, "r_s": r_squared, "r_s_adj": r_squared_adj, "p_vals": p_vals, "p_vals_n": len(p_vals)})
    return pl.DataFrame(sensor_list).drop_nans().filter(pl.col("p_vals_n") == len(attributes))


def get_p_vals_avg(df, attributes):
    df_p_vals = get_p_vals_id(df, attributes)
    print("Average r-squared: ", df_p_vals["r_s"].mean())
    print("Average r-squared adjusted: ", df_p_vals["r_s_adj"].mean())
    avg_p_vals = list(df_p_vals["p_vals"].explode().reshape((len(attributes), -1)).to_numpy().sum(axis=1) / len(df_p_vals))
    for attribute, p_val in zip(attributes, avg_p_vals):
        print(f"{attribute}: {p_val}")


get_p_vals_avg(df_daily.filter(pl.col("primary_energy") == "gas").drop_nulls(["snow", "tsun", "wpgt"]), attributes)

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "


Average r-squared:  0.8503016970972969
Average r-squared adjusted:  0.8394544407574351
diff: 0.3358658165031264
diff_t-1: 0.31799399109276955
hum_avg: 0.37756545272689673
hum_min: 0.30466889065302444
hum_max: 0.3611936101053241
tavg: 0.4186112359207607
tmin: 0.35086224320711185
tmax: 0.2973950833036209
prcp: 0.3674549931649284
snow: 0.3664272748531776
wdir: 0.26502690478492674
wspd: 0.3204535551672345
wpgt: 0.36604086257184415
pres: 0.3613724498803349
tsun: 0.33279723315477827
holiday: 0.31747609926382886


The averaged values show that, on average, the features have an r-squared value of about 0.85. Since no feature is significant on average, this might mean that the model is not able to generalize well, but does find significant attributes for each individiual building.

In [53]:
get_p_vals_avg(df_daily.filter(pl.col("primary_energy") == "district heating"), attributes)

Average r-squared:  0.8387312057973063
Average r-squared adjusted:  0.8320683646546283
diff: 0.4120543229208334
diff_t-1: 0.310382332142736
hum_avg: 0.4380167022958674
hum_min: 0.3479248502838678
hum_max: 0.4337290304788215
tavg: 0.44574469492953384
tmin: 0.3637427614614475
tmax: 0.333802826482722
prcp: 0.43029946798097257
snow: 0.4088406363788839
wdir: 0.33805145801000247
wspd: 0.34462754837156084
wpgt: 0.36074629084377263
pres: 0.36187440530635256
tsun: 0.33726795402006826
holiday: 0.4771710222997876
