# Correlation Analysis

## Load data

In [55]:
import polars as pl
from polars.polars import InvalidOperationError

from src.energy_forecast.config import PROCESSED_DATA_DIR


def map_datetimes(row):
    try:
        row[1] = row[1].strptime('%Y-%m-%dT%H:%M:%S')
    except InvalidOperationError:
        row[1] = row[1].strftime("%Y-%m-%dT%H:%M:%S")
    return row


df = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv").filter(pl.col("source") == "dh")
df = df.with_columns(pl.coalesce(pl.col("datetime").str.to_datetime(strict=False),
                                 pl.col("datetime").str.to_datetime(format="%y-%m-%dT%H:%M:%S#z", strict=False)))
df

id,datetime,diff,source
str,datetime[μs],f64,str
"""5c8f03f4-9165-43a2-8c42-1e8133…",2022-12-15 00:00:00,3237.0,"""dh"""
"""5c8f03f4-9165-43a2-8c42-1e8133…",2022-12-18 00:00:00,3211.0,"""dh"""
"""5c8f03f4-9165-43a2-8c42-1e8133…",2022-12-19 00:00:00,975.0,"""dh"""
"""5c8f03f4-9165-43a2-8c42-1e8133…",2023-09-21 00:00:00,614.0,"""dh"""
"""5c8f03f4-9165-43a2-8c42-1e8133…",2023-09-22 00:00:00,653.0,"""dh"""
…,…,…,…
"""8e9b1544-434e-44a7-8049-8f2e4b…",2024-05-10 00:00:00,36.0,"""dh"""
"""8e9b1544-434e-44a7-8049-8f2e4b…",2024-05-11 00:00:00,36.0,"""dh"""
"""8e9b1544-434e-44a7-8049-8f2e4b…",2024-05-12 00:00:00,39.0,"""dh"""
"""8e9b1544-434e-44a7-8049-8f2e4b…",2024-05-13 00:00:00,20.0,"""dh"""


In [56]:
df["id"].unique()

id
str
"""8e9b1544-434e-44a7-8049-8f2e4b…"
"""cae17ef4-cfad-4446-8b09-3cf946…"
"""fb5cc271-ae15-4f24-b9d5-30782b…"
"""f1c2b8a6-9833-4150-896c-20b054…"
"""5c8f03f4-9165-43a2-8c42-1e8133…"
…
"""10af300b-a270-4e41-928d-e4048b…"
"""4f36b3bd-337e-4b93-9333-c53a28…"
"""44201958-2d6b-4952-956c-22ea95…"
"""c00c8cba-b6de-4c10-89c0-e92312…"


In [62]:
    from src.energy_forecast.config import RAW_DATA_DIR

id = "8e9b1544-434e-44a7-8049-8f2e4b14a819"
attributes = ["diff", 'hum_avg',
              'hum_min',
              'hum_max',
              'tavg',
              'tmin',
              'tmax',
              'prcp',
              'snow',
              'wdir',
              'wspd',
              'wpgt',
              'pres',
              'tsun']


def get_df_sensor(df, id, attributes):
        df_sens = df.filter(pl.col("id") == id)
        source = df_sens["source"].unique().item()
        df_meta = pl.read_csv(RAW_DATA_DIR / f"{source}_meta.csv")
        sens_meta = df_meta.filter(pl.col("eco_u_id") == id)
        for col in df_meta.columns:
            df_sens = df_sens.with_columns(pl.lit(sens_meta[col].item()).alias(col))
        df_weather = pl.read_csv(RAW_DATA_DIR / f"weather_daily.csv").with_columns(
            pl.col("time").str.to_datetime().alias("datetime")).rename({"plz": "postal_code"})
        df_sens = df_sens.join(df_weather, on=["datetime", "postal_code"], how="left")
        return df_sens.select(attributes)


df_sens = get_df_sensor(df, id, attributes)
df_sens

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,63.166667,38.0,86.0,22.3,16.2,29.1,0.0,0.0,109.0,10.8,33.1,1020.5,792.0
0.0,86.375,64.0,99.0,21.1,17.0,26.6,23.2,0.0,60.0,6.8,29.9,1014.3,234.0
0.0,81.291667,54.0,95.0,18.2,12.5,24.2,0.0,0.0,311.0,10.8,28.8,1015.0,294.0
1.0,79.041667,59.0,100.0,15.7,8.7,21.5,0.0,0.0,284.0,10.8,37.4,1018.1,288.0
1.0,81.541667,64.0,99.0,16.7,10.4,20.8,0.0,0.0,327.0,6.5,22.7,1021.1,204.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
36.0,80.375,58.0,100.0,14.4,9.3,19.1,0.0,0.0,284.0,8.2,29.2,1024.3,780.0
36.0,71.25,51.0,89.0,14.2,9.5,19.9,0.0,0.0,15.0,8.3,27.7,1024.6,642.0
39.0,67.541667,40.0,95.0,14.8,9.5,20.6,0.0,0.0,82.0,13.3,38.9,1021.8,648.0
20.0,48.666667,30.0,68.0,18.0,10.5,24.6,0.0,0.0,104.0,14.4,36.7,1015.1,870.0


In [68]:
import statsmodels.api as sm


def get_p_vals(df_sens, attributes):
    tsun_ = list(set(attributes) - {"diff"})
    X = df_sens.select(tsun_).to_numpy()
    y = df_sens.select(pl.col("diff")).to_numpy()
    X2 = sm.add_constant(X)
    est = sm.OLS(y, X2)
    est2 = est.fit()
    return est2, est2.summary2().tables[1]["P>|t|"].tolist()


est, p_vals = get_p_vals(df_sens, attributes)
for attr, p in zip(attributes, p_vals):
    print(f"{attr}: {p}")

diff: 0.06785199400647018
hum_avg: 0.7964962148241681
hum_min: 5.5534646970089095e-05
hum_max: 0.49527538555746964
tavg: 0.17750183685070636
tmin: 0.0003535856464419058
tmax: 0.03127825015817975
prcp: 0.1372280581930215
snow: 0.11901131306517021
wdir: 0.0005940626110082672
wspd: 1.5814176024107137e-05
wpgt: 0.0797984255793109
pres: 0.15091376592263409
tsun: 0.6596193292743373


In [67]:
est.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.851
Model:,OLS,Adj. R-squared:,0.847
Method:,Least Squares,F-statistic:,250.3
Date:,"Mon, 10 Feb 2025",Prob (F-statistic):,7.51e-226
Time:,17:10:28,Log-Likelihood:,-3276.9
No. Observations:,585,AIC:,6582.0
Df Residuals:,571,BIC:,6643.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,562.8363,307.652,1.829,0.068,-41.431,1167.103
x1,-0.8784,3.405,-0.258,0.796,-7.566,5.809
x2,5.7123,1.406,4.062,0.000,2.950,8.475
x3,-0.6181,0.906,-0.682,0.495,-2.397,1.161
x4,-1.4597,1.081,-1.350,0.178,-3.583,0.664
x5,2.2086,0.614,3.594,0.000,1.002,3.415
x6,-0.0798,0.037,-2.159,0.031,-0.152,-0.007
x7,-0.7072,0.475,-1.488,0.137,-1.641,0.226
x8,-3.6610,2.345,-1.561,0.119,-8.267,0.945

0,1,2,3
Omnibus:,185.778,Durbin-Watson:,1.247
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2170.826
Skew:,1.049,Prob(JB):,0.0
Kurtosis:,12.201,Cond. No.,120000.0


In [78]:
p_vals_coll = list()
for id in df["id"].unique():
    df_sens = get_df_sensor(df, id, attributes)
    est, p_vals = get_p_vals(df_sens, attributes)
    p_vals_coll.append(p_vals)

df_p = pl.DataFrame(p_vals_coll, orient="row", schema=attributes).drop_nans()
df_p

  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  return 1 - self.ssr/self.centered_tss
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  return self.mse_model/self.mse_resid


diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.007902,0.119914,0.042385,0.364257,0.396759,0.064649,0.67168,0.539605,0.184415,0.503334,0.000143,0.347499,0.961375,0.04783
0.415404,0.216589,0.011363,0.041453,0.022807,0.038389,0.106225,0.212034,0.730278,0.001606,0.001881,0.000303,0.645456,0.890516
0.12245,0.193118,0.000134,0.477336,0.057809,0.000329,0.161431,0.357956,0.035785,0.000014,0.001489,0.001866,0.749721,0.706539
0.000023,0.235013,0.113044,0.850539,0.470311,0.830842,0.736393,0.952963,0.911982,0.040142,0.330652,0.029626,0.466477,8.8165e-7
0.000017,0.972232,0.039649,0.654594,0.450581,0.384636,0.868892,0.111922,0.162383,0.980353,8.6795e-19,0.191864,0.521951,0.000038
…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.126568,0.803992,0.016513,0.634132,0.059944,0.019431,0.330286,0.287825,0.48096,0.00031,3.0775e-7,0.080368,0.251691,0.804187
0.899912,0.703909,0.882632,0.204465,0.020231,0.025418,0.096289,0.216231,0.520659,0.237409,0.000292,0.054412,0.404644,0.414614
0.016123,0.093849,0.368661,0.775639,0.333591,0.065835,0.326246,0.620941,0.115799,0.724018,0.189451,0.580327,0.553424,0.21439
0.555334,0.407968,0.134371,0.035354,0.496974,0.962076,0.218023,0.134318,0.070696,0.000177,0.397064,0.163205,0.36106,0.777103


In [79]:
df_p.mean()

diff,hum_avg,hum_min,hum_max,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.220991,0.581622,0.124919,0.355579,0.234466,0.243143,0.339645,0.498153,0.351104,0.121005,0.139124,0.172704,0.480016,0.376959
