In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from src.utils import inspect_std_iqr
from src.preprocessing import normalize, try_drop_shared_nulls, drop_full_zero, to_datetime, drop_full_nulls

### load

In [None]:
with open("data/energy_dataset.csv") as file:
    en = pd.read_csv(file, sep=",")
    en.name = "energy"

# create datetime column
en = to_datetime(en, "time")

### clean data

In [None]:
# drop nulls
en = drop_full_nulls(en)
en = try_drop_shared_nulls(en, any_null=True)

# drop features with only zero values
en = drop_full_zero(en)

# recap
desc = en.describe()
display(desc)

### normalization

In [None]:
display(inspect_std_iqr(en))

In [None]:
en_norm = normalize(en)
en_norm.boxplot(figsize=(15, 5), rot=90)

In [None]:
en_norm.datetime

### statistics

In [None]:
df = en_norm.copy()

In [None]:
corr = df[df.columns[~df.columns.isin(["datetime"])]].corr()
sns.heatmap(
    corr,
    vmin=-1.0,
    vmax=1.0,
    cmap="crest",
    xticklabels=corr.columns.values,
    yticklabels=corr.columns.values,
    annot=False,
)
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
df.plot(x="datetime", y=["total load actual"], ax=axs[0])
df.hist(column=["total load actual"], bins=100, ax=axs[1])
plt.show()

In [None]:
df["diff: total load actual"] = df["total load actual"].diff()
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
df.plot(x="datetime", y=["diff: total load actual"], ax=axs[0])
df.hist(column=["diff: total load actual"], bins=100, ax=axs[1])
plt.show()