In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from src.utils import inspect_std_iqr, pop_datetime
from src.preprocessing import normalize, try_drop_shared_nulls, drop_full_zero, to_datetime, drop_full_nulls

with open("data/energy_dataset.csv") as file:
    en = pd.read_csv(file, sep=",")
    en.name = "energy"

# create datetime column
en = to_datetime(en, "time")
display(en.shape)
display(en.columns)

### evenly spaced timestamps

In [None]:
display(en.datetime.diff().dt.total_seconds().value_counts())

plt.plot(en.datetime.diff().dt.total_seconds()/3600, '-*')
plt.ylabel("hours")
plt.show()

### remove nulls creates unevenly spaced timestamps

In [None]:
no_nulls = drop_full_nulls(en)
no_nulls = try_drop_shared_nulls(no_nulls, any_null=True)

display(no_nulls.datetime.diff().dt.total_seconds().value_counts())

plt.plot(no_nulls.datetime.diff().dt.total_seconds()/3600, '-*')
plt.ylabel("hours")
plt.show()

### let us try to keep the rows and interpolate nulls instead

In [None]:
# show indexes of nulls for "total load actual" feature
display(en[en["total load actual"].isna()].index)

In [None]:
df = en.copy(deep=True)
_, df = pop_datetime(df)
df = df["total load actual"]
for method in ["linear", "quadratic", "cubic"]:
    tmp_df = df.interpolate(method=method)

    _, axs = plt.subplots(nrows=1, ncols=2)
    axs[1].plot(en["total load actual"],'r*')
    axs[1].plot(tmp_df.values,'b-')
    axs[1].set_xlim([730,780])
    axs[0].plot(en["total load actual"],'r*')
    axs[0].plot(tmp_df.values,'b-')
    axs[0].set_xlim([100,125])
    plt.title(method)
    plt.show()


In [None]:
display(en.datetime.diff().dt.total_seconds().value_counts())

plt.plot(en.datetime.diff().dt.total_seconds()/3600, '-*')
plt.ylabel("hours")
plt.show()

### conclusions
* no need for higher than linear interpolation methods
* remove nulls and resampling on removed timestamps with avg is equivalent to linear interpolation
* I'd stick to a straightforward linear interpolation and dropping only full- null/zero columns