In [None]:
# Data Exploration

In [None]:
import polars as pl
from darts import TimeSeries

from src.energy_forecast.config import INTERIM_DATA_DIR, PROCESSED_DATA_DIR, REPORTS_DIR, FEATURES

df_interim = pl.read_csv(INTERIM_DATA_DIR / "daily.csv")
df_processed = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily.csv")
df_interpolate = pl.read_csv(PROCESSED_DATA_DIR / "dataset_interpolate_daily.csv")
df_feat = pl.read_csv(PROCESSED_DATA_DIR / "dataset_daily_feat.csv")
df_missing_dates = pl.read_csv(REPORTS_DIR / "missing_dates.csv")

In [None]:
df_interim

## Meta Data

In [None]:
from src.energy_forecast.config import META_DIR

df_meta_l = pl.read_csv(META_DIR / "legacy_meta.csv").with_columns(pl.col("plz").str.strip_chars())
df_meta_dh = pl.read_csv(META_DIR / "dh_meta.csv").rename({"eco_u_id": "id"})
df_lod = pl.read_csv(META_DIR / "dh_meta_lod.csv").rename(
            {"adresse": "address"})  # dh data with lod building data
df_meta_dh = df_meta_dh.join(df_lod, on=["address"]).drop(
            ["id_right", "postal_code_right", "city", "postal_code"])
df_meta_k = pl.read_csv(META_DIR / "kinergy_meta.csv", null_values="")
df_meta = pl.concat(
            [df_meta_l.cast({"plz": pl.Int64}).rename(
                {"qmbehfl": "heated_area", "anzlwhg": "anzahlwhg", "adresse": "address"}).with_columns(
                pl.lit("gas").alias("primary_energy"), pl.lit("legacy").alias("source")),
                df_meta_k.rename({"name": "address"}).with_columns(pl.lit("kinergy").alias("source")),
                df_meta_dh.rename({"Height (m)": "building_height", "Storeys Above Ground": "storeys_above_ground"}).with_columns(pl.lit("dh").alias("source")),
            ],
            how="diagonal")
df_meta = df_meta.with_columns(pl.when(pl.col("typ").is_null()).then(pl.lit("Mehrfamilienhaus")).otherwise(
                    pl.col("typ")).name.keep())
df_meta

## Data Coutnts

In [None]:
df_meta.filter(pl.col("primary_energy")=="gas").group_by("source").agg(pl.len())

In [None]:
df_interim.group_by("source").agg(pl.len())

In [None]:
df_interim.filter(pl.col("primary_energy")=="district heating").group_by("source").agg(pl.len())

In [None]:
df_processed.group_by("source").agg(pl.len())

In [None]:
df_processed.join(df_meta.select(["primary_energy", "id"]), on="id").filter(pl.col("primary_energy")=="district heating").group_by("source").agg(pl.len())

In [None]:
df_interpolate.join(df_meta.select(["primary_energy", "id"]), on="id").filter(pl.col("primary_energy")=="district heating").group_by("source").agg(pl.len())


## Data Processing

In [None]:
raw_series = TimeSeries.from_dataframe(df_interim.filter(pl.col("id") == "10af300b-a270-4e41-928d-e4048b2fdf00"),
                                   time_col="date",
                                   value_cols="diff",
                                   fill_missing_dates=True)
raw_series.plot()

In [None]:
p_series = TimeSeries.from_dataframe(df_processed.filter(pl.col("id") == "10af300b-a270-4e41-928d-e4048b2fdf00"),
                                   time_col="datetime",
                                   value_cols="diff",
                                   fill_missing_dates=True)
p_series.plot()

In [None]:
raw_series.plot(label="raw")
p_series.plot(label="processed")

In [None]:
df_interpolate.group_by("id").agg(pl.len()).sort(by="id")

In [None]:
raw_series.plot(label="raw")
p_series.plot(label="processed")
cut_series = list()
for i in range(5):
    c_series =  TimeSeries.from_dataframe(df_interpolate.filter(pl.col("id") == f"10af300b-a270-4e41-928d-e4048b2fdf00-{i}"),
                                   time_col="datetime",
                                   value_cols="diff",
                                   fill_missing_dates=True)
    cut_series.append(c_series)
    c_series.plot(label=f"part {i}")

## Missing Dates

In [None]:
df_missing_dates.sort(by="len")

In [None]:
import plotly.express as px

df_plot_missing_dates = pl.concat([df_missing_dates.sort(by="per").select(["id", "len", "per"]).with_columns(pl.lit("missing").alias("type")),
           df_missing_dates.sort(by="per").select(["id", "n", "per"]).rename({"n": "len"}).with_columns(pl.lit("whole").alias("type"))], how="vertical")

fig = px.bar(df_plot_missing_dates, x="id", y="len", color="type", title="Missing Data")
fig.update_xaxes(showticklabels=False)
fig.show()

## Visualize all Sensors

In [None]:
import plotly.graph_objs as go

source = "legacy"
df =  df_interpolate.filter(pl.col("source") == source).sort("id")
df_min_max_dates = df.group_by(["id"]).agg(
    [pl.col("datetime").min().alias("min_date"),
     pl.col("datetime").max().alias("max_date"),
     pl.len().alias("days")
     ]
)

fig = go.Figure()
for (start, end, value, days) in zip(df_min_max_dates["min_date"], df_min_max_dates["max_date"],
                                     df_min_max_dates["id"], df_min_max_dates["days"]):
    # name = f"{start} to {end}"
    name = days
    fig.add_trace(go.Scatter(x=[start, end], y=[value, value],
                             mode='lines', name=name))

fig.show()
fig.write_html(f"../reports/figures/gasz채hler_start_end_datum_{source}.html")
# fig.write_image("../reports/figures/gasz채hler_start_end_datum.png")

In [None]:
import plotly.graph_objs as go

source = "legacy"
df =  df_interpolate.sort("id")
df_min_max_dates = df.group_by(["id"]).agg(
    [pl.col("datetime").min().alias("min_date"),
     pl.col("datetime").max().alias("max_date"),
     pl.len().alias("days")
     ]
)

fig = go.Figure()
for (start, end, value, days) in zip(df_min_max_dates["min_date"], df_min_max_dates["max_date"],
                                     df_min_max_dates["id"], df_min_max_dates["days"]):
    # name = f"{start} to {end}"
    name = days
    fig.add_trace(go.Scatter(x=[start, end], y=[value, value],
                             mode='lines', name=name))

fig.show()
fig.write_html(f"../reports/figures/gasz채hler_start_end_datum.html")
# fig.write_image("../reports/figures/gasz채hler_start_end_datum.png")

## Feature

In [None]:
df_feat.columns

In [None]:
len(df_feat)

In [None]:
len(df_feat.filter(~pl.col("anzahlwhg").is_null()))

In [None]:
df_meta.group_by("typ").agg(pl.len())

## Feature Correlation

In [None]:
features = list(set(FEATURES) - {"diff_t-1", "ground_surface", "building_height", "storeys_above_ground", "heated_area", "anzahlwhg", "daily_avg"})
df_feat[features].describe()

In [None]:
fig = px.imshow(df_feat[features].drop_nulls(["snow", "tsun", "wpgt"]).to_pandas().corr())
fig.show()

In [None]:
import pandas as pd
from pandas import DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

df = df_feat[features].drop_nulls(["snow", "tsun", "wpgt"]).to_pandas()

enc = OneHotEncoder()
cat_features = ["primary_energy"]  # categorical features we want
enc = enc.fit(df[cat_features])
cat_features_names = enc.get_feature_names_out()
X_enc = DataFrame(enc.transform(df[cat_features]).toarray(), columns=cat_features_names)
df = df.drop(columns=cat_features)
df = pd.concat([df, X_enc], axis=1)

corr = df.corr()

f, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(corr,
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            vmin=-1.0, vmax=1.0,
            square=True, ax=ax,
            annot=True)

In [None]:
## Feature Correlation
features_ha = list(
    set(FEATURES) - {"diff_t-1", "ground_surface", "building_height", "storeys_above_ground", "daily_avg"})
df = df_feat[features_ha].drop_nulls(["snow", "tsun", "wpgt", "anzahlwhg", "heated_area"]).to_pandas()

enc = OneHotEncoder()
cat_features = ["primary_energy"]  # categorical features we want
enc = enc.fit(df[cat_features])
cat_features_names = enc.get_feature_names_out()
X_enc = DataFrame(enc.transform(df[cat_features]).toarray(), columns=cat_features_names)
df = df.drop(columns=cat_features)
df = pd.concat([df, X_enc], axis=1)

corr = df.corr()

f, ax = plt.subplots(figsize=(17, 12))
sns.heatmap(corr,
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            vmin=-1.0, vmax=1.0,
            square=True, ax=ax,
            annot=True)