In [None]:
from pathlib import Path
import pandas as pd

In [None]:
from tfmforecasting.dataset import AdditionalHousingUnitFields, HousingUnitColumns
from tfmforecasting.preprocessing import (
    add_datetime_to_housing_unit_dataset, lag_consumption_feature, name_lagged_feature
)
from tfmforecasting.utils import find_cluster_files, get_housing_unit_name

## Cluster information

In [None]:
n_clusters = 4
target_cluster_id = 0
year = 2024
month = 5
consumption_data_dir = Path('../../analisis_consumos/data/viviendas/por_mes_con_cluster/cluster_4/')
consumption_data_dir = consumption_data_dir.resolve()

Search the files that match the cluster information.

In [None]:
cluster_files = find_cluster_files(consumption_data_dir, target_cluster_id, year=year, month=month)
cluster_files.sort()
cluster_files

Set of housing units belonging to the cluster.

In [None]:
set([get_housing_unit_name(file) for file in cluster_files])

Load cluster files into DataFrames.

In [None]:
csv_delimiter = ';'
data_frames = []
for cluster_file in cluster_files:
    df = pd.read_csv(cluster_file, delimiter=csv_delimiter)
    df = add_datetime_to_housing_unit_dataset(df)
    df = lag_consumption_feature(df, n_lags=24)
    df[AdditionalHousingUnitFields.HousingUnit] = get_housing_unit_name(cluster_file)
    data_frames.append(df)
cluster_data = pd.concat(data_frames)
cluster_data = cluster_data.sort_values(by=[AdditionalHousingUnitFields.Datetime, AdditionalHousingUnitFields.HousingUnit]).reset_index(drop=True)
cluster_data[cluster_data[AdditionalHousingUnitFields.HousingUnit] == 'ATF'][
    [HousingUnitColumns.Date, HousingUnitColumns.Time] +
    [name_lagged_feature(HousingUnitColumns.Consumption, lag) for lag in range(8, 0, -1)] +
    [HousingUnitColumns.Consumption]
].dropna().head(n=24)