In [1]:
from pathlib import Path
import pandas as pd

In [2]:
from tfmforecasting.dataset import AdditionalHousingUnitFields, HousingUnitColumns
from tfmforecasting.preprocessing import (
    add_datetime_to_housing_unit_dataset, lag_consumption_feature, name_lagged_feature
)
from tfmforecasting.utils import find_cluster_files, get_housing_unit_name

## Cluster information

In [3]:
n_clusters = 4
target_cluster_id = 0
year = 2024
month = 5
consumption_data_dir = Path('../../analisis_consumos/data/viviendas/por_mes_con_cluster/cluster_4/')
consumption_data_dir = consumption_data_dir.resolve()

Search the files that match the cluster information.

In [4]:
cluster_files = find_cluster_files(consumption_data_dir, target_cluster_id, year=year, month=month)
cluster_files.sort()
cluster_files

[PosixPath('/Users/juanhernandez/Developer/EHU/github/tfm/analisis_consumos/data/viviendas/por_mes_con_cluster/cluster_4/ATF_2024_05_cluster_k4_id_0.csv'),
 PosixPath('/Users/juanhernandez/Developer/EHU/github/tfm/analisis_consumos/data/viviendas/por_mes_con_cluster/cluster_4/JACL_2024_05_cluster_k4_id_0.csv'),
 PosixPath('/Users/juanhernandez/Developer/EHU/github/tfm/analisis_consumos/data/viviendas/por_mes_con_cluster/cluster_4/JLG_2024_05_cluster_k4_id_0.csv'),
 PosixPath('/Users/juanhernandez/Developer/EHU/github/tfm/analisis_consumos/data/viviendas/por_mes_con_cluster/cluster_4/RFM_2024_05_cluster_k4_id_0.csv')]

Set of housing units belonging to the cluster.

In [5]:
set([get_housing_unit_name(file) for file in cluster_files])

{'ATF', 'JACL', 'JLG', 'RFM'}

Load cluster files into DataFrames.

In [6]:
csv_delimiter = ';'
data_frames = []
for cluster_file in cluster_files:
    df = pd.read_csv(cluster_file, delimiter=csv_delimiter)
    df = add_datetime_to_housing_unit_dataset(df)
    df = lag_consumption_feature(df, n_lags=24)
    df[AdditionalHousingUnitFields.HousingUnit] = get_housing_unit_name(cluster_file)
    data_frames.append(df)
cluster_data = pd.concat(data_frames)
cluster_data = cluster_data.sort_values(by=[AdditionalHousingUnitFields.Datetime, AdditionalHousingUnitFields.HousingUnit]).reset_index(drop=True)
cluster_data[cluster_data[AdditionalHousingUnitFields.HousingUnit] == 'ATF'][
    [HousingUnitColumns.Date, HousingUnitColumns.Time] +
    [name_lagged_feature(HousingUnitColumns.Consumption, lag) for lag in range(8, 0, -1)] +
    [HousingUnitColumns.Consumption]
].dropna().head(n=24)

Unnamed: 0,date,time,consumptionKWh_08,consumptionKWh_07,consumptionKWh_06,consumptionKWh_05,consumptionKWh_04,consumptionKWh_03,consumptionKWh_02,consumptionKWh_01,consumptionKWh
32,2024-05-01,9:00,0.16,0.131,0.128,0.095,0.069,0.069,0.073,0.102,0.129
36,2024-05-01,10:00,0.131,0.128,0.095,0.069,0.069,0.073,0.102,0.129,0.091
40,2024-05-01,11:00,0.128,0.095,0.069,0.069,0.073,0.102,0.129,0.091,0.073
44,2024-05-01,12:00,0.095,0.069,0.069,0.073,0.102,0.129,0.091,0.073,0.07
48,2024-05-01,13:00,0.069,0.069,0.073,0.102,0.129,0.091,0.073,0.07,0.1
52,2024-05-01,14:00,0.069,0.073,0.102,0.129,0.091,0.073,0.07,0.1,0.195
56,2024-05-01,15:00,0.073,0.102,0.129,0.091,0.073,0.07,0.1,0.195,0.477
60,2024-05-01,16:00,0.102,0.129,0.091,0.073,0.07,0.1,0.195,0.477,0.129
64,2024-05-01,17:00,0.129,0.091,0.073,0.07,0.1,0.195,0.477,0.129,0.19
68,2024-05-01,18:00,0.091,0.073,0.07,0.1,0.195,0.477,0.129,0.19,0.14
