In [None]:
from datetime import timezone
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go

In [None]:
from tfmmeteogalicia.dataset import load_wrf_hist_dataset
from tfmmeteogalicia.thredds_wrf import MeteoGaliciaNetCDFSubsetColumns, ThreddsWRFDomain, ThreddsWRFServerRun
from tfmforecasting.dataset import AdditionalHousingUnitFields, HousingUnitColumns
from tfmforecasting.preprocessing import (
    add_datetime_to_housing_unit_dataset, lag_consumption_feature
)
from tfmforecasting.utils import find_cluster_files, get_housing_unit_name

## Configuration

### General

In [None]:
year = 2024
month = 5

### MeteoGalicia

In [None]:
domain: ThreddsWRFDomain = "d02"
server_run: ThreddsWRFServerRun = "0000"
weather_data_dir = Path("../data/meteogalicia/thredds/wrf_hist").resolve()

Weather info ([Source](https://metar-taf.com/es/temperatures/provincia-de-zaragoza))

In [None]:
max_temp = 34.2 + 273.15
min_temp = 4 + 273.15

### Housing Units

In [None]:
cluster_id = 0
consumption_data_dir = Path('../../analisis_consumos/data/viviendas/por_mes_con_cluster/cluster_4').resolve()

## Extract & Load

In [None]:
start_date = pd.Timestamp(year=year, month=month, day=1, tz=timezone.utc)
end_date = pd.Timestamp(year=year, month=month + 1, day=1, tz=timezone.utc) - pd.Timedelta(days=1)
dates = pd.date_range(start=start_date, end=end_date)
weather_by_days = [
    load_wrf_hist_dataset(weather_data_dir, domain, server_run, date)
    for date in dates
]
weather_by_days = [
    df[df[MeteoGaliciaNetCDFSubsetColumns.DATE] < date + pd.Timedelta(days=1, hours=1)].copy().reset_index(drop=True)
    for df, date in zip(weather_by_days, dates)
]
weather_data = pd.concat(weather_by_days)
weather_data

In [None]:
cluster_files = find_cluster_files(consumption_data_dir, cluster_id, year=year, month=month)
cluster_files.sort()
csv_delimiter = ';'
data_frames = []
for cluster_file in cluster_files:
    df = pd.read_csv(cluster_file, delimiter=csv_delimiter)
    df = add_datetime_to_housing_unit_dataset(df)
    df = lag_consumption_feature(df, n_lags=24)
    df[AdditionalHousingUnitFields.HousingUnit] = get_housing_unit_name(cluster_file)
    data_frames.append(df)
cluster_data = pd.concat(data_frames)
cluster_data = cluster_data.sort_values(by=[AdditionalHousingUnitFields.Datetime, AdditionalHousingUnitFields.HousingUnit]).reset_index(drop=True)
cluster_data = cluster_data[
    [AdditionalHousingUnitFields.Datetime, AdditionalHousingUnitFields.HousingUnit, HousingUnitColumns.Consumption]
].copy().reset_index(drop=True)
cluster_data[HousingUnitColumns.Consumption] = (cluster_data[HousingUnitColumns.Consumption] - cluster_data[HousingUnitColumns.Consumption].min()) / (cluster_data[HousingUnitColumns.Consumption].max() - cluster_data[HousingUnitColumns.Consumption].min())
cluster_data

Obtain the housing units associated with the cluster.

In [None]:
housing_units = set([get_housing_unit_name(file) for file in cluster_files])
housing_units

## Transform

In [None]:
weather_data[MeteoGaliciaNetCDFSubsetColumns.TEMP] = (weather_data[MeteoGaliciaNetCDFSubsetColumns.TEMP] - min_temp) / (max_temp - min_temp)
weather_data

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=weather_data[MeteoGaliciaNetCDFSubsetColumns.DATE],
        y=weather_data[MeteoGaliciaNetCDFSubsetColumns.TEMP],
        mode='lines+markers',
        name=MeteoGaliciaNetCDFSubsetColumns.TEMP,
    )
)
for housing_unit in sorted(housing_units):
    fig.add_trace(
        go.Scatter(
            x=cluster_data[cluster_data[AdditionalHousingUnitFields.HousingUnit] == housing_unit][AdditionalHousingUnitFields.Datetime],
            y=cluster_data[cluster_data[AdditionalHousingUnitFields.HousingUnit] == housing_unit][HousingUnitColumns.Consumption],
            mode='lines+markers',
            name=housing_unit,
        )
    )
fig.show()