In [None]:
from datetime import timezone
from pathlib import Path

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.model_selection import TimeSeriesSplit, validation_curve
from sklearn.svm import SVR

In [None]:
from tfmmeteogalicia.dataset import load_wrf_hist_dataset
from tfmmeteogalicia.thredds_wrf import MeteoGaliciaNetCDFSubsetColumns, ThreddsWRFDomain, ThreddsWRFServerRun
from tfmforecasting.dataset import AdditionalHousingUnitFields, HousingUnitColumns
from tfmforecasting.preprocessing import (
    add_datetime_to_housing_unit_dataset, lag_consumption_feature, name_lagged_feature
)
from tfmforecasting.utils import find_cluster_files, get_housing_unit_name

In [None]:
Cs = np.logspace(-8, 4, 11)
year = 2024
month = 5

In [None]:
domain: ThreddsWRFDomain = "d02"
server_run: ThreddsWRFServerRun = "0000"
weather_data_dir = Path("../data/meteogalicia/thredds/wrf_hist").resolve()

In [None]:
start_date = pd.Timestamp(year=year, month=month, day=1, tz=timezone.utc)
end_date = pd.Timestamp(year=year, month=month + 1, day=1, tz=timezone.utc) - pd.Timedelta(days=1)
dates = pd.date_range(start=start_date, end=end_date)
weather_by_days = [
    load_wrf_hist_dataset(weather_data_dir, domain, server_run, date)
    for date in dates
]
weather_by_days = [
    df[df[MeteoGaliciaNetCDFSubsetColumns.DATE] < date + pd.Timedelta(days=1, hours=1)].copy().reset_index(drop=True)
    for df, date in zip(weather_by_days, dates)
]
weather_data = pd.concat(weather_by_days)
weather_data = weather_data.rename(columns={MeteoGaliciaNetCDFSubsetColumns.DATE: AdditionalHousingUnitFields.Datetime})
weather_data.head(n=4)

In [None]:
cluster_id = 0
n_lags = 48
lagged_features = sorted([
    name_lagged_feature(HousingUnitColumns.Consumption, lag) for lag in range(24, 32)
], reverse=True)
consumption_data_dir = Path('../../analisis_consumos/data/viviendas/por_mes_con_cluster/cluster_4').resolve()

In [None]:
cluster_files = find_cluster_files(consumption_data_dir, cluster_id, year=year, month=month)
cluster_files.sort()
housing_units = set([get_housing_unit_name(file) for file in cluster_files])
n_housing_units = len(housing_units)
housing_units

In [None]:
csv_delimiter = ';'
data_frames = []
for cluster_file in cluster_files:
    df = pd.read_csv(cluster_file, delimiter=csv_delimiter)
    df = add_datetime_to_housing_unit_dataset(df)
    df = lag_consumption_feature(df, n_lags=n_lags)
    df[AdditionalHousingUnitFields.HousingUnit] = get_housing_unit_name(cluster_file)
    data_frames.append(df)
cluster_data = pd.concat(data_frames)
cluster_data = cluster_data.sort_values(by=[AdditionalHousingUnitFields.Datetime, AdditionalHousingUnitFields.HousingUnit]).reset_index(drop=True)
consumption_columns = lagged_features + [HousingUnitColumns.Consumption]
cluster_data = cluster_data[
    [AdditionalHousingUnitFields.Datetime, AdditionalHousingUnitFields.HousingUnit] + consumption_columns
].copy().dropna().reset_index(drop=True)
cluster_data[cluster_data[AdditionalHousingUnitFields.HousingUnit] == 'ATF'].head(n=4)

In [None]:
merged_df = pd.merge(
    cluster_data,
    weather_data[[AdditionalHousingUnitFields.Datetime, MeteoGaliciaNetCDFSubsetColumns.TEMP]],
    on=AdditionalHousingUnitFields.Datetime
).sort_values(by=[AdditionalHousingUnitFields.Datetime, AdditionalHousingUnitFields.HousingUnit]).reset_index(drop=True)
merged_df.head(n=4)

In [None]:
features = lagged_features + [MeteoGaliciaNetCDFSubsetColumns.TEMP]
target = HousingUnitColumns.Consumption
features, target

In [None]:
max_temp = 34.2 + 273.15
min_temp = 4 + 273.15
min_temp, max_temp

In [None]:
max_consumption = merged_df[HousingUnitColumns.Consumption].max()
min_consumption = merged_df[HousingUnitColumns.Consumption].min()
min_consumption, max_consumption

In [None]:
window_size = 21 # days
target_day = pd.Timestamp(year=year, month=month, day=27, hour=0, minute=0, second=0, microsecond=0, tz=timezone.utc)
window_start = target_day - pd.Timedelta(days=window_size)
sliced_df = merged_df[
    (
            merged_df[AdditionalHousingUnitFields.Datetime] >= window_start + pd.Timedelta(days=-1)
    ) & (
            merged_df[AdditionalHousingUnitFields.Datetime] < target_day + pd.Timedelta(days=1)
    )
].copy().reset_index(drop=True)
for lagged_feature in lagged_features:
    if lagged_feature not in features:
        continue
    sliced_df[lagged_feature] = (sliced_df[lagged_feature] - min_consumption) / (max_consumption - min_consumption)
sliced_df[MeteoGaliciaNetCDFSubsetColumns.TEMP] = (sliced_df[MeteoGaliciaNetCDFSubsetColumns.TEMP] - min_temp) / (max_temp - min_temp)
sliced_df.shape

In [None]:
X = sliced_df[features].to_numpy()
y = sliced_df[HousingUnitColumns.Consumption].to_numpy()
sliced_df[features]

In [None]:
tscv = TimeSeriesSplit(max_train_size=n_housing_units*window_size*24, n_splits=2, test_size=n_housing_units*24)
for i, (train_index, test_index) in enumerate(tscv.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

In [None]:
Cs = np.logspace(-8, 4, 16)
Cs

In [None]:
model = SVR(kernel='rbf', gamma='scale', epsilon=0.01)
train_scores, test_scores = validation_curve(
    model,
    X,
    y,
    param_name="C",
    param_range=Cs,
    cv=tscv,
    n_jobs=8,
    scoring='neg_mean_absolute_percentage_error',
)

In [None]:
results_df = pd.DataFrame({
    "C": Cs,
    "test_scores": test_scores[:, 1] * -100,
    "train_scores": train_scores[:, 1] * -100,
})
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=results_df['C'],
        y=results_df['test_scores'],
        mode='lines+markers',
        name='Test scores',
    )
)
fig.add_trace(
    go.Scatter(
        x=results_df['C'],
        y=results_df['train_scores'],
        mode='lines+markers',
        name='Train scores',
    )
)
fig.update_xaxes(title_text="C", type="log")
fig.update_yaxes(title_text="MAPE")
fig.show()