In [14]:
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [15]:
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

In [16]:
selected_sensors1_df = pd.read_csv("../data/selected_sensors1_cleaned.csv")

In [17]:
clusters_0 = {
    0: [1, 4372603],
    1: [2],
    2: [5], 
    3: [6],
    4: [9],
    5: [12],
}

In [18]:
scenarios = {
    # 0: 1, 4372603
    "0_12M_train_7M_test": {"train_start": "2017-03-25", "train_end": "2018-03-25", "test_start": "2018-03-26", "test_end": "2018-10-10"},
    "0_10M_train_9M_test":  {"train_start": "2017-03-25", "train_end": "2018-01-25", "test_start": "2018-01-26", "test_end": "2018-10-10"},
    "0_8M_train_11M_test": {"train_start": "2017-03-25", "train_end": "2017-10-25", "test_start": "2017-10-26", "test_end": "2018-10-10"},
    
    # Non-Heating Periods
    "0_NonHeating_3M_train_3M_test": {"train_start": "2017-04-15", "train_end": "2017-07-15", "test_start": "2017-07-16", "test_end": "2017-10-01"},
    "0_NonHeating_4M_train_2M_test": {"train_start": "2017-04-15", "train_end": "2017-08-15", "test_start": "2017-08-16", "test_end": "2017-10-01"},
    "0_NonHeating_2M_train_4M_test": {"train_start": "2017-04-15", "train_end": "2017-06-15", "test_start": "2017-06-16", "test_end": "2017-10-01"},
    "0_NonHeating_1M_train_5M_test": {"train_start": "2017-04-15", "train_end": "2017-05-15", "test_start": "2017-05-16", "test_end": "2017-10-01"},

    # Heating Periods
    "0_Heating_5M_train_1Y_test": {"train_start": "2017-06-01", "train_end": "2017-11-01", "test_start": "2017-11-02", "test_end": "2018-10-10"},
    "0_Heating_3M_train_1Y_test": {"train_start": "2017-07-01", "train_end": "2017-10-10", "test_start": "2017-10-11", "test_end": "2018-10-10"},
}


In [19]:
from MLForecastPipeline import *

In [None]:
def get_seasonal_data(df, start_date, end_date):
    """Filters data for a specific seasonal period."""
    return df[(df["date"] >= start_date) & (df["date"] <= end_date)]

def split_data(df, scenario):
    """Extracts train and test data based on a given time window scenario."""
    train_data = get_seasonal_data(df, scenario["train_start"], scenario["train_end"])
    test_data = get_seasonal_data(df, scenario["test_start"], scenario["test_end"])
    return train_data, test_data

models = {
    "XGBRegressor": XGBRegressor(),
    "SGDRegressor": SGDRegressor(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

# Define lag transformations

lag_transforms_options = [
    {1: [expanding_mean], 7: [rolling_mean_14], 30: [expanding_mean]},
    {1: [rolling_mean_14], 7: [rolling_mean_30], 30: [expanding_mean]},
    # {1: [rolling_mean_14], 30: [expanding_mean]},
    # {1: [rolling_mean_14]},
    # {},
]

In [None]:
# Loop through scenarios and evaluate models
results = []
for scenario_name, scenario in scenarios.items():
    train_df, test_df = split_data(selected_sensors1_df[0], scenario)

    optimal_lags_list = get_optimal_lags(train_df, "y")
    target_transforms = get_dynamic_transforms(train_df)
    results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, optimal_lags_list)

    save_results(results, f"results/{scenario_name}.csv") 