In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

In [3]:
def format_df_to_mlforecast(df, date_col, target_col, unique_id='mean'):
    df_ = df.rename({
        date_col: "ds",
        # target_col: 'y',
    }, axis=1)

    df_['ds'] = pd.to_datetime(df_['ds'])

    df_['y'] = df_[target_col].copy()
    # df_.drop(columns=target_col)

    df_['unique_id'] = unique_id
    return df_

In [4]:
selected_sensors_df = pd.read_csv("../data/selected_sensors2_cleaned.csv", index_col=0)

In [5]:
scenarios_sensors = {
    # 0: 1, 4372603
    # "0_12M_train_7M_test": {"train_start": "2017-03-25", "train_end": "2018-03-25", "test_start": "2018-03-26", "test_end": "2018-10-10"},
    '2': {
        "26M_train":  {"train_start": "2017-04-01", "train_end": "2019-06-01"},
        "24M_train":  {"train_start": "2017-04-01", "train_end": "2019-04-01"},
        "22M_train":  {"train_start": "2017-04-01", "train_end": "2019-02-01"},
        "20M_train":  {"train_start": "2017-04-01", "train_end": "2018-12-01"},
        "18M_train":  {"train_start": "2017-04-01", "train_end": "2018-10-01"},
        "12M_train":  {"train_start": "2017-04-01", "train_end": "2018-04-01"},
        # "10M_train":  {"train_start": "2017-04-01", "train_end": "2018-01-25"},
        # "8M_train":   {"train_start": "2017-04-01", "train_end": "2017-10-25"},
        },
}
scenarios_sensors['5'] = scenarios_sensors['2'].copy()
# scenarios_sensors['6'] = scenarios_sensors['2'].copy()

In [6]:
from MLForecastPipeline import *
from lightgbm import LGBMRegressor

In [7]:
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
def split_data(df, scenario, date_col="ds"):
    """Extracts train and test data based on train end date."""
    train_data = df[df[date_col] <= scenario['train_end']]
    test_start = pd.to_datetime(scenario['train_end']) + pd.Timedelta(days=1)
    test_data = df[df[date_col] >= test_start]
    return train_data, test_data

In [None]:
from joblib import Parallel, delayed
import time
from neuralforecast.auto import AutoLSTM
from neuralforecast.auto import AutoMLP
from neuralforecast.tsdataset import TimeSeriesDataset

def make_optuna_config_lstm(max_input_size):
    def optuna_config_lstm(trial):
        return {
            "input_size": trial.suggest_int("input_size", 7, max_input_size),
            "encoder_hidden_size": trial.suggest_categorical("encoder_hidden_size", [16, 32, 64, 128]),
            "encoder_n_layers": trial.suggest_int("encoder_n_layers", 1, 3),
            "decoder_hidden_size": trial.suggest_categorical("decoder_hidden_size", [16, 32, 64, 128]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
            "max_steps": trial.suggest_categorical("max_steps", [500, 1000]),
            "batch_size": trial.suggest_categorical("batch_size", [16, 32, 64]),
            "loss": None,  # set externally
            "h": None,     # will be set externally too
            "random_seed": trial.suggest_int("random_seed", 1, 19),
            "start_padding_enabled": True
        }
    return optuna_config_lstm

def make_optuna_config_mlp(max_input_size):
    def optuna_config_mlp(trial):
        return {
            "input_size": trial.suggest_int("input_size", 7, max_input_size),
            "step_size": trial.suggest_int("step_size", 1, max_input_size),
            "hidden_size": trial.suggest_categorical("hidden_size", [256, 512, 1024]),
            "num_layers": trial.suggest_int("num_layers", 2, 5),
            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
            "scaler_type": trial.suggest_categorical("scaler_type", [None, "robust", "standard"]),
            "max_steps": trial.suggest_categorical("max_steps", [500, 1000]),
            "batch_size": trial.suggest_categorical("batch_size", [32, 64, 128, 256]),
            "windows_batch_size": trial.suggest_categorical("windows_batch_size", [128, 256, 512, 1024]),
            "loss": None,
            "random_seed": trial.suggest_int("random_seed", 1, 19),
            "start_padding_enabled": True
        }
    return optuna_config_mlp


def count_metrics(model_name, params, test_df, y_hat, test_lengths):
    test_df_copy = test_df.copy()
    test_df_copy['forecast'] = y_hat

    error_dict = {}
    for test_length in test_lengths:
        eval_subset = test_df_copy.iloc[:test_length]  # Take subset for evaluation
        error_dict[f"test_{test_length}_days"] = mape_met(eval_subset['y'].values,  eval_subset['forecast'].values)

    monthly_error_dict = defaultdict(dict)
    test_df_copy['year'] = test_df_copy['ds'].dt.year
    test_df_copy['month'] = test_df_copy['ds'].dt.month

    # Group by year and month and calculate MAPE for each group
    grouped = test_df_copy.groupby(['year', 'month'])
    for (year, month), group in grouped:
        if not group.empty:
            monthly_error_dict[year][month] = mape_met(group['y'].values, group['forecast'].values)

    return {
        "Model": model_name,
        **error_dict,  # Expand error dictionary into separate columns
        **monthly_error_dict,
        "preds": test_df_copy['forecast'].values,
        "params": params,
    }

def process_scenario(sensor_name, scenario_name, scenario, selected_sensors_df, models, lag_transforms_options, ratios=[0.33, 0.66, 1]):
    """ Process each scenario independently and save results. """
    print(f'{sensor_name}_{scenario_name}')

    results = []
    
    formatted_df = format_df_to_mlforecast(selected_sensors_df[['full_date', sensor_name]], 'full_date', sensor_name, unique_id=sensor_name)
    formatted_df = formatted_df[['ds', 'y', 'unique_id']]
    
    train_df, test_df = split_data(formatted_df, scenario)
    train_df['ds'] = pd.to_datetime(train_df['ds']) 
    test_df['ds'] = pd.to_datetime(test_df['ds']) 

    train_dataset, indices, dates, ds_arr = TimeSeriesDataset.from_df(
        df=train_df,
        id_col="unique_id",
        time_col="ds",
        target_col="y"
    )

    horizon = len(test_df)
    test_lengths = list(range(30, 181, 30)) + [240, 300, 360, 480, 600, 720, horizon]  # Days-based segmentation

    # max_input_size = train_df.groupby("unique_id").size().min() - 1
    # max_input_size = max(max_input_size, 7)  # Ensure it's not below lower bound

    # Max possible input size = min_train_len - h - min_required_buffer
    min_train_len = train_df.groupby("unique_id").size().min()
    min_required_buffer = 20  # for sanity

    safe_max_input_size = max(min_train_len - horizon - min_required_buffer, 7)

    if safe_max_input_size < 7:
        print(f"Skipping scenario {sensor_name}_{scenario_name} due to insufficient training length.")
        return []

    optuna_config = make_optuna_config_lstm(safe_max_input_size)
    model_lstm = AutoLSTM(h=horizon, num_samples=20, backend='optuna', config=optuna_config)
    model_lstm.fit(dataset=train_dataset, distributed_config=None)
    y_hat = model_lstm.predict(dataset=train_dataset)
    results.append(count_metrics('lstm', model_lstm.model.hparams, test_df, y_hat, test_lengths))

    optuna_config = make_optuna_config_mlp(safe_max_input_size)
    model_mlp = AutoMLP(h=horizon, num_samples=20, backend='optuna', config=optuna_config)
    model_mlp.fit(dataset=train_dataset, distributed_config=None)
    y_hat = model_mlp.predict(dataset=train_dataset)
    results.append(count_metrics('mlp', model_mlp.model.hparams, test_df, y_hat, test_lengths))

    save_results(results, f"results/run_20/{sensor_name}_{scenario_name}.csv")

    return results

def run_all_scenarios_parallel(scenarios_sensors, selected_sensors_df, models, lag_transforms_options, ratios=[0.33, 0.66, 1]):
    # don't use all cpus (instead all but one)
    results = Parallel(n_jobs=-1, verbose=30)( 
        delayed(process_scenario)(sensor_name, scenario_name, scenario, selected_sensors_df, models, lag_transforms_options, ratios=ratios)
        for sensor_name, scenarios in scenarios_sensors.items()
        for scenario_name, scenario in scenarios.items()
    )

    return results

2025-04-16 16:33:02,916	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-04-16 16:33:03,200	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [9]:
results = run_all_scenarios_parallel(scenarios_sensors, selected_sensors_df, models=None, lag_transforms_options=None)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


RuntimeError: The size of tensor a (676) must match the size of tensor b (60) at non-singleton dimension 1

In [None]:
import multiprocessing
cpu_count = multiprocessing.cpu_count()
cpu_count

In [None]:
# Loop through scenarios and evaluate models
results = []

for sensor_name, scenarios in scenarios_sensors.items():
    formatted_df = format_df_to_mlforecast(selected_sensors_df[['full_date', sensor_name]], 'full_date', sensor_name, unique_id=sensor_name)
    formatted_df = formatted_df[['ds', 'y', 'unique_id']]

    for scenario_name, scenario in scenarios.items():

        train_df, test_df = split_data(formatted_df, scenario)

        optimal_lags_list = get_optimal_lags(train_df, 'y', 
                                            # ratios=[1]
                                            ratios=[0.33, 0.66, 1]
                                            #  ratios=[0.25, 0.5, 0.75, 1]
        )
        target_transforms = get_dynamic_transforms(train_df)
        results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, optimal_lags_list, winter_weights=True)

        save_results(results, f"results/run_13/{sensor_name}_{scenario_name}.csv") 