In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from MLForecastPipeline import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

In [3]:
def format_df_to_mlforecast(df, date_col, target_col, unique_id='mean'):
    df_ = df.rename({
        date_col: "ds",
        # target_col: 'y',
    }, axis=1)

    df_['ds'] = pd.to_datetime(df_['ds'])

    df_['y'] = df_[target_col].copy()
    # df_.drop(columns=target_col)

    df_['unique_id'] = unique_id
    return df_

In [4]:
selected_sensors_df = pd.read_csv("../data/selected_sensors2_cleaned.csv", index_col=0)

In [5]:
scenarios_sensors = {
    # 0: 1, 4372603
    # "0_12M_train_7M_test": {"train_start": "2017-03-25", "train_end": "2018-03-25", "test_start": "2018-03-26", "test_end": "2018-10-10"},
    '2': {
        "26M_train":  {"train_start": "2017-04-01", "train_end": "2019-06-01"},
        "24M_train":  {"train_start": "2017-04-01", "train_end": "2019-04-01"},
        "22M_train":  {"train_start": "2017-04-01", "train_end": "2019-02-01"},
        "20M_train":  {"train_start": "2017-04-01", "train_end": "2018-12-01"},
        "18M_train":  {"train_start": "2017-04-01", "train_end": "2018-10-01"},
        "12M_train":  {"train_start": "2017-04-01", "train_end": "2018-04-01"},
        # "10M_train":  {"train_start": "2017-04-01", "train_end": "2018-01-25"},
        # "8M_train":   {"train_start": "2017-04-01", "train_end": "2017-10-25"},
        },
}
scenarios_sensors['5'] = scenarios_sensors['2'].copy()
# scenarios_sensors['6'] = scenarios_sensors['2'].copy()

In [6]:
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
def split_data(df, scenario, date_col="ds"):
    """Extracts train and test data based on train end date."""
    train_data = df[df[date_col] <= scenario['train_end']]
    test_start = pd.to_datetime(scenario['train_end']) + pd.Timedelta(days=1)
    test_data = df[df[date_col] >= test_start]
    return train_data, test_data

In [7]:
import shutil
import os

import logging
logging.getLogger("lightning").setLevel(logging.ERROR)

from pytorch_lightning import Trainer

# where you used to write your logs
BASE_LOG_DIR = r"C:\Users\PC314\Documents\tair\pm25\code\lightning_logs"
BASE_LOG_DIR = r"C:\Users\77019\Desktop\code\raka\airkaz\code\lightning_logs"

if os.path.exists(BASE_LOG_DIR):
    shutil.rmtree(BASE_LOG_DIR)

def make_trainer():
    pid = os.getpid()
    proc_dir = os.path.join(BASE_LOG_DIR, f"proc_{pid}")
    # start clean
    if os.path.exists(proc_dir):
        shutil.rmtree(proc_dir)
    # disable all Lightning loggers and checkpointing, write into proc_dir
    return Trainer(
        default_root_dir=proc_dir,
        logger=False,                # no TensorBoard/CSV loggers :contentReference[oaicite:0]{index=0}
        enable_checkpointing=False,  # skip ckpts if you want; omit if you do need them
    )

from lightning.pytorch.loggers import TensorBoardLogger
def make_trainer_kwargs():
    pid = os.getpid()
    proc_dir = os.path.join(BASE_LOG_DIR, f"proc_{pid}")
    if os.path.exists(proc_dir):
        shutil.rmtree(proc_dir)
    tb_logger = TensorBoardLogger(
        save_dir=BASE_LOG_DIR,
        name="lightning_logs",
        version=f"proc_{pid}"
    )
    return {
        # all of these will be captured by LSTM(**trainer_kwargs)
        "default_root_dir": proc_dir,
        "logger": tb_logger,
        "enable_checkpointing": False,
    }

In [8]:
def rm_log_folder():
    if os.path.exists(BASE_LOG_DIR):
        shutil.rmtree(BASE_LOG_DIR)

In [9]:
from joblib import Parallel, delayed
import time
from neuralforecast.auto import AutoLSTM
from neuralforecast.auto import AutoMLP
from neuralforecast.tsdataset import TimeSeriesDataset

def make_optuna_config_lstm(max_input_size, debug_mode=False):
    max_steps_range = [500, 1000]
    lr_range = [1e-4, 1e-1]
    if debug_mode:
        max_steps_range = [9,10]
        lr_range = [0.5,0.6]
    def optuna_config_lstm(trial):
        return {
            "input_size": trial.suggest_int("input_size", 7, max_input_size),
            # "h": trial.suggest_int("input_size", 7, max_input_size),     # will be set externally too
            "encoder_hidden_size": trial.suggest_categorical("encoder_hidden_size", [16, 32, 64, 128]),
            "encoder_n_layers": trial.suggest_int("encoder_n_layers", 1, 3),
            "decoder_hidden_size": trial.suggest_categorical("decoder_hidden_size", [16, 32, 64, 128]),
            "learning_rate": trial.suggest_float("learning_rate", lr_range[0], lr_range[1], log=True),
            "max_steps": trial.suggest_categorical("max_steps", max_steps_range),
            "batch_size": trial.suggest_categorical("batch_size", [16, 32, 64]),
            "random_seed": trial.suggest_int("random_seed", 1, 19),
            "start_padding_enabled": True
        }
    return optuna_config_lstm

def make_optuna_config_mlp(max_input_size, debug_mode=False):
    max_steps_range = [500, 1000]
    if debug_mode:
        max_steps_range = [9,10]
    def optuna_config_mlp(trial):
        return {
            "input_size": trial.suggest_int("input_size", 7, max_input_size),
            "step_size": trial.suggest_int("step_size", 1, max_input_size),
            "hidden_size": trial.suggest_categorical("hidden_size", [256, 512, 1024]),
            "num_layers": trial.suggest_int("num_layers", 2, 5),
            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
            "scaler_type": trial.suggest_categorical("scaler_type", [None, "robust", "standard"]),
            "max_steps": trial.suggest_categorical("max_steps", max_steps_range),
            "batch_size": trial.suggest_categorical("batch_size", [32, 64, 128, 256]),
            "windows_batch_size": trial.suggest_categorical("windows_batch_size", [128, 256, 512, 1024]),
            "random_seed": trial.suggest_int("random_seed", 1, 19),
            "start_padding_enabled": True
        }
    return optuna_config_mlp


def count_metrics(model_name, params, test_df, y_hat, test_lengths):
    test_df_copy = test_df.copy()
    test_df_copy['forecast'] = y_hat

    error_dict = {}
    for test_length in test_lengths:
        eval_subset = test_df_copy.iloc[:test_length]  # Take subset for evaluation
        error_dict[f"test_{test_length}_days"] = mape_met(eval_subset['y'].values,  eval_subset['forecast'].values)

    monthly_error_dict = defaultdict(dict)
    test_df_copy['year'] = test_df_copy['ds'].dt.year
    test_df_copy['month'] = test_df_copy['ds'].dt.month

    # Group by year and month and calculate MAPE for each group
    grouped = test_df_copy.groupby(['year', 'month'])
    for (year, month), group in grouped:
        if not group.empty:
            monthly_error_dict[year][month] = mape_met(group['y'].values, group['forecast'].values)

    return {
        "Model": model_name,
        **error_dict,  # Expand error dictionary into separate columns
        **monthly_error_dict,
        "preds": test_df_copy['forecast'].values,
        "params": params,
    }

import pandas as pd
import numpy as np
from neuralforecast.tsdataset import TimeSeriesDataset


def recursive_forecast(model, train_df, sensor_name, steps, step_size, freq='D'):
    """
    Given a trained AutoLSTM/AutoMLP with h=1, produce `steps` forecasts
    by feeding back each 1-step prediction as input for the next.
    """
    history = train_df.copy()
    preds = []
    remaining = steps
    
    while remaining > 0:
        this_chunk = min(step_size, remaining)

        # build dataset on current history
        dataset_hist, _, _, _ = TimeSeriesDataset.from_df(df=history, id_col='unique_id', time_col='ds', target_col='y')
        # predict this_chunk steps ahead
        yhat = model.predict(dataset=dataset_hist, step_size=this_chunk)
        # yhat includes the last this_chunk points
        next_preds = yhat[-this_chunk:].tolist()
        next_preds = [p[0] for p in next_preds]
        preds.extend(next_preds)

        # append those predictions to history with correct dates
        last_date = history['ds'].max()
        for i, p in enumerate(next_preds, start=1):
            next_date = last_date + pd.Timedelta(days=i)
            history = pd.concat([
                history,
                pd.DataFrame({
                    'ds': [next_date],
                    'unique_id': sensor_name,
                    'y': [float(p)]
                })
            ], ignore_index=True)
        remaining -= this_chunk
        
    return np.array(preds)

def process_scenario(sensor_name, scenario_name, scenario, selected_sensors_df, models, lag_transforms_options, ratios=[0.33, 0.66, 1], num_samples=20, debug_mode=False):
    from collections import defaultdict
    import numpy as np
    import pandas as pd
    
    print(f'{sensor_name}_{scenario_name}')
    results = []

    formatted_df = format_df_to_mlforecast(selected_sensors_df[['full_date', sensor_name]], 'full_date', sensor_name, unique_id=sensor_name)
    formatted_df = formatted_df[['ds', 'y', 'unique_id']]
    train_df, test_df = split_data(formatted_df, scenario)
    train_df['ds'] = pd.to_datetime(train_df['ds'])
    test_df['ds'] = pd.to_datetime(test_df['ds'])

    horizon_values = [1, 7, 30, 90, 180]
    forecast_horizon = len(test_df)

    for h in horizon_values:
        rm_log_folder()
        # Validate that input_size can support h
        min_train_len = train_df.groupby("unique_id").size().min()
        min_required_buffer = 20
        safe_max_input_size = max(min_train_len - h - min_required_buffer, 7)
        if safe_max_input_size < 7:
            print(f"Skipping h={h} due to insufficient training length.")
            continue

        # Create extended df for forecasting future
        future_dates = pd.date_range(start=train_df['ds'].max() + pd.Timedelta(days=1), periods=forecast_horizon, freq='D')
        future_df = pd.DataFrame({'ds': future_dates, 'unique_id': sensor_name, 'y': np.nan})
        df_for_forecasting = pd.concat([train_df, future_df], ignore_index=True)

        dataset, indices, dates, ds_arr  = TimeSeriesDataset.from_df(df=train_df, id_col="unique_id", time_col="ds", target_col="y")
        dataset_future, indices, dates, ds_arr  = TimeSeriesDataset.from_df(df=df_for_forecasting, id_col="unique_id", time_col="ds", target_col="y")

        test_lengths = list(range(30, 181, 30)) + [240, 300, 360, 480, 600, 720, forecast_horizon]

        assert future_df['ds'].min() > train_df['ds'].max()
        assert pd.api.types.is_datetime64_any_dtype(df_for_forecasting['ds'])

        # LSTM
        rm_log_folder()
        orig_optuna_config = make_optuna_config_lstm(safe_max_input_size, debug_mode)

        def optuna_config_with_trainer(trial):
            base_cfg = orig_optuna_config(trial)
            return {**base_cfg, **make_trainer_kwargs()}

        model_lstm = AutoLSTM(h=h, num_samples=num_samples, backend='optuna', config=optuna_config_with_trainer)
        model_lstm.fit(dataset=dataset, distributed_config=None)
        # y_hat = model_lstm.model.predict(dataset=dataset_future)
        y_hat = recursive_forecast( model_lstm, train_df, sensor_name, steps=forecast_horizon, step_size=h, freq='D')

        # print('future_df:', future_df.shape)
        # print('yhat: ', len(y_hat))
        y_hat = y_hat[-forecast_horizon:]
        # print('yhat: ', len(y_hat), y_hat[-10:])
        results.append(count_metrics(f'lstm_h={h}', model_lstm.model.hparams, test_df, y_hat, test_lengths))
    
        # MLP
        rm_log_folder()
        optuna_config = make_optuna_config_mlp(safe_max_input_size, debug_mode)
        model_mlp = AutoMLP(h=h, num_samples=num_samples, backend='optuna', config=optuna_config)
        model_mlp.fit(dataset=dataset, distributed_config=None)
        y_hat = recursive_forecast( model_mlp, train_df, sensor_name, steps=forecast_horizon, step_size=h, freq='D')
        y_hat = y_hat[-forecast_horizon:]
        results.append(count_metrics(f'mlp_h={h}', model_mlp.model.hparams, test_df, y_hat, test_lengths))

    save_results(results, f"results/run_20/{sensor_name}_{scenario_name}.csv")
    return results

def run_all_scenarios_parallel(scenarios_sensors, selected_sensors_df, models, lag_transforms_options, ratios=[0.33, 0.66, 1], num_samples=20):
    # don't use all cpus (instead all but one)
    results = Parallel(n_jobs=-1, verbose=30)( 
        delayed(process_scenario)(sensor_name, scenario_name, scenario, selected_sensors_df, models, lag_transforms_options, ratios=ratios, num_samples=num_samples)
        for sensor_name, scenarios in scenarios_sensors.items()
        for scenario_name, scenario in scenarios.items()
    )

    return results

2025-04-22 15:13:41,916	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-04-22 15:13:42,397	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [10]:
# for sensor_name, scenarios in scenarios_sensors.items():
#     for scenario_name, scenario in scenarios.items():
#         res = process_scenario(sensor_name, scenario_name, scenario, selected_sensors_df, models=None, lag_transforms_options=None, num_samples=1, debug_mode=True)
#         raise KeyError('stop')

In [11]:
results = run_all_scenarios_parallel(scenarios_sensors, selected_sensors_df, models=None, lag_transforms_options=None, num_samples=20)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [None]:
import multiprocessing
cpu_count = multiprocessing.cpu_count()
cpu_count

In [None]:
# # Loop through scenarios and evaluate models
# results = []

# for sensor_name, scenarios in scenarios_sensors.items():
#     formatted_df = format_df_to_mlforecast(selected_sensors_df[['full_date', sensor_name]], 'full_date', sensor_name, unique_id=sensor_name)
#     formatted_df = formatted_df[['ds', 'y', 'unique_id']]

#     for scenario_name, scenario in scenarios.items():

#         train_df, test_df = split_data(formatted_df, scenario)

#         optimal_lags_list = get_optimal_lags(train_df, 'y', 
#                                             # ratios=[1]
#                                             ratios=[0.33, 0.66, 1]
#                                             #  ratios=[0.25, 0.5, 0.75, 1]
#         )
#         target_transforms = get_dynamic_transforms(train_df)
#         results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, optimal_lags_list, winter_weights=True)

#         save_results(results, f"results/run_13/{sensor_name}_{scenario_name}.csv") 