In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

In [3]:
def format_df_to_mlforecast(df, date_col, target_col, unique_id='mean'):
    df_ = df.rename({
        date_col: "ds",
        # target_col: 'y',
    }, axis=1)

    df_['ds'] = pd.to_datetime(df_['ds'])

    df_['y'] = df_[target_col].copy()
    # df_.drop(columns=target_col)

    df_['unique_id'] = unique_id
    return df_

In [4]:
selected_sensors_df = pd.read_csv("../data/selected_sensors2_cleaned.csv", index_col=0)

In [5]:
TEST_START_DATE = "2019-04-02"
scenarios_sensors = {
    # 0: 1, 4372603
    # "0_12M_train_7M_test": {"train_start": "2017-03-25", "train_end": "2018-03-25", "test_start": "2018-03-26", "test_end": "2018-10-10"},
    '2': {
        # "18M_train":  {"train_start": "2017-04-01", "train_end": "2018-10-01"},
        "12M_train":  {"train_start": "2017-04-01", "train_end": "2018-04-01", "val_start": "2017-04-01", "val_end": "2018-04-01"},
        # "12M_train_3M_val":  {"train_start": "2017-04-01", "train_end": "2018-04-01", "val_start": "2018-04-01", "val_end": "2018-07-01"},
        "12M_train_6M_val":  {"train_start": "2017-04-01", "train_end": "2018-04-01", "val_start": "2018-04-01", "val_end": "2018-10-01"},
        "12M_train_9M_val":  {"train_start": "2017-04-01", "train_end": "2018-04-01", "val_start": "2018-04-01", "val_end": "2019-01-01"},
        "12M_train_12M_val":  {"train_start": "2017-04-01", "train_end": "2018-04-01", "val_start": "2018-04-01", "val_end": "2019-04-01"},
        },
}
scenarios_sensors['5'] = scenarios_sensors['2'].copy()
scenarios_sensors['6'] = scenarios_sensors['2'].copy()

In [6]:
from MLForecastPipeline import *

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def full_split_data(df, scenario, test_start_date=TEST_START_DATE, date_col="ds"):
    """Extracts train and test data based on train end date."""
    train_data = df[df[date_col] <= scenario['train_end']]
    val_data = df[(df[date_col] > scenario['val_start']) & (df[date_col] <= scenario['val_end'])]
    test_data = df[df[date_col] >= test_start_date]
    return train_data, val_data, test_data

models = {
    "SGD_Ridge": SGDRegressor( penalty='l2', alpha=1, random_state=42 ),
    "SGDRegressor": SGDRegressor(random_state=42),
    "SGD_ElasticNet": SGDRegressor( penalty='elasticnet', l1_ratio=0.5, alpha=0.001, random_state=42 ),
}

# Define lag transformations
from mlforecast.lag_transforms import *
lag_transforms_options = [
    # {},
    {1: [rolling_mean_14], 7: [rolling_mean_30], 30: [expanding_mean]},
    {1: [expanding_mean], 7: [rolling_mean_14], 30: [expanding_mean]},
    # {7: [RollingMean(window_size=7)], 30: [RollingMean(window_size=30)], 60: [RollingMean(window_size=60)], },
    {7: [RollingMean(7), RollingStd(7)], 30: [RollingMean(30)], 60: [ExpandingMean()], 14: [ExponentiallyWeightedMean(alpha=0.3)],},
    {7: [RollingMean(7), RollingStd(7), ExpandingStd()], 14: [RollingMean(14), ExpandingStd(), ExponentiallyWeightedMean(alpha=0.3)], 30: [RollingMean(30)], 60: [ExpandingMean()],},
]

In [8]:
# Reshaping to MLForecast format
def format_multi_df_to_mlforecast(df):
    df_melted = df.melt(id_vars=['full_date'], var_name='unique_id', value_name='y')
    return df_melted.rename(columns={'full_date': 'ds'})

In [25]:
def optuna_objective(trial, train_df, test_df, transforms, lags, lag_transforms):
    alpha = trial.suggest_float('alpha', 1e-6, 1, log=True)
    l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
    max_iter = trial.suggest_int('max_iter', 300, 1000, step=100)  # Optimizing max_iter (number of iterations)
    eta0 = trial.suggest_float('eta0', 1e-6, 1, log=True)

    model = SGDRegressor(alpha=alpha, l1_ratio=l1_ratio, max_iter=max_iter, eta0=eta0, penalty='elasticnet', random_state=42)

    try:
        fcst = MLForecast(
            models=[model],
            freq='D',
            lags=lags,
            target_transforms=transforms,
            lag_transforms=lag_transforms,
            num_threads=1,
        )
        fcst.fit(train_df)
        predictions = fcst.predict(h=len(test_df))
        mape = mape_met(test_df['y'].values, predictions['SGDRegressor'].values)
        return mape
    except Exception as e:
        print(e)
        return float('inf')
    
import optuna

def run_optuna_search(train_df, test_df, transforms, lags, lag_transforms, n_trials=30):
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: optuna_objective(trial, train_df, test_df, transforms, lags, lag_transforms), n_trials=n_trials)
    return study.best_params


In [10]:
sensor_name = '2'
scenario = scenarios_sensors['2']['12M_train']
ratios = [1]

formatted_df = format_df_to_mlforecast(selected_sensors_df[['full_date', sensor_name]], 'full_date', sensor_name, unique_id=sensor_name)
formatted_df = formatted_df[['ds', 'y', 'unique_id']]

train_df, val_data, test_df = full_split_data(formatted_df, scenario)
optimal_lags_list = get_optimal_lags(train_df, 'y', ratios=ratios)
target_transforms = get_dynamic_transforms(train_df)

In [11]:
valid_transform_combinations = [()] + list(chain(combinations(target_transforms, 1), combinations(target_transforms, 2)))
valid_transform_combinations = [tc for tc in valid_transform_combinations if filter_conflicting_transforms(tc)]

In [31]:
transforms = list(valid_transform_combinations[1])
lags = optimal_lags_list[list(optimal_lags_list.keys())[0]]
lag_transforms = lag_transforms_options[0]

best_params = run_optuna_search(train_df, val_data, transforms, lags, lag_transforms, n_trials=30)

optuna_model = SGDRegressor(**best_params, random_state=42)
models['SGD_Optuna'] = optuna_model

[I 2025-04-08 14:14:12,345] A new study created in memory with name: no-name-347e80d5-0a21-4d8f-b60c-d045e9c2d995
  ret = a @ b
  ret = a @ b
[I 2025-04-08 14:14:12,692] Trial 0 finished with value: inf and parameters: {'alpha': 0.0008072307404295863, 'l1_ratio': 0.7703668529292352, 'max_iter': 300, 'eta0': 0.006491475892088861}. Best is trial 0 with value: inf.
  ret = a @ b
  ret = a @ b
[I 2025-04-08 14:14:12,864] Trial 1 finished with value: inf and parameters: {'alpha': 0.002157657136466659, 'l1_ratio': 0.2739360031073027, 'max_iter': 300, 'eta0': 0.001348077090140433}. Best is trial 0 with value: inf.


Input X contains NaN.
SGDRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
Input X contains NaN.
SGDRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pi

  ret = a @ b
  ret = a @ b
[I 2025-04-08 14:14:13,009] Trial 2 finished with value: inf and parameters: {'alpha': 1.0901394705055e-05, 'l1_ratio': 0.9514060115642948, 'max_iter': 900, 'eta0': 0.5720848412238454}. Best is trial 0 with value: inf.
  ret = a @ b
[I 2025-04-08 14:14:13,162] Trial 3 finished with value: inf and parameters: {'alpha': 3.9875043172176754e-06, 'l1_ratio': 0.22134645946030518, 'max_iter': 1000, 'eta0': 0.13740815840559908}. Best is trial 0 with value: inf.


Input X contains NaN.
SGDRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
Input X contains infinity or a value too large for dtype('float64').


[I 2025-04-08 14:14:15,166] Trial 4 finished with value: 61.50301833239493 and parameters: {'alpha': 0.0176407181056202, 'l1_ratio': 0.31609865261911774, 'max_iter': 600, 'eta0': 4.8747256239306424e-05}. Best is trial 4 with value: 61.50301833239493.
[I 2025-04-08 14:14:16,876] Trial 5 finished with value: 61.817042161136726 and parameters: {'alpha': 0.17046359985186602, 'l1_ratio': 0.18235603891093888, 'max_iter': 500, 'eta0': 2.2833045333422668e-05}. Best is trial 4 with value: 61.50301833239493.
  ret = a @ b
[I 2025-04-08 14:14:17,016] Trial 6 finished with value: inf and parameters: {'alpha': 3.646471861245323e-05, 'l1_ratio': 0.8808569558180018, 'max_iter': 1000, 'eta0': 0.001530935285231787}. Best is trial 4 with value: 61.50301833239493.


Input X contains infinity or a value too large for dtype('float64').


  ret = a @ b
[I 2025-04-08 14:14:17,688] Trial 7 finished with value: inf and parameters: {'alpha': 0.015321219204176758, 'l1_ratio': 0.8924273492916988, 'max_iter': 700, 'eta0': 0.00019887728653923242}. Best is trial 4 with value: 61.50301833239493.


Input X contains infinity or a value too large for dtype('float64').


[I 2025-04-08 14:14:20,223] Trial 8 finished with value: 63.308539897731464 and parameters: {'alpha': 5.810854544258895e-05, 'l1_ratio': 0.03449952744303186, 'max_iter': 1000, 'eta0': 1.852635120534001e-05}. Best is trial 4 with value: 61.50301833239493.
  ret = a @ b
[I 2025-04-08 14:14:20,610] Trial 9 finished with value: inf and parameters: {'alpha': 0.0003764347330703556, 'l1_ratio': 0.7851447641042258, 'max_iter': 1000, 'eta0': 0.07171678078101697}. Best is trial 4 with value: 61.50301833239493.


Input X contains infinity or a value too large for dtype('float64').


[I 2025-04-08 14:14:22,121] Trial 10 finished with value: 77.8790246644507 and parameters: {'alpha': 0.41773586790438727, 'l1_ratio': 0.49766817951795994, 'max_iter': 600, 'eta0': 1.1576791727374017e-06}. Best is trial 4 with value: 61.50301833239493.
[I 2025-04-08 14:14:23,735] Trial 11 finished with value: 66.78516135360722 and parameters: {'alpha': 0.37523206791409097, 'l1_ratio': 0.3629930950648338, 'max_iter': 500, 'eta0': 2.4708992352618388e-05}. Best is trial 4 with value: 61.50301833239493.
[I 2025-04-08 14:14:25,664] Trial 12 finished with value: 65.97290504655139 and parameters: {'alpha': 0.025717508790257956, 'l1_ratio': 0.03932947801272646, 'max_iter': 500, 'eta0': 2.5078737926980635e-05}. Best is trial 4 with value: 61.50301833239493.
[I 2025-04-08 14:14:26,999] Trial 13 finished with value: 71.21602062699944 and parameters: {'alpha': 0.039941093677072825, 'l1_ratio': 0.5216491753495696, 'max_iter': 700, 'eta0': 2.1981503245204994e-06}. Best is trial 4 with value: 61.50301

Input X contains infinity or a value too large for dtype('float64').


[I 2025-04-08 14:14:35,289] Trial 18 finished with value: 60.711636472966276 and parameters: {'alpha': 0.00048295695945396223, 'l1_ratio': 0.6218526362989849, 'max_iter': 800, 'eta0': 4.754703303962651e-06}. Best is trial 15 with value: 56.994915520991306.
[I 2025-04-08 14:14:36,947] Trial 19 finished with value: 71.94082796084005 and parameters: {'alpha': 0.00019248909158924775, 'l1_ratio': 0.6448490925589914, 'max_iter': 800, 'eta0': 1.8291983257261274e-06}. Best is trial 15 with value: 56.994915520991306.
[I 2025-04-08 14:14:39,606] Trial 20 finished with value: 57.15958018485598 and parameters: {'alpha': 0.00013667308587985092, 'l1_ratio': 0.6275974022773998, 'max_iter': 900, 'eta0': 5.4466010648628235e-06}. Best is trial 15 with value: 56.994915520991306.
[I 2025-04-08 14:14:40,871] Trial 21 finished with value: 56.98368183104551 and parameters: {'alpha': 9.62096805105516e-05, 'l1_ratio': 0.6218804835598938, 'max_iter': 900, 'eta0': 5.504815309340243e-06}. Best is trial 21 with va

Input X contains infinity or a value too large for dtype('float64').
Input X contains NaN.
SGDRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


[I 2025-04-08 14:14:51,480] Trial 28 finished with value: 60.251390851190564 and parameters: {'alpha': 8.381271616915755e-06, 'l1_ratio': 0.7080252295746956, 'max_iter': 900, 'eta0': 4.482494436410979e-06}. Best is trial 22 with value: 56.60678601035287.
  ret = a @ b
[I 2025-04-08 14:14:51,705] Trial 29 finished with value: inf and parameters: {'alpha': 0.0009132708027707777, 'l1_ratio': 0.7948395281290991, 'max_iter': 700, 'eta0': 0.02735623287979543}. Best is trial 22 with value: 56.60678601035287.


Input X contains infinity or a value too large for dtype('float64').


In [None]:
from joblib import Parallel, delayed
import time

def process_scenario(sensor_name, scenario_name, scenario, selected_sensors_df, models, lag_transforms_options, ratios=[0.33, 0.66, 1]):
    """ Process each scenario independently and save results. """
    print(f'{sensor_name}_{scenario_name}')
    formatted_df = format_df_to_mlforecast(selected_sensors_df[['full_date', sensor_name]], 'full_date', sensor_name, unique_id=sensor_name)
    formatted_df = formatted_df[['ds', 'y', 'unique_id']]
    
    train_df, test_df = split_data(formatted_df, scenario)
    optimal_lags_list = get_optimal_lags(train_df, 'y', ratios=ratios)
    target_transforms = get_dynamic_transforms(train_df)

    results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, optimal_lags_list)

    # Save results
    save_results(results, f"results/run_18/{sensor_name}_{scenario_name}.csv")

    return results

def run_all_scenarios_parallel(scenarios_sensors, selected_sensors_df, models, lag_transforms_options, ratios=[0.33, 0.66, 1]):
    # don't use all cpus (instead all but one)
    results = Parallel(n_jobs=15)( 
        delayed(process_scenario)(sensor_name, scenario_name, scenario, selected_sensors_df, models, lag_transforms_options, ratios=ratios)
        for sensor_name, scenarios in scenarios_sensors.items()
        for scenario_name, scenario in scenarios.items()
    )

    return results
