In [10]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [11]:
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

In [12]:
def format_df_to_mlforecast(df, date_col, target_col, unique_id='mean'):
    df_ = df.rename({
        date_col: "ds",
        # target_col: 'y',
    }, axis=1)

    df_['ds'] = pd.to_datetime(df_['ds'])

    df_['y'] = df_[target_col].copy()
    # df_.drop(columns=target_col)

    df_['unique_id'] = unique_id
    return df_

In [13]:
selected_sensors_df = pd.read_csv("../data/selected_sensors2_cleaned.csv", index_col=0)

In [14]:
scenarios_sensors = {
    # 0: 1, 4372603
    # "0_12M_train_7M_test": {"train_start": "2017-03-25", "train_end": "2018-03-25", "test_start": "2018-03-26", "test_end": "2018-10-10"},
    '2': {
        "26M_train":  {"train_start": "2017-04-01", "train_end": "2019-06-01"},
        "24M_train":  {"train_start": "2017-04-01", "train_end": "2019-04-01"},
        "22M_train":  {"train_start": "2017-04-01", "train_end": "2019-02-01"},
        "20M_train":  {"train_start": "2017-04-01", "train_end": "2018-12-01"},
        "18M_train":  {"train_start": "2017-04-01", "train_end": "2018-10-01"},
        "12M_train":  {"train_start": "2017-04-01", "train_end": "2018-04-01"},
        "10M_train":  {"train_start": "2017-04-01", "train_end": "2018-01-25"},
        "8M_train":   {"train_start": "2017-04-01", "train_end": "2017-10-25"},
        },
}
scenarios_sensors['5'] = scenarios_sensors['2'].copy()
scenarios_sensors['6'] = scenarios_sensors['2'].copy()

In [15]:
from MLForecastPipeline import *

In [16]:
def split_data(df, scenario, date_col="ds"):
    """Extracts train and test data based on train end date."""
    train_data = df[df[date_col] <= scenario['train_end']]
    test_start = pd.to_datetime(scenario['train_end']) + pd.Timedelta(days=1)
    test_data = df[df[date_col] >= test_start]
    return train_data, test_data

models = {
    "SGD_Ridge": SGDRegressor( penalty='l2', alpha=1, random_state=42 ),
    "SGDRegressor": SGDRegressor(random_state=42),
    "SGD_ElasticNet": SGDRegressor( penalty='elasticnet', l1_ratio=0.5, alpha=0.001, random_state=42 ),
}

# Define lag transformations
from mlforecast.lag_transforms import *
lag_transforms_options = [
    {},
    {7: [RollingMean(window_size=7)], 30: [RollingMean(window_size=30)], 60: [RollingMean(window_size=60)], },
    {7: [RollingMean(7), RollingStd(7)], 30: [RollingMean(30)], 60: [ExpandingMean()], 14: [ExponentiallyWeightedMean(alpha=0.3)],},
    {7: [RollingMean(7), RollingStd(7), ExpandingStd()], 14: [RollingMean(14), ExpandingStd(), ExponentiallyWeightedMean(alpha=0.3)], 30: [RollingMean(30)], 60: [ExpandingMean()],},
]

In [17]:
from joblib import Parallel, delayed
import time

def process_scenario(sensor_name, scenario_name, scenario, selected_sensors_df, models, lag_transforms_options, ratios=[0.33, 0.66, 1]):
    """ Process each scenario independently and save results. """
    print(f'{sensor_name}_{scenario_name}')
    formatted_df = format_df_to_mlforecast(selected_sensors_df[['full_date', sensor_name]], 'full_date', sensor_name, unique_id=sensor_name)
    formatted_df = formatted_df[['ds', 'y', 'unique_id']]
    
    train_df, test_df = split_data(formatted_df, scenario)
    optimal_lags_list = get_optimal_lags(train_df, 'y', ratios=ratios)
    target_transforms = get_dynamic_transforms(train_df)

    results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, optimal_lags_list)

    # Save results
    save_results(results, f"results/run_16/{sensor_name}_{scenario_name}.csv")

    return results

def run_all_scenarios_parallel(scenarios_sensors, selected_sensors_df, models, lag_transforms_options, ratios=[0.33, 0.66, 1]):
    # don't use all cpus (instead all but one)
    results = Parallel(n_jobs=15)( 
        delayed(process_scenario)(sensor_name, scenario_name, scenario, selected_sensors_df, models, lag_transforms_options, ratios=ratios)
        for sensor_name, scenarios in scenarios_sensors.items()
        for scenario_name, scenario in scenarios.items()
    )

    return results


In [18]:
results = run_all_scenarios_parallel(scenarios_sensors, selected_sensors_df, models, lag_transforms_options)

KeyboardInterrupt: 

In [28]:
import multiprocessing
cpu_count = multiprocessing.cpu_count()
cpu_count

16

In [None]:
# Loop through scenarios and evaluate models
results = []

for sensor_name, scenarios in scenarios_sensors.items():
    formatted_df = format_df_to_mlforecast(selected_sensors_df[['full_date', sensor_name]], 'full_date', sensor_name, unique_id=sensor_name)
    formatted_df = formatted_df[['ds', 'y', 'unique_id']]

    for scenario_name, scenario in scenarios.items():

        train_df, test_df = split_data(formatted_df, scenario)

        optimal_lags_list = get_optimal_lags(train_df, 'y', 
                                            # ratios=[1]
                                            ratios=[0.33, 0.66, 1]
                                            #  ratios=[0.25, 0.5, 0.75, 1]
        )
        target_transforms = get_dynamic_transforms(train_df)
        results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, optimal_lags_list)

        save_results(results, f"results/run_6/{sensor_name}_{scenario_name}.csv") 

Total model fits to run: 1152
0/1152 Training XGBRegressor with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function expanding_mean at 0x00000255F6108720>], 7: [<function rolling_mean_14 at 0x00000255838B0AE0>], 30: [<function expanding_mean at 0x00000255F6108720>]}...
XGBRegressor MAPE: 41.45% with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function expanding_mean at 0x00000255F6108720>], 7: [<function rolling_mean_14 at 0x00000255838B0AE0>], 30: [<function expanding_mean at 0x00000255F6108720>]}
1/1152 Training SGDRegressor_42 with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function expanding_mean at 0x00000255

  ret = a @ b
  ret = a @ b


Ridge MAPE: 63.23% with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function expanding_mean at 0x00000255F6108720>], 7: [<function rolling_mean_14 at 0x00000255838B0AE0>], 30: [<function expanding_mean at 0x00000255F6108720>]}
3/1152 Training Lasso with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function expanding_mean at 0x00000255F6108720>], 7: [<function rolling_mean_14 at 0x00000255838B0AE0>], 30: [<function expanding_mean at 0x00000255F6108720>]}...
Lasso MAPE: 42.30% with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function expanding_mean at 0x00000255F6108720>], 7: [<function rolling_mean_14 at 0x0000025583

  ret = a @ b
  ret = a @ b


Ridge MAPE: 49.97% with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function rolling_mean_14 at 0x00000255838B0AE0>], 7: [<function rolling_mean_30 at 0x00000255838B0FE0>], 30: [<function expanding_mean at 0x00000255F6108720>]}
7/1152 Training Lasso with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function rolling_mean_14 at 0x00000255838B0AE0>], 7: [<function rolling_mean_30 at 0x00000255838B0FE0>], 30: [<function expanding_mean at 0x00000255F6108720>]}...
Lasso MAPE: 42.06% with transforms (), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function rolling_mean_14 at 0x00000255838B0AE0>], 7: [<function rolling_mean_30 at 0x0000025

  ret = a @ b


Ridge MAPE: 68.08% with transforms (<mlforecast.target_transforms.AutoDifferences object at 0x00000255F9608830>,), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function expanding_mean at 0x00000255F6108720>], 7: [<function rolling_mean_14 at 0x00000255838B0AE0>], 30: [<function expanding_mean at 0x00000255F6108720>]}
11/1152 Training Lasso with transforms (<mlforecast.target_transforms.AutoDifferences object at 0x00000255F9608830>,), lags [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 18, 27, 36, 37, 41, 43, 48, 54, 63, 64, 73, 74, 77, 78, 79, 80, 81, 97, 98], and lag_transforms {1: [<function expanding_mean at 0x00000255F6108720>], 7: [<function rolling_mean_14 at 0x00000255838B0AE0>], 30: [<function expanding_mean at 0x00000255F6108720>]}...
Lasso MAPE: 49.63% with transforms (<mlforecast.target_transforms.AutoDifferences object at 0x00000255F9608830>,), lags [1, 2, 3, 4, 5,

KeyboardInterrupt: 