In [1]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

def determine_max_lags(train_df, min_lags=10, max_fraction=0.5, max_limit=400):
    """ Determines the maximum number of lags based on train set size. """
    max_lags = min(int(len(train_df) * max_fraction), max_limit)
    return max(max_lags, min_lags)  # Ensure at least min_lags

def determine_dynamic_max_lags(train_df, min_lags=20, max_fraction=0.5, max_limit=400):
    """
    Dynamically determines a range of max_lags values based on train size.
    Returns a list of `max_lags` values to be tested.
    """
    base_max_lags = min(int(len(train_df) * max_fraction), max_limit)  # Base max lags
    
    # Create diverse lag options (quarter, half, full, and extended)
    max_lags_list = [
        # max(min_lags, base_max_lags // 6),   # Small max_lags
        max(min_lags, base_max_lags // 4),   # Small max_lags
        max(min_lags, base_max_lags // 2),   # Medium max_lags
        base_max_lags,                      # Full max_lags
    ]
    
    # Remove duplicates and ensure sorted order
    max_lags_list = sorted(set(max_lags_list))

    return max_lags_list


def generate_lagged_features(train_df, target_col, max_lags):
    """Generates lagged features while keeping the `ds` column."""
    # lagged_features = pd.concat([
        # train_df[[target_col, "ds"]].assign(**{f'lag_{lag}': train_df[target_col].shift(lag)}) for lag in range(1, max_lags + 1)
    # ], axis=1)
    # train_df.set_index("ds", inplace=True)
    lagged_features = pd.concat([
        train_df[target_col].shift(lag).rename(f'lag_{lag}') for lag in range(1, max_lags + 1)
    ], axis=1)

    # Drop missing values (due to shifting) and reset index
    lagged_features = lagged_features.dropna().reset_index(drop=True)
    
    return lagged_features


def select_important_lags(train_df, target_col, max_lags, model=RandomForestRegressor(), num_of_lags=10):
    """ Selects the most important lags based on feature importance analysis. """
    lagged_features = generate_lagged_features(train_df, target_col, max_lags)
    y = train_df[target_col][max_lags:]  # Align target values
    
    if lagged_features.shape[0] != len(y):  # Avoid mismatched sizes
        lagged_features = lagged_features.iloc[:len(y)]
    
    model.fit(lagged_features, y)
    feature_importances = model.feature_importances_
    important_lags = [i + 1 for i in np.argsort(feature_importances)[-num_of_lags:]]  # Select top lags
    
    return sorted(important_lags)

def select_important_lags_extended(train_df, target_col, max_lags, model=RandomForestRegressor(), num_of_lags_list=[5, 10, 15]):
    """ Selects the most important lags based on feature importance analysis for multiple numbers of lags."""
    lagged_features = generate_lagged_features(train_df, target_col, max_lags)

    y = train_df[target_col][max_lags:]
    if lagged_features.shape[0] != len(y):  # Avoid mismatched sizes
        lagged_features = lagged_features.iloc[:len(y)]
    
    model.fit(lagged_features, y)
    feature_importances = model.feature_importances_
    
    important_lags_lists = {}
    for num_of_lags in num_of_lags_list:
        important_lags = [i + 1 for i in np.argsort(feature_importances)[-num_of_lags:]]  # Select top lags
        name = f"lags_{max_lags}_features_{num_of_lags}"  # Generate a meaningful name
        important_lags_lists[name] = [int(x) for x in sorted(important_lags)]  # Store with name
    
    return important_lags_lists

def get_optimal_lags(train_df, target_col, model=RandomForestRegressor(), ratios=[0.33, 0.66, 1]):
    """ Selects the most important lags dynamically based on train size. """
    max_lags_list = determine_dynamic_max_lags(train_df)  # Get dynamic max_lags
    results = {}

    for max_lags in max_lags_list:
        num_of_lags_list = [int(max_lags * ratio) for ratio in ratios]  # Various % of max_lags

        # Select important lags and store them with meaningful names
        selected_lags = select_important_lags_extended(train_df, target_col, max_lags, model, num_of_lags_list)
        
        # Merge into results dictionary
        results.update(selected_lags)

    return results


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlforecast import MLForecast
from mlforecast.target_transforms import (
    Differences, AutoDifferences, AutoSeasonalDifferences, AutoSeasonalityAndDifferences,
    LocalStandardScaler, LocalMinMaxScaler, LocalBoxCox
)
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from itertools import combinations, chain
import pickle

# Dummy dataset function
def create_dummy_data():
    """Generates a dummy time series dataset with three years of daily data."""
    dates = pd.date_range(start="2020-01-01", end="2022-12-31", freq='D')
    values = np.sin(np.linspace(0, 50, len(dates))) + np.random.normal(0, 0.1, len(dates))  # Pattern with noise
    df = pd.DataFrame({"ds": dates, "y": values})
    return df

def mape_met(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-9))) * 100

# Function to dynamically determine seasonal and differencing parameters
def get_dynamic_transforms(train_df):
    max_diffs = min(len(train_df) // 2, 380)  # Avoid excessive differencing
    season_length = min(len(train_df) // 3, 365)  # Estimate reasonable seasonality

    target_transforms = [
        AutoDifferences(max_diffs=max_diffs), 
        AutoSeasonalDifferences(season_length=season_length, max_diffs=max_diffs), 
        AutoSeasonalityAndDifferences(max_season_length=season_length, max_diffs=max_diffs),
        LocalStandardScaler(), 
        LocalMinMaxScaler(), 
        LocalBoxCox()
    ]
    return target_transforms

# Function to dynamically determine max lags
def determine_max_lags(train_df, min_lags=20, max_fraction=0.5, max_limit=400):
    """ Determines the maximum number of lags based on train set size. """
    max_lags = min(int(len(train_df) * max_fraction), max_limit)
    return max(max_lags, min_lags)


# Function to validate transform combinations
def filter_conflicting_transforms(transform_combination):
    conflicting_transforms = {Differences, AutoDifferences, AutoSeasonalDifferences, AutoSeasonalityAndDifferences}
    scaler_transforms = {LocalStandardScaler, LocalMinMaxScaler, LocalBoxCox}
    
    if sum(1 for t in transform_combination if type(t) in conflicting_transforms) > 1:
        return False
    if sum(1 for t in transform_combination if type(t) in scaler_transforms) > 1:
        return False
    return True

# Model Evaluation Pipeline
def evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, optimal_lags_list, date_features=['dayofweek', 'month']):
    """
    Evaluates multiple models with different transformations, lag selections, and lag transformations.
    Now accepts precomputed `optimal_lags_list` instead of calculating inside.
    """
    best_model = None
    best_error = float('inf')
    best_transforms = None
    best_lags = None
    best_lag_transforms = None
    results = {}

    # Validate transform combinations
    valid_transform_combinations = list(chain(combinations(target_transforms, 1), combinations(target_transforms, 2)))
    valid_transform_combinations = [tc for tc in valid_transform_combinations if filter_conflicting_transforms(tc)]

    total_fits = len(models) * len(valid_transform_combinations) * len(optimal_lags_list) * len(lag_transforms_options)
    print(f"Total model fits to run: {total_fits}")

    fit_num = 0
    for lag_name, optimal_lags in optimal_lags_list.items():  # Now uses precomputed lags
        for transform_combination in valid_transform_combinations:
            for lag_transforms in lag_transforms_options:
                for model_name, model in models.items():
                    print(f"{fit_num}/{total_fits} Training {model_name} with transforms {transform_combination}, lags {optimal_lags}, and lag_transforms {lag_transforms}...")

                    try:
                        fcst = MLForecast(
                            models=[model],
                            freq='D',
                            lags=optimal_lags,
                            target_transforms=list(transform_combination),
                            date_features=date_features,
                            num_threads=1,
                            lag_transforms=lag_transforms,
                        )
                        
                        # Fit the model
                        fcst.fit(train_df)
                        
                        # Predict
                        predictions = fcst.predict(h=len(test_df['y']))
                        
                        # Store results
                        # Merge predictions back to maintain the `ds` column
                        # test_df = test_df.copy()
                        # test_df = test_df.iloc[:len(predictions)].copy()  # Ensure same length
                        # test_df['forecast'] = predictions[model_name]
                        # error = np.mean(np.abs((test_df['y'].values - test_df['forecast'].values) / test_df['y'].values)) * 100
                        error = mape_met(test_df['y'].values, predictions[model_name])
                        # print(stringify_transform(list(transform_combination)))
                        results[(model_name, stringify_transform(list(transform_combination)), tuple(optimal_lags), clean_lag_transforms(lag_transforms), lag_name)] = error
                        print(f"{model_name} MAPE: {error:.2f}% with transforms {transform_combination}, lags {optimal_lags}, and lag_transforms {lag_transforms}")
                        
                        if error < best_error:
                            best_error = error
                            best_model = model_name
                            best_transforms = transform_combination
                            best_lags = optimal_lags
                            best_lag_transforms = lag_transforms
                    
                    except Exception as e:
                        print(f"Skipping combination due to error: {e}")
                    fit_num += 1
    
    print(f"Best Model: {best_model} with MAPE {best_error:.2f}% using transforms {best_transforms}, lags {best_lags}, and lag_transforms {best_lag_transforms}")
    return results

import json
import re
import pandas as pd

def stringify_transform(transforms):
    """
    Convert transformation(s) into a standardized string format including parameters.
    
    - Handles both **single** transformations and **lists** of transformations.
    - Extracts parameters **only if `scaler_` exists**, otherwise just takes the class name.
    """
    
    if not isinstance(transforms, list):  # If it's a single transformation, wrap it in a list
        transforms = [transforms]

    transform_strings = []
    
    for transform in transforms:
        class_name = transform.__class__.__name__  # Get the class name
        
        # Check if the transform has a `scaler_` attribute
        if hasattr(transform, 'scaler_'):
            actual_transform = transform.scaler_
            
            # Extract all attributes dynamically
            attr_strings = []
            for attr in dir(actual_transform):
                if (not attr.startswith("_")) \
                    and (not callable(getattr(actual_transform, attr, None))) \
                    and (attr not in ['tails_', 'diffs_']) \
                :
                    attr_value = getattr(actual_transform, attr, None)
                    attr_strings.append(f"{attr}={attr_value}")
            
            # Format class name + parameters
            attr_str = ", ".join(attr_strings) if attr_strings else "NoParams"
            transform_strings.append(f"{class_name}({attr_str})")
        
        else:
            # If no `scaler_`, just store the class name
            transform_strings.append(class_name + '()')
    
    return " | ".join(transform_strings)  # Join multiple transformations with " | "


def parse_transform(transform_str):
    """
    Convert string representation back into a list of transformation objects.
    - Handles **single** and **multiple** transformations.
    - Extracts parameters dynamically if present.
    """
    
    transform_list = transform_str.split(" | ")  # Split multiple transforms
    parsed_transforms = []
    
    for transform_item in transform_list:
        if "(" in transform_item:  # If parameters exist
            class_name, params_str = transform_item.split("(", 1)
            params_str = params_str.rstrip(")")
            
            # Extract parameters into a dictionary
            params = {}
            if params_str != "NoParams":
                for param in params_str.split(", "):
                    key, value = param.split("=")
                    try:
                        params[key] = eval(value)  # Convert to appropriate type (int, float, etc.)
                    except:
                        params[key] = value  # Keep as string if eval fails
            
            # Dynamically create the object
            if class_name in globals():
                parsed_transforms.append(globals()[class_name](**params))
            else:
                raise ValueError(f"Unknown transform class: {class_name}")

        else:
            # No parameters, just instantiate by class name
            if transform_item in globals():
                parsed_transforms.append(globals()[transform_item]())
            else:
                raise ValueError(f"Unknown transform class: {transform_item}")
    
    return parsed_transforms if len(parsed_transforms) > 1 else parsed_transforms[0]



def clean_lag_transforms(lag_transforms):
    """Converts lag transforms dictionary into a readable string identifier."""
    if not lag_transforms:
        return "No_Lag_Transforms"
    
    transform_names = []
    for lag, funcs in lag_transforms.items():
        func_names = "_".join(func.__name__ for func in funcs)
        transform_names.append(f"Lag{lag}:{func_names}")
    
    return "|".join(transform_names)  # Join using "|" for readability


def save_results(results, filename="forecast_results.json"):
    """Serializes model results into JSON format for easy reloading."""
    serializable_results = {
        json.dumps({
            "Model": model,
            "Transforms": transforms,
            "Lags": list(lags),
            "Lag Transforms": lag_transforms,
            "Lag Name": lag_name
        }): mape
        for (model, transforms, lags, lag_transforms, lag_name), mape in results.items()
    }
    
    with open(filename, "w") as f:
        json.dump(serializable_results, f, indent=4)
    print(f"Results saved to {filename}")

def load_results(filename="forecast_results.json"):
    """Loads results from JSON and reconstructs into a structured DataFrame."""
    with open(filename, "r") as f:
        loaded_results = json.load(f)
    
    unpacked_results = []
    for key, mape_metr in loaded_results.items():
        result_data = json.loads(key)  # Convert back from JSON string
        
        unpacked_results.append([
            result_data["Model"],
            result_data["Transforms"],
            tuple(result_data["Lags"]),  # Convert back to tuple
            result_data["Lag Transforms"],
            result_data["Lag Name"],
            mape_metr
        ])
    
    # Convert into DataFrame
    df_results = pd.DataFrame(unpacked_results, columns=["Model", "Transforms", "Lags", "Lag Transforms", "Lag Name", "MAPE"])
    return df_results


In [3]:
from numba import njit

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

In [4]:
# Define models
models = {
    # "XGBRegressor": XGBRegressor(),
    # "SGDRegressor": SGDRegressor(),
    # "Ridge": Ridge(),
    "Lasso": Lasso()
}

# Define lag transformations

lag_transforms_options = [
    {1: [expanding_mean], 7: [rolling_mean_14], 30: [expanding_mean]},
    # {1: [rolling_mean_14], 7: [rolling_mean_30], 30: [expanding_mean]},
    # {1: [rolling_mean_14], 30: [expanding_mean]},
    # {1: [rolling_mean_14]},
    # {},
]

# Load dataset
df = create_dummy_data()
df['unique_id'] = 0
df['ds'] = pd.to_datetime(df['ds'])
# Define train and test
train_df = df[df["ds"].between("2021-01-01", "2021-12-31")]
test_df = df[df["ds"].between("2022-01-01", "2022-12-31")]

# Run evaluation
optimal_lags_list = get_optimal_lags(train_df, "y")


In [10]:
target_transforms = get_dynamic_transforms(train_df)
results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, {'lags_45_features_14': [1, 2, 3, 4, 5, 6, 10, 25, 31, 33, 41, 42, 44, 45]})

Total model fits to run: 15
0/15 Training Lasso with transforms (<mlforecast.target_transforms.AutoDifferences object at 0x0000022958978A10>,), lags [1, 2, 3, 4, 5, 6, 10, 25, 31, 33, 41, 42, 44, 45], and lag_transforms {1: [CPUDispatcher(<function expanding_mean at 0x00000229589740E0>)], 7: [CPUDispatcher(<function rolling_mean_14 at 0x0000022953CB6E80>)], 30: [CPUDispatcher(<function expanding_mean at 0x00000229589740E0>)]}...
Lasso MAPE: 121.52% with transforms (<mlforecast.target_transforms.AutoDifferences object at 0x0000022958978A10>,), lags [1, 2, 3, 4, 5, 6, 10, 25, 31, 33, 41, 42, 44, 45], and lag_transforms {1: [CPUDispatcher(<function expanding_mean at 0x00000229589740E0>)], 7: [CPUDispatcher(<function rolling_mean_14 at 0x0000022953CB6E80>)], 30: [CPUDispatcher(<function expanding_mean at 0x00000229589740E0>)]}
1/15 Training Lasso with transforms (<mlforecast.target_transforms.AutoSeasonalDifferences object at 0x0000022959A64DA0>,), lags [1, 2, 3, 4, 5, 6, 10, 25, 31, 33, 4

In [12]:
save_results(results)

Results saved to forecast_results.json
