In [2]:
from MLForecastPipeline import *

In [3]:
selected_sensors_df = pd.read_csv("../data/selected_sensors2_cleaned.csv", index_col=0)

scenarios_sensors = {
    # 0: 1, 4372603
    # "0_12M_train_7M_test": {"train_start": "2017-03-25", "train_end": "2018-03-25", "test_start": "2018-03-26", "test_end": "2018-10-10"},
    '2': {
        "26M_train":  {"train_start": "2017-04-01", "train_end": "2019-06-01"},
        "24M_train":  {"train_start": "2017-04-01", "train_end": "2019-04-01"},
        "22M_train":  {"train_start": "2017-04-01", "train_end": "2019-02-01"},
        "20M_train":  {"train_start": "2017-04-01", "train_end": "2018-12-01"},
        "18M_train":  {"train_start": "2017-04-01", "train_end": "2018-10-01"},
        "12M_train":  {"train_start": "2017-04-01", "train_end": "2018-04-01"},
        "10M_train":  {"train_start": "2017-04-01", "train_end": "2018-01-25"},
        "8M_train":   {"train_start": "2017-04-01", "train_end": "2017-10-25"},
        
        # Non-Heating Periods
        "NH_3M_train":  {"train_start": "2017-04-15", "train_end": "2017-07-15"},
        "NH_4M_train":  {"train_start": "2017-04-15", "train_end": "2017-08-15"},
        "NH_2M_train":  {"train_start": "2017-04-15", "train_end": "2017-06-15"},
        "NH_1M_train":  {"train_start": "2017-04-15", "train_end": "2017-05-15"},
        "NH_15D_train": {"train_start": "2017-04-15", "train_end": "2017-04-30"},
        "NH_feb_2M_train": {"train_start": "2017-02-15", "train_end": "2017-04-15"},
        "NH_feb_1M_train": {"train_start": "2017-02-15", "train_end": "2017-04-15"},
        "NH_mar_2M_train": {"train_start": "2017-03-15", "train_end": "2017-05-15"},
        "NH_mar_1M_train": {"train_start": "2017-03-15", "train_end": "2017-04-15"},

        # Heating Periods
        "H_5M_train":     {"train_start": "2017-06-01", "train_end": "2017-11-01"},
        "H_3M_jul_train": {"train_start": "2017-07-01", "train_end": "2017-10-10"},
        "H_3M_sep_train": {"train_start": "2017-09-01", "train_end": "2017-12-10"},
        "H_3M_nov_train": {"train_start": "2017-11-01", "train_end": "2018-02-10"},
        },
}
scenarios_sensors['5'] = scenarios_sensors['2'].copy()
scenarios_sensors['6'] = scenarios_sensors['2'].copy()

def split_data(df, scenario, date_col="ds"):
    """Extracts train and test data based on train end date."""
    train_data = df[df[date_col] <= scenario['train_end']]
    test_start = pd.to_datetime(scenario['train_end']) + pd.Timedelta(days=1)
    test_data = df[df[date_col] >= test_start]
    return train_data, test_data

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

LAG_TRANSFORMS_MAP = {
    "expanding_mean_rolling_14_rolling_30": {1: 'expanding_mean', 7: 'expanding_mean', 30: 'rolling_mean_30'},
    "expanding_mean_rolling_14": {1: 'expanding_mean', 7: 'rolling_mean_14', 30: 'expanding_mean'},
    "rolling_14_rolling_30_expanding": {1: 'rolling_mean_14', 7: 'rolling_mean_30', 30: 'expanding_mean'},
    "rolling_14_expanding": {1: 'rolling_mean_14', 30: 'expanding_mean'},
    "rolling_14_only": {1: 'rolling_mean_14'},
    "no_transform": {},
}

def map_lag_transforms(lag_transform_dict, lag_transforms_map=LAG_TRANSFORMS_MAP):
    for name, transform in lag_transforms_map.items():
        if lag_transform_dict == transform:
            return name
    return "unknown"

def analyze_results(df, lag_transforms_map=LAG_TRANSFORMS_MAP, mape_threshold=40, model_filter=None):
    df = df.copy()
    df['Lag Transform Name'] = df['Lag Transforms'].apply(lambda x: map_lag_transforms(x, lag_transforms_map))
    df['Lag_Set_Name'] = df['Lag Name']
    # Identify MAPE columns dynamically
    mape_columns = [col for col in df.columns if col.startswith("test_") and col.endswith("_days")]
    
    # Compute mean MAPE across all test periods
    df['MAPE'] = df[mape_columns].mean(axis=1)
    
    # Apply filtering
    top_df = df[df["MAPE"] < mape_threshold].copy()
    if model_filter:
        top_df = top_df[top_df['Model'] == model_filter].copy()
    
    # Compute groupings
    top_models = top_df.groupby("Model")["MAPE"].mean().sort_values().reset_index()
    top_transforms = top_df.groupby("Transforms")["MAPE"].mean().sort_values().reset_index()
    top_lag_transforms = top_df.groupby("Lag Transform Name")["MAPE"].mean().sort_values().reset_index()
    top_lags = top_df.groupby("Lag_Set_Name")["MAPE"].mean().sort_values().reset_index()
    
    # Compute MAPE trends over different forecasting horizons
    mape_trends = top_df.groupby("Model")[mape_columns].mean().reset_index()
    
    return top_models, top_transforms, top_lag_transforms, top_lags, mape_trends

def plot_results(top_models, top_transforms, top_lag_transforms, top_lags, mape_trends):
    plt.figure(figsize=(12, 5))
    sns.barplot(x=top_models["Model"], y=top_models["MAPE"], palette="viridis", hue=top_models["Model"])
    plt.xticks(rotation=45)
    plt.title("Average MAPE per Model")
    plt.show()

    plt.figure(figsize=(12, 5))
    sns.barplot(x=top_transforms["Transforms"], y=top_transforms["MAPE"], palette="coolwarm", hue=top_transforms["Transforms"])
    plt.xticks(rotation=90)
    plt.title("Average MAPE per Transform")
    plt.show()

    plt.figure(figsize=(12, 5))
    sns.barplot(x=top_lag_transforms["Lag Transform Name"], y=top_lag_transforms["MAPE"], palette="Blues", hue=top_lag_transforms["Lag Transform Name"])
    plt.xticks(rotation=90)
    plt.title("Average MAPE per Lag Transform")
    plt.show()

    plt.figure(figsize=(12, 5))
    sns.barplot(x=top_lags["Lag_Set_Name"], y=top_lags["MAPE"], palette="Blues", hue=top_lags["Lag_Set_Name"])
    plt.xticks(rotation=90)
    plt.title("MAPE vs Number of Lags")
    plt.show()
    
    # Plot MAPE trends across different forecasting horizons
    plt.figure(figsize=(12, 5))
    for model in mape_trends["Model"]:
        plt.plot(mape_trends.columns[1:], mape_trends[mape_trends["Model"] == model].values[0][1:], label=model)
    plt.xlabel("Forecasting Horizon (Days)")
    plt.ylabel("MAPE")
    plt.title("MAPE Trends Across Forecast Horizons")
    plt.legend()
    plt.show()

# Example usage:
# top_models, top_transforms, top_lag_transforms, top_lags, mape_trends = analyze_results(df, lag_transforms_map, optimal_lags_map)
# plot_results(top_models, top_transforms, top_lag_transforms, top_lags, mape_trends)


In [5]:
# import pandas as pd
# import numpy as np
# import glob
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Load all results
# results = {}
# for file in glob.glob("results/run_3/*.csv"):
#     dataset_name = file.split("/")[-1].replace(".csv", "")
#     results[dataset_name] = pd.read_csv(file)

# # Combine all datasets into a single DataFrame
# df = pd.concat([df.assign(Dataset=name) for name, df in results.items()], ignore_index=True)

# # Define threshold for MAPE
# mape_threshold = 35  

# # Identify best models per test period
# best_models_per_period = {}
# for test_n in [30, 60, 90, 120, 150, 180, 240, 300, 360, 480, 600, 720, 737]:
#     col = f"test_{test_n}_days"
#     best_models_per_period[test_n] = df[df[col] < mape_threshold].nsmallest(1, col)

# # Calculate variance across all test columns
# test_cols = [col for col in df.columns if "test_" in col]
# df["stability"] = df[test_cols].std(axis=1)

# # Select most stable model
# most_stable_model = df.nsmallest(1, "stability")

In [6]:
import glob
results = {}
for file in glob.glob("results/run_3/*.csv"):
    dataset_name = file.split("\\")[-1].replace(".csv", "")
    results[dataset_name] = pd.read_csv(file)

In [7]:
import re

def extract_train_info(dataset_name):
    """
    Extracts sensor ID, training length in months, and a standardized train label.
    
    Example Inputs:
        - "6_NH_15D_train"  → (6, 0.5, "NH_15D_train")
        - "6_H_5M_train"    → (6, 5, "H_5M_train")
        - "2_18M_train"     → (2, 18, "18M_train")
    
    Returns:
        - sensor (int): Sensor ID
        - train_months (float): Training length in months
        - train_label (str): Everything after the sensor ID (used for finding comparable datasets)
    """
    name_parts = dataset_name.split('_')

    # Extract sensor ID
    sensor = int(name_parts[0])  # First part is always the sensor number

    # Reconstruct the label for easy dataset comparison
    train_label = '_'.join(name_parts[1:])

    # Extract training length (2nd to last part contains number + unit)
    train_info = name_parts[-2]  # Example: "15D" or "5M"
    match = re.match(r"(\d+)([MD])", train_info)  # Extract number and unit

    if match:
        train_length = int(match.group(1))
        unit = match.group(2)

        # Convert days to months (approximate)
        train_months = train_length / 30 if unit == "D" else train_length
    else:
        return None, None, None  # Invalid format, return None values

    return sensor, train_months, train_label

# Example Usage
datasets = ["6_NH_15D_train", "6_H_5M_train", "2_18M_train"]
for ds in datasets:
    print(f"{ds} → {extract_train_info(ds)}")


6_NH_15D_train → (6, 0.5, 'NH_15D_train')
6_H_5M_train → (6, 5, 'H_5M_train')
2_18M_train → (2, 18, '18M_train')


In [8]:
import re
import pandas as pd
import numpy as np
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)

@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

# Define function mapping
lag_transforms_mapping = {
    'expanding_mean': expanding_mean,
    'rolling_mean_14': rolling_mean_14,
    'rolling_mean_30': rolling_mean_30,
}

# Function to convert lag_transforms dictionary to a string
def stringify_lag_transforms(lag_transforms):
    """Converts lag_transforms dictionary to a clean string format for storage."""
    return str({key: [func.__name__ for func in funcs] for key, funcs in lag_transforms.items()})

# Function to parse lag_transforms safely from an invalid dictionary-like string
def parse_lag_transforms(lag_transforms_str):
    """Parses a raw function dictionary string and converts it back to a proper dictionary with function references."""
    try:
        # Extract function names using regex pattern: `<function function_name at 0x...>`
        cleaned_str = re.sub(r'<function (\w+) at 0x[0-9A-Fa-f]+>', r'"\1"', lag_transforms_str)

        # Convert cleaned string into a valid Python dictionary
        temp_dict = eval(cleaned_str)  # Evaluates after function names are fixed

        # Map function names back to actual function references
        return {key: [lag_transforms_mapping[func_name] for func_name in funcs] for key, funcs in temp_dict.items()}
    
    except Exception as e:
        print(f"Error parsing lag_transforms {lag_transforms_str}: {e}")
        return {}

# Example input with incorrect function references
initial_lag_transforms_str = "{1: [<function expanding_mean at 0x0000021CAC0F2840>], 7: [<function rolling_mean_14 at 0x0000021CA47C3C40>], 30: [<function expanding_mean at 0x0000021CAC0F2840>]}"

# Convert to proper lag_transforms dictionary
parsed_lag_transforms = parse_lag_transforms(initial_lag_transforms_str)

print("String Input:", initial_lag_transforms_str)
print("Parsed Back:", parsed_lag_transforms)


String Input: {1: [<function expanding_mean at 0x0000021CAC0F2840>], 7: [<function rolling_mean_14 at 0x0000021CA47C3C40>], 30: [<function expanding_mean at 0x0000021CAC0F2840>]}
Parsed Back: {1: [<function expanding_mean at 0x000001C5EEADC040>], 7: [<function rolling_mean_14 at 0x000001C5E90ACB80>], 30: [<function expanding_mean at 0x000001C5EEADC040>]}


In [8]:
import pandas as pd
import numpy as np
import re
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean
from mlforecast import MLForecast
# from my_custom_utils import format_df_to_mlforecast, split_data  # Assuming these are in a module
import ast

# Define parameters
HORIZONS = [30, 60, 90, 120, 150, 180, 240, 300, 360, 480, 600, 720, 737]
MAPE_COLUMNS = [f"test_{h}_days" for h in HORIZONS]

# Thresholds for different filtering criteria
EARLY_HORIZON_THRESHOLD = 30  # MAPE threshold for early horizons
ONE_THIRD_THRESHOLD = 30       # Threshold for MAPE at 1/3 of training length
ONE_THIRD_THRESHOLD_GENERAL = 35       # Threshold for MAPE at 1/3 of training length


EARLY_HORIZONS = [30, 60, 90, 120]  # Horizons we check for early filtering
SENSORS_TO_COMPARE = [2, 5, 6]  # Sensors that share training length criteria

# Define available lag transforms
lag_transforms_options = [
    {1: [expanding_mean], 7: [rolling_mean], 30: [expanding_mean]},
    {1: [rolling_mean], 7: [rolling_mean], 30: [expanding_mean]},
]

model_mapping = {
    "XGBRegressor": XGBRegressor(),
    "SGDRegressor_42": SGDRegressor(random_state=42),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

transform_mapping = {
    "AutoDifferences" : AutoDifferences, 
    "AutoSeasonalDifferences" : AutoSeasonalDifferences, 
    "AutoSeasonalityAndDifferences" : AutoSeasonalityAndDifferences,
    "LocalStandardScaler" : LocalStandardScaler, 
    "LocalMinMaxScaler" : LocalMinMaxScaler, 
    "LocalBoxCox" : LocalBoxCox
}

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)

@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

# Function to extract sensor ID and train length
def extract_train_info(dataset_name):
    name_parts = dataset_name.split('_')
    sensor = int(name_parts[0])  
    train_label = '_'.join(name_parts[1:])

    train_info = name_parts[-2]  
    match = re.match(r"(\d+)([MD])", train_info)  

    if match:
        train_length = int(match.group(1))
        unit = match.group(2)
        train_months = train_length / 30 if unit == "D" else train_length
    else:
        return None, None, None  

    return sensor, train_months, train_label

# Function to reverse `stringify_transform`
def parse_transform_string(transform_str):
    transform_str = transform_str.strip()
    if "(" in transform_str:
        class_name, params = transform_str.split("(", 1)
        params = params.rstrip(")")
        param_dict = {}
        if params != "NoParams" and class_name != "LocalBoxCox":
            for param in params.split(", "):
                key, value = param.split("=")
                if key in ("max_diffs", "season_length", "max_season_length"):
                    param_dict[key] = int(value) if value.replace(".", "").isdigit() else value
        return class_name, param_dict
    return transform_str, {}

In [9]:
# Define patterns to match and their correct format
rename_patterns = {
    "H_3M_jul_train": "H_jul_3M_train",
    "H_3M_sep_train": "H_sep_3M_train",
    "H_3M_nov_train": "H_nov_3M_train",
}

# Create a new dictionary with updated keys
updated_results = {}

for dataset_name, df in results.items():
    new_name = dataset_name  # Default: keep the same name

    for old_pattern, new_pattern in rename_patterns.items():
        if old_pattern in dataset_name:
            new_name = dataset_name.replace(old_pattern, new_pattern)
            break  # Stop checking once renamed

    updated_results[new_name] = df  # Preserve the dataset content

results = updated_results

In [10]:
import warnings
import pandas as pd

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [12]:
import hashlib
import joblib
import json
import pandas as pd
import os

RUN_NUM = "run_3"

failed_models_log = f"results_of_results/{RUN_NUM}_failed_models_log.txt"

# Ensure directory exists
save_dir = f"results_of_results/{RUN_NUM}"
os.makedirs(save_dir, exist_ok=True)  # Creates the directory if it doesn't exist

# Function to create a unique hash based on model parameters
def generate_model_hash(model_name, transforms, lags, lag_transforms):
    """Creates a short unique hash for the model based on its features & transforms."""
    hash_input = f"{model_name}_{transforms}_{lags}_{lag_transforms}"
    return hashlib.md5(hash_input.encode()).hexdigest()[:8]  # Take only the first 8 characters

# Dictionary to store information about passed models
passed_models_info = {}

# Process each dataset
for dataset_name, df in results.items():
    print("=" * 80)
    print(f"DATASET: {dataset_name}")
    print("=" * 80)

    sensor_id, train_months, train_label = extract_train_info(dataset_name)
    if sensor_id is None:
        print("Skipping dataset: Cannot determine sensor ID or train length.")
        continue

    mape_columns_local = [c for c in MAPE_COLUMNS if c in df.columns]

    # --- (a) Filtering Criteria ---

    # 1. Remove models exceeding the early horizon threshold
    early_horizon_cols = [f"test_{h}_days" for h in EARLY_HORIZONS if f"test_{h}_days" in df.columns]
    mask_early_horizon = (df[early_horizon_cols] <= EARLY_HORIZON_THRESHOLD).all(axis=1)

    # 2. Ensure MAPE is good at 1/3 of train length
    one_third_horizon = int(train_months * 30 // 3)
    closest_horizon = min(HORIZONS, key=lambda x: abs(x - one_third_horizon))

    if f"test_{closest_horizon}_days" in df.columns:
        mask_one_third = df[f"test_{closest_horizon}_days"] <= ONE_THIRD_THRESHOLD
    else:
        mask_one_third = True

    # Apply filters
    df_filtered = df[mask_early_horizon & mask_one_third].copy()

    # --- (b) Extract best models and reconstruct `MLForecast` ---
    for _, row in df_filtered.iterrows():
        model_name = row["Model"]
        transforms_str = row["Transforms"]
        lags = row["Lags"]
        lag_transforms_str = row["Lag Transforms"]

        # Reverse transforms
        transform_objects = []
        if isinstance(transforms_str, str):
            for transform_str in transforms_str.split(" | "):
                class_name, params = parse_transform_string(transform_str)
                
                # transform_objects.append(eval(class_name)(**params) if params else eval(class_name)())
                if class_name in transform_mapping:
                    transform_objects.append(transform_mapping[class_name](**params) if params else transform_mapping[class_name]())
                else:
                    raise ValueError(f"Unknown transform: {class_name}")

        # Reverse lag transforms
        lag_transforms = parse_lag_transforms(lag_transforms_str)

        # Prepare dataset
        sensor_id = str(sensor_id)
        scenario = scenarios_sensors[sensor_id][train_label]

        if sensor_id not in scenarios_sensors:
            continue

        formatted_df = format_df_to_mlforecast(
            selected_sensors_df[['full_date', sensor_id]], 'full_date', sensor_id, unique_id=sensor_id
        )
        formatted_df = formatted_df[['ds', 'y', 'unique_id']]
        train_df, test_df = split_data(formatted_df, scenario)

        # Initialize MLForecast
        model_instance = model_mapping.get(model_name)
        if model_instance is None:
            raise ValueError(f"Model {model_name} not found in model_mapping")

        fcst = MLForecast(
            models=[model_instance],
            freq='D',
            lags=ast.literal_eval(lags),
            target_transforms=transform_objects,
            date_features=['dayofweek', 'month'],
            num_threads=1,
            lag_transforms=lag_transforms,
        )

        fcst.fit(train_df)
        predictions = fcst.predict(h=test_df.shape[0])
        test_df_copy = test_df.copy()
        test_df_copy['forecast'] = predictions[model_name].values

        # Compute one_third_horizon MAPE **before validation**
        eval_subset = test_df_copy.iloc[:closest_horizon]
        one_third_mape = mape_met(eval_subset['y'].values, eval_subset['forecast'].values)

        # --- (c) Train on Other Sensors and Validate at `one_third_horizon` ---
        valid = True
        for other_sensor in SENSORS_TO_COMPARE:
            if other_sensor == int(sensor_id):
                continue  

            other_sensor_id = str(other_sensor)
            if other_sensor_id in scenarios_sensors and train_label in scenarios_sensors[other_sensor_id]:
                other_scenario = scenarios_sensors[other_sensor_id][train_label]

                formatted_df = format_df_to_mlforecast(
                    selected_sensors_df[['full_date', other_sensor_id]], 'full_date', other_sensor_id, unique_id=other_sensor_id
                )
                formatted_df = formatted_df[['ds', 'y', 'unique_id']]
                train_df, test_df = split_data(formatted_df, other_scenario)

                fcst.fit(train_df)

                try:
                    if test_df.shape[0] > 0:
                        predictions = fcst.predict(h=test_df.shape[0])
                        test_df_copy = test_df.copy()
                        test_df_copy['forecast'] = predictions[model_name].values
                    else:
                        with open(failed_models_log, "a") as f:
                            f.write(f"Skipping prediction: test_df is empty for {model_name} {dataset_name}\n")
                        continue  # Skip this iteration
                except Exception as e:
                    with open(failed_models_log, "a") as f:
                        f.write(f"Prediction failed for {model_name} {dataset_name}: {e}\n")
                    continue  # Skip model saving if prediction fails

                eval_subset = test_df_copy.iloc[:closest_horizon]
                other_mape = mape_met(eval_subset['y'].values, eval_subset['forecast'].values)

                if other_mape > ONE_THIRD_THRESHOLD_GENERAL:
                    valid = False
                    with open(failed_models_log, "a") as f:
                        f.write(f"Model {model_name} {one_third_mape} rejected: {other_mape} on sensor {other_sensor_id} at one_third_horizon\n")
                    break

        if valid:
            model_hash = generate_model_hash(model_name, transforms_str, lags, lag_transforms_str)
            unique_model_id = f"{sensor_id}_{train_label}_{model_name}_{model_hash}"

            print(f"Model {unique_model_id} passed: {other_mape} across sensors")

            # --- Save Passed Model ---
            model_filename = f"results_of_results/{RUN_NUM}/{unique_model_id}.joblib"
            # joblib.dump(fcst, model_filename)

            # Save metadata
            passed_models_info[unique_model_id] = {
                "sensor": sensor_id,
                "train_label": train_label,
                "model_name": model_name,
                "transforms": transforms_str,
                "lags": lags,
                "lag_transforms": lag_transforms_str,
                "one_third_mape": one_third_mape,  
                "file_path": model_filename,
            }

# --- Save Metadata ---
with open(f"results_of_results/{RUN_NUM}_passed_models_metadata.json", "w") as f:
    json.dump(passed_models_info, f, indent=4)

# Also save as CSV
pd.DataFrame.from_dict(passed_models_info, orient="index").to_csv(f"results_of_results/{RUN_NUM}_passed_models_metadata.csv")

print("All passed models and metadata saved successfully.")

DATASET: 2_10M_train
DATASET: 2_12M_train
Model 2_12M_train_Ridge_48ac90c4 passed: 30.10147803513295 across sensors
DATASET: 2_18M_train
DATASET: 2_20M_train
DATASET: 2_22M_train
DATASET: 2_24M_train
Model 2_24M_train_XGBRegressor_7e83269a passed: 34.75393475399367 across sensors
Model 2_24M_train_XGBRegressor_49224000 passed: 34.75393475399367 across sensors
Model 2_24M_train_XGBRegressor_58049aeb passed: 34.75393475399367 across sensors


KeyboardInterrupt: 