In [22]:
import pandas as pd
import time
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [23]:
from numba import njit
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

In [24]:
def format_df_to_mlforecast(df, date_col, target_col, unique_id='mean'):
    df_ = df.rename({
        date_col: "ds",
        # target_col: 'y',
    }, axis=1)

    df_['ds'] = pd.to_datetime(df_['ds'])

    df_['y'] = df_[target_col].copy()
    # df_.drop(columns=target_col)

    df_['unique_id'] = unique_id
    return df_

In [25]:
selected_sensors_df = pd.read_csv("../data/selected_sensors2_cleaned.csv", index_col=0)

import dill

with open(r"results_of_results\run_7\mlforecast_model.dill", 'rb') as f:
    selected_model_config = dill.load(f)

prophet_roll_df = pd.read_csv('results_of_results/prophet_roll7_2.csv')

In [26]:
scenarios_sensors = {
    '2': {
        "12M_train":  {"train_start": "2018-04-02", "train_end": "2019-04-01"},
        },
}
# scenarios_sensors['5'] = scenarios_sensors['2'].copy()
# scenarios_sensors['6'] = scenarios_sensors['2'].copy()

In [27]:
from MLForecastPipeline import *

2025-04-05 17:12:28,203	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2025-04-05 17:12:28,433	INFO util.py:154 -- Outdated packages:
  ipywidgets==7.8.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [28]:
def split_data(df, scenario, date_col="ds"):
    """Extracts train and test data based on train end date."""
    train_data = df[df[date_col] <= scenario['train_end']]
    test_start = pd.to_datetime(scenario['train_end']) + pd.Timedelta(days=1)
    test_data = df[df[date_col] >= test_start]
    return train_data, test_data

In [49]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor


def prepare_ensemble_data(y_true, preds1, preds2, pred_col1='y_pred', pred_col2='y_pred'):
    if pred_col1 not in preds1.columns:
        raise KeyError(pred_col1)
    df = pd.merge(preds1.rename(columns={pred_col1: 'model1'}),
                  preds2.rename(columns={pred_col2: 'model2'}),
                  on='ds')
    df = pd.merge(df, y_true, on='ds')
    return df

def split_by_year(df):
    df['ds'] = pd.to_datetime(df['ds'])
    df['year'] = df['ds'].dt.year
    train_year = df['year'].min()
    test_year = df['year'].max()
    df_train = df[df['year'] == train_year]
    df_test = df[df['year'] == test_year]
    return df_train.drop(columns='year'), df_test.drop(columns='year')

def train_and_evaluate_ensemblers(df_train, df_test):
    X_train = df_train[['model1', 'model2']]
    y_train = df_train['y']
    X_test = df_test[['model1', 'model2']]
    y_test = df_test['y']

    models = {
        'LinearRegression': LinearRegression(),
        'Ridge': Ridge(),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'Stacking': StackingRegressor(
            estimators=[
                ('lr', Ridge()),
                ('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
                ('gb', GradientBoostingRegressor(n_estimators=50, random_state=42))
            ],
            final_estimator=Ridge()
        )
    }

    results = []

    for name, model in models.items():
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        mape = mean_absolute_percentage_error(y_test, preds)
        results.append({
            'model': name,
            'rmse': rmse,
            'mae': mae,
            'mape': mape
        })

    return pd.DataFrame(results)

# Usage
# preds1 = pd.DataFrame({'ds': ..., 'y_pred': ...})
# preds2 = pd.DataFrame({'ds': ..., 'y_pred': ...})
# y_true = pd.DataFrame({'ds': ..., 'y': ...})
preds1 = selected_model_config['test_df']
preds1 = preds1[preds1['unique_id'] == '2'][['ds', 'forecast']]

preds2 = prophet_roll_df[['ds', 'yhat']]
y_true_df = selected_sensors_df.rename(columns={'full_date': 'ds', '2': 'y'})
scenario = scenarios_sensors['2']['12M_train']

preds1['ds'] = pd.to_datetime(preds1['ds'])
preds2['ds'] = pd.to_datetime(preds2['ds'])
y_true_df['ds'] = pd.to_datetime(y_true_df['ds'])

df = prepare_ensemble_data(preds1=preds1, preds2=preds2, y_true=y_true_df, pred_col1='forecast', pred_col2='yhat')
df_train, df_test = split_data(df, scenario)
metrics_df = train_and_evaluate_ensemblers(df_train, df_test)
metrics_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds2['ds'] = pd.to_datetime(preds2['ds'])


Unnamed: 0,model,rmse,mae,mape
0,LinearRegression,17.546106,11.895526,0.341998
1,Ridge,17.546092,11.895502,0.341999
2,RandomForest,21.444745,13.629224,0.406328
3,GradientBoosting,21.632215,13.34683,0.373573
4,Stacking,17.319418,11.621257,0.333767


In [48]:
mean_absolute_percentage_error(y_true_df[y_true_df['ds'] > '2019-04-01']['y'], preds1[preds1['ds'] > '2019-04-01']['forecast'])

0.3074422799534196