In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("../data/total_data.csv")

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df.rename(columns={"full_date": "date_time",
                  "pm": "value"}, inplace=True)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
from functions import holt_winters_imputation_and_expand, plot_imputation_results

In [None]:
import sktime
from matplotlib import pyplot
import matplotlib as plt
import seaborn
import datetime
import pandas as pd
import statsmodels
import numpy as np

In [None]:
df.set_index("date_time", inplace=True)

df.index = pd.to_datetime(df.index)

# Generate the complete range of dates
full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')

# Reindex the DataFrame to include all dates
df = df.reindex(full_range)

# Set the index name back (optional)
df.index.name = 'date_time'

In [None]:
# df['value'] = holt_winters_imputation_and_expand(
#     df['value'],
#     seasonal_periods=365,
# )
# df.to_csv("../data/imputed_mean.csv")

In [None]:
df = pd.read_csv("../data/imputed_mean.csv")

In [None]:
df.set_index("date_time", inplace=True)

In [None]:
test_date = "2020-01-01"
df_train = df[df.index < test_date].copy()
df_test = df[df.index >= test_date].copy()

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-9))) * 100

from sklearn.metrics import mean_squared_error

def evaluate_on_test(y_hat, y_true):
    return mape(y_true, y_hat), np.sqrt(mean_squared_error(y_hat, y_true))

def plot_preds(y_hat, y_true):
    predictions = pd.DataFrame({
        "Preds": y_hat,
        "Actual": y_true,
        # "train": t['value']
    })
    _, ax = pyplot.subplots()
    ax = predictions.plot(ax=ax)

## data prep

In [None]:
from statsforecast.models import SeasonalNaive
from statsforecast.core import StatsForecast
import pandas as pd

# Prepare your data
data = pd.DataFrame({'ds': df_train.index,
                     'y': df_train['value'].values})
data['unique_id'] = "mean"

data['ds'] = pd.to_datetime(data['ds'])

In [None]:
# Prepare your data
data_test = pd.DataFrame({'ds': df_test.index,
                     'y': df_test['value'].values})
data_test['unique_id'] = "mean"

data_test['ds'] = pd.to_datetime(data_test['ds'])

## ML models basics

In [None]:
def train_test_split(series, test_size=0.2):
    """
    Splits the series into train and test sets.

    Parameters:
        series (array-like): The transformed series to split.
        test_size (float): Proportion of the series to include in the test set (default: 0.2).
    
    Returns:
        tuple: (train_series, test_series)
    """
    try:
        n = len(series)
        test_count = int(n * test_size)
        train_series = series[:-test_count]
        test_series = series[-test_count:]
        return train_series, test_series
    except Exception as e:
        warnings.warn(f"Failed to split series into train and test sets: {e}")
        return series, None


In [None]:
# from statsmodels.tsa.seasonal import seasonal_decompose

# data = preprocessor.remove_seasonality(data)
# data = data[~np.isnan(data)]
# result = seasonal_decompose(data, model='additive', period=300)
# result.plot()
# pyplot.show()

In [None]:
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from sklearn.linear_model import LinearRegression


In [None]:
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from numba import njit
from window_ops.rolling import rolling_mean

@njit
def rolling_mean_48(x):
    return rolling_mean(x, window_size=48)


# fcst = MLForecast(
#     models=[],
#     freq='D',
#     target_transforms=[Differences([365])],    
#     lag_transforms={
#         1: [ExpandingMean()],
#         24: [RollingMean(window_size=48), rolling_mean_48],
#     },
# )
# prep = fcst.preprocess(data)
# prep


In [None]:
from mlforecast.target_transforms import BaseTargetTransform
import numpy as np
from sklearn.preprocessing import FunctionTransformer

from mlforecast.target_transforms import GlobalSklearnTransformer

class LocalMinMaxScaler(BaseTargetTransform):
    """Scales each serie to be in the [0, 1] interval."""
    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        self.stats_ = df.groupby(self.id_col)[self.target_col].agg(['min', 'max'])
        df = df.merge(self.stats_, on=self.id_col)
        df[self.target_col] = (df[self.target_col] - df['min']) / (df['max'] - df['min'])
        df = df.drop(columns=['min', 'max'])
        return df

    def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.merge(self.stats_, on=self.id_col)
        for col in df.columns.drop([self.id_col, self.time_col, 'min', 'max']):
            df[col] = df[col] * (df['max'] - df['min']) + df['min']
        df = df.drop(columns=['min', 'max'])
        return df

sk_log1p = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

In [None]:
import pandas as pd
from mlforecast import MLForecast
from mlforecast.target_transforms import (BaseTargetTransform, Differences, AutoDifferences, AutoSeasonalDifferences, AutoSeasonalityAndDifferences,
           LocalStandardScaler, LocalMinMaxScaler, LocalRobustScaler, LocalBoxCox, GlobalSklearnTransformer)
from window_ops.expanding import expanding_mean

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor 
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor

train_df = data.copy()
test_df = data_test.copy()

In [None]:
# Initialize MLForecast with transformations
fcst = MLForecast(
    models=[CatBoostRegressor(verbose=False), RandomForestRegressor(), Ridge(), Lasso(), XGBRegressor()],
    freq='D',
    lags=[1, 3, 7, 14, 30, 45, 60, 80, 90, 120, 150, 180, 365],  # Lag features
    target_transforms=[
        Differences([365]),  # First-order differencing
        LocalStandardScaler()  # Local Standard Scaler normalization
    ],
    date_features=['dayofweek', 'month'],  # Additional time-based features
    num_threads=1,
    lag_transforms={
        # 1: [ExpandingMean()],
        # 7: [RollingMean(window_size=14)],
    },
)

# Fit the model on training data
fcst.fit(train_df)

# Make predictions on the test set
predictions = fcst.predict(h=len(test_df['y']))

# Merge predictions with the test set for evaluation
test_df['forecast'] = predictions['CatBoostRegressor']
mape(test_df['y'].values, test_df['forecast'].values), \
plot_preds(test_df['forecast'].values, test_df['y'].values)


In [None]:
import numpy as np

# Splitting data: last observed value as prediction
last_observed_value = data['y'].iloc[-1]  # Last known value

# Creating a naive prediction (same value for past observed points)
data['yhat'] = last_observed_value  # Naïve prediction

# Calculating MAPE
data = data.dropna()  # Ensure no NaNs before calculation
mape = np.mean(np.abs((data['y'] - data['yhat']) / data['y'])) * 100

print(f"MAPE for Naïve Predictor: {mape:.2f}%")


In [None]:
import pandas as pd
from mlforecast import MLForecast
from mlforecast.target_transforms import Differences
from statsforecast.models import Naive
from sklearn.linear_model import Lasso

# Ensure your DataFrame has the right format
# Define MLForecast instance with Naïve model
forecast = MLForecast(
    models={"Naïve": Naive()},
    freq="D",  # Adjust frequency as needed
    lags=[1],  # Naïve method only needs last observation (lag 1)
)

# Fit the model
forecast.fit(train_df)

# Make predictions on the test set
predictions = forecast.predict(h=len(test_df['y']))

# Merge predictions with the test set for evaluation
test_df['forecast'] = predictions['CatBoostRegressor']
mape(test_df['y'].values, test_df['forecast'].values), \
plot_preds(test_df['forecast'].values, test_df['y'].values)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlforecast import MLForecast
from mlforecast.target_transforms import (
    Differences, AutoDifferences, AutoSeasonalDifferences, AutoSeasonalityAndDifferences,
    LocalStandardScaler, LocalMinMaxScaler, LocalRobustScaler, LocalBoxCox
)
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge, Lasso
from itertools import combinations, chain

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def plot_preds(y_pred, y_true):
    plt.figure(figsize=(12, 6))
    plt.plot(y_true, label='Actual', marker='o', linestyle='-')
    plt.plot(y_pred, label='Forecast', marker='x', linestyle='--')
    plt.legend()
    plt.title('Forecast vs Actual')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.show()

def select_important_lags(train_df, target_col, max_lags, model=RandomForestRegressor(n_estimators=50), num_of_lags=10):
    """ Selects the most important lags based on feature importance analysis. """
    lagged_features = pd.concat([
        train_df[target_col].shift(lag).rename(f'lag_{lag}') for lag in range(1, max_lags + 1)
    ], axis=1)
    
    lagged_features.dropna(inplace=True)
    y = train_df[target_col][max_lags:]
    model.fit(lagged_features, y)
    feature_importances = model.feature_importances_
    important_lags = [i + 1 for i in np.argsort(feature_importances)[-num_of_lags:]]  # Select top lags
    
    return [int(x) for x in sorted(important_lags)]

def filter_conflicting_transforms(transform_combination):
    conflicting_transforms = {Differences, AutoDifferences, AutoSeasonalDifferences, AutoSeasonalityAndDifferences}
    scaler_transforms = {LocalStandardScaler, LocalMinMaxScaler, LocalRobustScaler, LocalBoxCox}
    
    if sum(1 for t in transform_combination if type(t) in conflicting_transforms) > 1:
        return False
    if sum(1 for t in transform_combination if type(t) in scaler_transforms) > 1:
        return False
    return True

def evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, optimal_lags_list, date_features=['dayofweek', 'month']):
    best_model = None
    best_error = float('inf')
    best_transforms = None
    best_lags = None
    best_lag_transforms = None
    results = {}
    
    valid_transform_combinations = list(chain(combinations(target_transforms, 1), combinations(target_transforms, 2)))
    valid_transform_combinations = [tc for tc in valid_transform_combinations if filter_conflicting_transforms(tc)]

    total_fits = len(models) * len(valid_transform_combinations) * len(optimal_lags_list) * len(lag_transforms_options)
    print(f"Total model fits to run: {total_fits}")

    fit_num = 0
    for optimal_lags in optimal_lags_list:
        for transform_combination in valid_transform_combinations:
            for lag_transforms in lag_transforms_options:
                for model_name, model in models.items():
                    print(f"{fit_num}/{total_fits} Training {model_name} with transforms {transform_combination}, lags {optimal_lags}, and lag_transforms {lag_transforms}...")
                    
                    try:
                        fcst = MLForecast(
                            models=[model],
                            freq='D',
                            lags=optimal_lags,
                            target_transforms=list(transform_combination),
                            date_features=date_features,
                            num_threads=1,
                            lag_transforms=lag_transforms,
                        )
                        
                        # Fit the model
                        fcst.fit(train_df)
                        
                        # Predict
                        predictions = fcst.predict(h=len(test_df['y']))
                        
                        # Store results
                        test_df['forecast'] = predictions[model_name]
                        error = mape(test_df['y'].values, test_df['forecast'].values)
                        
                        results[(model_name, transform_combination, tuple(optimal_lags), frozenset((k, tuple(v)) for k, v in lag_transforms.items()))] = error
                        print(f"{model_name} MAPE: {error:.2f}% with transforms {transform_combination}, lags {optimal_lags}, and lag_transforms {lag_transforms}")
                        
                        if error < best_error:
                            best_error = error
                            best_model = model_name
                            best_transforms = transform_combination
                            best_lags = optimal_lags
                            best_lag_transforms = lag_transforms
                        
                        # plot_preds(test_df['forecast'].values, test_df['y'].values)
                    
                    except Exception as e:
                        print(f"Skipping combination due to error: {e}")
                    fit_num += 1
    
    print(f"Best Model: {best_model} with MAPE {best_error:.2f}% using transforms {best_transforms}, lags {best_lags}, and lag_transforms {best_lag_transforms}")
    return results

In [None]:
# optimal_lags = select_important_lags(train_df, 'y', max_lags=400, model=RandomForestRegressor(), num_of_lags=20)
# optimal_lags

In [None]:
optimal_lags_list = [
    [1, 7, 15, 18, 173, 200, 335, 368, 369, 379],
    [1, 6, 11, 15, 173, 200, 335, 378, 379, 384],
    [1, 6, 10, 11, 15, 182, 183, 185, 187, 193],
    [1, 4, 7, 10, 14, 22, 173, 179, 182, 183, 184, 185, 187, 188, 193],
    [1, 5, 6, 7, 11, 21, 116, 173, 180, 184, 187, 188, 334, 335, 368, 369, 373, 378, 379, 384],
    
    [1, 2, 5, 6, 9, 11, 12, 14, 131, 144, 146, 151, 183, 196, 210],
    [1, 10, 11, 15, 16, 150, 172, 188, 198, 199, 222, 273, 336, 368, 384],

]

In [None]:
models = {
    "CatBoostRegressor": CatBoostRegressor(verbose=False),
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100),
    "XGBRegressor": XGBRegressor(),
    # "LGBMRegressor": LGBMRegressor(),
    # "GradientBoostingRegressor": GradientBoostingRegressor(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

target_transforms = [
    Differences([365]),  # some of them shoudln't be together
    AutoDifferences(380), 
    AutoSeasonalDifferences(season_length=365, max_diffs=380), 
    AutoSeasonalityAndDifferences(max_season_length=380, max_diffs=380),
    LocalStandardScaler(), 
    LocalMinMaxScaler(), 
    LocalRobustScaler('mad'), 
    LocalRobustScaler('iqr'), 
    LocalBoxCox()
]

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

lag_transforms_options = [
    {1: [expanding_mean], 7: [expanding_mean, rolling_mean_14], 30: [rolling_mean_30]},
    {1: [expanding_mean], 7: [rolling_mean_14], 30: [expanding_mean]},
    {1: [rolling_mean_14], 7: [rolling_mean_30], 30: [expanding_mean]},
    {1: [rolling_mean_14], 30: [expanding_mean]},
    {1: [rolling_mean_14]},
    {},
]

In [None]:
results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, 
                          optimal_lags_list
)
print("Final Results:", results)


In [None]:
import json
def save_results(results, filename="forecast_results.json"):
    serializable_results = {
        str(k): v for k, v in results.items()
    }  # Convert keys to strings for JSON compatibility
    with open(filename, "w") as f:
        json.dump(serializable_results, f, indent=4)
    print(f"Results saved to {filename}")

In [None]:
save_results(results, "results_mlforecast.json")

In [None]:
results.values()

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load results from JSON
filename = "results_mlforecast.json"
with open(filename, "r") as f:
    results_json = json.load(f)

# Convert results into a DataFrame
data = []
# for key, mape in results_json.items():
for key, mape in results.items():
    # print(eval(key))
    # model, transforms, lags, lag_transforms = eval(key)
    model, transforms, lags, lag_transforms = key
    transforms = tuple(t.__class__.__name__ for t in transforms)
    data.append([model, transforms, lags, lag_transforms, mape])

df_results = pd.DataFrame(data, columns=["Model", "Transforms", "Lags", "Lag Transforms", "MAPE"])

# Top 10 Best Configurations
df_top_configs = df_results.nsmallest(50, "MAPE")

# Model Performance Summary
df_model_performance = df_results.groupby("Model")["MAPE"].agg(["mean", "min", "max", "count"]).sort_values("mean")

# Transformation Effectiveness
df_transform_performance = df_results.explode("Transforms").groupby("Transforms")["MAPE"].mean().sort_values()

# Lags Effectiveness
df_lag_performance = df_results.explode("Lags").groupby("Lags")["MAPE"].mean().sort_values()

# Visualization: Model Performance
plt.figure(figsize=(10, 5))
sns.boxplot(data=df_results, x="Model", y="MAPE")
plt.xticks(rotation=45)
plt.title("Model Performance Distribution")
plt.show()

# Visualization: Transformation Effectiveness
plt.figure(figsize=(10, 5))
df_transform_performance.plot(kind="bar", title="Average MAPE by Transformation")
plt.ylabel("Average MAPE")
plt.show()

# Visualization: Lag Effectiveness
plt.figure(figsize=(10, 5))
df_lag_performance.plot(kind="bar", title="Average MAPE by Lags")
plt.ylabel("Average MAPE")
plt.show()

# Display top configurations and summaries
# import ace_tools as tools
"Top 10 Best Configurations", df_top_configs
"Model Performance Summary", df_model_performance
"Transformation Effectiveness", df_transform_performance
"Lag Effectiveness", df_lag_performance
/

In [None]:
def clean_frozenset(frozenset_str):
    frozenset_str = str(frozenset_str)
    # Extract key-value pairs using regex
    matches = re.findall(r"\((\d+), \(CPUDispatcher\(<function (\w+) at .*?>\)\)\)", frozenset_str)
    # Convert to dictionary format
    cleaned_dict = {int(k): v for k, v in matches}
    return frozenset_str
    return cleaned_dict

clean_frozenset(df_top_configs['Lag Transforms'][49])

In [None]:
import re

def stringify_transform(transform):
    return str(transform)

def clean_lag_transforms(lag_transforms):
    """ Converts lag transforms into a string representation """
    return str(lag_transforms)

def save_results(results, filename="forecast_results.json"):
    serializable_results = {
        str((model, stringify_transform(transforms), lags, clean_lag_transforms(lag_transforms))): mape
        for (model, transforms, lags, lag_transforms), mape in results.items()
    }
    with open(filename, "w") as f:
        json.dump(serializable_results, f, indent=4)
    print(f"Results saved to {filename}")

In [None]:
save_results(results, "results_mlforecast_strfied.json")

In [None]:
import json
import pandas as pd

# Load results
with open("results_mlforecast_strfied.json", "r") as f:
    loaded_results = json.load(f)

# Unpack results into structured format
unpacked_results = []
for key, mape_metr in loaded_results.items():
    model, transforms, lags, lag_transforms = eval(key)  # Convert back to tuple safely
    
    unpacked_results.append([model, transforms, lags, lag_transforms, mape_metr])

# Convert to DataFrame
df_results = pd.DataFrame(unpacked_results, columns=["Model", "Transforms", "Lags", "Lag Transforms", "MAPE"])

In [None]:
def clean_transforms(transform_str):
    # Extract class names using regex
    cleaned = re.findall(r"<mlforecast\.target_transforms\.(\w+) object", transform_str)
    return ", ".join(cleaned) if cleaned else transform_str

def clean_lag_transforms(transform_str):
    # Extract class names using regex
    pattern = r"\((\d+), \(CPUDispatcher\(<function ([a-zA-Z0-9_]+)"
    cleaned = re.findall(pattern, transform_str)

    return {int(k): v for k, v in cleaned}

In [None]:
df_results['Transforms'] = df_results['Transforms'].apply(clean_transforms)
df_results['Lag Transforms'] = df_results['Lag Transforms'].apply(clean_lag_transforms)

In [None]:
import seaborn as sns
df = df_results.copy()
lag_transforms_map = {
    "expanding_mean_rolling_14_rolling_30": {1: 'expanding_mean', 7: 'expanding_mean', 30: 'rolling_mean_30'},
    "expanding_mean_rolling_14": {1: 'expanding_mean', 7: 'rolling_mean_14', 30: 'expanding_mean'},
    "rolling_14_rolling_30_expanding": {1: 'rolling_mean_14', 7: 'rolling_mean_30', 30: 'expanding_mean'},
    "rolling_14_expanding": {1: 'rolling_mean_14', 30: 'expanding_mean'},
    "rolling_14_only": {1: 'rolling_mean_14'},
    "no_transform": {},
}

def map_lag_transforms(lag_transform_dict):
    for name, transform in lag_transforms_map.items():
        if lag_transform_dict == transform:
            return name
    return "unknown"

df['Lag Transform Name'] = df['Lag Transforms'].apply(map_lag_transforms)
optimal_lags_map = {
    "rndforest_set_10_year_1": [1, 7, 15, 18, 173, 200, 335, 368, 369, 379],
    "rndforest_set_10_year_2": [1, 6, 11, 15, 173, 200, 335, 378, 379, 384],
    "rndforest_set_10_200": [1, 6, 10, 11, 15, 182, 183, 185, 187, 193],
    "rndforest_set_15_200": [1, 4, 7, 10, 14, 22, 173, 179, 182, 183, 184, 185, 187, 188, 193],
    "rndforest_set_20": [1, 5, 6, 7, 11, 21, 116, 173, 180, 184, 187, 188, 334, 335, 368, 369, 373, 378, 379, 384],
    "catboost_set_15_210": [1, 2, 5, 6, 9, 11, 12, 14, 131, 144, 146, 151, 183, 196, 210],
    "catboost_set_15_year": [1, 10, 11, 15, 16, 150, 172, 188, 198, 199, 222, 273, 336, 368, 384],
}

def map_lag_sets(lag_list):
    for name, lags in optimal_lags_map.items():
        if lag_list == tuple(lags):
            return name
    return "unknown"

df['Lag_Set_Name'] = df['Lags'].apply(map_lag_sets)

top_df = df[df["MAPE"]<40].copy() # 1st analyss
# top_df = df[(df["MAPE"]<40) & (df['Model']=='XGBRegressor')].copy() # 2nd analyss
# top_df = df[(df['Model']=='XGBRegressor') 
#             & 
#             (df["MAPE"]<50)
#             # (df['Lag Transform Name']=='expanding_mean_rolling_14')
#             ].copy() # 3rd analyss

top_models = top_df.groupby("Model")["MAPE"].mean().sort_values().reset_index()

# Extracting top transforms by lowest average MAPE
top_transforms = top_df.groupby("Transforms")["MAPE"].mean().sort_values().reset_index()

# Extracting top lag transforms by lowest MAPE
top_lag_transforms = top_df.groupby("Lag Transform Name")["MAPE"].mean().sort_values().reset_index()

# Extracting best-performing lag sets
top_lags = top_df.groupby("Lag_Set_Name")["MAPE"].mean().sort_values().reset_index()

# Visualization
plt.figure(figsize=(12, 5))
sns.barplot(x=top_models["Model"], y=top_models["MAPE"], palette="viridis", hue=top_models["Model"])
plt.xticks(rotation=45)
plt.title("Average MAPE per Model")
plt.show()

plt.figure(figsize=(12, 5))
sns.barplot(x=top_transforms["Transforms"], y=top_transforms["MAPE"], palette="coolwarm", hue=top_transforms["Transforms"])
plt.xticks(rotation=90)
plt.title("Average MAPE per Transform")
plt.show()

plt.figure(figsize=(12, 5))
sns.barplot(x=top_lag_transforms["Lag Transform Name"], y=top_lag_transforms["MAPE"], palette="Blues", hue=top_lag_transforms["Lag Transform Name"])
plt.xticks(rotation=90)
plt.title("Average MAPE per Lag Transform")
plt.show()

plt.figure(figsize=(12, 5))
sns.barplot(x=top_lags["Lag_Set_Name"], y=top_lags["MAPE"], palette="Blues", hue=top_lags["Lag_Set_Name"])
plt.xticks(rotation=90)
plt.title("MAPE vs Number of Lags")
plt.show()

## 2nd run

In [None]:
def add_time_features(df):
    if not isinstance(df.index, pd.DatetimeIndex):
        df = df.set_index(pd.to_datetime(df["ds"]))  # Convert to datetime and set index

    df['day_of_week'] = df.index.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['month'] = df.index.month
    df['season'] = ((df['month'] % 12 + 3) // 3)  # 1:Winter, 2:Spring, 3:Summer, 4:Fall
    return df


exp_train_df = add_time_features(train_df)
exp_test_df = add_time_features(test_df)

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
def evaluate_model_w_params(train_df, test_df, model, model_name, lags, target_transforms, date_features, lag_transforms):    
    fcst = MLForecast(
        models=model,
        freq='D',
        lags=lags,
        target_transforms=list(target_transforms),
        date_features=date_features,
        num_threads=1,
        lag_transforms=lag_transforms,
    )
    
    # Fit the model
    fcst.fit(train_df, time_col = "ds",)
    
    # Predict
    predictions = fcst.predict(h=len(test_df['y']))
    
    # Store results
    test_df['forecast'] = predictions[model_name]
    error = mape(test_df['y'].values, test_df['forecast'].values)
    
    print(f"{model_name} MAPE: {error:.2f}%")

In [None]:
def is_weekend(dates):
    return dates.isin([5, 6]).astype(int)
def get_season(dates):
    return ((dates.month % 12 + 3) // 3)  # 1:Winter, 2:Spring, 3:Summer, 4:Fall

evaluate_model_w_params(exp_train_df, exp_test_df, Lasso(), "Lasso", 
                        optimal_lags_map['rndforest_set_10_year_1'], 
                        [AutoDifferences(380)], 
                        # ['dayofweek', 'month', is_weekend, get_season], 
                        [], 
                        lag_transforms_options[3], 
                        static_features=['day_of_week', 'month', 'is_weekend', 'season'])

In [None]:
evaluate_model_w_params(train_df, test_df, XGBRegressor(), "XGBRegressor", 
                        optimal_lags_map['rndforest_set_20'], 
                        [AutoDifferences(380), LocalMinMaxScaler()], 
                        # ['dayofweek', 'month'], 
                        ['dayofweek', 'month', is_weekend, get_season], 
                        lag_transforms_options[2])

### lasso run w/ smaller window and fewer lags

In [None]:
def load_results(name):
    with open(name, "r") as f:
        loaded_results = json.load(f)

    # Unpack results into structured format
    unpacked_results = []
    for key, mape_metr in loaded_results.items():
        model, transforms, lags, lag_transforms = eval(key)  # Convert back to tuple safely
        
        unpacked_results.append([model, transforms, lags, lag_transforms, mape_metr])

    # Convert to DataFrame
    df_results = pd.DataFrame(unpacked_results, columns=["Model", "Transforms", "Lags", "Lag Transforms", "MAPE"])

    df_results['Transforms'] = df_results['Transforms'].apply(clean_transforms)
    df_results['Lag Transforms'] = df_results['Lag Transforms'].apply(clean_lag_transforms)

    return df_results

In [None]:
optimal_lags = select_important_lags(train_df, 'y', max_lags=100, model=RandomForestRegressor(), num_of_lags=15)
optimal_lags

In [None]:
models = {"Lasso": Lasso()}
target_transforms = [AutoDifferences(380)]
lag_transforms_options = [{1: [rolling_mean_14], 30: [expanding_mean]}]
lags_lasso = [
                              [1, 6, 7, 11, 191, 199, 335], 
                              [1, 4, 14, 23, 136, 165, 177, 187, 188, 198],
                              [1, 3, 4, 5, 6, 7, 21, 23, 135, 136],
                              [1, 4, 5, 6, 7, 13, 14, 23, 68, 129], 
                              [1, 3, 4, 5, 6, 7, 14, 16, 21, 23],
                              [1, 3, 4, 5, 6, 7, 10, 11, 14, 15, 20, 21, 23, 25, 41],
                              select_important_lags(train_df, 'y', max_lags=40, model=RandomForestRegressor(), num_of_lags=10),
                              select_important_lags(train_df, 'y', max_lags=40, model=RandomForestRegressor(), num_of_lags=15),
                              select_important_lags(train_df, 'y', max_lags=40, model=RandomForestRegressor(), num_of_lags=20),
                              select_important_lags(train_df, 'y', max_lags=40, model=RandomForestRegressor(), num_of_lags=25),
                              select_important_lags(train_df, 'y', max_lags=40, model=RandomForestRegressor(), num_of_lags=30),
                              select_important_lags(train_df, 'y', max_lags=40, model=RandomForestRegressor(), num_of_lags=35),
                              select_important_lags(train_df, 'y', max_lags=40, model=RandomForestRegressor(), num_of_lags=40),
                              select_important_lags(train_df, 'y', max_lags=70, model=RandomForestRegressor(), num_of_lags=10),
                              select_important_lags(train_df, 'y', max_lags=70, model=RandomForestRegressor(), num_of_lags=15),
                              select_important_lags(train_df, 'y', max_lags=70, model=RandomForestRegressor(), num_of_lags=20),
                            #   select_important_lags(train_df, 'y', max_lags=100, model=RandomForestRegressor(), num_of_lags=20),
                            #   select_important_lags(train_df, 'y', max_lags=100, model=RandomForestRegressor(), num_of_lags=25),
                              select_important_lags(train_df, 'y', max_lags=100, model=RandomForestRegressor(), num_of_lags=30),
                              select_important_lags(train_df, 'y', max_lags=100, model=RandomForestRegressor(), num_of_lags=40),
                              select_important_lags(train_df, 'y', max_lags=100, model=RandomForestRegressor(), num_of_lags=50),
                              select_important_lags(train_df, 'y', max_lags=100, model=RandomForestRegressor(), num_of_lags=60),
                              select_important_lags(train_df, 'y', max_lags=100, model=RandomForestRegressor(), num_of_lags=70),
                              select_important_lags(train_df, 'y', max_lags=150, model=RandomForestRegressor(), num_of_lags=10),
                              select_important_lags(train_df, 'y', max_lags=150, model=RandomForestRegressor(), num_of_lags=20),
                              select_important_lags(train_df, 'y', max_lags=150, model=RandomForestRegressor(), num_of_lags=30),
                              select_important_lags(train_df, 'y', max_lags=150, model=RandomForestRegressor(), num_of_lags=40),
                              select_important_lags(train_df, 'y', max_lags=150, model=RandomForestRegressor(), num_of_lags=50),
                              select_important_lags(train_df, 'y', max_lags=150, model=RandomForestRegressor(), num_of_lags=60),
                              select_important_lags(train_df, 'y', max_lags=150, model=RandomForestRegressor(), num_of_lags=70),
                              select_important_lags(train_df, 'y', max_lags=200, model=RandomForestRegressor(), num_of_lags=20),
                              select_important_lags(train_df, 'y', max_lags=200, model=RandomForestRegressor(), num_of_lags=30),
                              select_important_lags(train_df, 'y', max_lags=200, model=RandomForestRegressor(), num_of_lags=40),
                              select_important_lags(train_df, 'y', max_lags=250, model=RandomForestRegressor(), num_of_lags=30),
                              select_important_lags(train_df, 'y', max_lags=250, model=RandomForestRegressor(), num_of_lags=90),
                              select_important_lags(train_df, 'y', max_lags=250, model=RandomForestRegressor(), num_of_lags=60),
                          ]

results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, 
                          optimal_lags_list = lags_lasso
)
save_results(results, "results_mlforecast_lasso.json")

In [None]:
df_results_lasso = load_results("results_mlforecast_lasso.json")

In [None]:
df_results_lasso

In [None]:
LAG_TRANSFORM_MAPS = {
    "expanding_mean_rolling_14_rolling_30": {1: 'expanding_mean', 7: 'expanding_mean', 30: 'rolling_mean_30'},
    "expanding_mean_rolling_14": {1: 'expanding_mean', 7: 'rolling_mean_14', 30: 'expanding_mean'},
    "rolling_14_rolling_30_expanding": {1: 'rolling_mean_14', 7: 'rolling_mean_30', 30: 'expanding_mean'},
    "rolling_14_expanding": {1: 'rolling_mean_14', 30: 'expanding_mean'},
    "rolling_14_only": {1: 'rolling_mean_14'},
    "no_transform": {},
}

def map_lag_transforms(lag_transform_dict, lag_transforms_map):
    for name, transform in lag_transforms_map.items():
        if lag_transform_dict == transform:
            return name
    return "unknown"

def map_lag_sets(lag_list, optimal_lags_map):
    for name, lags in optimal_lags_map.items():
        if lag_list == tuple(lags):
            return name
    return "unknown"

def show_results(top_df):
    top_models = top_df.groupby("Model")["MAPE"].mean().sort_values().reset_index()

    # Extracting top transforms by lowest average MAPE
    top_transforms = top_df.groupby("Transforms")["MAPE"].mean().sort_values().reset_index()

    # Extracting top lag transforms by lowest MAPE
    top_lag_transforms = top_df.groupby("Lag Transform Name")["MAPE"].mean().sort_values().reset_index()

    # Extracting best-performing lag sets
    top_lags = top_df.groupby("Lag_Set_Name")["MAPE"].mean().sort_values().reset_index()

    # Visualization
    plt.figure(figsize=(12, 5))
    sns.barplot(x=top_models["Model"], y=top_models["MAPE"], palette="viridis", hue=top_models["Model"])
    plt.xticks(rotation=45)
    plt.title("Average MAPE per Model")
    plt.show()

    plt.figure(figsize=(12, 5))
    sns.barplot(x=top_transforms["Transforms"], y=top_transforms["MAPE"], palette="coolwarm", hue=top_transforms["Transforms"])
    plt.xticks(rotation=90)
    plt.title("Average MAPE per Transform")
    plt.show()

    plt.figure(figsize=(12, 5))
    sns.barplot(x=top_lag_transforms["Lag Transform Name"], y=top_lag_transforms["MAPE"], palette="Blues", hue=top_lag_transforms["Lag Transform Name"])
    plt.xticks(rotation=90)
    plt.title("Average MAPE per Lag Transform")
    plt.show()

    plt.figure(figsize=(12, 5))
    sns.barplot(x=top_lags["Lag_Set_Name"], y=top_lags["MAPE"], palette="Blues", hue=top_lags["Lag_Set_Name"])
    plt.xticks(rotation=90)
    plt.title("MAPE vs Number of Lags")
    plt.show()

In [None]:
optimal_lags_names = ['wndw_400_len_7', 'wndw_200_len_10','wndw_150_len_10_1','wndw_150_len_10', 'wndw_100_len_10',
                      'wndw_100_len_15','wndw_40_len_10)','wndw_40_len_15)','wndw_40_len_20)', 'wndw_40_len_25)',
                    'wndw_40_len_30)', 'wndw_40_len_35)', 'wndw_40_len_40)', 'wndw_70_len_10)', 'wndw_70_len_15)',
                    'wndw_70_len_20)','wndw_100_len_30)','wndw_100_len_40)','wndw_100_len_50)','wndw_100_len_60)','wndw_100_len_70)',
                    'wndw_150_len_10)', 'wndw_150_len_20)', 'wndw_150_len_30)', 'wndw_150_len_40)', 'wndw_150_len_50)',
                    'wndw_150_len_60)', 'wndw_150_len_70)', 'wndw_200_len_20)', 'wndw_200_len_30)', 'wndw_200_len_40)',
                    'wndw_250_len_30)', 'wndw_250_len_90)', 'wndw_250_len_60)',
]
optimal_lags_map = dict(zip(optimal_lags_names, df_results_lasso['Lags'].values))

In [None]:
df = df_results_lasso.copy()

df['Lag Transform Name'] = df['Lag Transforms'].apply(lambda x: map_lag_transforms(x, LAG_TRANSFORM_MAPS))


df['Lag_Set_Name'] = df['Lags'].apply(lambda x: map_lag_sets(x, optimal_lags_map))

top_df = df[df["MAPE"]<40].copy() # 1st analyss
# top_df = df[(df["MAPE"]<40) & (df['Model']=='XGBRegressor')].copy() # 2nd analyss
# top_df = df[(df['Model']=='XGBRegressor') 
#             & 
#             (df["MAPE"]<50)
#             # (df['Lag Transform Name']=='expanding_mean_rolling_14')
#             ].copy() # 3rd analyss

show_results(top_df)

In [None]:
df

### xgboost 

In [None]:
def select_important_lags_extended(train_df, target_col, max_lags, model=RandomForestRegressor(n_estimators=50), num_of_lags_list=[5, 10, 15]):
    """ Selects the most important lags based on feature importance analysis for multiple numbers of lags."""
    lagged_features = pd.concat([
        train_df[target_col].shift(lag).rename(f'lag_{lag}') for lag in range(1, max_lags + 1)
    ], axis=1)
    
    lagged_features.dropna(inplace=True)
    y = train_df[target_col][max_lags:]
    model.fit(lagged_features, y)
    feature_importances = model.feature_importances_
    
    important_lags_lists = []
    for num_of_lags in num_of_lags_list:
        important_lags = [i + 1 for i in np.argsort(feature_importances)[-num_of_lags:]]  # Select top lags
        important_lags_lists.append([int(x) for x in sorted(important_lags)])
    
    return important_lags_lists

In [None]:
rnd_forest_lags_list_100 = select_important_lags_extended(train_df, 'y', 100, num_of_lags_list=[33, 66, 100])
rnd_forest_lags_list_200 = select_important_lags_extended(train_df, 'y', 200, num_of_lags_list=[33, 66, 100, 150])
rnd_forest_lags_list_300 = select_important_lags_extended(train_df, 'y', 300, num_of_lags_list=[50, 100, 150, 200, 250])
rnd_forest_lags_list_400 = select_important_lags_extended(train_df, 'y', 400, num_of_lags_list=[50, 100, 150, 200, 250, 300])

In [None]:
rnd_forest_lags_list = rnd_forest_lags_list_100 + rnd_forest_lags_list_200 + rnd_forest_lags_list_300 + rnd_forest_lags_list_400

In [None]:
# models = {"XGBRegressor": XGBRegressor()}
# target_transforms = [AutoDifferences(380), LocalMinMaxScaler()]
# lag_transforms_options = [{1: [rolling_mean_14], 7: [rolling_mean_30], 30: [expanding_mean]}]

# results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, 
#                           optimal_lags_list = rnd_forest_lags_list
# )
# save_results(results, "results_mlforecast_xgboost.json")

фыва

думаю, нет смысла хгб тестить дальше. с разными наборами признаков он не стал сильно лучше

In [None]:
df_results_xgboost = load_results("results_mlforecast_xgboost.json")

### more linear models

In [None]:
optimal_lags_list = [
    [1, 7, 15, 18, 173, 200, 335, 368, 369, 379],
    [1, 6, 11, 15, 173, 200, 335, 378, 379, 384],
    [1, 6, 10, 11, 15, 182, 183, 185, 187, 193],
    [1, 4, 7, 10, 14, 22, 173, 179, 182, 183, 184, 185, 187, 188, 193],
    [1, 5, 6, 7, 11, 21, 116, 173, 180, 184, 187, 188, 334, 335, 368, 369, 373, 378, 379, 384],
    
    [1, 2, 5, 6, 9, 11, 12, 14, 131, 144, 146, 151, 183, 196, 210],
    [1, 10, 11, 15, 16, 150, 172, 188, 198, 199, 222, 273, 336, 368, 384],

]

In [None]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet, BayesianRidge, HuberRegressor, SGDRegressor
from sklearn.svm import SVR 

models = {
    "LinearRegression": LinearRegression(),
    "ElasticNet": ElasticNet(),
    "BayesianRidge": BayesianRidge(),
    "HuberRegressor": HuberRegressor(),
    "SGDRegressor": SGDRegressor(),
    # "SVR": SVR("poly"),
    "SVR": SVR(),
    # "SVR": SVR("sigmoid"),
}

target_transforms = [
    AutoDifferences(380), 
    AutoSeasonalDifferences(season_length=365, max_diffs=380), 
    AutoSeasonalityAndDifferences(max_season_length=380, max_diffs=380),
    LocalStandardScaler(), 
    LocalMinMaxScaler(), 
    LocalBoxCox()
]

@njit
def rolling_mean_14(x):
    return rolling_mean(x, window_size=14)
@njit
def rolling_mean_30(x):
    return rolling_mean(x, window_size=30)

lag_transforms_options = [
    # {1: [expanding_mean], 7: [expanding_mean, rolling_mean_14], 30: [rolling_mean_30]},
    {1: [expanding_mean], 7: [rolling_mean_14], 30: [expanding_mean]},
    {1: [rolling_mean_14], 7: [rolling_mean_30], 30: [expanding_mean]},
    {1: [rolling_mean_14], 30: [expanding_mean]},
    {1: [rolling_mean_14]},
    {},
]

results = evaluate_models(train_df, test_df, models, target_transforms, lag_transforms_options, 
                          optimal_lags_list
)
df_results_simple = process_results(results, "results_mlforecast_simple.json")

In [None]:
df_results_simple = load_results("results_mlforecast_simple.json") 

In [None]:
optimal_lags_map = {
    "rndforest_set_10_year_1": [1, 7, 15, 18, 173, 200, 335, 368, 369, 379],
    "rndforest_set_10_year_2": [1, 6, 11, 15, 173, 200, 335, 378, 379, 384],
    "rndforest_set_10_200": [1, 6, 10, 11, 15, 182, 183, 185, 187, 193],
    "rndforest_set_15_200": [1, 4, 7, 10, 14, 22, 173, 179, 182, 183, 184, 185, 187, 188, 193],
    "rndforest_set_20": [1, 5, 6, 7, 11, 21, 116, 173, 180, 184, 187, 188, 334, 335, 368, 369, 373, 378, 379, 384],
    "catboost_set_15_210": [1, 2, 5, 6, 9, 11, 12, 14, 131, 144, 146, 151, 183, 196, 210],
    "catboost_set_15_year": [1, 10, 11, 15, 16, 150, 172, 188, 198, 199, 222, 273, 336, 368, 384],
}

df = df_results_simple.copy()

df['Lag Transform Name'] = df['Lag Transforms'].apply(lambda x: map_lag_transforms(x, LAG_TRANSFORM_MAPS))


df['Lag_Set_Name'] = df['Lags'].apply(lambda x: map_lag_sets(x, optimal_lags_map))

top_df = df[df["MAPE"]<40].copy() # 1st analyss
top_df = df[(df["MAPE"]<40) & (df['Model']=='SGDRegressor')].copy() # 2nd analyss
# top_df = df[(df['Model']=='XGBRegressor') 
#             & 
#             (df["MAPE"]<50)
#             # (df['Lag Transform Name']=='expanding_mean_rolling_14')
#             ].copy() # 3rd analyss

show_results(top_df) 

In [None]:
df