In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
from typing import Tuple, Union, Literal, List, Dict, Optional, Any, Mapping
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.preprocessing import StandardScaler
from scipy.signal import butter, lfilter, lfilter_zi
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from utils import plot_time_series, compute_metrics, plot_predictions, remove_outliers_zscore, plot_errors
import warnings
%load_ext autoreload
%autoreload 2
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

# Table of Contents
1. [Baseline pipeline recap](#Baseline-pipeline-recap)
2. [Filter-by-difference cleaning](#Filter-by-difference-cleaning)
3. [Denoising: Low-Pass FFT Filter](#Denoising:-Low-Pass-FFT-Filter)
4. [Low-pass-FFT-data-leakage](#Low-pass-FFT-data-leakage)
5. [Denoising: Butterworth filter](#Denoising:-Butterworth-filter)
6. [Denoising: Mean filter](#Denoising:-Mean-filter)
7. [Domain Feature Engineering](#Domain-Feature-Engineering)
8. [Statistical Features](#Statistical-Features)
9. [Lag features](#Lag-features)
10. [Conclusions from deep data cleaning](#Conclusions-from-deep-data-cleaning)
11. [Model Selection: Random Forest](#Model-Selection:-Random-Forest)
12. [Model Selection: CatBoost](#Model-Selection:-CatBoost)
13. [Model Selection: LSTM](#Model-Selection:-LSTM)
14. [Model Selection: MLP](#Model-Selection:-MLP)
15. [Conclusions](#Conclusions)

# Baseline pipeline recap

In [None]:
def add_lag_features(df: pd.DataFrame, columns:List[str]=None, lags: List[int]=[1], drop_na=True):
    """
    Add lag features to DataFrame.
    
    Parameters:
    - df: DataFrame
    - columns: list of column names (default: all numeric)
    - lags: int or list of lag periods (default: 1)
    - drop_na: bool, drop NaN rows (default: True)
    
    Returns: DataFrame with lag features
    """
    df_result = df.copy()
    
    # Create lag features
    for col in columns:
        for lag in lags:
            df_result[f"{col}_lag{lag}"] = df_result[col].shift(lag)
    
    if drop_na:
        return df_result.dropna()
    else:
        return df_result.bfill()  # Backward fill NaNs

In [None]:
def add_rolling_features(df, columns, window_sizes=7, stats=['mean', 'median'], drop_na=True):
    """
    Add rolling features to DataFrame.
    
    Parameters:
    - df: DataFrame
    - columns: list of column names
    - window_sizes: int or list of window sizes (default: 7)
    - stats: list of statistics ['mean', 'median', 'std', 'min', 'max', 'skew', 'kurt']
    - drop_na: bool, drop NaN rows (default: True)
    
    Returns: DataFrame with rolling features
    """
    df_result = df.copy()
    
    # Convert single values to lists
    if isinstance(window_sizes, int):
        window_sizes = [window_sizes]
    if isinstance(columns, str):
        columns = [columns]
    
    # Create rolling features
    for col in columns:
        for window in window_sizes:
            rolling = df_result[col].rolling(window)
            
            for stat in stats:
                if stat == 'mean':
                    df_result[f"{col}_roll{window}_mean"] = rolling.mean()
                elif stat == 'median':
                    df_result[f"{col}_roll{window}_median"] = rolling.median()
                elif stat == 'std':
                    df_result[f"{col}_roll{window}_std"] = rolling.std()
                elif stat == 'min':
                    df_result[f"{col}_roll{window}_min"] = rolling.min()
                elif stat == 'max':
                    df_result[f"{col}_roll{window}_max"] = rolling.max()
                elif stat == 'skew':
                    df_result[f"{col}_roll{window}_skew"] = rolling.skew()
                elif stat == 'kurt':
                    df_result[f"{col}_roll{window}_kurt"] = rolling.kurt()
    if drop_na:
        return df_result.dropna()
    else:
        return df_result.bfill()  # Backward fill NaNs

In [None]:
def eval_model(
    x_train: pd.DataFrame,
    y_train: pd.Series,
    x_test: pd.DataFrame,
    y_test: pd.Series,
    n_splits: int = 3,
    model_name: Literal["RF", "LinReg", "CatBoost"] = "RF",
    model_params: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    """
    Evaluate a time series model using TimeSeriesSplit cross-validation
    on the training set, then refit on the full train data and evaluate on test.

    Parameters
    ----------
    x_train : pd.DataFrame
        Training features.
    y_train : pd.Series
        Training target.
    x_test : pd.DataFrame
        Test features.
    y_test : pd.Series
        Test target.
    n_splits : int, default=3
        Number of time-series CV folds.
    model_name : {'RF','LinReg','CatBoost'}, default='RF'
        Model identifier.
    model_params : dict or None
        Keyword arguments for the selected model.

    Returns
    -------
    results : dict
        - 'cv_mae'   : average MAE over CV folds
        - 'cv_rmse'  : average RMSE over CV folds
        - 'cv_mape'  : average MAPE over CV folds
        - 'test_mae' : MAE on the final test set
        - 'test_rmse': RMSE on the final test set
        - 'test_mape': MAPE on the final test set
        - 'y_pred_test': model predictions on test set
        - 'model'      : fitted final model
    """
    np.random.seed(SEED)

    if model_params is None:
        model_params = {}

    tscv = TimeSeriesSplit(n_splits=n_splits)
    cv_mae_list: List[float] = []
    cv_rmse_list: List[float] = []
    cv_mape_list: List[float] = []

    # --- Cross-validation ---
    for fold, (train_idx, val_idx) in enumerate(tscv.split(x_train), 1):
        x_train_cv = x_train.iloc[train_idx, :].copy()
        x_val_cv = x_train.iloc[val_idx, :].copy()
        y_train_cv = y_train.iloc[train_idx].copy().values.ravel()
        y_val_cv = y_train.iloc[val_idx].copy().values.ravel()

        # Scale features
        x_scaler = StandardScaler()
        x_scaled_cv_train = x_scaler.fit_transform(x_train_cv)
        x_scaled_cv_val = x_scaler.transform(x_val_cv)

        # Construct model
        if model_name == "RF":
            model = RF(**model_params)
        elif model_name == "LinReg":
            model = LinearRegression(**model_params)
        elif model_name == "CatBoost":
            params = dict(model_params)
            params.setdefault("verbose", False)
            params.setdefault("random_seed", SEED)
            model = CatBoostRegressor(**params)
        else:
            raise ValueError(f"Unknown model_name: {model_name}")

        # Fit and predict
        model.fit(x_scaled_cv_train, y_train_cv)
        y_pred_cv = model.predict(x_scaled_cv_val)

        # Metrics
        mae_err, rmse_err, mape_err = compute_metrics(y_val_cv, y_pred_cv)
        cv_mae_list.append(float(mae_err))
        cv_rmse_list.append(float(rmse_err))
        cv_mape_list.append(float(mape_err))

    cv_mae = float(np.mean(cv_mae_list))
    cv_rmse = float(np.mean(cv_rmse_list))
    cv_mape = float(np.mean(cv_mape_list))

    # --- Final model training on full training set ---
    if model_name == "RF":
        model = RF(**model_params)
    elif model_name == "LinReg":
        model = LinearRegression(**model_params)
    elif model_name == "CatBoost":
        params = dict(model_params)
        params.setdefault("verbose", False)
        params.setdefault("random_seed", SEED)
        model = CatBoostRegressor(**params)
    else:
        raise ValueError(f"Unknown model_name: {model_name}")

    x_scaler = StandardScaler()
    x_scaled_train = x_scaler.fit_transform(x_train)
    x_scaled_test = x_scaler.transform(x_test)

    model.fit(x_scaled_train, y_train.values.ravel())

    y_pred_test = model.predict(x_scaled_test)
    mae_err_test, rmse_err_test, mape_err_test = compute_metrics(y_test, y_pred_test)

    return {
        "cv_mae": round(cv_mae, 2),
        "cv_rmse": round(cv_rmse, 2),
        "cv_mape": round(cv_mape, 2),
        "test_mae": round(float(mae_err_test), 2),
        "test_rmse": round(float(rmse_err_test), 2),
        "test_mape": round(float(mape_err_test), 2),
        "y_pred_test": y_pred_test,
        "model": model,
    }

In [None]:
def run_baseline(
    df: pd.DataFrame,
    split_index: int,
    target: str,
    z_threshold: float
) -> Tuple[Dict[str, Any], pd.DataFrame, pd.Series]:
    """
    Run the full baseline pipeline:
    - time-based train/test split
    - z-score based outlier cleaning (train + test features)
    - time-series feature engineering (lags + rolling stats)
    - Random Forest training + evaluation.

    Parameters
    ----------
    df : pd.DataFrame
        Full input dataset with features and target.
    split_index : int
        Index position used to split df into train ([:split_index])
        and test ([split_index:]) in time order.
    target : str
        Name of the target column to predict.
    z_threshold : float
        Z-score threshold for outlier removal in numeric columns.

    Returns
    -------
    eval_results : dict
        Metrics and artifacts from eval_model (CV + test metrics, model, preds, etc.).
    x_clean_test : pd.DataFrame
        Final engineered test feature matrix used for inference.
    y_test : pd.Series
        Raw test-set target values (not cleaned), aligned with x_clean_test.
    """

    # 1) Time-based train/test split
    df_train = df[:split_index].copy()
    df_test = df[split_index:].copy()

    # 2) Z-score cleaning on train (learn stats) and test (apply train stats)
    df_clean_train, z_score_stats = remove_outliers_zscore(
        df_train,
        threshold=z_threshold,
        nan_treatment="ffill"
    )

    features = [col for col in df.columns if col != target]

    x_clean_test, _ = remove_outliers_zscore(
        df_test[features],
        threshold=z_threshold,
        nan_treatment="ffill",
        stats=z_score_stats
    )

    # Keep original target on test; only clean features
    df_clean_test = pd.concat([x_clean_test, df_test[target]], axis=1)

    # 3) Lag features (example: GenRPM, GenPh1Temp)
    df_train_lag = add_lag_features(
        df_clean_train,
        columns=["GenRPM", "GenPh1Temp"],
        lags=[1, 2, 3],
        drop_na=False
    )
    df_test_lag = add_lag_features(
        df_clean_test,
        columns=["GenRPM", "GenPh1Temp"],
        lags=[1, 2, 3],
        drop_na=False
    )

    # 4) Rolling statistics (median, std, min, max)
    df_train_feat = add_rolling_features(
        df_train_lag,
        window_sizes=[5, 10, 15],
        columns=["GenRPM", "WindSpeed", "GenPh1Temp"],
        stats=["median", "std", "min", "max"],
        drop_na=False
    )
    df_test_feat = add_rolling_features(
        df_test_lag,
        window_sizes=[5, 10, 15],
        columns=["GenRPM", "WindSpeed", "GenPh1Temp"],
        stats=["median", "std", "min", "max"],
        drop_na=False
    )

    # If you want to test "pure filtering" without feature engineering:
    # df_train_feat = df_clean_train.copy()
    # df_test_feat = df_clean_test.copy()

    # 5) Final feature matrices (exclude target column)
    feature_cols = [col for col in df_train_feat.columns if col != target]

    x_clean_train = df_train_feat[feature_cols].copy()
    y_clean_train = df_train_feat[target].copy()

    x_clean_test = df_test_feat[feature_cols].copy()
    y_test = df_test[target].copy()   # use original (uncleaned) target for evaluation

    # 6) Run baseline Random Forest model
    params = {
        "n_estimators": 100,
        "random_state": SEED,
        "n_jobs": -1,
    }

    eval_results = eval_model(
        x_clean_train,
        y_clean_train,
        x_clean_test,
        y_test,
        n_splits=3,
        model_name="RF",
        model_params=params,
    )

    return eval_results, x_clean_test, y_test

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)
SEED = 42

In [None]:
eval_results, x_test, y_test = run_baseline(df, 30_000, 'Power', 3)
eval_results

In [None]:
plot_errors(x_test, y_test, eval_results['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

# Filter-by-difference cleaning

### Target cleaning

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
df_clean_train, _ = remove_outliers_zscore(df_train, threshold=3, nan_treatment='ffill')

In [None]:
cols_to_plot = [col for col in df_clean_train.columns]
plot_time_series(df_clean_train, cols_to_plot, step=1, rolling_window=None)

Despite that this data is cleaned data using z-score filter, we still see quite a lot of suddent spikes that can be considered as outliers.

A very good way to see the if there are suddent outlying spikes is to compute the differences between the neighboring points.

In [None]:
plt.subplots(1, 2, figsize=(12, 4))
plt.subplot(1, 2, 1)
sns.histplot(df_clean_train['Power'].diff(1))
plt.ylim(0, 20)
plt.subplot(1, 2, 2)
sns.histplot(df_clean_train['WindDirAbs'].diff(1))
plt.ylim(0, 100)

Instead of specifying the cut-off differences from positive and negative sides, let's compute and plot the absolute differences.

In [None]:
cols = df_train.columns  # or pick a subset
n_features = len(cols)

n_cols = 2
n_rows = math.ceil(n_features / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 3 * n_rows))
axes = axes.flatten()

for i, col in enumerate(cols):
    ax = axes[i]
    diffs = df_clean_train[col].diff(1).abs().dropna() # compute the absolute differences

    sns.histplot(diffs, ax=ax)
    ax.set_title(col)
    ax.set_xlabel("abs(diff(1))")
    ax.set_ylabel("Count")

# remove any unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
def remove_diff_outliers(df, column, diff_threshold):
    """
    Remove outliers based on absolute first-order diff and forward-fill the gaps.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.
    column : str
        Column to clean.
    diff_threshold : float
        Absolute diff threshold.

    Returns
    -------
    df_clean : pd.DataFrame
        Cleaned dataframe with forward fill.
    outlier_idx : pd.Index
        Indices of removed outliers.
    """

    df_clean = df.copy()

    # 1. Compute absolute diff
    diff_vals = df_clean[column].diff(1).abs()

    # 2. Outlier mask
    outlier_mask = diff_vals > diff_threshold
    outlier_idx = df_clean.index[outlier_mask]

    # 3. Remove outliers
    df_clean.loc[outlier_idx, column] = np.nan

    # 4. Forward fill (and backfill if needed)
    df_clean[column] = df_clean[column].ffill().bfill()

    return df_clean, outlier_idx

From the figure or quantiles, we can identify the cut-off differences.

In [None]:
abs_diff_thresholds = {
    'WindSpeed': 12,
    'WindDirAbs': 70,
    'Power': 200,
    'Pitch': 12,
    'GenRPM': 450,
    'WindDirRel': 9,
    'NacelTemp': 40,
    'GenPh1Temp': 40,
    'RotorRPM': 20,
    'EnvirTemp': 17,
    'NacelTemp':    30,   
    'GearOilTemp':  20,  
    'GearBearTemp': 35,
    'GenBearTemp': 8,
    'GenPh1Temp':   35, 
}

In [None]:
df_filtered_train = df_train.copy()
removed_dict = {}

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

Let's plot the raw and filtered features.

In [None]:
# These are NOT predictions, but we can use the same functions
plot_predictions(df_train['Power'], df_filtered_train['Power'])

In [None]:
cols_to_plot = [col for col in df_filtered_train.columns]
plot_time_series(df_filtered_train, cols_to_plot, step=1, rolling_window=None)

In [None]:
# Select numeric columns only
numeric_cols = df_filtered_train.select_dtypes(include=np.number).columns
n_cols = 3
n_rows = int(np.ceil(len(numeric_cols) / n_cols))

plt.figure(figsize=(15, 10))

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.histplot(df_filtered_train[col].dropna(), bins=20, kde=True)
    plt.title(col, fontsize=10)
    plt.xlabel('')
    plt.ylabel('')

plt.tight_layout()

It seems that this removed the outliers a little bit better.

However, we see that there are still some outliers exist.

Let's see how the model performes.

In [None]:
# Clean the test set
filt_cols = [col for col in df_test.columns if col != 'Power']

x_filtered_test = df_test[filt_cols].copy()
removed_dict = {}

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
# 'cv_mae': 66.37,
#  'cv_rmse': 96.43,
#  'cv_mape': 7.76,
#  'test_mae': 72.35,
#  'test_rmse': 121.48,
#  'test_mape': 9.34,

We see that this cleaning works a bit better that z-score leaning.

Let's apply z-score cleaning ON TOP if filtering by difference.

In [None]:
y_train_clean, _ = remove_outliers_zscore(pd.DataFrame(df_filtered_train['Power']), nan_treatment='ffill', threshold=4)

In [None]:
# Demonstrate drop as well
x_train_clean, z_score_stats = remove_outliers_zscore(df_filtered_train[filt_cols], nan_treatment='ffill', threshold=4)
x_test_clean, _ = remove_outliers_zscore(x_filtered_test, nan_treatment='ffill', threshold=3, stats=z_score_stats)

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    x_train_clean,
    y_train_clean, 
    x_test_clean, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
# 'cv_mae': 63.89,
#  'cv_rmse': 91.89,
#  'cv_mape': 7.58,
#  'test_mae': 72.15,
#  'test_rmse': 121.68,
#  'test_mape': 9.31,

We see that we don't get a better performance.

In fact, at z_threshold = 3, it seems we are cutting the values that contain useful information because the score gets worse.

In production, it's easy to maintain the "Filter by difference" method because we do NOT need to keep train of z_score_stats.

Good improvement and less production headache.

# Denoising: Low-Pass FFT Filter

In the EDA stage, we have observed that the data has a lot of noise.

Potentially, cutting the noise can give us a good model improvement.

Let's check that.

In [None]:
def fft_lowpass_filter(x, dt, cutoff):
    """
    Apply a low-pass FFT filter.

    Parameters
    ----------
    x : array-like
        Raw time-series signal.
    dt : float
        Sampling interval in chosen time units (e.g. 10/60 for cycles/hour).
    cutoff : float
        Cutoff frequency in same units as FFT output (e.g. cycles/hour).

    Returns
    -------
    x_filtered : np.array
        Filtered time-series (mean added back).
    freqs : np.array
        Frequency axis.
    fft_filtered : np.array
        Filtered FFT values.
    """

    # Ensure numpy array
    x_clean = np.asarray(x, dtype=float)

    # Fill NaNs if needed
    if np.isnan(x_clean).any():
        nans = np.isnan(x_clean)
        x_clean[nans] = np.interp(np.flatnonzero(nans),
                                  np.flatnonzero(~nans),
                                  x_clean[~nans])

    # Store original mean
    mean_val = np.mean(x_clean)

    # Detrend (remove mean for FFT)
    x_detrended = x_clean - mean_val

    N = len(x_detrended)

    # FFT
    fft_vals = np.fft.rfft(x_detrended)
    freqs = np.fft.rfftfreq(N, d=dt)

    # Low-pass mask
    mask = freqs <= cutoff
    fft_filtered = fft_vals * mask

    # Inverse FFT + ADD MEAN BACK
    x_filtered = np.fft.irfft(fft_filtered, n=N) + mean_val

    return x_filtered, freqs, fft_filtered

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)
SEED = 42

In [None]:
# Abs difference filter
# Train set
df_filtered_train = df_train.copy()

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

# Test set
filt_cols = [col for col in df_test.columns if col != 'Power']
x_filtered_test = df_test[filt_cols].copy()

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

In [None]:
# Denoise data with low pass filter
cutoff = 1
dt = 10/60  # 10 minutes in hours (cycles/hour)

# We filter only these columns
feats = [col for col in ['GenRPM', 'GenPh1Temp', 'WindSpeed', 'WindDirAbs', 'WindDirRel', 'Pitch', 'RotorRPM']]

for col in feats:
    x_filt, freqs, fft_filt = fft_lowpass_filter(df_filtered_train[col], dt, cutoff)
    df_filtered_train.loc[:, col] = x_filt

for col in feats:
    x_filt, freqs, fft_filt = fft_lowpass_filter(x_filtered_test[col], dt, cutoff)
    x_filtered_test.loc[:, col] = x_filt

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(df_train['RotorRPM'][:200], label='Raw signal')
plt.plot(df_filtered_train['RotorRPM'][:200], label='Filtered signal')
plt.legend(fontsize=16)

This looks promising.

At first, it seems that we smooth too much, but this is a hyperparmaeter we can tune based on the model performance.

Let's see what the performance looks like.

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

Now, we have Filter-by-difference + FFT filter.

Let's compare with the resuls when we cleaned with only z-score

In [None]:
# 'cv_mae': 66.37,
#  'cv_rmse': 96.43,
#  'cv_mape': 7.76,
#  'test_mae': 72.35,
#  'test_rmse': 121.48,
#  'test_mape': 9.34,

We clealry see a huge improvement!

And we have not yet engineered any features.

From here, we can conclude that noise reduction is a good idea.

But 

# Low-pass-FFT-data-leakage

**Now, we also got a problem....**

We just intorduced a data leakage.

This is because we have fitted the filter on the ENTIRE test set which is not correct.

In production, we will not have information about the future data points.

So, we need to modify our filter and see how it would work in real production scenarion.

In [None]:
def apply_fft_lowpass(df: pd.DataFrame,
                      cols,
                      dt: float,
                      cutoff: float,
                      window_size: int = 500,
                      online: bool = False) -> pd.DataFrame:
    """
    Apply fft_lowpass_filter to selected columns.

    - online=False  → full-series filtering (good for TRAIN).
    - online=True   → past-only sliding window (good for TEST/production).

    Returns a new DataFrame with filtered columns.
    """
    df_out = df.copy()

    for col in cols:
        x = df[col].to_numpy(dtype=float)

        if not online:
            # Full-series offline filter (simple train-time version)
            x_filt, _, _ = fft_lowpass_filter(x, dt, cutoff)
            df_out[col] = x_filt

        else:
            # Online-style: use only last `window_size` past points
            y = np.full_like(x, np.nan, dtype=float)

            for i in range(window_size - 1, len(x)):
                window = x[i - window_size + 1 : i + 1]
                x_filt_window, _, _ = fft_lowpass_filter(window, dt, cutoff)
                y[i] = x_filt_window[-1]

            df_out[col] = y

    return df_out

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
# Abs difference filter
# Train set
df_filtered_train = df_train.copy()

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

# Test set
filt_cols = [col for col in df_test.columns if col != 'Power']
x_filtered_test = df_test[filt_cols].copy()

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

**First, let's apply the filter on the training set as a whole and sliding filter on the test set.**

In [None]:
cutoff = 1
dt = 10/60  # 10 minutes in hours (cycles/hour)
window_size = 500

feats = ['GenRPM', 'GenPh1Temp', 'WindSpeed',
         'WindDirAbs', 'WindDirRel', 'Pitch', 'RotorRPM']

# ---- TRAIN: full-series filtering ----
df_filtered_train = apply_fft_lowpass(
    df=df_filtered_train,
    cols=feats,
    dt=dt,
    cutoff=cutoff,
    online=False,          # full-series
)

# ---- TEST: sliding-window (production-like) ----
x_filtered_test = apply_fft_lowpass(
    df=x_filtered_test,
    cols=feats,
    dt=dt,
    cutoff=cutoff,
    window_size=window_size,
    online=True,           # past-only window
)

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
## This is low pass with data leackage
# 'cv_mae': 50.52,
#  'cv_rmse': 72.21,
#  'cv_mape': 6.08,
#  'test_mae': 63.54,
#  'test_rmse': 114.19,
#  'test_mape': 8.18,

We see that on the test set we get a MUCH worse performance.

Let's look at the error over time.

In [None]:
plot_errors(x_test, y_test, eval_results['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

**Let's apply the same way on the train and test set**

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
# Abs difference filter
# Train set
df_filtered_train = df_train.copy()

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

# Test set
filt_cols = [col for col in df_test.columns if col != 'Power']
x_filtered_test = df_test[filt_cols].copy()

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

In [None]:
cutoff = 1
dt = 10/60      # 10-minute sampling in hours
window_size = 200

feats = ['GenRPM', 'GenPh1Temp', 'WindSpeed',
         'WindDirAbs', 'WindDirRel', 'Pitch', 'RotorRPM']


# ---- TRAIN: sliding window (same as production) ----
df_filtered_train = apply_fft_lowpass(
    df=df_filtered_train,
    cols=feats,
    dt=dt,
    cutoff=cutoff,
    window_size=window_size,
    online=True     # <--- use sliding window in TRAIN too
)

# ---- TEST: sliding window (production-like) ----
x_filtered_test = apply_fft_lowpass(
    df=x_filtered_test,
    cols=feats,
    dt=dt,
    cutoff=cutoff,
    window_size=window_size,
    online=True     # <--- same logic
)

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
plot_errors(x_test, y_test, eval_results['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

We see that the performance is really poor, so we need to do something else.

There are cut-off frequency filter alternatives that do not require information from the future.

One of the, is a Butterworth filter. Let's try it.

# Denoising: Butterworth filter

In [None]:
def design_butter_lowpass(dt: float, cutoff: float, order: int = 4):
    """
    Design a causal Butterworth low-pass filter.

    Parameters
    ----------
    dt : float
        Sampling interval (e.g. 10/60 hours).
    cutoff : float
        Cutoff frequency in same units as 1/dt (e.g. cycles/hour).
    order : int
        Filter order.

    Returns
    -------
    b, a : np.ndarray
        Filter coefficients.
    """
    fs = 1.0 / dt          # sampling frequency
    nyq = 0.5 * fs
    wn = cutoff / nyq      # normalized cutoff in (0, 1)
    b, a = butter(order, wn, btype="low", analog=False)
    return b, a

In [None]:
def apply_butter_lowpass_causal(
    df: pd.DataFrame,
    cols,
    b: np.ndarray,
    a: np.ndarray,
) -> pd.DataFrame:
    """
    Apply a causal Butterworth low-pass filter to selected columns.

    This simulates production-like streaming:
    - processes samples in time order
    - uses only past values (causal)
    - uses lfilter with steady-state initial condition per column
    """
    df_out = df.copy()

    for col in cols:
        x = df[col].to_numpy(dtype=float)

        # init state close to steady state at x[0]
        zi = lfilter_zi(b, a) * x[0]

        # causal filtering over the whole series
        y, _ = lfilter(b, a, x, zi=zi)

        df_out[col] = y

    return df_out

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
# Abs difference filter
# Train set
df_filtered_train = df_train.copy()

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

# Test set
filt_cols = [col for col in df_test.columns if col != 'Power']
x_filtered_test = df_test[filt_cols].copy()

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

In [None]:
cutoff = 1
dt = 10/60
order = 1

feats = ['GenRPM', 'GenPh1Temp', 'WindSpeed',
         'WindDirAbs', 'WindDirRel', 'Pitch', 'RotorRPM']

# 1) Design filter ONCE using train settings
b, a = design_butter_lowpass(dt=dt, cutoff=cutoff, order=order)

# 2) TRAIN: production-like causal filtering
df_filtered_train = apply_butter_lowpass_causal(
    df=df_filtered_train,
    cols=feats,
    b=b,
    a=a,
)

# 3) TEST: same filter, same causal logic
x_filtered_test = apply_butter_lowpass_causal(
    df=x_filtered_test,
    cols=feats,
    b=b,
    a=a,
)

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(df_train['RotorRPM']["2007-08-20":"2007-08-21"], label='Raw signal')
plt.plot(df_filtered_train['RotorRPM']["2007-08-20":"2007-08-21"], label='Filtered signal')
plt.legend(fontsize=16)

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
plot_errors(x_test, y_test, eval_results['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

We see that the predictions are way better now.

With some hyperparameter tuning, this filter can be applied in production.

However, we went the hard way. The easiest way to reduce noise would be to apply mean or median filter.

# Denoising: Mean filter

If these filters work with similar performance, it's better to use them instead because they are MCH easier to use.

We can POTENTIALLY apply other filters like Gaussian filter, however, in production we would not be able to apply it in real-time.

Because it requires information from the future relative to the current prediciton point (similar to the low-pass FFT filter)

So, let's apply the mean and median sliding filters.

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
df_filtered_train = df_train.copy()

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

In [None]:
filt_cols = [col for col in df_test.columns if col != 'Power']
x_filtered_test = df_test[filt_cols].copy()

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

In [None]:
def smooth_signal(df, column, window, method="median"):
    """
    Smooth a time-series column using rolling mean or median.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.
    column : str
        Column to smooth.
    window : int
        Rolling window size.
    method : str
        "mean"  -> rolling mean filter
        "median" -> rolling median filter (robust smoothing)

    Returns
    -------
    df_smoothed : pd.DataFrame
        DataFrame with smoothed column.
    """

    df_smoothed = df.copy()

    if method == "mean":
        df_smoothed[column] = df_smoothed[column].rolling(
            window=window, min_periods=1, center=False
        ).mean()

    elif method == "median":
        df_smoothed[column] = df_smoothed[column].rolling(
            window=window, min_periods=1, center=False
        ).median()

    else:
        raise ValueError("method must be 'mean' or 'median'")

    return df_smoothed

In [None]:
smooth_cols = [col for col in df_filtered_train.columns if col != 'Power']

for col in smooth_cols:
    df_filtered_train = smooth_signal(df_filtered_train, col, window=3, method="mean") # test with 2, 4, 5

In [None]:
for col in smooth_cols:
    x_filtered_test = smooth_signal(x_filtered_test, col, window=3, method="mean") # test with 2, 4, 5

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(df_train['RotorRPM']["2007-08-20":"2007-08-21"], label='Raw signal')
plt.plot(df_filtered_train['RotorRPM']["2007-08-20":"2007-08-21"], label='Filtered signal')
plt.legend(fontsize=16)

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
# 'cv_mae': 55.36,
#  'cv_rmse': 78.61,
#  'cv_mape': 6.66,
#  'test_mae': 67.9,
#  'test_rmse': 118.16,
#  'test_mape': 8.78,

In [None]:
plot_errors(x_test, y_test, eval_results['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

We see that average filter works worse and the Butterworth filter.

However, it still works better that only z-score filter wich means in general noise reduction is required in this case.

**If selecting between Butterwoth and Mean Filter, from the production perspective it's better to choose the Mean filter because it's way easier to use AND tune if required.** 

**Yes, we sacrifice some accuracy but gain confidence**

# Domain Feature Engineering

Now, let's try to engineer some features based on basic physical relatonships.

Note that before feature engineering, we clean the data not to propagate the noise and outliers into the engineered features.

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
df_filtered_train = df_train.copy()
removed_dict = {}

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

In [None]:
filt_cols = [col for col in df_test.columns if col != 'Power']

x_filtered_test = df_test[filt_cols].copy()
removed_dict = {}

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

In [None]:
smooth_cols = [col for col in df_filtered_train.columns if col != 'Power']

for col in smooth_cols:
    df_filtered_train = smooth_signal(df_filtered_train, col, window=3, method="mean") # test with 2, 4, 5

for col in smooth_cols:
    x_filtered_test = smooth_signal(x_filtered_test, col, window=3, method="mean") # test with 2, 4, 5

In [None]:
def get_domain_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create domain-specific engineered features for wind turbine data.

    This function constructs physically meaningful combinations of signals
    (e.g., wind speed, rotor RPM, pitch, thermal indicators) that often help
    ML models capture turbine aerodynamics and thermal behavior.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing at least the following columns:
        - WindSpeed, RotorRPM, Pitch, GenPh1Temp, GenBearTemp

    Returns
    -------
    pd.DataFrame
        Copy of the input dataframe with additional engineered columns:
        - WindSpeed3, TSR_proxy, RPM_WS2, RPM2_WS,
          PitchFactor, WS3_pitch, ThermalLoad, ThermalProxy
    """
    df_feat = df.copy()

    # Aerodynamic proxies
    df_feat["WindSpeed3"] = df_feat["WindSpeed"] ** 3
    df_feat["TSR_proxy"] = (df_feat["RotorRPM"] / df_feat["WindSpeed"]).replace(0, np.nan)
    df_feat["RPM_WS2"] = df_feat["RotorRPM"] * df_feat["WindSpeed"] ** 2
    df_feat["RPM2_WS"] = df_feat["RotorRPM"] ** 2 * df_feat["WindSpeed"]

    # Pitch effect (proxy for aerodynamic efficiency)
    df_feat["PitchFactor"] = np.exp(-0.1 * df_feat["Pitch"])
    df_feat["WS3_pitch"] = df_feat["WindSpeed3"] * df_feat["PitchFactor"]

    # Thermal proxies
    df_feat["ThermalLoad"] = df_feat["GenPh1Temp"] + df_feat["GenBearTemp"]
    df_feat["ThermalProxy"] = df_feat["RPM_WS2"] * df_feat["ThermalLoad"]

    return df_feat

### Engineered Feature Explanations

### WindSpeed3
`WindSpeed^3`  
Power generation is roughly proportional to the cube of wind speed.  
**Why useful:** captures nonlinear aerodynamic effects related to power output.

### TSR_proxy
`RotorRPM / WindSpeed`  
Approximate Tip-Speed Ratio (TSR), a key aerodynamic efficiency measure.  
**Why useful:** indicates whether the turbine is operating in an optimal aerodynamic regime.

### RPM_WS2
`RotorRPM * WindSpeed^2`  
Combined aerodynamic/rotational loading term.  
**Why useful:** highlights high-load operational conditions.

### RPM2_WS
`RotorRPM^2 * WindSpeed`  
Places more emphasis on rotor behavior relative to wind.  
**Why useful:** detects abnormal RPM behavior under specific wind speeds.

### PitchFactor
`exp(-0.1 * Pitch)`  
Approximates how pitching the blades reduces aerodynamic efficiency.  
**Why useful:** captures control effects that reduce effective power capture.

### WS3_pitch
`WindSpeed3 * PitchFactor`  
Couples aerodynamic potential with pitch-induced efficiency loss.  
**Why useful:** represents effective aerodynamic power rather than theoretical.

### ThermalLoad
`GenPh1Temp + GenBearTemp`  
Simple indicator of thermal stress in generator and bearing.  
**Why useful:** overheating is a common early sign of mechanical degradation.

### ThermalProxy
`RPM_WS2 * ThermalLoad`  
Thermal–mechanical load interaction term.  
**Why useful:** sensitive to slow degradation such as increasing friction or bearing wear.

In [None]:
df_filtered_train = get_domain_features(df_filtered_train)
x_filtered_test = get_domain_features(x_filtered_test)
filt_cols = [col for col in df_filtered_train if col != 'Power']

In [None]:
# Compute correlation of all columns with the target
corr = df_filtered_train.corr()['Power'].drop('Power')

# Sort by absolute correlation value
correlations_sorted = corr.reindex(corr.abs().sort_values(ascending=False).index)
correlations_sorted

We see that most of the enginered features have good correlation with the target, better than many of the raw features.

This is promising to get better model results.

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
# 'cv_mae': 57.05,
#  'cv_rmse': 80.15,
#  'cv_mape': 6.89,
#  'test_mae': 69.15,
#  'test_rmse': 119.36,
#  'test_mape': 8.94,

We see the it has not given us almost any improvement, even though the engineered featured look good.

Let's check if the model used the features at all.

In [None]:
# Extract values
importances = eval_results['model'].feature_importances_
cols = df_filtered_train[filt_cols].columns

# Combine into DataFrame
feat_imp = (
    pd.DataFrame({
        "feature": cols,
        "importance": importances
    })
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

feat_imp

We see that these features, despite being well correlated with the target, are not used much by the model.

Overall, it can happen because:
- Random Forest already models nonlinear relations, so engineered features may be redundant.
- High correlation doesn’t mean the feature adds new information beyond existing inputs.
- Strong raw predictors (e.g., GenRPM) dominate splits, reducing importance of derived features.

Let's see if statistical features are able to improve the model. For the baseline mode, they did.

# Statistical Features

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
# Train
df_filtered_train = df_train.copy()
removed_dict = {}

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

filt_cols = [col for col in df_test.columns if col != 'Power']

# Test
x_filtered_test = df_test[filt_cols].copy()
removed_dict = {}

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

In [None]:
smooth_cols = [col for col in df_filtered_train.columns if col != 'Power']

for col in smooth_cols:
    df_filtered_train = smooth_signal(df_filtered_train, col, window=3, method="mean") # test with 2, 4, 5

for col in smooth_cols:
    x_filtered_test = smooth_signal(x_filtered_test, col, window=3, method="mean") # test with 2, 4, 5

In [None]:
df_filtered_train = add_rolling_features(
    df_filtered_train,
    window_sizes=[3, 6], # window_sizes=[5, 10, 15],
    columns=['GenRPM', 'GenPh1Temp', 'WindSpeed', 'WindDirAbs', 'WindDirRel', 'Pitch', 'RotorRPM'], 
    stats=['std','max', 'skew'], # 'min', 'mean' - we already have mean
    drop_na=False
)

In [None]:
x_filtered_test = add_rolling_features(
    x_filtered_test,
    window_sizes=[3, 6], # window_sizes=[5, 10, 15],
    columns=['GenRPM', 'GenPh1Temp', 'WindSpeed', 'WindDirAbs', 'WindDirRel', 'Pitch', 'RotorRPM'], 
    stats=['std','max', 'skew'], #'min',  'mean' - we already have mean
    drop_na=False
)

In [None]:
filt_cols = [col for col in df_filtered_train if col != 'Power']

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
## These are the metrics with Engineered features
# 'cv_mae': 56.74,
#  'cv_rmse': 79.68,
#  'cv_mape': 6.86,
#  'test_mae': 69.2,
#  'test_rmse': 119.4,
#  'test_mape': 8.95,

In [None]:
plot_errors(x_test, y_test, eval_results['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

In [None]:
# Extract values
importances = eval_results['model'].feature_importances_
cols = df_filtered_train[filt_cols].columns

# Combine into DataFrame
feat_imp = (
    pd.DataFrame({
        "feature": cols,
        "importance": importances
    })
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

feat_imp

We see that we have not got much of improvement.

This can be the case because by filtering the noise using averaging, we already extrated the main info.

The rest of the infromation can be bettter extracted by cleaning the noise better (as we say with data leacked low-pass FFT filter).

This is definitely the room for improvement here.

From the feature importance we see that some of the max features are the importance TOP, but they are way less important that the raw GenRPM feature.

**To make out pipeline general, we can use some of the statistical features, we will create the transformation fuctions, so that later in production and after deeper research more features can be added if required.**

# Lag features

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
# Filtering by difference
# Train
df_filtered_train = df_train.copy()
removed_dict = {}

for col, thr in abs_diff_thresholds.items():
    df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

filt_cols = [col for col in df_test.columns if col != 'Power']

# Test
x_filtered_test = df_test[filt_cols].copy()
removed_dict = {}

for col, thr in abs_diff_thresholds.items():
    if col != 'Power':
        x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

In [None]:
smooth_cols = [col for col in df_filtered_train.columns if col != 'Power']

for col in smooth_cols:
    df_filtered_train = smooth_signal(df_filtered_train, col, window=3, method="mean") # test with 2, 4, 5

for col in smooth_cols:
    x_filtered_test = smooth_signal(x_filtered_test, col, window=3, method="mean") # test with 2, 4, 5

In [None]:
# # Let's keep max features for now
df_filtered_train = add_rolling_features(
    df_filtered_train,
    window_sizes=[3, 6],
    columns=['GenRPM', 'GenPh1Temp', 'WindSpeed'], 
    stats=['max'],
    drop_na=False
)

x_filtered_test = add_rolling_features(
    x_filtered_test,
    window_sizes=[3, 6],
    columns=['GenRPM', 'GenPh1Temp', 'WindSpeed'], 
    stats=['max'],
    drop_na=False
)

In [None]:
# Adding lagged features
df_filtered_train = add_lag_features(df_filtered_train, columns=['GenRPM', 'GenPh1Temp'], lags=[1, 2, 3], drop_na=False)
x_filtered_test = add_lag_features(x_filtered_test, columns=['GenRPM', 'GenPh1Temp'], lags=[1, 2, 3], drop_na=False)
filt_cols = [col for col in df_filtered_train if col != 'Power']

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results  = eval_model(
    df_filtered_train[filt_cols],
    df_filtered_train['Power'], 
    x_filtered_test, 
    df_test['Power'], 
    3, 
    'RF', 
    params
)
eval_results

In [None]:
# 'cv_mae': 56.56,
#  'cv_rmse': 79.71,
#  'cv_mape': 6.81,
#  'test_mae': 68.48,
#  'test_rmse': 118.48,
#  'test_mape': 8.85,

In [None]:
# This is baseline
# 'cv_mae': 55.25,
#  'cv_rmse': 80.82,
#  'cv_mape': 6.58,
#  'test_mae': 64.44,
#  'test_rmse': 114.03,
#  'test_mape': 8.3,

We see that we have not got much of improvement.

Moreover, we see pretty much the same performance as in out baseline.

The most likely reason is that the data has a lot of noise and the transformations we applied focus on more or less the same - remove outliers and smooth the noise to extract the true signal.

The biggest potential boost we have achieved was using the low pass FFT filter which however introduced a data leakage.

# Conclusions from deep data cleaning

From all the analysis above, we can conclude that the future improvement can be achieved by better data denoising

To make out pipeline more flexible for future development, we will introduce:
- Filtering by difference to remove the outliers
- Smoothing the noise with mean filter (can be substituted by Butterworth filter)
- Adding lagged features - max value, but we will use the function that can add more features
- Adding lagged features (except the target).

# Model Selection: Random Forest

First, let's prepare the function that creates the final cleaning and feature engineering pipeline. 

In [None]:
def get_clean_data(
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    abs_diff_thresholds: Dict[str, float],
    smooth_window: int
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    """
    Clean and enrich train/test turbine datasets:
    - remove outliers using absolute-difference thresholds,
    - denoise selected signals via FFT low-pass filter,
    - add lag features,
    - add rolling statistical features.

    Assumptions
    ----------
    - `df_train` and `df_test` contain a target column named 'Power'.
    - `remove_diff_outliers` does NOT drop rows (it forward-fills / smooths values).
    - Columns required for denoising and feature engineering are present:
      ['GenRPM', 'GenPh1Temp', 'WindSpeed', 'WindDirAbs',
       'WindDirRel', 'Pitch', 'RotorRPM'].
    - Helper functions `remove_diff_outliers`, `fft_lowpass_filter`,
      `add_lag_features`, and `add_rolling_features` are defined elsewhere.

    Parameters
    ----------
    df_train : pd.DataFrame
        Training dataset with features and target ('Power').
    df_test : pd.DataFrame
        Test dataset with features and target ('Power').
    abs_diff_thresholds : Dict[str, float]
        Mapping from column name to absolute-difference threshold used by
        `remove_diff_outliers` to smooth outliers.

    Returns
    -------
    X_train : pd.DataFrame
        Cleaned and feature-engineered training features (without 'Power').
    y_train : pd.Series
        Cleaned training target ('Power').
    X_test : pd.DataFrame
        Cleaned and feature-engineered test features (without 'Power').
    y_test : pd.Series
        Original test target ('Power'), not filtered by outlier logic.
    """
    df_filtered_train = df_train.copy()

    # Applying filter by difference
    # Remove outliers on the train set (features + target)
    for col, thr in abs_diff_thresholds.items():
        df_filtered_train, removed_idx = remove_diff_outliers(df_filtered_train, col, thr)

    # Remove outliers on the test set (features only)
    filt_cols = [col for col in df_test.columns if col != "Power"]
    x_filtered_test = df_test[filt_cols].copy()

    for col, thr in abs_diff_thresholds.items():
        if col != "Power":
            x_filtered_test, removed_idx = remove_diff_outliers(x_filtered_test, col, thr)

    # Denoising
    smooth_cols = [col for col in df_filtered_train.columns if col != 'Power']

    for col in smooth_cols:
        df_filtered_train = smooth_signal(df_filtered_train, col, window=smooth_window, method="mean")
    
    for col in smooth_cols:
        x_filtered_test = smooth_signal(x_filtered_test, col, window=smooth_window, method="mean")


    # Add Lag Features in train and test datasets
    df_filtered_train = add_lag_features(
        df_filtered_train,
        columns=["GenRPM", "GenPh1Temp", 'WindSpeed'],
        lags=[1, 2, 3],
        drop_na=False,
    )
    x_filtered_test = add_lag_features(
        x_filtered_test,
        columns=["GenRPM", "GenPh1Temp", 'WindSpeed'],
        lags=[1, 2, 3],
        drop_na=False,
    )

    # Add Statistical Features in the train dataset
    df_filtered_train = add_rolling_features(
        df_filtered_train,
        window_sizes=[3, 6],  # e.g. [5, 10, 15]
        columns=["GenRPM", "GenPh1Temp", "WindSpeed",
                 "WindDirAbs", "WindDirRel", "Pitch", "RotorRPM"],
        stats=["max"],
        drop_na=False,
    )

    # Add Statistical Features in the test dataset
    x_filtered_test = add_rolling_features(
        x_filtered_test,
        window_sizes=[3, 6],  # e.g. [5, 10, 15]
        columns=["GenRPM", "GenPh1Temp", "WindSpeed",
                 "WindDirAbs", "WindDirRel", "Pitch", "RotorRPM"],
        stats=["max"],
        drop_na=False,
    )

    X_train = df_filtered_train.drop(columns="Power")
    y_train = df_filtered_train["Power"]
    X_test = x_filtered_test
    y_test = df_test["Power"]

    return X_train, y_train, X_test, y_test

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
x_train, y_train, x_test, y_test = get_clean_data(df_train, df_test, abs_diff_thresholds, smooth_window=3)

In [None]:
params = {
    'n_estimators': 100, 
    'random_state': SEED,
    'n_jobs':-1
}

eval_results = eval_model(
    x_train,
    y_train,
    x_test,
    y_test,
    3,
    'RF',
    params
)

eval_results

In [None]:
# 'cv_mae': 55.81,
#  'cv_rmse': 78.73,
#  'cv_mape': 6.72,
#  'test_mae': 67.58,
#  'test_rmse': 117.83,
#  'test_mape': 8.72,

This seems to be the best model so far, even though the imporvement is not big compared to the model fitted just on cleaned data.

In [None]:
plot_errors(x_filtered_test, df_test['Power'], eval_results['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

# Model Selection: CatBoost

In [None]:
# Read again for reproducibility
df = pd.read_parquet('../data/01_raw/df_train_test.parquet')
df = df[df['Power'] > 20].copy()
df.index = pd.to_datetime(df['Timestamps'])
df.drop(columns=['Timestamps'], inplace=True)

# Split the data
df_train = df[:30_000]
df_test = df[30_000:]

In [None]:
x_train, y_train, x_test, y_test = get_clean_data(df_train, df_test, abs_diff_thresholds, smooth_window=3)

In [None]:
# Use default CatBoost Parameters
params = {
    "iterations": 1000,       # number of trees
    "learning_rate": 0.03,    # shrinkage
    "depth": 6,               # tree depth
    "l2_leaf_reg": 3.0,       # L2 regularization
    "random_seed": SEED,         # reproducibility
    "loss_function": "RMSE",  # regression loss
    "verbose": False          # silence logs
}

eval_results = eval_model(
    x_train,
    y_train,
    x_test,
    y_test,
    3,
    'CatBoost',
    params
)
eval_results

In [None]:
plot_errors(x_filtered_test, df_test['Power'], eval_results['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

We see that CatBoost with default parameters gives slightly better performance (but not much improvement).

It can potentially get better if we tune hyperparameters.

# Model Selection: LSTM

In [None]:
class LSTMRegressor(nn.Module):
    """
    Many-to-one LSTM regressor for time series prediction.

    Architecture
    ------------
    - LSTM layers with configurable hidden size and number of layers
    - Fully connected head: hidden_size -> hidden_size//2 -> 1
    - Uses last hidden state from final LSTM layer for prediction
    """

    def __init__(
        self,
        n_features: int,
        hidden_size: int = 128,
        num_layers: int = 2,
        dropout: float = 0.0,
    ) -> None:
        """
        Initialize LSTM regressor.

        Parameters
        ----------
        n_features : int
            Number of input features per time step.
        hidden_size : int, default=128
            Hidden size of LSTM layers.
        num_layers : int, default=2
            Number of stacked LSTM layers.
        dropout : float, default=0.0
            Dropout rate applied between LSTM layers (only if num_layers > 1).
        """
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        # a slightly richer head than just one Linear
        self.head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, 1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass through the LSTM regressor.

        Parameters
        ----------
        x : torch.Tensor
            Input tensor of shape (batch_size, seq_len, n_features).

        Returns
        -------
        torch.Tensor
            Output tensor of shape (batch_size, 1).
        """
        out, (h_n, c_n) = self.lstm(x)      # h_n: (num_layers, B, H)
        last_hidden = h_n[-1]               # (B, H)
        return self.head(last_hidden)       # (B, 1)

In [None]:
def make_sequences(
    x_np: np.ndarray,
    y_np: np.ndarray,
    seq_len: int,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Create sliding window sequences for time series data.

    Creates sequences where each input sequence contains `seq_len` consecutive
    time steps, and the corresponding target is the value at the next time step.

    Parameters
    ----------
    x_np : np.ndarray
        Input features array of shape (n_samples, n_features).
    y_np : np.ndarray
        Target values array of shape (n_samples,).
    seq_len : int
        Length of each input sequence (number of time steps to look back).

    Returns
    -------
    X_seq : np.ndarray
        Sequence inputs of shape (n_samples - seq_len, seq_len, n_features).
    y_seq : np.ndarray
        Sequence targets of shape (n_samples - seq_len,).

    Examples
    --------
    >>> x = np.array([[1], [2], [3], [4], [5]])
    >>> y = np.array([10, 20, 30, 40, 50])
    >>> X_seq, y_seq = make_sequences(x, y, seq_len=2)
    >>> X_seq.shape
    (3, 2, 1)
    >>> y_seq
    array([30, 40, 50])
    """
    X_seq, y_seq = [], []
    for i in range(len(x_np) - seq_len):
        X_seq.append(x_np[i:i + seq_len])
        y_seq.append(y_np[i + seq_len])
    return np.stack(X_seq), np.array(y_seq)

In [None]:
def fit_lstm(
    x_train_df: pd.DataFrame,
    y_train_ser: pd.Series,
    x_val_df: pd.DataFrame,
    y_val_ser: pd.Series,
    model_params: Optional[Dict[str, Any]] = None,
) -> Tuple[np.ndarray, np.ndarray, nn.Module]:
    """
    Fit a many-to-one LSTM on (x_train, y_train) and predict on x_val.

    Steps
    -----
    1. Standardize X with StandardScaler
    2. Standardize y with StandardScaler
    3. Create sequences of length seq_len
    4. Build LSTM (hidden layers, dropout) on GPU if available
    5. Train with MSE loss + Adam
    6. Predict on validation data
    7. Invert scaling for predictions

    Parameters
    ----------
    x_train_df : pd.DataFrame
        Training features.
    y_train_ser : pd.Series
        Training target.
    x_val_df : pd.DataFrame
        Validation features.
    y_val_ser : pd.Series
        Validation target.
    model_params : dict, optional
        Dictionary containing hyperparameters:
        - seq_len: int, default=48
            Sequence length for LSTM input
        - hidden_size: int, default=128
            Hidden size of LSTM layers
        - num_layers: int, default=2
            Number of LSTM layers
        - dropout: float, default=0.0
            Dropout rate (only applied if num_layers > 1)
        - lr: float, default=1e-3
            Learning rate for Adam optimizer
        - batch_size: int, default=128
            Batch size for training
        - epochs: int, default=40
            Number of training epochs
        - verbose: bool, default=True
            Whether to print training progress

    Returns
    -------
    y_val_aligned : np.ndarray
        Validation target (original scale, aligned with predictions).
    y_pred_val : np.ndarray
        Validation predictions (original scale).
    model : nn.Module
        Trained LSTM model.
    """

    if model_params is None:
        model_params = {}

    # ---- hyperparams (with sensible defaults) ----
    seq_len = model_params.get("seq_len", 48)
    hidden_size = model_params.get("hidden_size", 128)
    num_layers = model_params.get("num_layers", 2)
    dropout = model_params.get("dropout", 0.0)
    lr = model_params.get("lr", 1e-3)
    batch_size = model_params.get("batch_size", 128)
    epochs = model_params.get("epochs", 40)
    verbose = model_params.get("verbose", True)

    np.random.seed(SEED)
    torch.manual_seed(SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # ---- scale X ----
    x_scaler = StandardScaler()
    x_train_scaled = x_scaler.fit_transform(x_train_df.values.astype(np.float32))
    x_val_scaled = x_scaler.transform(x_val_df.values.astype(np.float32))

    # ---- scale y ----
    y_train = y_train_ser.values.astype(np.float32).reshape(-1, 1)
    y_val = y_val_ser.values.astype(np.float32).reshape(-1, 1)

    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train).flatten()
    y_val_scaled = y_scaler.transform(y_val).flatten()

    # ---- make sequences (in scaled space) ----
    X_train_seq, y_train_seq = make_sequences(x_train_scaled, y_train_scaled, seq_len)
    X_val_seq, y_val_seq = make_sequences(x_val_scaled, y_val_scaled, seq_len)

    # We'll also keep the ORIGINAL-scale validation y for metrics later:
    _, y_val_orig_seq = make_sequences(x_val_scaled, y_val.flatten(), seq_len)

    train_ds = TensorDataset(
        torch.from_numpy(X_train_seq),
        torch.from_numpy(y_train_seq).view(-1, 1)
    )
    val_ds = TensorDataset(
        torch.from_numpy(X_val_seq),
        torch.from_numpy(y_val_seq).view(-1, 1)
    )

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    # ---- model ----
    n_features = x_train_df.shape[1]
    model = LSTMRegressor(
        n_features=n_features,
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout,
    ).to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # ---- training loop with progress ----
    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * X_batch.size(0)

        train_loss /= len(train_ds)

        # validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                preds = model(X_batch)
                loss = criterion(preds, y_batch)
                val_loss += loss.item() * X_batch.size(0)

        val_loss /= len(val_ds)

        if verbose:
            print(f"Epoch {epoch:03d}/{epochs} | train MSE (scaled): {train_loss:.4f} | "
                  f"val MSE (scaled): {val_loss:.4f}")

    # ---- predict on validation ----
    model.eval()
    preds_list = []
    with torch.no_grad():
        for X_batch, _ in val_loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch).cpu().numpy().flatten()
            preds_list.append(preds)

    y_pred_val_scaled = np.concatenate(preds_list).reshape(-1, 1)
    y_pred_val = y_scaler.inverse_transform(y_pred_val_scaled).flatten()

    # original-scale y for metrics (same length as y_pred_val)
    y_val_aligned = y_val_orig_seq

    return y_val_aligned, y_pred_val, model

Let's add LSTM and MLP to the eval_model function

In [None]:
def eval_model(
    x_train: pd.DataFrame,
    y_train: pd.Series,
    x_test: pd.DataFrame,
    y_test: pd.Series,
    n_splits: int = 3,
    model_name: str = "RF",
    model_params: Union[Mapping[str, Any], None] = None,
) -> Dict[str, Any]:
    """
    Evaluate a regression model with time-series CV, then refit on full train
    and evaluate on test.

    Returns
    -------
    dict
        {
          "cv_mae", "cv_rmse", "cv_mape",
          "test_mae", "test_rmse", "test_mape",
          "y_pred_test",
          "model" 
        }
    """
    np.random.seed(SEED)

    if model_params is None:
        model_params = {}

    tscv = TimeSeriesSplit(n_splits=n_splits)
    cv_mae_list, cv_rmse_list, cv_mape_list = [], [], []

    # ------------------ CV ------------------
    for fold, (train_idx, val_idx) in enumerate(tscv.split(x_train), 1):
        x_train_cv = x_train.iloc[train_idx, :]
        x_val_cv   = x_train.iloc[val_idx, :]
        y_train_cv = y_train.iloc[train_idx]
        y_val_cv   = y_train.iloc[val_idx]

        if model_name == "LSTM":
            y_val_aligned, y_pred_cv, model = fit_lstm(
                x_train_cv, y_train_cv, x_val_cv, y_val_cv, model_params
            )
            mae_err, rmse_err, mape_err = compute_metrics(y_val_aligned, y_pred_cv)

        elif model_name == "MLP":
            y_val_aligned, y_pred_cv, model = fit_mlp(
                x_train_cv, y_train_cv, x_val_cv, y_val_cv, model_params
            )
            mae_err, rmse_err, mape_err = compute_metrics(y_val_aligned, y_pred_cv)

        else:
            x_scaler = StandardScaler()
            x_scaled_cv_train = x_scaler.fit_transform(x_train_cv)
            x_scaled_cv_val   = x_scaler.transform(x_val_cv)

            if model_name == "RF":
                model = RF(**model_params)
            elif model_name == "LinReg":
                model = LinearRegression(**model_params)
            elif model_name == "CatBoost":
                params = dict(model_params)
                params.setdefault("verbose", False)
                params.setdefault("random_seed", SEED)
                model = CatBoostRegressor(**params)
            else:
                raise ValueError(f"Unknown model_name: {model_name}")

            model.fit(x_scaled_cv_train, y_train_cv)
            y_pred_cv = model.predict(x_scaled_cv_val)
            mae_err, rmse_err, mape_err = compute_metrics(y_val_cv, y_pred_cv)

        cv_mae_list.append(mae_err)
        cv_rmse_list.append(rmse_err)
        cv_mape_list.append(mape_err)

    cv_mae  = float(np.mean(cv_mae_list))
    cv_rmse = float(np.mean(cv_rmse_list))
    cv_mape = float(np.mean(cv_mape_list))

    # ----------- Final model on full training -------------
    if model_name == "LSTM":
        y_test_aligned, y_pred_test_aligned, model = fit_lstm(
            x_train, y_train, x_test, y_test, model_params
        )
        seq_len = model_params.get("seq_len", 48)

        y_pred_test = np.full(len(y_test), np.nan, dtype=float)
        y_pred_test[seq_len:] = y_pred_test_aligned

        mae_err_test, rmse_err_test, mape_err_test = compute_metrics(
            y_test.values[seq_len:], y_pred_test[seq_len:]
        )

    elif model_name == "MLP":
        y_test_aligned, y_pred_test, model = fit_mlp(
            x_train, y_train, x_test, y_test, model_params
        )
        mae_err_test, rmse_err_test, mape_err_test = compute_metrics(
            y_test_aligned, y_pred_test
        )

    else:
        if model_name == "RF":
            model = RF(**model_params)
        elif model_name == "LinReg":
            model = LinearRegression(**model_params)
        elif model_name == "CatBoost":
            params = dict(model_params)
            params.setdefault("verbose", False)
            params.setdefault("random_seed", SEED)
            model = CatBoostRegressor(**params)
        else:
            raise ValueError(f"Unknown model_name: {model_name}")

        x_scaler = StandardScaler()
        x_scaled_train = x_scaler.fit_transform(x_train)
        x_scaled_test  = x_scaler.transform(x_test)

        model.fit(x_scaled_train, y_train)
        y_pred_test = model.predict(x_scaled_test)

        mae_err_test, rmse_err_test, mape_err_test = compute_metrics(
            y_test, y_pred_test
        )

    return {
        "cv_mae": round(cv_mae, 2),
        "cv_rmse": round(cv_rmse, 2),
        "cv_mape": round(cv_mape, 2),
        "test_mae": round(mae_err_test, 2),
        "test_rmse": round(rmse_err_test, 2),
        "test_mape": round(mape_err_test, 2),
        "y_pred_test": y_pred_test,
        "model": model
    }

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lstm_params = {
    "seq_len": 48,
    "hidden_size": 128,
    "num_layers": 2,
    "dropout": 0.0,
    "lr": 1e-3,
    "batch_size": 128,
    "epochs": 5,
    "verbose": True,
}

results_lstm = eval_model(
    x_train,
    y_train,
    x_test,
    y_test,
    n_splits=3,
    model_name="LSTM",
    model_params=lstm_params,
)

In [None]:
results_lstm

We see that the first attempt with LSTM does not work well.

In fact, it works way worse.

It can of course be the case that by finding a better architecture and hyperparameters, we can achieve better results.

However, based on the current results and given the data quality, the chances for this are not big.

This is because LSTM would work well in less noisy data with clear patters over time.

In our case, even though we clean data, for the model it can still be hard to learn the relationship and not overfit to the noise.

Another reason can be that the generated features make it harder (not easier) for the model to predict the target.

Last, it's better to make log transformation of the target to make it closer to the normalm distribution.

However, this gives again some more of headache in production without providing very clear advantage.

In [None]:
plot_errors(x_filtered_test, df_test['Power'], results_lstm['y_pred_test'], error='mape', error_threshold=8.5, rolling_window=288)

# Model Selection: MLP

Let's try a simplier neural network.

In [None]:
class MLPRegressor(nn.Module):
    def __init__(self, n_features, hidden_sizes=[128, 64], dropout=0.1):
        """
        Parameters
        ----------
        n_features : int
            Number of input features.
        hidden_sizes : list[int]
            Example: [256, 128, 64] => 3 hidden layers.
        dropout : float
            Dropout applied AFTER each hidden layer. Set 0 for no dropout.
        """
        super().__init__()

        layers = []
        in_dim = n_features

        for h in hidden_sizes:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.ReLU())

            if dropout > 0:
                layers.append(nn.Dropout(dropout))

            in_dim = h

        # final output layer
        layers.append(nn.Linear(in_dim, 1))

        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

In [None]:
def fit_mlp(
    x_train_df: pd.DataFrame,
    y_train_ser: pd.Series,
    x_test_df: pd.DataFrame,
    y_test_ser: pd.Series,
    model_params: Dict[str, Any] = None,
) -> Tuple[np.ndarray, np.ndarray, nn.Module]:
    """
    Fit a feed-forward MLP for regression on (X_train, y_train) and
    predict on X_test using PyTorch.

    Steps
    -----
    1. Standardize X with StandardScaler
    2. Standardize y with StandardScaler
    3. Build MLP (hidden layers, dropout) on GPU if available
    4. Train with MSE loss + Adam
    5. Predict on test data
    6. Invert scaling for predictions

    Parameters
    ----------
    x_train_df : pd.DataFrame
        Training features.
    y_train_ser : pd.Series
        Training target.
    x_test_df : pd.DataFrame
        Test features.
    y_test_ser : pd.Series
        Test target.
    model_params : dict, default=None
        - hidden_sizes, dropout, lr, batch_size, epochs, verbose

    Returns
    -------
    y_test_orig : np.ndarray
        Test target (original scale).
    y_pred_test : np.ndarray
        Test predictions (original scale).
    model : nn.Module
        Trained model.
    """

    if model_params is None:
        model_params = {}

    hidden_sizes = model_params.get("hidden_sizes", [128, 64])
    dropout = model_params.get("dropout", 0.1)
    lr = model_params.get("lr", 1e-3)
    batch_size = model_params.get("batch_size", 128)
    epochs = model_params.get("epochs", 40)
    verbose = model_params.get("verbose", True)

    np.random.seed(SEED)
    torch.manual_seed(SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # ---- scale X ----
    x_scaler = StandardScaler()
    X_train = x_scaler.fit_transform(x_train_df.values.astype(np.float32))
    X_test = x_scaler.transform(x_test_df.values.astype(np.float32))

    # ---- scale y ----
    y_train = y_train_ser.values.astype(np.float32).reshape(-1, 1)
    y_test = y_test_ser.values.astype(np.float32).reshape(-1, 1)

    y_scaler = StandardScaler()
    y_train_scaled = y_scaler.fit_transform(y_train).flatten()
    y_test_scaled = y_scaler.transform(y_test).flatten()

    # ---- tensors & loaders ----
    X_train_t = torch.from_numpy(X_train)
    y_train_t = torch.from_numpy(y_train_scaled).view(-1, 1)

    X_test_t = torch.from_numpy(X_test)
    y_test_t = torch.from_numpy(y_test_scaled).view(-1, 1)

    train_ds = TensorDataset(X_train_t, y_train_t)
    test_ds = TensorDataset(X_test_t, y_test_t)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    # ---- model ----
    n_features = x_train_df.shape[1]
    model = MLPRegressor(
        n_features=n_features,
        hidden_sizes=hidden_sizes,
        dropout=dropout,
    ).to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # ---- training ----
    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * X_batch.size(0)

        train_loss /= len(train_ds)

        # evaluate on test
        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                preds = model(X_batch)
                loss = criterion(preds, y_batch)
                test_loss += loss.item() * X_batch.size(0)

        test_loss /= len(test_ds)

        if verbose:
            print(
                f"Epoch {epoch:03d}/{epochs} | "
                f"train MSE (scaled): {train_loss:.4f} | "
                f"test MSE (scaled): {test_loss:.4f}"
            )

    # ---- predict on test ----
    model.eval()
    preds_list = []
    with torch.no_grad():
        for X_batch, _ in test_loader:
            X_batch = X_batch.to(device)
            preds = model(X_batch).cpu().numpy().flatten()
            preds_list.append(preds)

    y_pred_test_scaled = np.concatenate(preds_list).reshape(-1, 1)
    y_pred_test = y_scaler.inverse_transform(y_pred_test_scaled).flatten()

    y_test_orig = y_test.flatten()
    return y_test_orig, y_pred_test, model

In [None]:
mlp_params = {
    "hidden_sizes": [128, 128, 128],
    "dropout": 0.3, # 0.3,
    "lr": 1e-3,
    "batch_size": 128,
    "epochs": 50, #100,
    "verbose": True,
}

results_mlp = eval_model(
    x_train,
    y_train,
    x_test,
    y_test,
    n_splits=3,
    model_name="MLP",
    model_params=mlp_params,
)

In [None]:
results_mlp

In [None]:
plot_errors(x_filtered_test, df_test['Power'], results_mlp['y_pred_test'], error='mape', error_threshold=10, rolling_window=288)

We see that MLP gives a bit worse results in terms of the ML metrics but similar in terms of the business metric.

For now, we can iterate more over CatBoost hyperparameter tuning, however MLP can also be tested.

The problem with MLP though is that it has a much wider hyperparmaeter space.

# Conclusions

### **1. The real issue is noise, not the model**
All meaningful improvements came from cleaning and smoothing the signal.
This system is noise-limited, not model-limited.

### **2. FFT looked great only because it leaked future data**
Using the full sequence gave unrealistic results.  
When we removed future information, we got worse performance.  

### **3. Butterworth and mean filters gave more realistic results**
Uses only past values, stable, fast, minimal lag, production-safe.  

### **4. Difference-based outlier removal works well**
We observed that Z-score filter misses spikes. Difference based filtering removes them better.  

### **5. Extra features didn’t help because the model already had the signal**
RPM + WindSpeed already capture almost everything.  
New domain features were redundant for trees, even though showed good correlation with the target.

### **6. Proposed production pipeline for cleaning**
1. Diff-based outlier removal  
2. Mean filtering (can be Butterworth)  
3. Small rolling stats (optional)
4. Lagged values for some features  
5. Train RF/CatBoost on the cleaned data
6. MLP can be tested if wanted.