In [None]:
import warnings
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from src.models.random_forest_utils import fit_rf, eval_rul, plot_rmse, Metrics

from src.data_utils.feature_engineering import create_features, FeatureEngineeringSettings
from src.utils.config import config

# np.random.seed(34)
warnings.filterwarnings("ignore")

## Load and apply FE

In [None]:
fe_settings = FeatureEngineeringSettings()

In [None]:
prepared_folder = config.PREPARED_DATA_PATH

train_df = pd.read_csv(prepared_folder / "train-all-prepared.csv", index_col=False)
test_df = pd.read_csv(prepared_folder / "test-all-prepared.csv", index_col=False)

In [None]:
train_df = create_features(train_df, fe_settings)
test_df = create_features(test_df, fe_settings)

In [None]:
train_df.shape

In [None]:
train_to_use = train_df
test_to_use = test_df

## Filtering

We can only select trainning data among a preselected range of RUL (corresponding to those in 'test' subset)

In [None]:
rul_thresholds = {
    1: {"max": 145, "min": 6},
    2: {"max": 194, "min": 6},
    3: {"max": 145, "min": 6},
    4: {"max": 194, "min": 6},
}

# Apply different RUL filtering for each subset
filtered_dfs = []
for subset_id in [1, 2, 3, 4]:
    subset_data = train_df[train_df["subset"] == subset_id]
    max_rul = rul_thresholds[subset_id]["max"]
    min_rul = rul_thresholds[subset_id]["min"]

    filtered_subset = subset_data[(subset_data["RUL"] <= max_rul) & (subset_data["RUL"] >= min_rul)]
    filtered_dfs.append(filtered_subset)

# Combine all filtered subsets back together
train_to_use = pd.concat(filtered_dfs, ignore_index=True)

# 1. Train CV

Iterate here to find best hyperparameter (and feature engineering - rolling windows sizes, etc) -> script, optuna, MLFlow, also parametrize RUL range to train as HP.

In [None]:
# best_model, val_rmse = fit_rf(train_to_use, param_grid)
best_model, val_rmse = fit_rf(train_to_use)

In [None]:
y_pred, y_test, metrics = eval_rul(best_model, test_to_use)

In [None]:
fig = plot_rmse(y_test, y_pred, metrics.rmse)
fig.show()

## 2.2 Mutual Information

In [None]:
# from sklearn.feature_selection import mutual_info_regression
#
# mi_scores = mutual_info_regression(X, y)
# mi_df = pd.DataFrame({
#     'feature': X.columns,
#     'mutual_info': mi_scores
# }).sort_values('mutual_info', ascending=False)

## 2.3 Recursive Feature Elimination (RFE)

In [None]:
# from sklearn.feature_selection import RFE
#
# rfe = RFE(estimator=RandomForestRegressor(n_estimators=100),
#           n_features_to_select=50)
# rfe.fit(X, y)
# selected_features = X.columns[rfe.support_]

## 2.4 Variance Threshold

In [None]:
# from sklearn.feature_selection import VarianceThreshold
#
# # Remove features with very low variance
# selector = VarianceThreshold(threshold=0.01)
# X_filtered = selector.fit_transform(X)

_____

# 3. Simulate adding categories one after the other (for model lifecycle simulation)

This is not ML research, but more to validate this will work in this simulation model lifecycle project
Each time using the `train` subset for trainning - and `test` subset for testing
So for instance: 001_train eval on 002_test

In [None]:
train_to_use

In [None]:
def subset_dfs(df: pd.DataFrame) -> list[pd.DataFrame]:
    subset_dfs: list[pd.DataFrame] = []
    for subset_id in [1, 2, 3, 4]:
        mask = df["subset"] == subset_id
        subset_dfs.append(df[mask].copy())
    return subset_dfs


subset_train_dfs: list[pd.DataFrame] = subset_dfs(train_to_use)
subset_test_dfs: list[pd.DataFrame] = subset_dfs(test_to_use)

In [None]:
FD001_train = subset_train_dfs[0]
FD002_train = subset_train_dfs[1]
FD003_train = subset_train_dfs[2]
FD004_train = subset_train_dfs[3]

FD001_test = subset_test_dfs[0]
FD002_test = subset_test_dfs[1]
FD003_test = subset_test_dfs[2]
FD004_test = subset_test_dfs[3]

## 3.0 Simulate engine train Cat 001 - Test on cat 001

In [None]:
best_model, val_rmse = fit_rf(FD001_train)
y_pred, y_test, metrics = eval_rul(best_model, FD001_test)
fig = plot_rmse(y_test, y_pred, metrics.rmse)
fig.show()

## 3.1 Simulate engine train Cat 001 - Test on cat 001 + 002

In [None]:
FD001_2_test = pd.concat([FD001_test, FD002_test], ignore_index=True)

y_pred, y_test, metrics = eval_rul(best_model, FD002_test)
fig = plot_rmse(y_test, y_pred, metrics.rmse)
fig.show()

In [None]:
FD001_2_train = pd.concat([FD001_train, FD002_train], ignore_index=True)
best_model, val_rmse = fit_rf(FD001_2_train)

In [None]:
y_pred, y_test, metrics = eval_rul(best_model, FD002_test)
fig = plot_rmse(y_test, y_pred, metrics.rmse)
fig.show()

## 3.2 Simulate engine train Cat 001 + 002 - Test on cat 003

In [None]:
y_pred, y_test, metrics = eval_rul(best_model, FD003_test)
fig = plot_rmse(y_test, y_pred, metrics.rmse)
fig.show()

In [None]:
FD001_2_3_train = pd.concat([FD001_train, FD002_train, FD003_train], ignore_index=True)

best_model, val_rmse = fit_rf(FD001_2_3_train)

y_pred, y_test, metrics = eval_rul(best_model, FD003_test)
fig = plot_rmse(y_test, y_pred, metrics.rmse)
fig.show()

## 3.3 Simulate engine train Cat 001 + 002 + 003 - Test on cat 004

In [None]:
y_pred, y_test, metrics = eval_rul(best_model, FD004_test)
fig = plot_rmse(y_test, y_pred, metrics.rmse)
fig.show()

In [None]:
FD001_2_3_4_train = pd.concat([FD001_train, FD002_train, FD003_train, FD004_train], ignore_index=True)

best_model, val_rmse = fit_rf(FD001_2_3_4_train)

y_pred, y_test, metrics = eval_rul(best_model, FD004_test)
fig = plot_rmse(y_test, y_pred, metrics.rmse)
fig.show()

_____

# Strategy : All unit shuffled - train/test same order - eval sliding window - detect drift -> retrain on where we are on test

In [None]:
print(train_to_use["unit_number"].nunique())
print(test_to_use["unit_number"].nunique())

In [None]:
train_units = set(train_to_use["unit_number"].values)
test_units = set(test_to_use["unit_number"].values)

common_units = train_units.intersection(test_units)  # 1001, 1002, 1003, .... 4001, 4002, etc
num_common = len(common_units)

print(f"Number of common unit_numbers: {num_common}")

In [None]:
# Shuffle the common unit numbers
common_units_list = list(common_units)
# np.random.shuffle(common_units_list)

# Take the first 10%
num_units_to_take = int(len(common_units_list) * 0.1)
selected_units = common_units_list[:num_units_to_take]

train_subset = train_to_use[train_to_use["unit_number"].isin(selected_units)]


test_subset = test_to_use[test_to_use["unit_number"].isin(selected_units)]

best_model, val_rmse = fit_rf(train_subset)
y_pred, y_test, metrics = eval_rul(best_model, test_subset)
# fig = plot_rmse(y_test, y_pred, rmse)
# fig.show()

In [None]:
# Shuffle the common unit numbers
common_units_list = list(common_units)
# np.random.shuffle(common_units_list)

# Parameters
train_ratio = 0.1  # Fixed training ratio
eval_portion = 0.25  # Random portion of available units to actually evaluate on
step_size = 0.025  # Increase evaluation pool by 5% each iteration
max_ratio = 1.0  # Go up to 100%

# Storage for results
eval_pool_ratios = []
rmse_values = []

In [None]:
# Train model once on fixed ratio
print(f"Training model on {train_ratio * 100:.1f}% of units...")
num_units_to_take = int(len(common_units_list) * train_ratio)
selected_units = common_units_list[:num_units_to_take]

train_subset = train_to_use[train_to_use["unit_number"].isin(selected_units)]

# Train the model
best_model, val_rmse = fit_rf(train_subset)
if val_rmse is not None:
    print(f"Training complete on {len(selected_units)} units. Validation RMSE: {val_rmse:.4f}")
else:
    print(f"Training complete on {len(selected_units)} units.")

In [None]:
# Calculate evaluation pool ratios
current_ratio = train_ratio + step_size  # Start after training ratio
eval_ratios = []
eval_ratios.append(train_ratio)
while current_ratio <= max_ratio:
    eval_ratios.append(current_ratio)
    current_ratio += step_size

print(f"Will evaluate with pool ratios: {[r * 100 for r in eval_ratios]}%")

test_subset = test_to_use[test_to_use["unit_number"].isin(selected_units)]

# Track retraining points
retrain_ratios = []

for eval_ratio in eval_ratios:
    num_units_to_take = int(len(common_units_list) * eval_ratio)
    selected_units = common_units_list[:num_units_to_take]

    test_subset = test_to_use[test_to_use["unit_number"].isin(selected_units)]

    test_subset_sample = test_subset.sample(frac=eval_portion, random_state=42)

    if len(test_subset) > 0:
        # Evaluate the model
        y_pred, y_test, metrics = eval_rul(best_model, test_subset_sample)

        eval_pool_ratios.append(eval_ratio)
        rmse_values.append(metrics.rmse)

        if metrics.rmse > 29.0:  # dummy drift detection
            train_subset = train_to_use[train_to_use["unit_number"].isin(selected_units)]
            best_model, _ = fit_rf(train_subset)
            # Track this retraining point
            retrain_ratios.append(eval_ratio)
    else:
        assert (False, f"No test samples found for evaluation pool at {eval_ratio * 100:.1f}%")

In [None]:
# Plot results
plt.figure(figsize=(12, 6))
plt.plot([r * 100 for r in eval_pool_ratios], rmse_values, "b-o", linewidth=2, markersize=6)
plt.xlabel("Evaluation Pool Size (% of Total Units)")
plt.ylabel("RMSE")
plt.title(
    f"Model Performance vs Evaluation Pool Size\n"
    f"(Trained on {train_ratio * 100:.1f}% units, Each evaluation uses {eval_portion * 100:.1f}% of available pool)"
)
plt.grid(True, alpha=0.3)

# Add vertical line showing training size
plt.axvline(
    x=train_ratio * 100, color="red", linestyle="--", alpha=0.7, label=f"Training size ({train_ratio * 100:.1f}%)"
)

# Add red lines at each actual retraining point
for i, retrain_ratio in enumerate(retrain_ratios):
    if i == 0:  # Add label only to first retraining point
        plt.axvline(x=retrain_ratio * 100, color="red", linestyle=":", alpha=0.7, label="Model retrained")
    else:
        plt.axvline(x=retrain_ratio * 100, color="red", linestyle=":", alpha=0.7)

plt.legend()

plt.tight_layout()

if rmse_values:
    plt.text(
        0.02,
        0.98,
        f"Min RMSE: {min(rmse_values):.4f}\nMax RMSE: {max(rmse_values):.4f}\nFinal RMSE: {rmse_values[-1]:.4f}",
        transform=plt.gca().transAxes,
        verticalalignment="top",
        bbox=dict(boxstyle="round", facecolor="lightblue", alpha=0.8),
    )

plt.savefig(os.path.join(config.TEMP_FOLDER, "drift_simulation_notebook.png"), dpi=300, bbox_inches="tight")
plt.show()

# Print summary
print(f"\n=== SUMMARY ===")
print(f"Model trained on: {train_ratio * 100:.1f}% of units ({len(train_units)} units)")
print(f"Evaluation portion: {eval_portion * 100:.1f}% of each pool (random selection)")
print(f"Step size: {step_size * 100:.1f}%")
if rmse_values:
    print(f"RMSE range: {min(rmse_values):.4f} - {max(rmse_values):.4f}")
    print(f"Number of evaluation points: {len(rmse_values)}")

_____