In [None]:
# %% [markdown]
# ## üìã Step 1: Install Dependencies

# %%
!pip install tensorflow==2.15.0
!pip install scikit-learn pandas numpy matplotlib seaborn plotly
!pip install keras-tuner
print("‚úÖ All dependencies installed successfully!")

[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.15.0 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.18.1, 2.19.0rc0, 2.19.0, 2.19.1, 2.20.0rc0, 2.20.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.15.0[0m[31m
‚úÖ All dependencies installed successfully!


In [None]:
# %% [markdown]
# ## üì¶ Step 2: Import Libraries

# %%
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pickle
import json
from pathlib import Path

# TensorFlow and Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
from tensorflow.keras.metrics import MeanAbsoluteError, RootMeanSquaredError

# Scikit-learn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Plotly for interactive visualizations
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

warnings.filterwarnings('ignore')

# GPU Configuration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"‚úÖ GPU Available: {len(gpus)} GPU(s)")
    except RuntimeError as e:
        print(e)
else:
    print("‚ÑπÔ∏è Running on CPU")

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

‚úÖ GPU Available: 1 GPU(s)
TensorFlow version: 2.19.0
Keras version: 3.10.0


In [None]:
# %% [markdown]
# ## üìÅ Step 3: Manual Dataset Upload

# %%
from google.colab import files
import zipfile

def upload_datasets():
    """Upload and organize CMAPSS datasets"""
    print("üì§ Upload your CMAPSS dataset files")
    print("=" * 70)
    print("Required files:")
    print("  - train_FD001.txt, test_FD001.txt, RUL_FD001.txt")
    print("  - train_FD002.txt, test_FD002.txt, RUL_FD002.txt")
    print("  - train_FD003.txt, test_FD003.txt, RUL_FD003.txt")
    print("  - train_FD004.txt, test_FD004.txt, RUL_FD004.txt")
    print("=" * 70)
    print("\nüí° You can upload a ZIP file or individual files")
    print("Click 'Choose Files' below:\n")

    uploaded = files.upload()

    # Create data directory
    data_dir = Path('./data')
    data_dir.mkdir(exist_ok=True)

    # Process uploaded files
    for filename, content in uploaded.items():
        filepath = Path(filename)

        # Handle ZIP files
        if filepath.suffix == '.zip':
            print(f"\nüì¶ Extracting {filename}...")
            with open(filename, 'wb') as f:
                f.write(content)

            with zipfile.ZipFile(filename, 'r') as zip_ref:
                zip_ref.extractall(data_dir)

            os.remove(filename)
            print(f"‚úì Extracted to {data_dir}")
        else:
            # Save individual files
            file_path = data_dir / filename
            with open(file_path, 'wb') as f:
                f.write(content)
            print(f"‚úì Saved {filename} to {data_dir}")

    # Verify uploaded files
    print("\nüìã Verifying dataset files...")
    datasets = ['FD001', 'FD002', 'FD003', 'FD004']
    available_datasets = []

    for dataset in datasets:
        train_file = data_dir / f'train_{dataset}.txt'
        test_file = data_dir / f'test_{dataset}.txt'
        rul_file = data_dir / f'RUL_{dataset}.txt'

        if train_file.exists() and test_file.exists() and rul_file.exists():
            available_datasets.append(dataset)
            print(f"‚úÖ {dataset}: Complete")
        else:
            missing = []
            if not train_file.exists(): missing.append(f"train_{dataset}.txt")
            if not test_file.exists(): missing.append(f"test_{dataset}.txt")
            if not rul_file.exists(): missing.append(f"RUL_{dataset}.txt")
            print(f"‚ö†Ô∏è {dataset}: Missing {', '.join(missing)}")

    if available_datasets:
        print(f"\n‚úÖ Successfully loaded {len(available_datasets)} dataset(s): {', '.join(available_datasets)}")
    else:
        print("\n‚ùå No complete datasets found. Please upload the required files.")

    return available_datasets

# Execute upload
print("üöÄ Starting dataset upload process...\n")
available_datasets = upload_datasets()

üöÄ Starting dataset upload process...

üì§ Upload your CMAPSS dataset files
Required files:
  - train_FD001.txt, test_FD001.txt, RUL_FD001.txt
  - train_FD002.txt, test_FD002.txt, RUL_FD002.txt
  - train_FD003.txt, test_FD003.txt, RUL_FD003.txt
  - train_FD004.txt, test_FD004.txt, RUL_FD004.txt

üí° You can upload a ZIP file or individual files
Click 'Choose Files' below:



TypeError: 'NoneType' object is not subscriptable

In [None]:
# %% [markdown]
# ## üîß Step 4: Configuration and Dataset Handler

# %%
class CMAPSSConfig:
    """Configuration for CMAPSS datasets"""

    CONFIG = {
        'FD001': {
            'train_path': 'train_FD001.txt',
            'test_path': 'test_FD001.txt',
            'rul_path': 'RUL_FD001.txt',
            'seq_len': 50,  # Optimized sequence length
            'rul_clip': 125,
            'batch_size': 512,
            'sensor_cols': [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 17, 20, 21],
            'op_cols': [1, 2, 3],
        },
        'FD002': {
            'train_path': 'train_FD002.txt',
            'test_path': 'test_FD002.txt',
            'rul_path': 'RUL_FD002.txt',
            'seq_len': 30,
            'rul_clip': 125,
            'batch_size': 512,
            'sensor_cols': [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 17, 20, 21],
            'op_cols': [1, 2, 3],
        },
        'FD003': {
            'train_path': 'train_FD003.txt',
            'test_path': 'test_FD003.txt',
            'rul_path': 'RUL_FD003.txt',
            'seq_len': 50,
            'rul_clip': 125,
            'batch_size': 512,
            'sensor_cols': [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 17, 20, 21],
            'op_cols': [1, 2, 3],
        },
        'FD004': {
            'train_path': 'train_FD004.txt',
            'test_path': 'test_FD004.txt',
            'rul_path': 'RUL_FD004.txt',
            'seq_len': 30,
            'rul_clip': 125,
            'batch_size': 512,
            'sensor_cols': [2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 17, 20, 21],
            'op_cols': [1, 2, 3],
        }
    }

    @staticmethod
    def get_feature_names():
        """Get column names for CMAPSS data"""
        columns = ['unit', 'cycle']
        columns += [f'op_setting_{i}' for i in range(1, 4)]
        columns += [f'sensor_{i}' for i in range(1, 22)]
        return columns

class DataProcessor:
    """Process CMAPSS datasets"""

    def __init__(self, dataset_name, base_path='./data'):
        self.dataset_name = dataset_name
        self.config = CMAPSSConfig.CONFIG[dataset_name]
        self.base_path = Path(base_path)
        self.feature_names = CMAPSSConfig.get_feature_names()
        self.scaler = StandardScaler()

    def load_data(self):
        """Load training, test, and RUL data"""
        train_path = self.base_path / self.config['train_path']
        test_path = self.base_path / self.config['test_path']
        rul_path = self.base_path / self.config['rul_path']

        # Load data
        train_df = pd.read_csv(train_path, sep=r'\s+', header=None, names=self.feature_names)
        test_df = pd.read_csv(test_path, sep=r'\s+', header=None, names=self.feature_names)
        rul_df = pd.read_csv(rul_path, sep=r'\s+', header=None, names=['RUL'])

        print(f"\nüìä Dataset: {self.dataset_name}")
        print(f"   Training samples: {len(train_df):,}")
        print(f"   Test samples: {len(test_df):,}")
        print(f"   Training units: {train_df['unit'].nunique()}")
        print(f"   Test units: {test_df['unit'].nunique()}")

        return train_df, test_df, rul_df

    def add_features(self, df):
        """Add engineered features"""
        # Rolling statistics
        sensor_cols = [f'sensor_{i}' for i in self.config['sensor_cols']]

        for col in sensor_cols:
            # Rolling mean (3 cycles)
            df[f'{col}_rolling_mean'] = df.groupby('unit')[col].transform(
                lambda x: x.rolling(window=3, min_periods=1).mean()
            )
            # Rolling std (3 cycles)
            df[f'{col}_rolling_std'] = df.groupby('unit')[col].transform(
                lambda x: x.rolling(window=3, min_periods=1).std().fillna(0)
            )

        return df

    def compute_rul(self, df):
        """Compute Remaining Useful Life"""
        max_cycle = df.groupby('unit')['cycle'].max().reset_index()
        max_cycle.columns = ['unit', 'max_cycle']

        df = df.merge(max_cycle, on='unit', how='left')
        df['RUL'] = df['max_cycle'] - df['cycle']
        df['RUL'] = df['RUL'].clip(upper=self.config['rul_clip'])
        df.drop('max_cycle', axis=1, inplace=True)

        return df

    def select_features(self, df):
        """Select relevant features"""
        sensor_cols = [f'sensor_{i}' for i in self.config['sensor_cols']]
        op_cols = [f'op_setting_{i}' for i in self.config['op_cols']]

        # Add engineered features
        rolling_cols = [col for col in df.columns if 'rolling' in col]

        feature_cols = op_cols + sensor_cols + rolling_cols

        base_cols = ['unit', 'cycle']
        if 'RUL' in df.columns:
            return df[base_cols + feature_cols + ['RUL']]
        return df[base_cols + feature_cols]

    def normalize_features(self, train_df, test_df):
        """Normalize features using StandardScaler"""
        feature_cols = [col for col in train_df.columns
                       if col not in ['unit', 'cycle', 'RUL']]

        # Fit on training data
        train_df[feature_cols] = self.scaler.fit_transform(train_df[feature_cols])

        # Transform test data
        test_df[feature_cols] = self.scaler.transform(test_df[feature_cols])

        return train_df, test_df, feature_cols

    def create_sequences(self, df, is_test=False):
        """Create sequences for LSTM/GRU models"""
        seq_len = self.config['seq_len']
        sequences = []
        labels = []

        for unit in df['unit'].unique():
            unit_data = df[df['unit'] == unit].sort_values('cycle')

            # Get features and labels
            feature_cols = [col for col in unit_data.columns
                          if col not in ['unit', 'cycle', 'RUL']]
            features = unit_data[feature_cols].values

            if is_test:
                # For test: take last sequence
                if len(features) >= seq_len:
                    seq = features[-seq_len:]
                else:
                    # Pad if too short
                    pad_len = seq_len - len(features)
                    pad = np.repeat(features[0:1], pad_len, axis=0)
                    seq = np.vstack([pad, features])
                sequences.append(seq)
            else:
                # For training: create overlapping windows
                rul_values = unit_data['RUL'].values

                for i in range(len(features) - seq_len + 1):
                    sequences.append(features[i:i+seq_len])
                    labels.append(rul_values[i+seq_len-1])

        sequences = np.array(sequences, dtype=np.float32)

        if is_test:
            return sequences
        else:
            labels = np.array(labels, dtype=np.float32)
            return sequences, labels

In [None]:
# %% [markdown]
# ## ü§ñ Step 5: Model Architectures

# %%
def create_lstm_model(input_shape, model_config=None):
    """Create LSTM model with attention"""
    if model_config is None:
        model_config = {
            'lstm_units': [128, 64],
            'dropout': 0.3,
            'dense_units': [64, 32],
            'learning_rate': 0.001
        }

    inputs = layers.Input(shape=input_shape)

    # LSTM layers
    x = layers.LSTM(model_config['lstm_units'][0],
                    return_sequences=True)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(model_config['dropout'])(x)

    x = layers.LSTM(model_config['lstm_units'][1],
                    return_sequences=True)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(model_config['dropout'])(x)

    # Attention mechanism
    attention = layers.Dense(1, activation='tanh')(x)
    attention = layers.Flatten()(attention)
    attention = layers.Activation('softmax')(attention)
    attention = layers.RepeatVector(model_config['lstm_units'][1])(attention)
    attention = layers.Permute([2, 1])(attention)

    # Apply attention
    x = layers.Multiply()([x, attention])
    x = layers.Lambda(lambda xin: tf.reduce_sum(xin, axis=1))(x)

    # Dense layers
    for units in model_config['dense_units']:
        x = layers.Dense(units, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(model_config['dropout'])(x)

    # Output
    outputs = layers.Dense(1, activation='linear')(x)

    model = models.Model(inputs=inputs, outputs=outputs, name='LSTM_Attention')

    model.compile(
        optimizer=Adam(learning_rate=model_config['learning_rate']),
        loss=Huber(delta=1.0),
        metrics=[RootMeanSquaredError(name='rmse'),
                MeanAbsoluteError(name='mae')]
    )

    return model

def create_gru_model(input_shape, model_config=None):
    """Create GRU model"""
    if model_config is None:
        model_config = {
            'gru_units': [128, 64],
            'dropout': 0.3,
            'dense_units': [64, 32],
            'learning_rate': 0.001
        }

    inputs = layers.Input(shape=input_shape)

    # GRU layers
    x = layers.GRU(model_config['gru_units'][0],
                   return_sequences=True)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(model_config['dropout'])(x)

    x = layers.GRU(model_config['gru_units'][1],
                   return_sequences=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(model_config['dropout'])(x)

    # Dense layers
    for units in model_config['dense_units']:
        x = layers.Dense(units, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(model_config['dropout'])(x)

    # Output
    outputs = layers.Dense(1, activation='linear')(x)

    model = models.Model(inputs=inputs, outputs=outputs, name='GRU_Model')

    model.compile(
        optimizer=Adam(learning_rate=model_config['learning_rate']),
        loss=Huber(delta=1.0),
        metrics=[RootMeanSquaredError(name='rmse'),
                MeanAbsoluteError(name='mae')]
    )

    return model

def create_cnn_lstm_model(input_shape, model_config=None):
    """Create hybrid CNN-LSTM model"""
    if model_config is None:
        model_config = {
            'conv_filters': [64, 32],
            'lstm_units': [100, 50],
            'dropout': 0.3,
            'dense_units': [50],
            'learning_rate': 0.001
        }

    inputs = layers.Input(shape=input_shape)

    # CNN layers for feature extraction
    x = layers.Conv1D(filters=model_config['conv_filters'][0],
                      kernel_size=3, padding='same', activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Dropout(model_config['dropout'])(x)

    x = layers.Conv1D(filters=model_config['conv_filters'][1],
                      kernel_size=3, padding='same', activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(model_config['dropout'])(x)

    # LSTM layers for temporal dependencies
    x = layers.LSTM(model_config['lstm_units'][0],
                    return_sequences=True)(x)
    x = layers.Dropout(model_config['dropout'])(x)

    x = layers.LSTM(model_config['lstm_units'][1],
                    return_sequences=False)(x)
    x = layers.Dropout(model_config['dropout'])(x)

    # Dense layers
    for units in model_config['dense_units']:
        x = layers.Dense(units, activation='relu')(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(model_config['dropout'])(x)

    # Output
    outputs = layers.Dense(1, activation='linear')(x)

    model = models.Model(inputs=inputs, outputs=outputs, name='CNN_LSTM')

    model.compile(
        optimizer=Adam(learning_rate=model_config['learning_rate']),
        loss=Huber(delta=1.0),
        metrics=[RootMeanSquaredError(name='rmse'),
                MeanAbsoluteError(name='mae')]
    )

    return model


In [None]:
# %% [markdown]
# ## üìà Step 6: Training Pipeline

# %%
class ModelTrainer:
    """Complete training pipeline"""

    def __init__(self, model, dataset_name, model_type):
        self.model = model
        self.dataset_name = dataset_name
        self.model_type = model_type
        self.history = None

        # Create directories
        self.save_dir = Path(f'./models/{dataset_name}_{model_type}_{datetime.now().strftime("%Y%m%d_%H%M%S")}')
        self.save_dir.mkdir(parents=True, exist_ok=True)

        print(f"üìÅ Save directory: {self.save_dir}")

    def get_callbacks(self):
        """Get training callbacks"""
        callbacks_list = [
            # Model checkpoint
            keras.callbacks.ModelCheckpoint(
                filepath=str(self.save_dir / 'best_model.keras'),
                monitor='val_loss',
                save_best_only=True,
                mode='min',
                verbose=1
            ),

            # Early stopping
            keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=20,
                restore_best_weights=True,
                verbose=1
            ),

            # Reduce learning rate
            keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=10,
                min_lr=1e-7,
                verbose=1
            ),

            # CSV logger
            keras.callbacks.CSVLogger(
                filename=str(self.save_dir / 'training_log.csv')
            ),

            # TensorBoard
            keras.callbacks.TensorBoard(
                log_dir=str(self.save_dir / 'logs'),
                histogram_freq=0
            )
        ]

        return callbacks_list

    def train(self, X_train, y_train, X_val, y_val, epochs=150, batch_size=512):
        """Train the model"""
        print(f"\n{'='*70}")
        print(f"üéØ Training {self.model_type} model for {self.dataset_name}")
        print(f"{'='*70}")
        print(f"Training samples: {len(X_train):,}")
        print(f"Validation samples: {len(X_val):,}")
        print(f"Batch size: {batch_size}")
        print(f"Epochs: {epochs}")
        print(f"{'='*70}\n")

        self.history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=self.get_callbacks(),
            verbose=1
        )

        # Save final model
        self.model.save(self.save_dir / 'final_model.keras')
        print(f"\n‚úÖ Training complete! Models saved to {self.save_dir}")

        return self.history

    def plot_history(self):
        """Plot training history"""
        if self.history is None:
            print("No training history available")
            return

        fig, axes = plt.subplots(2, 2, figsize=(16, 10))

        # Loss
        axes[0, 0].plot(self.history.history['loss'], label='Train Loss', linewidth=2)
        axes[0, 0].plot(self.history.history['val_loss'], label='Val Loss', linewidth=2)
        axes[0, 0].set_title(f'{self.dataset_name} - Loss', fontsize=14, fontweight='bold')
        axes[0, 0].set_xlabel('Epoch')
        axes[0, 0].set_ylabel('Loss')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)

        # RMSE
        axes[0, 1].plot(self.history.history['rmse'], label='Train RMSE', linewidth=2)
        axes[0, 1].plot(self.history.history['val_rmse'], label='Val RMSE', linewidth=2)
        axes[0, 1].set_title(f'{self.dataset_name} - RMSE', fontsize=14, fontweight='bold')
        axes[0, 1].set_xlabel('Epoch')
        axes[0, 1].set_ylabel('RMSE')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)

        # MAE
        axes[1, 0].plot(self.history.history['mae'], label='Train MAE', linewidth=2)
        axes[1, 0].plot(self.history.history['val_mae'], label='Val MAE', linewidth=2)
        axes[1, 0].set_title(f'{self.dataset_name} - MAE', fontsize=14, fontweight='bold')
        axes[1, 0].set_xlabel('Epoch')
        axes[1, 0].set_ylabel('MAE')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)

        # Learning rate
        if 'lr' in self.history.history:
            axes[1, 1].plot(self.history.history['lr'], color='green', linewidth=2)
            axes[1, 1].set_title('Learning Rate', fontsize=14, fontweight='bold')
            axes[1, 1].set_xlabel('Epoch')
            axes[1, 1].set_ylabel('Learning Rate')
            axes[1, 1].set_yscale('log')
            axes[1, 1].grid(True, alpha=0.3)
        else:
            axes[1, 1].axis('off')

        plt.tight_layout()
        plt.savefig(self.save_dir / 'training_history.png', dpi=300, bbox_inches='tight')
        plt.show()



In [None]:
# %% [markdown]
# ## üìä Step 7: Evaluation and Visualization

# %%
class ModelEvaluator:
    """Evaluate and visualize model performance"""

    @staticmethod
    def evaluate_model(model, X_test, y_test):
        """Calculate evaluation metrics"""
        y_pred = model.predict(X_test, verbose=0).flatten()

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Error distribution
        errors = y_pred - y_test
        mean_error = np.mean(errors)
        std_error = np.std(errors)

        metrics = {
            'rmse': rmse,
            'mae': mae,
            'r2': r2,
            'mean_error': mean_error,
            'std_error': std_error,
            'predictions': y_pred,
            'actuals': y_test
        }

        return metrics

    @staticmethod
    def print_metrics(metrics, dataset_name):
        """Print evaluation metrics"""
        print(f"\n{'='*70}")
        print(f"üìä Evaluation Results for {dataset_name}")
        print(f"{'='*70}")
        print(f"RMSE:        {metrics['rmse']:.4f} cycles")
        print(f"MAE:         {metrics['mae']:.4f} cycles")
        print(f"R¬≤ Score:    {metrics['r2']:.4f}")
        print(f"Mean Error:  {metrics['mean_error']:.4f} cycles")
        print(f"Std Error:   {metrics['std_error']:.4f} cycles")
        print(f"{'='*70}\n")

    @staticmethod
    def plot_predictions(metrics, dataset_name, save_path=None):
        """Create comprehensive prediction visualizations"""
        y_pred = metrics['predictions']
        y_test = metrics['actuals']
        errors = y_pred - y_test

        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Predictions vs Actual RUL',
                'Error Distribution',
                'Scatter Plot',
                'Cumulative Error'
            )
        )

        # 1. Predictions vs Actual
        indices = np.arange(len(y_test))
        fig.add_trace(
            go.Scatter(x=indices, y=y_test, mode='markers',
                      name='Actual RUL', marker=dict(color='blue', size=6)),
            row=1, col=1
        )
        fig.add_trace(
            go.Scatter(x=indices, y=y_pred, mode='markers',
                      name='Predicted RUL', marker=dict(color='red', size=6)),
            row=1, col=1
        )

        # 2. Error Distribution
        fig.add_trace(
            go.Histogram(x=errors, nbinsx=50, name='Error Distribution',
                        marker_color='purple', showlegend=False),
            row=1, col=2
        )

        # 3. Scatter Plot
        min_val = min(y_test.min(), y_pred.min())
        max_val = max(y_test.max(), y_pred.max())
        fig.add_trace(
            go.Scatter(x=y_test, y=y_pred, mode='markers',
                      name='Predictions', marker=dict(color='green', size=6)),
            row=2, col=1
        )
        fig.add_trace(
            go.Scatter(x=[min_val, max_val], y=[min_val, max_val],
                      mode='lines', name='Perfect Fit',
                      line=dict(color='red', dash='dash')),
            row=2, col=1
        )

        # 4. Cumulative Error
        sorted_errors = np.sort(np.abs(errors))
        cumulative = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors) * 100
        fig.add_trace(
            go.Scatter(x=sorted_errors, y=cumulative, mode='lines',
                      name='Cumulative Error', line=dict(color='orange', width=2)),
            row=2, col=2
        )

        fig.update_layout(
            height=800,
            title_text=f"{dataset_name} - Model Performance Analysis",
            showlegend=True
        )

        if save_path:
            fig.write_html(save_path)

        fig.show()

        # Additional plots
        fig2, axes = plt.subplots(1, 3, figsize=(18, 5))

        # Error by prediction magnitude
        axes[0].scatter(y_pred, errors, alpha=0.5, s=30)
        axes[0].axhline(y=0, color='r', linestyle='--', linewidth=2)
        axes[0].set_xlabel('Predicted RUL', fontsize=12)
        axes[0].set_ylabel('Prediction Error', fontsize=12)
        axes[0].set_title('Error vs Predicted RUL', fontsize=14, fontweight='bold')
        axes[0].grid(True, alpha=0.3)

        # Absolute error distribution
        abs_errors = np.abs(errors)
        axes[1].hist(abs_errors, bins=50, color='coral', edgecolor='black', alpha=0.7)
        axes[1].axvline(np.mean(abs_errors), color='red', linestyle='--',
                       linewidth=2, label=f'Mean: {np.mean(abs_errors):.2f}')
        axes[1].set_xlabel('Absolute Error', fontsize=12)
        axes[1].set_ylabel('Frequency', fontsize=12)
        axes[1].set_title('Absolute Error Distribution', fontsize=14, fontweight='bold')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)

        # QQ plot
        from scipy import stats
        stats.probplot(errors, dist="norm", plot=axes[2])
        axes[2].set_title('Q-Q Plot', fontsize=14, fontweight='bold')
        axes[2].grid(True, alpha=0.3)

        plt.tight_layout()
        if save_path:
            plt.savefig(save_path.replace('.html', '_additional.png'), dpi=300, bbox_inches='tight')
        plt.show()


In [None]:
# %% [markdown]
# ## üöÄ Step 8: Complete Training Pipeline

# %%
def train_single_dataset(dataset_name, model_type='lstm', epochs=150):
    """Complete training pipeline for a single dataset"""

    print(f"\n{'#'*80}")
    print(f"{'#'*80}")
    print(f"##  TRAINING PROGNOSAI MODEL: {dataset_name} - {model_type.upper()}")
    print(f"{'#'*80}")
    print(f"{'#'*80}\n")

    # Initialize processor
    processor = DataProcessor(dataset_name)

    # Load data
    print("üì• Loading datasets...")
    train_df, test_df, rul_df = processor.load_data()

    # Preprocess training data
    print("\nüîß Preprocessing training data...")
    train_df = processor.add_features(train_df)
    train_df = processor.compute_rul(train_df)
    train_df = processor.select_features(train_df)

    # Preprocess test data
    print("üîß Preprocessing test data...")
    test_df = processor.add_features(test_df)
    test_df = processor.select_features(test_df)

    # Normalize
    print("üìä Normalizing features...")
    train_df, test_df, feature_cols = processor.normalize_features(train_df, test_df)

    print(f"   Selected {len(feature_cols)} features")

    # Create sequences
    print("\nüîÑ Creating sequences...")
    X_train, y_train = processor.create_sequences(train_df, is_test=False)
    X_test = processor.create_sequences(test_df, is_test=True)
    y_test = rul_df['RUL'].values
    y_test = np.clip(y_test, 0, processor.config['rul_clip'])

    # Split training data
    print("‚úÇÔ∏è Splitting training/validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=SEED, shuffle=True
    )

    print(f"   Training: {X_train.shape}")
    print(f"   Validation: {X_val.shape}")
    print(f"   Test: {X_test.shape}")

    # Create model
    print(f"\nü§ñ Creating {model_type.upper()} model...")
    input_shape = (X_train.shape[1], X_train.shape[2])

    if model_type.lower() == 'lstm':
        model = create_lstm_model(input_shape)
    elif model_type.lower() == 'gru':
        model = create_gru_model(input_shape)
    elif model_type.lower() == 'cnn_lstm':
        model = create_cnn_lstm_model(input_shape)
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    print(f"\nüìã Model Summary:")
    model.summary()

    # Train model
    trainer = ModelTrainer(model, dataset_name, model_type)
    history = trainer.train(
        X_train, y_train,
        X_val, y_val,
        epochs=epochs,
        batch_size=processor.config['batch_size']
    )

    # Plot training history
    print("\nüìà Plotting training history...")
    trainer.plot_history()

    # Evaluate on test set
    print("\nüéØ Evaluating on test set...")
    evaluator = ModelEvaluator()
    metrics = evaluator.evaluate_model(model, X_test, y_test)
    evaluator.print_metrics(metrics, dataset_name)

    # Visualize predictions
    print("üìä Creating prediction visualizations...")
    evaluator.plot_predictions(
        metrics,
        dataset_name,
        save_path=str(trainer.save_dir / 'predictions.html')
    )

    # Save artifacts
    print("\nüíæ Saving artifacts...")

    # Save scaler
    scaler_path = trainer.save_dir / 'scaler.pkl'
    with open(scaler_path, 'wb') as f:
        pickle.dump(processor.scaler, f)
    print(f"   ‚úì Scaler saved: {scaler_path}")

    # Save configuration
    config = {
        'dataset_name': dataset_name,
        'model_type': model_type,
        'seq_len': processor.config['seq_len'],
        'rul_clip': processor.config['rul_clip'],
        'feature_cols': feature_cols,
        'input_shape': input_shape,
        'metrics': {
            'rmse': float(metrics['rmse']),
            'mae': float(metrics['mae']),
            'r2': float(metrics['r2'])
        },
        'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

    config_path = trainer.save_dir / 'config.json'
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=4)
    print(f"   ‚úì Config saved: {config_path}")

    # Save predictions
    predictions_df = pd.DataFrame({
        'unit_id': range(len(y_test)),
        'actual_rul': y_test,
        'predicted_rul': metrics['predictions'],
        'error': metrics['predictions'] - y_test,
        'abs_error': np.abs(metrics['predictions'] - y_test)
    })
    predictions_path = trainer.save_dir / 'predictions.csv'
    predictions_df.to_csv(predictions_path, index=False)
    print(f"   ‚úì Predictions saved: {predictions_path}")

    print(f"\n‚úÖ Training complete for {dataset_name}!")
    print(f"üìÅ All artifacts saved to: {trainer.save_dir}\n")

    return {
        'model': model,
        'scaler': processor.scaler,
        'config': config,
        'metrics': metrics,
        'save_dir': trainer.save_dir
    }


In [None]:
# %% [markdown]
# ## üéØ Step 9: Train All Available Datasets

# %%
def train_all_available_datasets(model_type='lstm', epochs=150):
    """Train models for all available datasets"""

    if not available_datasets:
        print("‚ùå No datasets available. Please upload datasets first.")
        return {}

    print(f"\nüöÄ Starting training for {len(available_datasets)} dataset(s)")
    print(f"   Model type: {model_type.upper()}")
    print(f"   Epochs: {epochs}")
    print(f"   Datasets: {', '.join(available_datasets)}\n")

    results = {}
    summary_data = []

    for dataset in available_datasets:
        try:
            result = train_single_dataset(dataset, model_type, epochs)
            results[dataset] = result

            # Collect summary
            summary_data.append({
                'Dataset': dataset,
                'Model': model_type.upper(),
                'RMSE': result['metrics']['rmse'],
                'MAE': result['metrics']['mae'],
                'R¬≤': result['metrics']['r2'],
                'Save Dir': str(result['save_dir'])
            })

        except Exception as e:
            print(f"\n‚ùå Error training {dataset}: {e}")
            import traceback
            traceback.print_exc()
            continue

    # Generate comparison report
    if results:
        print(f"\n{'='*80}")
        print("üìä TRAINING SUMMARY - ALL DATASETS")
        print(f"{'='*80}\n")

        summary_df = pd.DataFrame(summary_data)
        print(summary_df.to_string(index=False))

        # Save summary
        summary_path = Path('./models/training_summary.csv')
        summary_df.to_csv(summary_path, index=False)
        print(f"\n‚úì Summary saved to: {summary_path}")

        # Create comparison visualization
        fig = go.Figure()

        datasets_list = list(results.keys())
        rmse_values = [results[d]['metrics']['rmse'] for d in datasets_list]
        mae_values = [results[d]['metrics']['mae'] for d in datasets_list]

        fig.add_trace(go.Bar(
            name='RMSE',
            x=datasets_list,
            y=rmse_values,
            marker_color='steelblue'
        ))

        fig.add_trace(go.Bar(
            name='MAE',
            x=datasets_list,
            y=mae_values,
            marker_color='coral'
        ))

        fig.update_layout(
            title='Model Performance Comparison Across Datasets',
            xaxis_title='Dataset',
            yaxis_title='Error (cycles)',
            barmode='group',
            template='plotly_white',
            height=500
        )

        comparison_path = Path('./models/performance_comparison.html')
        fig.write_html(comparison_path)
        fig.show()
        print(f"\n‚úì Comparison chart saved to: {comparison_path}")

        print(f"\n{'='*80}")
        print("‚úÖ ALL TRAINING COMPLETE!")
        print(f"{'='*80}\n")

    return results


In [None]:
# %% [markdown]
# ## üîÆ Step 10: Inference Engine (Deployment Ready)

# %%
class PrognosAIInference:
    """Production-ready inference engine"""

    def __init__(self, model_path, scaler_path, config_path):
        """
        Initialize inference engine

        Args:
            model_path: Path to saved Keras model (.keras file)
            scaler_path: Path to saved scaler (.pkl file)
            config_path: Path to config JSON file
        """
        # Load configuration
        with open(config_path, 'r') as f:
            self.config = json.load(f)

        # Load scaler
        with open(scaler_path, 'rb') as f:
            self.scaler = pickle.load(f)

        # Load model
        self.model = keras.models.load_model(model_path, safe_mode=False)

        self.seq_len = self.config['seq_len']
        self.rul_clip = self.config['rul_clip']
        self.feature_cols = self.config['feature_cols']

        print(f"‚úÖ Inference engine initialized")
        print(f"   Dataset: {self.config['dataset_name']}")
        print(f"   Model: {self.config['model_type']}")
        print(f"   Sequence length: {self.seq_len}")
        print(f"   Features: {len(self.feature_cols)}")

    def predict_rul(self, sensor_data):
        """
        Predict RUL for sensor data

        Args:
            sensor_data: DataFrame or numpy array with sensor readings
                        Shape: (n_timesteps, n_features)

        Returns:
            Predicted RUL value
        """
        # Convert to numpy if DataFrame
        if isinstance(sensor_data, pd.DataFrame):
            sensor_data = sensor_data[self.feature_cols].values

        # Scale data
        sensor_scaled = self.scaler.transform(sensor_data)

        # Create sequence
        if len(sensor_scaled) < self.seq_len:
            # Pad if too short
            pad_len = self.seq_len - len(sensor_scaled)
            pad = np.repeat(sensor_scaled[0:1], pad_len, axis=0)
            sequence = np.vstack([pad, sensor_scaled])
        else:
            # Take last seq_len timesteps
            sequence = sensor_scaled[-self.seq_len:]

        # Predict
        sequence = sequence.reshape(1, self.seq_len, -1)
        prediction = self.model.predict(sequence, verbose=0)[0][0]

        # Clip to valid range
        prediction = np.clip(prediction, 0, self.rul_clip)

        return float(prediction)

    def predict_batch(self, sensor_data_list):
        """
        Predict RUL for multiple engines

        Args:
            sensor_data_list: List of DataFrames or numpy arrays

        Returns:
            Array of predicted RUL values
        """
        sequences = []

        for data in sensor_data_list:
            if isinstance(data, pd.DataFrame):
                data = data[self.feature_cols].values

            data_scaled = self.scaler.transform(data)

            if len(data_scaled) < self.seq_len:
                pad_len = self.seq_len - len(data_scaled)
                pad = np.repeat(data_scaled[0:1], pad_len, axis=0)
                sequence = np.vstack([pad, data_scaled])
            else:
                sequence = data_scaled[-self.seq_len:]

            sequences.append(sequence)

        sequences = np.array(sequences)
        predictions = self.model.predict(sequences, verbose=0).flatten()
        predictions = np.clip(predictions, 0, self.rul_clip)

        return predictions

    def get_risk_level(self, rul):
        """
        Determine risk level based on RUL

        Args:
            rul: Remaining Useful Life value

        Returns:
            Risk level and recommendation
        """
        if rul > 50:
            return {
                'level': 'LOW',
                'color': 'green',
                'action': 'Normal operation',
                'priority': 1
            }
        elif rul > 25:
            return {
                'level': 'MEDIUM',
                'color': 'yellow',
                'action': 'Schedule maintenance',
                'priority': 2
            }
        elif rul > 10:
            return {
                'level': 'HIGH',
                'color': 'orange',
                'action': 'Plan immediate maintenance',
                'priority': 3
            }
        else:
            return {
                'level': 'CRITICAL',
                'color': 'red',
                'action': 'URGENT: Immediate maintenance required',
                'priority': 4
            }


In [None]:
# %% [markdown]
# ## üéÆ Step 11: MAIN EXECUTION - Train Selected Datasets

# %%
# @title üöÄ **RUN THIS CELL TO TRAIN MODELS**

# ============================================
# CONFIGURATION
# ============================================

# Select which datasets to train
DATASETS_TO_TRAIN = available_datasets  # Train all available datasets
# Or specify manually: DATASETS_TO_TRAIN = ['FD001', 'FD003']

# Select model type
MODEL_TYPE = 'lstm'  # Options: 'lstm', 'gru', 'cnn_lstm'

# Training epochs
EPOCHS = 150  # Recommended: 100-200


In [None]:
from tensorflow import keras
keras.config.enable_unsafe_deserialization()


In [None]:
# ============================================
# EXECUTION
# ============================================

print("\n" + "="*80)
print("üöÄ PROGNOSAI TRAINING PIPELINE")
print("="*80)
print(f"\nüìã Configuration:")
print(f"   Datasets to train: {DATASETS_TO_TRAIN}")
print(f"   Model type: {MODEL_TYPE.upper()}")
print(f"   Training epochs: {EPOCHS}")
print(f"   GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")
print("\n" + "="*80 + "\n")

# Train all selected datasets
if DATASETS_TO_TRAIN:
    training_results = train_all_available_datasets(
        model_type=MODEL_TYPE,
        epochs=EPOCHS
    )

    # Download results
    if training_results:
        print("\nüì¶ Creating downloadable archive...")
        !zip -r prognosai_models.zip ./models

        print("\nüì• Download your trained models:")
        files.download('prognosai_models.zip')

        print("\n" + "="*80)
        print("‚úÖ TRAINING COMPLETE! All models and artifacts are ready.")
        print("="*80)
        print("\nüìÇ Saved artifacts include:")
        print("   ‚úì Trained Keras models (.keras)")
        print("   ‚úì Scalers (.pkl)")
        print("   ‚úì Configurations (.json)")
        print("   ‚úì Training logs (.csv)")
        print("   ‚úì Predictions (.csv)")
        print("   ‚úì Visualizations (.html, .png)")
else:
    print("‚ùå No datasets available for training!")
    print("Please run the upload cell first to upload your datasets.")

# %% [markdown]
# ## üîç Step 12: Test Inference Engine (Example)

# %%
# @title üß™ **Test Inference Engine** (Run after training)

# Select a trained model to test
if training_results:
    # Get first available model
    test_dataset = list(training_results.keys())[0]
    result = training_results[test_dataset]

    print(f"üß™ Testing inference engine with {test_dataset} model\n")

    # Initialize inference engine
    inference = PrognosAIInference(
        model_path=str(result['save_dir'] / 'best_model.keras'),
        scaler_path=str(result['save_dir'] / 'scaler.pkl'),
        config_path=str(result['save_dir'] / 'config.json')
    )

    # Load test data for demonstration
    processor = DataProcessor(test_dataset)
    train_df, test_df, rul_df = processor.load_data()
    test_df = processor.add_features(test_df)
    test_df = processor.select_features(test_df)

    # Get first test unit
    first_unit = test_df[test_df['unit'] == 1]
    feature_cols = [col for col in first_unit.columns
                   if col not in ['unit', 'cycle']]

    # Predict RUL
    predicted_rul = inference.predict_rul(first_unit[feature_cols])
    actual_rul = rul_df.iloc[0]['RUL']

    print(f"\nüìä Prediction Results for Unit 1:")
    print(f"   Predicted RUL: {predicted_rul:.2f} cycles")
    print(f"   Actual RUL: {actual_rul:.2f} cycles")
    print(f"   Error: {abs(predicted_rul - actual_rul):.2f} cycles")

    # Get risk assessment
    risk = inference.get_risk_level(predicted_rul)
    print(f"\n‚ö†Ô∏è Risk Assessment:")
    print(f"   Level: {risk['level']}")
    print(f"   Action: {risk['action']}")
    print(f"   Priority: {risk['priority']}")

    print("\n‚úÖ Inference engine test complete!")
else:
    print("‚ùå No trained models available. Please train models first.")


In [None]:
# %% [markdown]
# ## üìñ Usage Instructions

# %%
# @markdown ### **üìñ HOW TO USE THIS NOTEBOOK:**
#
# **Step 1: Upload Datasets**
# - Run the upload cell in Step 3
# - Upload your CMAPSS .txt files or a ZIP archive
# - Verify that files are detected correctly
#
# **Step 2: Configure Training**
# - In Step 11, modify the configuration:
#   - `DATASETS_TO_TRAIN`: Select which datasets to train
#   - `MODEL_TYPE`: Choose 'lstm', 'gru', or 'cnn_lstm'
#   - `EPOCHS`: Set training epochs (150 recommended)
#
# **Step 3: Run Training**
# - Click the play button in Step 11
# - Monitor training progress (loss, RMSE, MAE)
# - Wait for completion (may take 15-60 minutes per dataset)
#
# **Step 4: Download Results**
# - After training, models.zip will be created automatically
# - Download contains:
#   - Trained models (.keras)
#   - Scalers (.pkl)
#   - Configurations (.json)
#   - Predictions and visualizations