In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

pd.options.display.max_columns = None
sns.set(style="whitegrid", palette="pastel")
warnings.filterwarnings("ignore")

# Load Data

In [None]:
df = pd.read_csv("datalog_ilapak10.csv", index_col=0, parse_dates=True)
df.head()

# Exploratory Data Analysis

## I. Outliers Analysis

In [None]:
numeric_cols = [
    'suhu_sealing_vertikal_bawah', 
    'suhu_sealing_vertikal_atas',
    'suhu_sealing_horizontal_depan',
    'suhu_sealing_horizontal_belakang',
    'downtime_sec',
    'output_time_sec',
    'total_time_sec'
]

plt.figure(figsize=(16, 10))
plt.suptitle("Outlier Analysis for Numeric Features", fontsize=18, weight='bold', y=1.02)

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df[col], color=sns.color_palette("pastel")[0])
    plt.title(col.replace('_', ' ').title(), fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
df_cleaned = df.copy()

numeric_cols = [
    'suhu_sealing_vertikal_bawah', 
    'suhu_sealing_vertikal_atas',
    'suhu_sealing_horizontal_depan',
    'suhu_sealing_horizontal_belakang',
    'downtime_sec',
    'output_time_sec',
    'total_time_sec'
]

print(f"Shape before outlier removal: {df_cleaned.shape}")

for col in numeric_cols:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]

print(f"Shape after outlier removal: {df_cleaned.shape}")
print(f"Percentage of data removed: {1 - df_cleaned.shape[0] / df.shape[0]:.2%}")

In [None]:
plt.figure(figsize=(16, 10))
plt.suptitle("Numeric Features after Outlier Removed", fontsize=18, weight='bold', y=1.02)

for i, col in enumerate(numeric_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=df_cleaned[col], color=sns.color_palette("pastel")[0])
    plt.title(col.replace('_', ' ').title(), fontsize=12)

plt.tight_layout()
plt.show()

## II. Distribution of Binary Operational Signals

The following plot illustrates the distribution of binary operational columns such as `jaws_position`, `knife_position`, `pump_position_stop`, `doser_drive_enable`, `sealing_enable`, and `machine_alarm`. These features represent the status (ON/OFF or Active/Inactive) of various mechanical components in the packaging process.


In [None]:
binary_cols = [
    'jaws_position', 'knife_position', 'pump_position_stop',
    'doser_drive_enable', 'sealing_enable', 'machine_alarm'
]

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10))
axes = axes.flatten()

for i, col in enumerate(binary_cols):
    ax = axes[i]
    sns.countplot(data=df_cleaned, x=col, ax=ax, palette='pastel')

    total = len(df_cleaned)
    for p in ax.patches:
        count = p.get_height()
        percentage = f'{100 * count / total:.1f}%'
        x = p.get_x() + p.get_width() / 2
        y = count
        ax.text(x, y + total * 0.01, percentage, ha='center', fontsize=11)

    ax.set_title(f'Distribution of {col.replace("_", " ").title()}', fontsize=13)
    ax.set_xlabel('')
    ax.set_ylabel('Count')

plt.suptitle('Binary Signal Distributions (Operational Statuses)', fontsize=16, weight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


### Binary Signal Analysis for LSTM AE Preparation
The visualization illustrates the distribution of binary operational signals from the datalog_ilapak10 dataset. The key findings are as follows:
| Feature              | Dominant Label | Proportion | Insight                                                   |
| -------------------- | -------------- | ---------- | --------------------------------------------------------- |
| `jaws_position`      | 0              | 100.0%     | No variation, always in position 0 — not informative      |
| `knife_position`     | 0              | 100.0%     | Constant at 0, suggesting no operational switching        |
| `pump_position_stop` | 0              | 100.0%     | No observed activation, redundant for modeling            |
| `doser_drive_enable` | 0              | 71.1%      | Predominantly inactive, but contains valid signal changes |
| `sealing_enable`     | 0              | 68.4%      | Shows both active and inactive states                     |
| `machine_alarm`      | 0              | 67.4%      | Around one-third of data indicates alarm conditions       |

**Interpretation:**
- Features with no variance (jaws_position, knife_position, pump_position_stop) offer no learning value and should be excluded from model training.
- The remaining binary features exhibit meaningful variability, potentially capturing machine operational states.
- Most notably, machine_alarm serves as a proxy for identifying faulty conditions.

**Conclusion for LSTM Autoencoder (LSTM AE) Training:**

Since an LSTM Autoencoder should learn only from normal (non-anomalous) behavior, it is essential to filter the training data accordingly. Based on the machine_alarm feature, data labeled with machine_alarm == 0 represents normal operation and is suitable for training the LSTM AE:

```python
df_normal = df[df["machine_alarm"] == 0]
```
This approach ensures the model is exposed exclusively to baseline behavior. During deployment, the model can then flag unseen patterns with high reconstruction error as potential anomalies.

### Correlation Matrix

In [None]:
plt.figure(figsize=(16, 10))
sns.heatmap(df_cleaned.corr(), annot=True, cmap="coolwarm", cbar=False)

## III. Trend Analysis of Sealing Temperature Columns During Abnormal Machine States
To gain insight into how temperature behavior correlates with abnormal machine states, we analyze the time-series trend of all sealing-related temperature columns:

- suhu_sealing_vertikal_bawah
- suhu_sealing_vertikal_atas
- suhu_sealing_horizontal_depan
- suhu_sealing_horizontal_belakang

This analysis is segmented based on the machine_alarm flag to highlight deviations during non-zero (abnormal) conditions.

**Objectives:**
- Identify any temperature spikes, fluctuations, or irregularities that correspond with machine alarms.
- Validate whether thermal anomalies could be potential early indicators of machine faults.

**Visualization Strategy:**
- Plot time-series overlays for each temperature column.
- Overlay or color regions where machine_alarm != 0 to highlight abnormal segments.

**Why This Matters:**

Temperature stability is crucial in packaging processes involving sealing. Any inconsistency or drift may result in sealing failures or machine alarms. By observing how these signals behave during alarm conditions, we can:

- Understand root causes of anomalies.
- Improve feature selection for anomaly detection models.
- Potentially predict alarms before they occur using pre-alarm temperature patterns.

In [None]:
# Function helper
def get_abnormal_segments(mask):
    segments = []
    start = None
    for i, val in enumerate(mask):
        if val and start is None:
            start = i
        elif not val and start is not None:
            segments.append((start, i - 1))
            start = None
    if start is not None:
        segments.append((start, len(mask) - 1))
    return segments

def plot_temperature(df: pd.DataFrame, period: str = "all_time", with_segments: bool = False):
    temp_cols = [
        'suhu_sealing_vertikal_bawah', 'suhu_sealing_vertikal_atas',
        'suhu_sealing_horizontal_depan', 'suhu_sealing_horizontal_belakang'
    ]

    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("Index of dataframe must be a DatatimeIndex")

    now = df.index.max()
    if period == "last_day":
        df_period = df[df.index >= now - pd.Timedelta(days=1)]
    elif period == "last_week":
        df_period = df[df.index >= now - pd.Timedelta(days=7)]
    elif period == "last_month":
        df_period = df[df.index >= now - pd.Timedelta(days=30)]
    elif period == "last_year":
        df_period = df[df.index >= now - pd.Timedelta(days=365)]
    else:
        df_period = df
    
    _, axes = plt.subplots(len(temp_cols), 1, figsize=(18, 10), sharex=True)
    for i, col in enumerate(temp_cols):
        sns.lineplot(data=df_period, x=df_period.index, y=col, ax=axes[i], palette="pastel")
        axes[i].set_title(f"{col.replace('_', ' ').title()}", fontsize=12, loc="left")
        axes[i].set_ylabel("Temp (°C)")
        axes[i].grid(True)
    
        if with_segments:
            abnormal_mask = (
                (df_period["machine_alarm"] != 0) &
                (df_period["doser_drive_enable"] != 0) &
                (df_period["sealing_enable"] != 0)
            )
            segments = get_abnormal_segments(abnormal_mask.values)
            for start, end in segments:
                axes[i].axvspan(df_period.index[start], df_period.index[end], color='red', alpha=0.3)
    
    axes[-1].set_xlabel("Timestamp")
    highlight_text = "with Machine Alarm Highlighted" if with_segments else ""
    plt.suptitle(f"Sealing Temperature Trends {highlight_text} ({period.replace('_', ' ').title()})",
                 fontsize=16, weight="bold")
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

### All Time

In [None]:
plot_temperature(df_cleaned, with_segments=True)

### Last Month

In [None]:
plot_temperature(df_cleaned, period="last_month", with_segments=True)

### Last Week

In [None]:
plot_temperature(df_cleaned, period="last_week", with_segments=True)

### Last Day

In [None]:
plot_temperature(df_cleaned, period="last_day", with_segments=True)

## IV. Data Normal Temperature Trends

In [None]:
data_normal = df_cleaned[(df_cleaned["machine_alarm"] == 0) & (df_cleaned["doser_drive_enable"] == 0) & (df_cleaned["sealing_enable"] == 0)]
plot_temperature(data_normal, period="last_day")

## Data Preprocessing

### Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

data_normal[numeric_cols] = scaler.fit_transform(data_normal[numeric_cols])
data_normal.head(2)

### Features Selection

In [None]:
# Drop that features because they are not useful (just 0)
df_final = data_normal.drop(columns=["jaws_position", "knife_position", "pump_position_stop"], axis=1)
df_final.to_csv("datalog_ilapak10_normal.csv")
df_final.head(2)

## Training Model

### Create Sequences & Train-Test Split

In [None]:
def create_sequences(data: pd.DataFrame, window_size: int, stride: int = 5) -> np.ndarray:
    sequences = []
    for i in range(0, len(data) - window_size, stride):
        sequences.append(data[i:i + window_size])
    return np.array(sequences)

sequences = create_sequences(df_final, window_size=30, stride=5)
print(f"Sequences Shape: {sequences.shape}")

train_size = int(0.8 * len(sequences))
X_train = sequences[:train_size]
X_val = sequences[train_size:]

### Model Architecture

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense

class LSTMAutoEncoder(Model):
    def __init__(self, window_size: int, n_features: int):
        super(LSTMAutoEncoder, self).__init__()
        self.encoder_layer1 = LSTM(64, return_sequences=True, activation='relu')
        self.encoder_layer2 = LSTM(32, return_sequences=False, activation='relu')
        self.repeat_vector = RepeatVector(window_size)
        self.decoder_layer1 = LSTM(32, return_sequences=True, activation='relu')
        self.decoder_layer2 = LSTM(64, return_sequences=True, activation='relu')
        self.output_layer = TimeDistributed(Dense(n_features))

    def call(self, inputs, training = False):
        x = self.encoder_layer1(inputs)
        x = self.encoder_layer2(x)
        x = self.repeat_vector(x)
        x = self.decoder_layer1(x)
        x = self.decoder_layer2(x)
        x = self.output_layer(x)
        return x

### Training

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.callbacks import EarlyStopping, Callback, ModelCheckpoint
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta


import os
import joblib
from typing import Tuple, List, Union, Dict

#### Data Preparation

In [None]:
class DataPreprocessor:
    """Handles data preprocessing for time series anomaly detection"""
    
    def __init__(self, window_size: int = 30, stride: int = 5):
        self.window_size = window_size
        self.stride = stride
        self.scaler = MinMaxScaler()

    def create_sequences(self, data: pd.DataFrame) -> np.ndarray:
        """Create sliding window sequences from time series data"""
        sequences = []
        for i in range(0, len(data) - self.window_size + 1, self.stride):
            sequences.append(data.iloc[i:i + self.window_size].values)
        return np.array(sequences)
    
    def fit_scaler(self, data: pd.DataFrame) -> pd.DataFrame:
        """Fit scaler on training data and transform"""
        scaled_data = self.scaler.fit_transform(data)
        return pd.DataFrame(scaled_data, columns=data.columns, index=data.index)
    
    def transform(self, data: pd.DataFrame) -> pd.DataFrame:
        """Transform data using fitted scaler"""
        if self.scaler is None:
            raise ValueError("Scaler not fitted. Call fit_scaler first.")
        scaled_data = self.scaler.transform(data)
        return pd.DataFrame(scaled_data, columns=data.columns, index=data.index)
    
    def split_sequences(self, sequences: np.ndarray, train_size: float = 0.7, val_size: float = 0.15) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Split sequences into train, validation, and test sets"""
        total_len = len(sequences)
        train_end = int(train_size * total_len)
        val_end = train_end + int(val_size * total_len)

        return sequences[:train_end], sequences[train_end:val_end], sequences[val_end:]

#### Model Architecture

In [None]:
class LSTMAutoEncoder(Model):
    """LSTM-based Autoencoder for anomaly detection"""
    
    def __init__(self, window_size: int, n_features: int, 
                 encoder_units: List[int] = [64, 32], 
                 decoder_units: List[int] = [32, 64]):
        super().__init__()
        self.window_size = window_size
        self.n_features = n_features

        # Encoder
        self.encoder_lstm1 = LSTM(encoder_units[0], return_sequences=True, activation='relu')
        self.encoder_lstm2 = LSTM(encoder_units[1], return_sequences=False, activation='relu')

        # Decoder
        self.repeat_vector = RepeatVector(window_size)
        self.decoder_lstm1 = LSTM(decoder_units[0], return_sequences=True, activation='relu')
        self.decoder_lstm2 = LSTM(decoder_units[1], return_sequences=True, activation='relu')
        self.output_layer = TimeDistributed(Dense(n_features))

    def call(self, inputs, training=False):
        # Encoder
        x = self.encoder_lstm1(inputs, training=training)
        encoded = self.encoder_lstm2(x, training=training)

        # Decoder
        x = self.repeat_vector(encoded)
        x = self.decoder_lstm1(x, training=training)
        x = self.decoder_lstm2(x, training=training)
        decoded = self.output_layer(x)
        return decoded

#### Callback Function

In [None]:
class TrainingCallback(Callback):
    """callback for monitoring training progress"""
    
    def __init__(self, plot_interval: int = 10, save_plots: bool = False):
        super().__init__()
        self.plot_interval = plot_interval
        self.save_plots = save_plots
        if save_plots:
            os.makedirs("training_plots", exist_ok=True)
        
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.plot_interval == 0:
            self._plot_progress(epoch, logs)
    
    def _plot_progress(self, epoch, logs):
        """Plot training progress"""
        plt.figure(figsize=(12, 5))
        
        # Get history from model
        history = self.model.history.history
        
        plt.plot(history['loss'], label='Train Loss')
        plt.plot(history['val_loss'], label='Val Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title(f'Training Progress - Epoch {epoch+1}')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        
        if self.save_plots:
            plt.savefig(f"training_plots/progress_epoch_{epoch+1}.png", dpi=150, bbox_inches='tight')
        plt.show()

#### Orchestrator 

In [None]:
class AnomalyDetector:
    """Main class for LSTM Autoencoder-based anomaly detection"""
    
    def __init__(self, window_size: int = 30, stride: int = 5):
        self.window_size = window_size
        self.stride = stride
        self.preprocessor = DataPreprocessor(window_size, stride)
        self.model = None
        self.threshold = None
        self.history = None
        self.feature_names = None
        
    def build_model(self, n_features: int, encoder_units: List[int] = [64, 32], 
                   decoder_units: List[int] = [32, 64]):
        """Build and compile the LSTM autoencoder model"""
        self.model = LSTMAutoEncoder(
            window_size=self.window_size,
            n_features=n_features,
            encoder_units=encoder_units,
            decoder_units=decoder_units
        )
        self.model.compile(optimizer='adam', loss='mse', metrics=['mse'])
        
    def prepare_data(self, data: pd.DataFrame, fit_scaler: bool = True) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Prepare data for training"""
        # Scale data
        if fit_scaler:
            scaled_data = self.preprocessor.fit_scaler(data)
        else:
            scaled_data = self.preprocessor.transform(data)
        
        # Create sequences
        sequences = self.preprocessor.create_sequences(scaled_data)
        
        # Split data
        X_train, X_val, X_test = self.preprocessor.split_sequences(sequences)
        
        return X_train, X_val, X_test
        
    def train(self, X_train: np.ndarray, X_val: np.ndarray, 
              epochs: int = 50, batch_size: int = 32, patience: int = 10,
              save_best: bool = True, plot_progress: bool = True) -> tf.keras.callbacks.History:
        """Train the autoencoder model"""
        if self.model is None:
            raise ValueError("Model not built. Call build_model() first.")
        
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
        ]
        
        if save_best:
            callbacks.append(
                ModelCheckpoint("best_model.keras", monitor='val_loss', 
                              save_best_only=True, verbose=1)
            )
            
        if plot_progress:
            callbacks.append(TrainingCallback(plot_interval=max(1, epochs//5)))
        
        print(f"Training model with {len(X_train)} training samples...")
        self.history = self.model.fit(
            X_train, X_train,
            validation_data=(X_val, X_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1
        )
        
        return self.history
    
    def calculate_threshold(self, X_train: np.ndarray, percentile: float = 95) -> float:
        """Calculate anomaly detection threshold from training data"""
        if self.model is None:
            raise ValueError("Model not trained. Train model first.")
            
        print("Calculating threshold from training data...")
        train_pred = self.model.predict(X_train, verbose=0)
        train_errors = self._calculate_errors(X_train, train_pred)
        
        self.threshold = np.percentile(train_errors, percentile)
        
        print(f"Threshold set to: {self.threshold:.6f}")
        print(f"Training errors - Mean: {np.mean(train_errors):.6f}, Std: {np.std(train_errors):.6f}")
        
        return self.threshold
    
    def _calculate_errors(self, original: np.ndarray, reconstructed: np.ndarray) -> np.ndarray:
        """Calculate reconstruction errors"""
        return np.array([
            mean_squared_error(orig.flatten(), recon.flatten())
            for orig, recon in zip(original, reconstructed)
        ])
    
    def predict(self, X: np.ndarray, return_details: bool = False) -> Union[np.ndarray, Dict]:
        """Predict anomalies in new data"""
        if self.model is None or self.threshold is None:
            raise ValueError("Model not trained or threshold not set.")
        
        predictions = self.model.predict(X, verbose=0)
        errors = self._calculate_errors(X, predictions)
        anomalies = errors > self.threshold
        
        if not return_details:
            return anomalies
            
        return {
            'anomalies': anomalies,
            'errors': errors,
            'threshold': self.threshold,
            'num_anomalies': np.sum(anomalies),
            'anomaly_rate': np.mean(anomalies),
            'max_error': np.max(errors),
            'mean_error': np.mean(errors)
        }
    
    def visualize_results(self, X: np.ndarray, title: str = "Anomaly Detection Results"):
        """Visualize reconstruction errors and anomalies"""
        if self.model is None:
            raise ValueError("Model not trained.")
            
        predictions = self.model.predict(X, verbose=0)
        errors = self._calculate_errors(X, predictions)
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle(title, fontsize=16)
        
        # Error timeline
        axes[0, 0].plot(errors, alpha=0.7)
        if self.threshold:
            axes[0, 0].axhline(y=self.threshold, color='red', linestyle='--', label='Threshold')
            anomalies = errors > self.threshold
            axes[0, 0].scatter(np.where(anomalies)[0], errors[anomalies], 
                             color='red', s=20, alpha=0.8, label='Anomalies')
        axes[0, 0].set_title('Reconstruction Errors')
        axes[0, 0].set_xlabel('Sample')
        axes[0, 0].set_ylabel('Error')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)
        
        # Error distribution
        axes[0, 1].hist(errors, bins=50, alpha=0.7, edgecolor='black')
        if self.threshold:
            axes[0, 1].axvline(x=self.threshold, color='red', linestyle='--', label='Threshold')
        axes[0, 1].set_title('Error Distribution')
        axes[0, 1].set_xlabel('Error')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
        
        # Box plot
        axes[1, 0].boxplot([errors], labels=['Errors'])
        if self.threshold:
            axes[1, 0].axhline(y=self.threshold, color='red', linestyle='--', label='Threshold')
        axes[1, 0].set_title('Error Statistics')
        axes[1, 0].set_ylabel('Error')
        axes[1, 0].grid(True, alpha=0.3)
        
        # Cumulative distribution
        sorted_errors = np.sort(errors)
        cumulative = np.arange(1, len(sorted_errors) + 1) / len(sorted_errors)
        axes[1, 1].plot(sorted_errors, cumulative)
        if self.threshold:
            axes[1, 1].axvline(x=self.threshold, color='red', linestyle='--', label='Threshold')
        axes[1, 1].set_title('Cumulative Distribution')
        axes[1, 1].set_xlabel('Error')
        axes[1, 1].set_ylabel('Cumulative Probability')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Print summary
        print(f"\n=== {title} Summary ===")
        print(f"Samples: {len(errors)}")
        print(f"Mean error: {np.mean(errors):.6f}")
        print(f"Std error: {np.std(errors):.6f}")
        if self.threshold:
            anomalies = errors > self.threshold
            print(f"Anomalies: {np.sum(anomalies)} ({np.mean(anomalies)*100:.2f}%)")
    
    def save_model(self, filepath: str = "anomaly_detector.keras"):
        """Save the trained model"""
        if self.model is None:
            raise ValueError("No model to save.")
        self.model.save(filepath)
        
        # Save additional parameters
        params = {
            'window_size': self.window_size,
            'stride': self.stride,
            'threshold': self.threshold,
            'scaler': self.preprocessor.scaler
        }
        joblib.dump(params, filepath.replace('.keras', '_params.pkl'))
        print(f"Model and parameters saved to {filepath}")
    
    def load_model(self, filepath: str):
        """Load a saved model"""
        self.model = tf.keras.models.load_model(filepath)
        
        # Load additional parameters
        params_file = filepath.replace('.keras', '_params.pkl')
        if os.path.exists(params_file):
            params = joblib.load(params_file)
            self.window_size = params['window_size']
            self.stride = params['stride']
            self.threshold = params['threshold']
            self.preprocessor.scaler = params['scaler']
        
        print(f"Model loaded from {filepath}")

In [None]:
pipeline = AnomalyDetector()
# Preparing data
print(f"Preparing data...")
X_train, X_val, X_test = pipeline.prepare_data(df_final, fit_scaler=True)
print(f"Traing Sequences: {X_train.shape}")
print(f"Validation Sequences: {X_val.shape}")
print(f"Test Sequences: {X_test.shape}\n")

# Build model
print(f"Building model...")
pipeline.build_model(n_features=df_final.shape[1], encoder_units=[64, 32], decoder_units=[32, 64])
print(f"Model built sucessfully.\n")

# Train model
print(f"Training model...")
history = pipeline.train(X_train, X_val, epochs=50, batch_size=32, patience=10, save_best=True, plot_progress=True)
print(f"Model trained sucessfully.\n")

# Calculate threshold
print(f"Calculating threshold...")
threshold = pipeline.calculate_threshold(X_train, percentile=95)
print(f"Threshold calculated sucessfully.\n")

# Test on validation data
print(f"Testing model on validation data...")
val_results = pipeline.predict(X_val, return_details=True)
print(f"Validation anomalies detected: {val_results['num_anomalies']} ({val_results['anomaly_rate']*100:.2f}%)")

pipeline.visualize_results(X_val, "Validation Data Results")

print("Testing on test data...")
test_results = pipeline.predict(X_test, return_details=True)
print(f"Test anomalies detected: {test_results['num_anomalies']} ({test_results['anomaly_rate']*100:.2f}%)")
pipeline.visualize_results(X_test, "Test Data Results")

print("\nSaving model...")
pipeline.save_model("trained_anomaly_detector.keras")