---
title: "Multivariate Anomaly Detection"
format: html
---

# ðŸš¨ Anomaly Detection in Sensor Streams
**Portfolio Project 3 â€” Multivariate Anomaly Detection**

---

## Objective
Detect point anomalies and contextual anomalies in multivariate sensor data
using statistical, isolation-based, and autoencoder approaches.

## Dataset
**NASA SMAP / MSL Benchmark (subset) â€” simulated equivalent**
Original: https://github.com/nasa/anomaly-detection
We replicate its structure with synthetic channels.

---

In [None]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from scipy.signal import butter, filtfilt
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
print('Imports OK')

## 1. Synthetic Multi-Channel Sensor Data with Known Anomalies

In [None]:
# 2. Generate sensor streams + inject anomalies
def gen_anomaly_data(n=10000, n_ch=5, seed=99):
    rng = np.random.default_rng(seed)
    t = np.linspace(0, 50, n)

    # Normal signals: sinusoidal + noise
    signals = np.column_stack([
        np.sin(t/(3+i) + i) * (1 + 0.2*i) + rng.normal(0, 0.15, n)
        for i in range(n_ch)
    ])

    labels = np.zeros(n, dtype=int)  # 0 = normal

    # --- Inject anomalies ---
    # Point anomalies
    point_idx = rng.choice(n, 80, replace=False)
    ch_idx = rng.integers(0, n_ch, 80)
    signals[point_idx,
            ch_idx] += rng.choice([-1, 1], 80) * rng.uniform(3, 6, 80)
    labels[point_idx] = 1

    # Contextual anomalies: sudden mean shift on channel 0 for a window
    for start in rng.choice(range(100, n-200), 5, replace=False):
        window = slice(start, start+40)
        signals[window, 0] += 2.5
        labels[window] = 1

    # Collective anomaly: correlated spike across all channels
    for start in rng.choice(range(200, n-100), 3, replace=False):
        window = slice(start, start+20)
        signals[window, :] += rng.normal(1.8, 0.3, (20, n_ch))
        labels[window] = 1

    cols = [f'Ch_{i}' for i in range(n_ch)]
    df = pd.DataFrame(signals, columns=cols)
    df['Timestamp'] = pd.date_range('2024-03-01', periods=n, freq='1min')
    df['True_Anomaly'] = labels
    return df


df = gen_anomaly_data()
print(
    f'Dataset: {df.shape}  |  Anomaly rate: {df["True_Anomaly"].mean()*100:.1f}%')
df.head()

## 2. Baseline Statistical Method â€” Z-Score

In [None]:
# 3. Per-channel Z-Score anomaly detection
ch_cols = [c for c in df.columns if c.startswith('Ch_')]
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(
    df[ch_cols]), columns=ch_cols, index=df.index)

# Flag if |Z| > threshold on ANY channel
Z_THRESH = 3.0
df['zscore_anomaly'] = (df_scaled.abs() > Z_THRESH).any(axis=1).astype(int)

print('Z-Score Detection:')
print(classification_report(
    df['True_Anomaly'], df['zscore_anomaly'], target_names=['Normal', 'Anomaly']))

## 3. Isolation Forest

In [None]:
# 4. Isolation Forest
iso = IsolationForest(
    n_estimators=300,
    contamination=df['True_Anomaly'].mean(),
    max_samples=256,
    random_state=42
)
df['iso_pred'] = (iso.fit_predict(df[ch_cols]) == -1).astype(int)
# higher = more anomalous
df['iso_score'] = -iso.decision_function(df[ch_cols])

print('Isolation Forest Detection:')
print(classification_report(df['True_Anomaly'],
      df['iso_pred'], target_names=['Normal', 'Anomaly']))

## 4. Sliding-Window Autoencoder (Reconstruction Error)

In [None]:
# 5. Simple sliding-window reconstruction (no deep-learning dependency)
# Uses PCA as a linear autoencoder proxy â€” portable & fast
from sklearn.decomposition import PCA

N_COMP = 2  # bottleneck dimension
pca = PCA(n_components=N_COMP)
recon = pca.fit_transform(df[ch_cols].values)
recon_full = pca.inverse_transform(recon)  # reconstructed signal

# Reconstruction error per row
df['recon_error'] = np.mean((df[ch_cols].values - recon_full)**2, axis=1)

# Threshold: mean + 3*std on training portion (first 80%)
train_end = int(len(df)*0.8)
thresh = df['recon_error'].iloc[:train_end].mean(
) + 3*df['recon_error'].iloc[:train_end].std()
df['recon_anomaly'] = (df['recon_error'] > thresh).astype(int)

print(f'Reconstruction threshold: {thresh:.4f}')
print('\nPCA-Autoencoder Detection:')
print(classification_report(
    df['True_Anomaly'], df['recon_anomaly'], target_names=['Normal', 'Anomaly']))

## 5. Visual Comparison

In [None]:
# 6. Plot Channel 0 with all three detection layers
fig, axes = plt.subplots(4, 1, figsize=(16, 10), sharex=True)

# Raw signal + true anomalies
axes[0].plot(df['Timestamp'], df['Ch_0'], lw=0.7, color='steelblue')
anom_mask = df['True_Anomaly'] == 1
axes[0].scatter(df.loc[anom_mask, 'Timestamp'],
                df.loc[anom_mask, 'Ch_0'], s=15, color='red', zorder=5)
axes[0].set_title('Channel 0 â€” True Anomalies (red)', fontsize=11)
axes[0].set_ylabel('Signal')

# Z-Score detections
axes[1].plot(df['Timestamp'], df['Ch_0'], lw=0.7, color='steelblue')
z_mask = df['zscore_anomaly'] == 1
axes[1].scatter(df.loc[z_mask, 'Timestamp'], df.loc[z_mask,
                'Ch_0'], s=15, color='orange', zorder=5)
axes[1].set_title('Z-Score Detections (orange)', fontsize=11)
axes[1].set_ylabel('Signal')

# Isolation Forest
axes[2].plot(df['Timestamp'], df['Ch_0'], lw=0.7, color='steelblue')
iso_mask = df['iso_pred'] == 1
axes[2].scatter(df.loc[iso_mask, 'Timestamp'],
                df.loc[iso_mask, 'Ch_0'], s=15, color='green', zorder=5)
axes[2].set_title('Isolation Forest Detections (green)', fontsize=11)
axes[2].set_ylabel('Signal')

# Reconstruction error
axes[3].plot(df['Timestamp'], df['recon_error'], lw=0.7, color='purple')
axes[3].axhline(thresh, color='crimson', ls='--',
                lw=1.2, label=f'Threshold={thresh:.3f}')
axes[3].set_title('PCA Reconstruction Error', fontsize=11)
axes[3].set_ylabel('MSE')
axes[3].set_xlabel('Time')
axes[3].legend(loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
# 7. Confusion matrices side-by-side
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
methods = [('Z-Score', 'zscore_anomaly'), ('Isolation Forest',
                                           'iso_pred'), ('PCA-AE', 'recon_anomaly')]

for ax, (name, col) in zip(axes, methods):
    cm = confusion_matrix(df['True_Anomaly'], df[col])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=['Normal', 'Anomaly'], yticklabels=['Normal', 'Anomaly'])
    ax.set_title(f'{name}', fontsize=12)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.suptitle('Confusion Matrices', fontsize=14, y=1.03)
plt.tight_layout()
plt.show()

## Summary
- Implemented three complementary anomaly-detection strategies
- **Z-Score** catches large point anomalies; misses contextual shifts
- **Isolation Forest** adapts to multivariate density; good recall on collective anomalies
- **PCA Reconstruction** captures structural deviation across all channels
- An ensemble (OR/AND combination) of these methods is the production-ready approach