# Exploratory Data Analysis for Anomaly Detection

This notebook provides a beginner-friendly interface for:
- Loading and exploring datasets
- Visualizing data patterns
- Running quick anomaly detection experiments
- Understanding model outputs

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs, make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import warnings

# Import our custom modules
import sys
sys.path.append('../src')
from data_preprocessing import DataPreprocessor, preprocess_pipeline
from model_training import ModelTrainer, train_ensemble
from evaluation import ModelEvaluator
from drift_detection import ConceptDriftMonitor

warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries loaded successfully!")

## 1. Generate Synthetic Data for Testing

Let's create a synthetic dataset with normal data and anomalies for demonstration.

In [None]:
# Generate synthetic data with anomalies
def create_anomaly_dataset(n_samples=1000, contamination=0.1, n_features=2, random_state=42):
    """
    Create synthetic dataset with normal data and anomalies.
    
    Args:
        n_samples: Total number of samples
        contamination: Proportion of anomalies
        n_features: Number of features
        random_state: Random seed
    
    Returns:
        X: Feature matrix
        y: Labels (0 = normal, 1 = anomaly)
    """
    np.random.seed(random_state)
    
    n_inliers = int(n_samples * (1 - contamination))
    n_outliers = n_samples - n_inliers
    
    # Generate normal data (inliers)
    X_inliers = np.random.randn(n_inliers, n_features)
    
    # Generate anomalies (outliers) - far from normal data
    X_outliers = np.random.uniform(low=-6, high=6, size=(n_outliers, n_features))
    
    # Combine data
    X = np.vstack([X_inliers, X_outliers])
    y = np.hstack([np.zeros(n_inliers), np.ones(n_outliers)])
    
    # Shuffle
    shuffle_idx = np.random.permutation(n_samples)
    X = X[shuffle_idx]
    y = y[shuffle_idx]
    
    return X, y

# Create dataset
X, y = create_anomaly_dataset(n_samples=1000, contamination=0.1, n_features=2)

# Convert to DataFrame for easier handling
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['is_anomaly'] = y

print(f"Dataset shape: {df.shape}")
print(f"Number of normal samples: {(y == 0).sum()}")
print(f"Number of anomalies: {(y == 1).sum()}")
print(f"Contamination rate: {y.mean():.2%}")
print("\nFirst few samples:")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset Statistics:")
print("=" * 50)
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nData Types:")
print(df.dtypes)

In [None]:
# Visualize data distribution
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 2D scatter plot
ax = axes[0, 0]
normal = df[df['is_anomaly'] == 0]
anomalies = df[df['is_anomaly'] == 1]
ax.scatter(normal['feature_0'], normal['feature_1'], c='blue', alpha=0.6, label='Normal')
ax.scatter(anomalies['feature_0'], anomalies['feature_1'], c='red', alpha=0.8, label='Anomaly', marker='x', s=50)
ax.set_xlabel('Feature 0')
ax.set_ylabel('Feature 1')
ax.set_title('Data Distribution in 2D Space')
ax.legend()
ax.grid(True, alpha=0.3)

# Feature distributions
for i, col in enumerate(['feature_0', 'feature_1']):
    ax = axes[0, 1] if i == 0 else axes[1, 0]
    ax.hist(normal[col], bins=30, alpha=0.7, label='Normal', color='blue', density=True)
    ax.hist(anomalies[col], bins=20, alpha=0.7, label='Anomaly', color='red', density=True)
    ax.set_xlabel(col)
    ax.set_ylabel('Density')
    ax.set_title(f'Distribution of {col}')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Correlation heatmap
ax = axes[1, 1]
correlation = df.drop('is_anomaly', axis=1).corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, ax=ax)
ax.set_title('Feature Correlation Matrix')

plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
# Initialize and apply preprocessing
preprocessor = DataPreprocessor(
    scaling_method='standard',
    feature_selection_method=None  # No dimensionality reduction for 2D data
)

# Clean data (remove duplicates, handle missing values)
df_clean = preprocessor.clean_data(df, drop_duplicates=True, handle_missing='drop')

# Prepare features and labels
X_clean = df_clean.drop('is_anomaly', axis=1).values
y_clean = df_clean['is_anomaly'].values

# Split into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.3, random_state=42, stratify=y_clean
)

# Fit and transform
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")
print(f"Training anomaly rate: {y_train.mean():.2%}")
print(f"Test anomaly rate: {y_test.mean():.2%}")

## 4. Train Anomaly Detection Models

In [None]:
# Train Isolation Forest with hyperparameter optimization
print("Training Isolation Forest with Optuna optimization...")
print("=" * 50)

trainer = ModelTrainer(
    model_type='isolation_forest',
    task_type='anomaly_detection',
    n_trials=20  # Reduced for demo
)

# Optimize hyperparameters
result = trainer.optimize(X_train_scaled, y_train, X_test_scaled, y_test)

print(f"\nBest parameters found:")
for param, value in result['best_params'].items():
    print(f"  {param}: {value}")
print(f"\nBest validation score: {result['best_score']:.4f}")

In [None]:
# Make predictions
model = result['model']
y_pred = model.predict(X_test_scaled)

# Convert predictions to binary (0 = normal, 1 = anomaly)
# Isolation Forest returns -1 for anomalies, 1 for normal
y_pred_binary = (y_pred == -1).astype(int)

# Get anomaly scores
scores = model.decision_function(X_test_scaled)

print(f"Predictions shape: {y_pred_binary.shape}")
print(f"Predicted anomalies: {y_pred_binary.sum()}")
print(f"Actual anomalies: {y_test.sum()}")

## 5. Model Evaluation

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator(task_type='anomaly_detection')

# Compute metrics
metrics = evaluator.compute_anomaly_metrics(y_test, y_pred_binary, scores)

print("Anomaly Detection Performance Metrics:")
print("=" * 50)
for metric, value in metrics.items():
    if metric != 'confusion_matrix' and not isinstance(value, np.ndarray):
        print(f"{metric}: {value:.4f}")

In [None]:
# Visualize results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot 1: Confusion Matrix
ax = axes[0]
cm = metrics['confusion_matrix']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
ax.set_title('Confusion Matrix')
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')

# Plot 2: Predictions visualization
ax = axes[1]
# Use original unscaled data for visualization
colors = ['blue' if pred == 0 else 'red' for pred in y_pred_binary]
ax.scatter(X_test[:, 0], X_test[:, 1], c=colors, alpha=0.6)
ax.set_xlabel('Feature 0')
ax.set_ylabel('Feature 1')
ax.set_title('Predicted Anomalies')

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='blue', label='Predicted Normal'),
                  Patch(facecolor='red', label='Predicted Anomaly')]
ax.legend(handles=legend_elements)

# Plot 3: Score distribution
ax = axes[2]
normal_scores = scores[y_test == 0]
anomaly_scores = scores[y_test == 1]
ax.hist(normal_scores, bins=30, alpha=0.7, label='Normal', color='blue', density=True)
ax.hist(anomaly_scores, bins=20, alpha=0.7, label='Anomaly', color='red', density=True)
ax.set_xlabel('Anomaly Score')
ax.set_ylabel('Density')
ax.set_title('Anomaly Score Distribution')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Bootstrap Confidence Intervals

In [None]:
# Calculate bootstrap confidence intervals
print("Calculating Bootstrap Confidence Intervals...")
bootstrap_results = evaluator.bootstrap_confidence_intervals(
    y_test, y_pred_binary,
    n_bootstraps=100,  # Reduced for demo
    confidence_level=0.95
)

print(f"\nF1 Score: {bootstrap_results['original_score']:.4f}")
print(f"95% Confidence Interval: [{bootstrap_results['ci_lower']:.4f}, {bootstrap_results['ci_upper']:.4f}]")
print(f"Bootstrap Mean: {bootstrap_results['mean']:.4f}")
print(f"Bootstrap Std: {bootstrap_results['std']:.4f}")

# Visualize bootstrap distribution
plt.figure(figsize=(10, 5))
plt.hist(bootstrap_results['scores'], bins=30, alpha=0.7, color='blue', edgecolor='black')
plt.axvline(bootstrap_results['original_score'], color='red', linestyle='--', linewidth=2, label='Original Score')
plt.axvline(bootstrap_results['ci_lower'], color='green', linestyle=':', linewidth=2, label='95% CI Lower')
plt.axvline(bootstrap_results['ci_upper'], color='green', linestyle=':', linewidth=2, label='95% CI Upper')
plt.xlabel('F1 Score')
plt.ylabel('Frequency')
plt.title('Bootstrap Distribution of F1 Score')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 7. Train and Compare Multiple Models

In [None]:
# Train ensemble of models
print("Training ensemble of anomaly detection models...")
print("This may take a few minutes...")
print("=" * 50)

ensemble_results = train_ensemble(
    X_train_scaled, y_train,
    models=['isolation_forest', 'one_class_svm'],  # Reduced for demo
    task_type='anomaly_detection',
    n_trials=10  # Reduced for demo
)

print(f"\nBest model: {ensemble_results['best_model_type']}")

# Compare models
model_scores = {}
for model_name, model_info in ensemble_results['models'].items():
    model = model_info['model']
    y_pred = model.predict(X_test_scaled)
    y_pred_binary = (y_pred == -1).astype(int)
    
    from sklearn.metrics import f1_score
    score = f1_score(y_test, y_pred_binary)
    model_scores[model_name] = score
    print(f"{model_name}: F1 Score = {score:.4f}")

## 8. Drift Detection Simulation

In [None]:
# Simulate data stream with concept drift
def generate_stream_with_drift(n_batches=20, batch_size=50, drift_point=10):
    """
    Generate data stream with concept drift.
    """
    stream = []
    
    for i in range(n_batches):
        if i < drift_point:
            # Before drift: centered at origin
            X_batch = np.random.randn(batch_size, 2)
        else:
            # After drift: shifted distribution
            X_batch = np.random.randn(batch_size, 2) + [2, 2]
        
        stream.append(X_batch)
    
    return stream

# Generate stream
stream = generate_stream_with_drift(n_batches=20, batch_size=50, drift_point=10)

# Initialize drift monitor
drift_monitor = ConceptDriftMonitor(methods=['statistical'])

# Process stream
drift_detected = []
for i, batch in enumerate(stream):
    results = drift_monitor.update_unsupervised(batch)
    if any(results.values()):
        drift_detected.append(i)
        print(f"Drift detected at batch {i}!")

# Visualize drift
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot data stream
ax = axes[0]
for i, batch in enumerate(stream[:10]):
    ax.scatter(batch[:, 0], batch[:, 1], alpha=0.3, label=f'Batch {i}' if i < 2 else '')
ax.set_title('Data Stream Before Drift')
ax.set_xlabel('Feature 0')
ax.set_ylabel('Feature 1')
ax.legend()
ax.grid(True, alpha=0.3)

ax = axes[1]
for i, batch in enumerate(stream[10:15]):
    ax.scatter(batch[:, 0], batch[:, 1], alpha=0.3, label=f'Batch {i+10}' if i < 2 else '')
ax.set_title('Data Stream After Drift')
ax.set_xlabel('Feature 0')
ax.set_ylabel('Feature 1')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Get drift report
drift_report = drift_monitor.get_drift_report()
print("\nDrift Detection Report:")
print("=" * 50)
for detector_name, info in drift_report['detectors'].items():
    print(f"{detector_name}:")
    print(f"  Number of drifts: {info['n_drifts']}")
    print(f"  Drift points: {info['drift_points']}")

## 9. Load Your Own Data

Use this section to load and analyze your own datasets.

In [None]:
# Example: Load CSV file
# Uncomment and modify the path to load your own data
"""
# Load your data
your_data = pd.read_csv('../data/raw/your_dataset.csv')

# Display basic information
print(f"Dataset shape: {your_data.shape}")
print(f"\nFirst few rows:")
print(your_data.head())

# Preprocess your data
preprocessor = DataPreprocessor(scaling_method='standard')
your_data_clean = preprocessor.clean_data(your_data)

# Continue with analysis...
"""

print("To load your own data:")
print("1. Place your CSV file in the '../data/raw/' directory")
print("2. Uncomment the code above and modify the file path")
print("3. Run the cell to load and analyze your data")

## 10. Save Results and Models

In [None]:
# Save the trained model
import os
import joblib
from datetime import datetime

# Create models directory if it doesn't exist
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

# Save model
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f"{models_dir}/isolation_forest_{timestamp}.pkl"
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

# Save preprocessor
preprocessor_path = f"{models_dir}/preprocessor_{timestamp}.pkl"
joblib.dump(preprocessor, preprocessor_path)
print(f"Preprocessor saved to: {preprocessor_path}")

# Save evaluation results
results_dir = '../data/processed'
os.makedirs(results_dir, exist_ok=True)

results_df = pd.DataFrame({
    'metric': list(model_scores.keys()),
    'score': list(model_scores.values())
})
results_path = f"{results_dir}/model_comparison_{timestamp}.csv"
results_df.to_csv(results_path, index=False)
print(f"Results saved to: {results_path}")

## Conclusion

This notebook demonstrated:
1. **Data Generation**: Creating synthetic anomaly detection datasets
2. **Data Exploration**: Visualizing and understanding data patterns
3. **Preprocessing**: Cleaning and scaling data
4. **Model Training**: Using Optuna for hyperparameter optimization
5. **Evaluation**: Computing comprehensive metrics and confidence intervals
6. **Model Comparison**: Training and comparing multiple algorithms
7. **Drift Detection**: Monitoring for concept drift in data streams

### Next Steps:
- Load your own dataset and apply the same analysis pipeline
- Experiment with different preprocessing techniques
- Try other anomaly detection algorithms
- Adjust hyperparameter optimization settings
- Implement real-time anomaly detection for streaming data