## 1. Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

# Models / training
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score,
    roc_curve, classification_report, confusion_matrix
)

# Many dependencies have weird warnings/loggings; we disable those since the output
# explodes when put in loops
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

print("All imports successful!")

All imports successful!


## 2. Model Configuration

Using exact model dictionary from step2.ipynb

In [2]:
# Model dictionary with exact hyperparameters from step2.ipynb
model_dict = {
    "Linear Classifier (Logistic Regression)": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVM": SVC(kernel="linear", class_weight='balanced', probability=True),
    "RBF SVM": SVC(kernel='rbf', class_weight='balanced', probability=True),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
}

print(f"Configured {len(model_dict)} models")

Configured 8 models


## 3. Data Loading and Filtering

In [3]:
# Load data
clean_data = pd.read_csv("clean_labeled.csv")
print(f"Loaded {len(clean_data)} rows")
print(f"Columns: {list(clean_data.columns)}")

Loaded 28849 rows
Columns: ['Unnamed: 0.1', 'Unnamed: 0', 'hostname', 'date', 'ping_jitter', 'ping_latency', 'ping_low', 'ping_high', 'day', 'predictions', 'basic_ema_anomaly', 'dspot_anomaly', 'tuned_dspot_anomaly']


In [4]:
# Filter to hosts that have at least one anomaly in ANY detection method
hosts_with_basic_anomaly = clean_data[clean_data['basic_ema_anomaly'] == True]['hostname'].unique()
hosts_with_dspot_anomaly = clean_data[clean_data['dspot_anomaly'] == True]['hostname'].unique()
hosts_with_tuned_dspot_anomaly = clean_data[clean_data['tuned_dspot_anomaly'] == True]['hostname'].unique()

# Union of all hosts with anomalies
hosts_with_any_anomaly = set(hosts_with_basic_anomaly) | set(hosts_with_dspot_anomaly) | set(hosts_with_tuned_dspot_anomaly)

print(f"Hosts with basic_ema_anomaly: {len(hosts_with_basic_anomaly)}")
print(f"Hosts with dspot_anomaly: {len(hosts_with_dspot_anomaly)}")
print(f"Hosts with tuned_dspot_anomaly: {len(hosts_with_tuned_dspot_anomaly)}")
print(f"Total hosts with ANY anomaly: {len(hosts_with_any_anomaly)}")

# Filter dataframe to only these hosts
df = clean_data[clean_data['hostname'].isin(hosts_with_any_anomaly)].copy()
print(f"\nFiltered to {len(df)} rows from {len(hosts_with_any_anomaly)} hosts")

Hosts with basic_ema_anomaly: 38
Hosts with dspot_anomaly: 24
Hosts with tuned_dspot_anomaly: 32
Total hosts with ANY anomaly: 38

Filtered to 28394 rows from 38 hosts


### Train/Test Split

In [5]:
# Convert date column to datetime
df["date"] = pd.to_datetime(df["date"])

# Split into train and test
start_a = pd.Timestamp("2025-04-16")
end_a   = pd.Timestamp("2025-06-20")

start_b = pd.Timestamp("2025-07-01")
end_b   = pd.Timestamp("2025-08-01")

mask_a = (df["date"] >= start_a) & (df["date"] <= end_a)
mask_b = (df["date"] >= start_b) & (df["date"] <= end_b)

train = df[mask_a].copy()
test = df[mask_b].copy()

print(f"Train set: {len(train)} rows")
print(f"Test set: {len(test)} rows")

Train set: 16068 rows
Test set: 12326 rows


## 4. Feature Engineering Functions

Modified to accept a `label_col` parameter to specify which anomaly column to use

In [6]:
def create_lookback_features(lookback_df, label_col='predictions', latency_to_use='ping_latency'):
    """
    Create lookback features for a window of data.
    
    Args:
        lookback_df: DataFrame with historical data
        label_col: Column name to use as labels (e.g., 'basic_ema_anomaly', 'dspot_anomaly')
        latency_to_use: Column name for latency values
    """
    lookback_df = lookback_df.reset_index(drop=True)

    has_anomalies = lookback_df[label_col].any()
    has_normal = (~lookback_df[label_col]).any()
    
    lookback_features = {
        'anomaly_count': lookback_df[label_col].sum(),
        'anomaly_rate': lookback_df[label_col].mean(),
        'recent_anomaly_count': lookback_df[label_col].tail(3).sum(),
        'datapoints_since_anomaly': ((len(lookback_df) - 1 - lookback_df[lookback_df[label_col] == True].index[-1]) if has_anomalies else -1),
        'has_anomaly_history': float(has_anomalies),
        'latency_during_anomalies': (lookback_df[lookback_df[label_col] == True][latency_to_use].mean() if has_anomalies else -1),
        'latency_during_normal': (lookback_df[lookback_df[label_col] == False][latency_to_use].mean() if has_normal else -1),
        'recent_latency_mean': lookback_df[latency_to_use].tail(3).mean(),
        'baseline_latency_mean': lookback_df[latency_to_use].head(5).mean(),
        'recent_vs_baseline': (lookback_df[latency_to_use].tail(3).mean() / lookback_df[latency_to_use].head(5).mean() if lookback_df[latency_to_use].head(5).mean() > 0 else 1.0),
        'recent_latency_max': lookback_df[latency_to_use].tail(3).max(),
        'latency_trend': (lookback_df[latency_to_use].iloc[-1] - lookback_df[latency_to_use].iloc[0]) / len(lookback_df),
        'anomaly_clustering': lookback_df[label_col].rolling(3).sum().max() if len(lookback_df) >= 3 else 0,
        'missing_points': lookback_df[latency_to_use].isna().sum(),
        'completeness': 1 - lookback_df[latency_to_use].isna().mean()
    }
    return lookback_features


def get_feature_df(og_df, label_col='predictions', latency_to_use='ping_latency'):
    """
    Create a feature dataframe with lookback windows.
    
    Args:
        og_df: Original dataframe sorted by date
        label_col: Column name to use as labels
        latency_to_use: Column name for latency values
    """
    initial = create_lookback_features(og_df.iloc[0:10], label_col=label_col, latency_to_use=latency_to_use)
    featured_df = pd.DataFrame(columns=list(initial.keys()) + ['label', 'date', 'hostname'])
    TOL = pd.Timedelta(minutes=2)

    for i, row in og_df.iloc[9:].iterrows():
        end_time = og_df.loc[i, 'date']
        start_time = end_time - pd.Timedelta(hours=30)
        lookback_df = og_df[(og_df['date'] >= start_time + TOL) & (og_df['date'] < end_time - TOL)].copy()
        if len(lookback_df) == 0:
            continue
        lookback_features = create_lookback_features(lookback_df, label_col=label_col, latency_to_use=latency_to_use)
        label = og_df.loc[i, label_col]
        hostname = og_df.loc[i, 'hostname']
        row = {**lookback_features, 'label': label, 'date': end_time, 'hostname': hostname}
        featured_df.loc[len(featured_df)] = row

    return featured_df


def transform_single_df_to_features(df, cur_hostname, label_col='predictions'):
    """
    Transform a single device's data to features.
    
    Args:
        df: Full dataframe
        cur_hostname: Hostname to filter by
        label_col: Column name to use as labels
    """
    host_isolated = df[df['hostname'] == cur_hostname]
    host_isolated = host_isolated.sort_values(by='date', ascending=True)
    return get_feature_df(host_isolated, label_col=label_col, latency_to_use='ping_latency')


print("Feature engineering functions defined")

Feature engineering functions defined


## 5. Model Evaluation Functions

Includes new threshold-based metrics:
- **FPR at 90% Recall**: What false positive rate is needed to catch 90% of anomalies
- **Recall at 10% FPR**: What percentage of anomalies are caught with 10% false positive rate

In [7]:
def calculate_threshold_metrics(y_test, y_proba):
    """
    Calculate threshold-based metrics using ROC curve.
    """
    # Handle edge case
    if y_test.sum() == 0:
        return -1, -1
    
    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    
    # FPR at 90% Recall (TPR)
    idx_90_recall = np.where(tpr >= 0.90)[0]
    fpr_at_90_recall = fpr[idx_90_recall[0]] if len(idx_90_recall) > 0 else 1.0
    
    # Recall at 10% FPR
    idx_10_fpr = np.where(fpr <= 0.10)[0]
    recall_at_10_fpr = tpr[idx_10_fpr[-1]] if len(idx_10_fpr) > 0 else 0.0
    
    return fpr_at_90_recall, recall_at_10_fpr


def evaluate_model_per_device(X_train, y_train, X_test, y_test, hostname):
    """
    Train all models and evaluate on test set for a single device.
    
    Args:
        X_train, y_train: Training features and labels
        X_test, y_test: Test features and labels
        hostname: Device hostname
    
    Returns:
        List of result dictionaries for each model
    """
    results = []

    for model_name, clf in model_dict.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        try:
            y_proba = clf.predict_proba(X_test)[:, 1]
            has_proba = True
        except:
            y_proba = None
            has_proba = False
        
        # Standard metrics
        accuracy = clf.score(X_test, y_test)
        f1_anomaly = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
        precision = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
        recall = recall_score(y_test, y_pred, pos_label=1, zero_division=0)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        roc_auc = -1
        fpr_at_90_recall = -1
        recall_at_10_fpr = -1
        
        if has_proba:
            roc_auc = roc_auc_score(y_test, y_proba)
            fpr_at_90_recall, recall_at_10_fpr = calculate_threshold_metrics(y_test, y_proba)

        results.append({
            'model': model_name,
            'hostname': hostname,
            'accuracy': accuracy,
            'f1_score': f1_anomaly,
            'precision': precision,
            'recall': recall,
            'roc_auc': roc_auc,
            'fpr_at_90_recall': fpr_at_90_recall,
            'recall_at_10_fpr': recall_at_10_fpr,
            'true_positives': tp,
            'false_positives': fp,
            'true_negatives': tn,
            'false_negatives': fn,
            'caught_anomalies': tp,
            'missed_anomalies': fn,
            'false_alarms': fp,
            'total_test_samples': len(y_test),
            'total_anomalies': y_test.sum()
        })

    return results


print("Model evaluation functions defined")

Model evaluation functions defined


## 6. Main Evaluation Loop

Iterate over all three anomaly detection methods and evaluate per-device models

In [8]:
# Define the three anomaly columns to evaluate
anomaly_columns = ['basic_ema_anomaly', 'dspot_anomaly', 'tuned_dspot_anomaly']

# Store results for each method
all_results_by_method = {}

# Columns to remove when creating features
remove_cols = ['label', 'date', 'hostname']

print("="*80)
print("EVALUATING PER-DEVICE MODELS FOR EACH ANOMALY DETECTION METHOD")
print("="*80)

for label_col in anomaly_columns:
    print(f"\n{'='*80}")
    print(f"ANOMALY DETECTION METHOD: {label_col}")
    print(f"{'='*80}\n")
    
    # Filter to hosts that have anomalies in both train and test for this method
    train_hostnames_with_anomalies = train[train[label_col] == True]['hostname'].unique()
    test_hostnames_with_anomalies = test[test[label_col] == True]['hostname'].unique()
    hostnames_in_both = set(train_hostnames_with_anomalies).intersection(set(test_hostnames_with_anomalies))
    
    print(f"Hosts with anomalies in train: {len(train_hostnames_with_anomalies)}")
    print(f"Hosts with anomalies in test: {len(test_hostnames_with_anomalies)}")
    print(f"Hosts with anomalies in BOTH train and test: {len(hostnames_in_both)}\n")
    
    if len(hostnames_in_both) == 0:
        print(f"WARNING: No hosts have anomalies in both train and test for {label_col}. Skipping.\n")
        continue
    
    # Filter train and test to only these hosts
    train_filtered = train[train['hostname'].isin(hostnames_in_both)]
    test_filtered = test[test['hostname'].isin(hostnames_in_both)]
    
    # Evaluate models for each host
    method_results = []
    
    for i, cur_hostname in enumerate(hostnames_in_both, 1):
        print(f"[{i}/{len(hostnames_in_both)}] Host: {cur_hostname}")
        
        # Create features using this label column
        train_single_w_lookback = transform_single_df_to_features(train_filtered, cur_hostname, label_col=label_col)
        test_single_w_lookback = transform_single_df_to_features(test_filtered, cur_hostname, label_col=label_col)
        
        # Prepare X and y
        X_train = train_single_w_lookback.drop(columns=remove_cols)
        X_test = test_single_w_lookback.drop(columns=remove_cols)
        y_train = train_single_w_lookback["label"].astype(int)
        y_test = test_single_w_lookback["label"].astype(int)
        
        # Evaluate models
        hostname_results = evaluate_model_per_device(X_train, y_train, X_test, y_test, cur_hostname)
        
        method_results.extend(hostname_results)
    
    # Store results for this method
    all_results_by_method[label_col] = pd.DataFrame(method_results)

    print(f"\nCompleted {label_col}\n")
    print("="*80)

print("EVALUATION COMPLETE")
print("\n" + "="*80)

EVALUATING PER-DEVICE MODELS FOR EACH ANOMALY DETECTION METHOD

ANOMALY DETECTION METHOD: basic_ema_anomaly

Hosts with anomalies in train: 37
Hosts with anomalies in test: 35
Hosts with anomalies in BOTH train and test: 34


[1/34] Host: d493afd

[2/34] Host: 6ca8355

[3/34] Host: 9840de6

[4/34] Host: 575f518

[5/34] Host: b340432

[6/34] Host: b2c53ee

[7/34] Host: b407ebe

[8/34] Host: dede9dc

[9/34] Host: b5c8445

[10/34] Host: 33fe84e

[11/34] Host: 592a43c

[12/34] Host: 64b750b

[13/34] Host: 2620a05

[14/34] Host: 24a22bf

[15/34] Host: a2e0486

[16/34] Host: 7f6d63d

[17/34] Host: 8445893

[18/34] Host: 9dc32f2

[19/34] Host: 0f42441

[20/34] Host: f8f4b44

[21/34] Host: c073f39

[22/34] Host: da6d469

[23/34] Host: ed86ea2

[24/34] Host: 5bf17fc

[25/34] Host: 1a21874

[26/34] Host: 9ab8252

[27/34] Host: 953d46d

[28/34] Host: 38b6bf0

[29/34] Host: 972f622

[30/34] Host: 63598f8

[31/34] Host: 43e847f

[32/34] Host: 5c5004f

[33/34] Host: 25b3303

[34/34] Host: 29129b6

C

## 7. Results Summary

Display aggregate statistics for each anomaly detection method

In [9]:
print("\n" + "="*80)
print("AGGREGATE RESULTS BY ANOMALY DETECTION METHOD")
print("="*80 + "\n")

for label_col, results_df in all_results_by_method.items():
    print(f"\n{'='*80}")
    print(f"METHOD: {label_col}")
    print(f"{'='*80}\n")
    
    # Group by model and calculate mean metrics
    grouped = results_df.groupby('model').agg({
        'accuracy': 'mean',
        'fpr_at_90_recall': 'mean',
        'recall_at_10_fpr': 'mean',
    }).round(4)
    
    print("Key Metrics (averaged across devices):")
    print(grouped.to_string())
    
    print("\n" + "-"*80)
    print("Best Models:")
    print("-"*80)
    best_acc = grouped['accuracy'].idxmax()
    best_fpr_90 = grouped['fpr_at_90_recall'].idxmin()  # Lower is better
    best_recall_10 = grouped['recall_at_10_fpr'].idxmax()  # Higher is better
    
    print(f"Best Accuracy: {best_acc} ({grouped.loc[best_acc, 'accuracy']:.4f})")
    print(f"Best FPR at 90% Recall: {best_fpr_90} ({grouped.loc[best_fpr_90, 'fpr_at_90_recall']:.4f})")
    print(f"Best Recall at 10% FPR: {best_recall_10} ({grouped.loc[best_recall_10, 'recall_at_10_fpr']:.4f})")
    
    print("\n")


AGGREGATE RESULTS BY ANOMALY DETECTION METHOD


METHOD: basic_ema_anomaly

Key Metrics (averaged across devices):
                                         accuracy  fpr_at_90_recall  recall_at_10_fpr
model                                                                                
AdaBoost                                   0.8345            0.8752            0.1315
Decision Tree                              0.6737            0.9958            0.0357
Linear Classifier (Logistic Regression)    0.6529            0.7983            0.1566
Linear SVM                                 0.6037            0.8334            0.1527
Naive Bayes                                0.7543            0.8654            0.2127
Nearest Neighbors                          0.8400            1.0000            0.0965
RBF SVM                                    0.5632            0.8265            0.1885
Random Forest                              0.8690            0.8593            0.1617

------------------------

## 8. Save Results to CSV

In [10]:
# Save detailed results for each method
for label_col, results_df in all_results_by_method.items():
    filename = f"per_device_results_{label_col}.csv"
    results_df.to_csv(filename, index=False)
    print(f"Saved results for {label_col} to {filename}")

print("\nAll results saved successfully!")

Saved results for basic_ema_anomaly to per_device_results_basic_ema_anomaly.csv
Saved results for dspot_anomaly to per_device_results_dspot_anomaly.csv
Saved results for tuned_dspot_anomaly to per_device_results_tuned_dspot_anomaly.csv

All results saved successfully!


## 9. Exploratory: Multi-Device Models

Train models on ALL data combined (not split per device) and evaluate with the same metrics

## Summary

This notebook evaluated per-device ML models on three different anomaly detection methods:
1. **basic_ema_anomaly**: Basic EMA-based detection
2. **dspot_anomaly**: DSPOT detection
3. **tuned_dspot_anomaly**: Tuned DSPOT detection

For each method, we trained 8 different ML models per device and calculated:
- Standard classification metrics (accuracy, F1, precision, recall, ROC-AUC)
- **FPR at 90% Recall**: Answers "how many false alarms to catch 90% of issues?"
- **Recall at 10% FPR**: Answers "how many issues caught with 10% false alarm rate?"

The threshold-based metrics are particularly useful for understanding:
- **Customers who want to catch as many anomalies as possible**: Look at FPR at 90% Recall
- **Customers who want to minimize false alarms**: Look at Recall at 10% FPR

### Multi-Device Feature Engineering

Create features from all devices combined for each anomaly detection method

In [11]:
def transform_multidevice_df_to_features(df, label_col='predictions'):
    """
    Create features for all devices combined.
    
    Args:
        df: Full dataframe with multiple devices
        label_col: Column name to use as labels
    """
    hostnames = df["hostname"].unique().tolist()
    featured_dfs = []
    
    for cur_hostname in hostnames:
        host_isolated = df[df['hostname'] == cur_hostname]
        host_isolated = host_isolated.sort_values(by='date', ascending=True)
        feature_df = get_feature_df(host_isolated, label_col=label_col, latency_to_use='ping_latency')
        featured_dfs.append(feature_df)
    
    final_df = pd.concat(featured_dfs, ignore_index=True)
    return final_df

print("Multi-device feature engineering function defined")

Multi-device feature engineering function defined


### Evaluate Multi-Device Models

Train on all data combined and evaluate with the same metrics as per-device models

In [12]:
# Store multi-device results
all_multidevice_results = {}

print("="*80)
print("EVALUATING MULTI-DEVICE MODELS (TRAINED ON ALL DATA COMBINED)")
print("="*80)

for label_col in anomaly_columns:
    print(f"\n{'='*80}")
    print(f"MULTI-DEVICE MODEL FOR: {label_col}")
    print(f"{'='*80}\n")
    
    # Use the same filtered train/test as per-device models
    # Get hosts with anomalies in both train and test
    train_hostnames_with_anomalies = train[train[label_col] == True]['hostname'].unique()
    test_hostnames_with_anomalies = test[test[label_col] == True]['hostname'].unique()
    hostnames_in_both = set(train_hostnames_with_anomalies).intersection(set(test_hostnames_with_anomalies))
    
    if len(hostnames_in_both) == 0:
        print(f"WARNING: No hosts have anomalies in both train and test for {label_col}. Skipping.\n")
        continue
    
    # Filter to hosts with anomalies in both
    train_filtered = train[train['hostname'].isin(hostnames_in_both)]
    test_filtered = test[test['hostname'].isin(hostnames_in_both)]
    
    # Create features for ALL devices combined
    train_multi_w_lookback = transform_multidevice_df_to_features(train_filtered, label_col=label_col)
    test_multi_w_lookback = transform_multidevice_df_to_features(test_filtered, label_col=label_col)
    
    # Prepare X and y
    X_train = train_multi_w_lookback.drop(columns=remove_cols)
    X_test = test_multi_w_lookback.drop(columns=remove_cols)
    y_train = train_multi_w_lookback["label"].astype(int)
    y_test = test_multi_w_lookback["label"].astype(int)
    
    # Normalize features based on training data only (no leakage)
    train_mean = X_train.mean()
    train_std = X_train.std()
    X_train_normalized = (X_train - train_mean) / train_std
    X_test_normalized = (X_test - train_mean) / train_std  # Use train stats
    
    # Fill NaNs with 0 (from division by zero std)
    X_train_normalized = X_train_normalized.fillna(0)
    X_test_normalized = X_test_normalized.fillna(0)
    
    print(f"Train: {len(X_train)} samples, {y_train.sum()} anomalies ({y_train.mean()*100:.1f}%)")
    print(f"Test: {len(X_test)} samples, {y_test.sum()} anomalies ({y_test.mean()*100:.1f}%)")
    
    # Train and evaluate models
    results = []
    
    for model_name, clf in model_dict.items():
        clf.fit(X_train_normalized, y_train)
        y_pred = clf.predict(X_test_normalized)
        
        try:
            y_proba = clf.predict_proba(X_test_normalized)[:, 1]
            has_proba = True
        except:
            y_proba = None
            has_proba = False
        
        # Standard metrics
        accuracy = clf.score(X_test_normalized, y_test)
        f1_anomaly = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
        precision = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
        recall = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
        
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        roc_auc = -1
        fpr_at_90_recall = -1
        recall_at_10_fpr = -1
        
        if has_proba:
            roc_auc = roc_auc_score(y_test, y_proba)
            fpr_at_90_recall, recall_at_10_fpr = calculate_threshold_metrics(y_test, y_proba)
        
        results.append({
            'model': model_name,
            'accuracy': accuracy,
            'f1_score': f1_anomaly,
            'precision': precision,
            'recall': recall,
            'roc_auc': roc_auc,
            'fpr_at_90_recall': fpr_at_90_recall,
            'recall_at_10_fpr': recall_at_10_fpr,
            'true_positives': tp,
            'false_positives': fp,
            'true_negatives': tn,
            'false_negatives': fn,
            'caught_anomalies': tp,
            'missed_anomalies': fn,
            'false_alarms': fp,
            'total_test_samples': len(y_test),
            'total_anomalies': y_test.sum()
        })
    
    all_multidevice_results[label_col] = pd.DataFrame(results)
    
    # Print results
    results_df = pd.DataFrame(results)
    print("\nResults:")
    for _, row in results_df.iterrows():
        print(f"  {row['model']:45s} | Acc: {row['accuracy']:.3f} | FPR@90%R: {row['fpr_at_90_recall']:.3f} | R@10%FPR: {row['recall_at_10_fpr']:.3f}")
    print()

print("\n" + "="*80)
print("MULTI-DEVICE EVALUATION COMPLETE")
print("="*80)

EVALUATING MULTI-DEVICE MODELS (TRAINED ON ALL DATA COMBINED)

MULTI-DEVICE MODEL FOR: basic_ema_anomaly

Train: 14548 samples, 1769 anomalies (12.2%)
Test: 11071 samples, 1455 anomalies (13.1%)

Results:
  Linear Classifier (Logistic Regression)       | Acc: 0.534 | FPR@90%R: 0.689 | R@10%FPR: 0.293
  Nearest Neighbors                             | Acc: 0.838 | FPR@90%R: 1.000 | R@10%FPR: 0.101
  Linear SVM                                    | Acc: 0.435 | FPR@90%R: 0.689 | R@10%FPR: 0.217
  RBF SVM                                       | Acc: 0.551 | FPR@90%R: 0.747 | R@10%FPR: 0.188
  Decision Tree                                 | Acc: 0.591 | FPR@90%R: 0.745 | R@10%FPR: 0.175
  Random Forest                                 | Acc: 0.867 | FPR@90%R: 0.727 | R@10%FPR: 0.219
  AdaBoost                                      | Acc: 0.868 | FPR@90%R: 0.720 | R@10%FPR: 0.240
  Naive Bayes                                   | Acc: 0.737 | FPR@90%R: 0.703 | R@10%FPR: 0.277


MULTI-DEVICE MODE

### Multi-Device Results Summary

In [13]:
print("\n" + "="*80)
print("MULTI-DEVICE MODEL RESULTS")
print("="*80 + "\n")

for label_col, results_df in all_multidevice_results.items():
    print(f"\n{'='*80}")
    print(f"METHOD: {label_col} (Multi-Device Model)")
    print(f"{'='*80}\n")
    
    print("Key Metrics:")
    print(results_df[['model', 'accuracy', 'fpr_at_90_recall', 'recall_at_10_fpr']].to_string(index=False))
    
    print("\n" + "-"*80)
    print("Best Models:")
    print("-"*80)
    best_acc_idx = results_df['accuracy'].idxmax()
    best_fpr_90_idx = results_df['fpr_at_90_recall'].idxmin()
    best_recall_10_idx = results_df['recall_at_10_fpr'].idxmax()
    
    print(f"Best Accuracy: {results_df.loc[best_acc_idx, 'model']} ({results_df.loc[best_acc_idx, 'accuracy']:.4f})")
    print(f"Best FPR at 90% Recall: {results_df.loc[best_fpr_90_idx, 'model']} ({results_df.loc[best_fpr_90_idx, 'fpr_at_90_recall']:.4f})")
    print(f"Best Recall at 10% FPR: {results_df.loc[best_recall_10_idx, 'model']} ({results_df.loc[best_recall_10_idx, 'recall_at_10_fpr']:.4f})")
    
    print("\n")


MULTI-DEVICE MODEL RESULTS


METHOD: basic_ema_anomaly (Multi-Device Model)

Key Metrics:
                                  model  accuracy  fpr_at_90_recall  recall_at_10_fpr
Linear Classifier (Logistic Regression)  0.534188          0.689164          0.292784
                      Nearest Neighbors  0.837684          1.000000          0.101031
                             Linear SVM  0.434830          0.689060          0.217182
                                RBF SVM  0.550628          0.747400          0.187629
                          Decision Tree  0.591094          0.745320          0.175258
                          Random Forest  0.866950          0.727225          0.218557
                               AdaBoost  0.867943          0.719738          0.239863
                            Naive Bayes  0.736609          0.702891          0.276976

--------------------------------------------------------------------------------
Best Models:
----------------------------------------

### Save Multi-Device Results

In [14]:
# Save multi-device results
for label_col, results_df in all_multidevice_results.items():
    filename = f"multidevice_results_{label_col}.csv"
    results_df.to_csv(filename, index=False)
    print(f"Saved multi-device results for {label_col} to {filename}")

print("\nAll multi-device results saved successfully!")

Saved multi-device results for basic_ema_anomaly to multidevice_results_basic_ema_anomaly.csv
Saved multi-device results for dspot_anomaly to multidevice_results_dspot_anomaly.csv
Saved multi-device results for tuned_dspot_anomaly to multidevice_results_tuned_dspot_anomaly.csv

All multi-device results saved successfully!


### Comparison: Per-Device vs Multi-Device Models

Compare average performance of per-device models with multi-device models

In [15]:
print("\n" + "="*80)
print("COMPARISON: PER-DEVICE vs MULTI-DEVICE MODELS")
print("="*80 + "\n")

for label_col in anomaly_columns:
    if label_col not in all_results_by_method or label_col not in all_multidevice_results:
        continue
    
    print(f"\n{'='*80}")
    print(f"METHOD: {label_col}")
    print(f"{'='*80}\n")
    
    # Per-device average
    per_device_df = all_results_by_method[label_col]
    per_device_avg = per_device_df.groupby('model').agg({
        'accuracy': 'mean',
        'fpr_at_90_recall': 'mean',
        'recall_at_10_fpr': 'mean'
    }).round(4)
    
    # Multi-device
    multi_device_df = all_multidevice_results[label_col].set_index('model')
    
    # Compare
    comparison = pd.DataFrame({
        'Model': per_device_avg.index,
        'Per-Device Acc': per_device_avg['accuracy'].values,
        'Multi-Device Acc': multi_device_df.loc[per_device_avg.index, 'accuracy'].values,
        'Per-Device FPR@90%R': per_device_avg['fpr_at_90_recall'].values,
        'Multi-Device FPR@90%R': multi_device_df.loc[per_device_avg.index, 'fpr_at_90_recall'].values,
        'Per-Device R@10%FPR': per_device_avg['recall_at_10_fpr'].values,
        'Multi-Device R@10%FPR': multi_device_df.loc[per_device_avg.index, 'recall_at_10_fpr'].values,
    })
    
    print("Accuracy Comparison:")
    print(comparison[['Model', 'Per-Device Acc', 'Multi-Device Acc']].to_string(index=False))
    
    print("\n" + "-"*80)
    print("FPR at 90% Recall Comparison (lower is better):")
    print(comparison[['Model', 'Per-Device FPR@90%R', 'Multi-Device FPR@90%R']].to_string(index=False))
    
    print("\n" + "-"*80)
    print("Recall at 10% FPR Comparison (higher is better):")
    print(comparison[['Model', 'Per-Device R@10%FPR', 'Multi-Device R@10%FPR']].to_string(index=False))
    
    print("\n")


COMPARISON: PER-DEVICE vs MULTI-DEVICE MODELS


METHOD: basic_ema_anomaly

Accuracy Comparison:
                                  Model  Per-Device Acc  Multi-Device Acc
                               AdaBoost          0.8345          0.867943
                          Decision Tree          0.6737          0.591094
Linear Classifier (Logistic Regression)          0.6529          0.534188
                             Linear SVM          0.6037          0.434830
                            Naive Bayes          0.7543          0.736609
                      Nearest Neighbors          0.8400          0.837684
                                RBF SVM          0.5632          0.550628
                          Random Forest          0.8690          0.866950

--------------------------------------------------------------------------------
FPR at 90% Recall Comparison (lower is better):
                                  Model  Per-Device FPR@90%R  Multi-Device FPR@90%R
                       