## 1. Imports

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Date/time handling
from datetime import datetime, timedelta

# Plotting
import matplotlib.pyplot as plt

# Sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Sklearn utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score,
    classification_report, confusion_matrix
)

print("All imports successful!")

All imports successful!


## 2. Model Configuration

Using exact model dictionary from step2.ipynb

In [2]:
# Model dictionary with exact hyperparameters from step2.ipynb
model_dict = {
    "Linear Classifier (Logistic Regression)": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Nearest Neighbors": KNeighborsClassifier(3),
    "Linear SVM": SVC(kernel="linear", class_weight='balanced', probability=True),
    "RBF SVM": SVC(kernel='rbf', class_weight='balanced', probability=True),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
}

print(f"Configured {len(model_dict)} models")

Configured 8 models


## 3. Data Loading and Filtering

In [3]:
# Load data
clean_data = pd.read_csv("clean_labeled.csv")
print(f"Loaded {len(clean_data)} rows")
print(f"Columns: {list(clean_data.columns)}")

Loaded 28849 rows
Columns: ['Unnamed: 0.1', 'Unnamed: 0', 'hostname', 'date', 'ping_jitter', 'ping_latency', 'ping_low', 'ping_high', 'day', 'predictions', 'basic_ema_anomaly', 'dspot_anomaly', 'tuned_dspot_anomaly']


In [4]:
# Filter to hosts that have at least one anomaly in ANY detection method
hosts_with_basic_anomaly = clean_data[clean_data['basic_ema_anomaly'] == True]['hostname'].unique()
hosts_with_dspot_anomaly = clean_data[clean_data['dspot_anomaly'] == True]['hostname'].unique()
hosts_with_tuned_dspot_anomaly = clean_data[clean_data['tuned_dspot_anomaly'] == True]['hostname'].unique()

# Union of all hosts with anomalies
hosts_with_any_anomaly = set(hosts_with_basic_anomaly) | set(hosts_with_dspot_anomaly) | set(hosts_with_tuned_dspot_anomaly)

print(f"Hosts with basic_ema_anomaly: {len(hosts_with_basic_anomaly)}")
print(f"Hosts with dspot_anomaly: {len(hosts_with_dspot_anomaly)}")
print(f"Hosts with tuned_dspot_anomaly: {len(hosts_with_tuned_dspot_anomaly)}")
print(f"Total hosts with ANY anomaly: {len(hosts_with_any_anomaly)}")

# Filter dataframe to only these hosts
df = clean_data[clean_data['hostname'].isin(hosts_with_any_anomaly)].copy()
print(f"\nFiltered to {len(df)} rows from {len(hosts_with_any_anomaly)} hosts")

Hosts with basic_ema_anomaly: 38
Hosts with dspot_anomaly: 24
Hosts with tuned_dspot_anomaly: 32
Total hosts with ANY anomaly: 38

Filtered to 28394 rows from 38 hosts


### Apply Time Filtering

Remove gaps greater than 2 hours 15 minutes

In [5]:
# Time filtering: remove gaps > 2h 15min
hostnames = df["hostname"].unique().tolist()
time_filtered_df = []

for host in hostnames:
    hostdf = df[df['hostname'] == host].copy()
    hostdf["date"] = pd.to_datetime(hostdf["date"])
    hostdf = hostdf.sort_values(by='date', ascending=True)
    hostdf['time_diff'] = hostdf['date'] - (hostdf['date'].shift(1))
    mask = ((hostdf['time_diff'] >= timedelta(hours=2, minutes=15)) | (hostdf['time_diff'].isna()))
    filtered_host_df = hostdf[mask]
    time_filtered_df.append(filtered_host_df)

time_filtered_df = pd.concat(time_filtered_df, ignore_index=True)
print(f"After time filtering: {len(time_filtered_df)} rows")

After time filtering: 19804 rows


### Train/Test Split

In [6]:
# Split into train and test
tdf = time_filtered_df

start_a = pd.Timestamp("2025-04-16")
end_a   = pd.Timestamp("2025-06-20")

start_b = pd.Timestamp("2025-07-01")
end_b   = pd.Timestamp("2025-08-01")

mask_a = (tdf["date"] >= start_a) & (tdf["date"] <= end_a)
mask_b = (tdf["date"] >= start_b) & (tdf["date"] <= end_b)

train = tdf[mask_a].copy()
test = tdf[mask_b].copy()

print(f"Train set: {len(train)} rows")
print(f"Test set: {len(test)} rows")

Train set: 11525 rows
Test set: 8279 rows


### Normalize Latency Values per Host

In [7]:
# Train normalization - normalizing using Z-score for each hostname
train["normalized_latency"] = train.groupby("hostname")["ping_latency"].transform(
    lambda x: (x - x.mean()) / x.std()
)
train["normalized_latency"] = train["normalized_latency"].fillna(0)
train["normalized_latency"] = train["normalized_latency"] - train["normalized_latency"].min()

# Test normalization
test["normalized_latency"] = test.groupby("hostname")["ping_latency"].transform(
    lambda x: (x - x.mean()) / x.std()
)
test["normalized_latency"] = test["normalized_latency"].fillna(0)
test["normalized_latency"] = test["normalized_latency"] - test["normalized_latency"].min()

print("Normalization complete")

Normalization complete


## 4. Feature Engineering Functions

Modified to accept a `label_col` parameter to specify which anomaly column to use

In [8]:
def create_lookback_features(lookback_df, label_col='predictions', latency_to_use='ping_latency'):
    """
    Create lookback features for a window of data.
    
    Args:
        lookback_df: DataFrame with historical data
        label_col: Column name to use as labels (e.g., 'basic_ema_anomaly', 'dspot_anomaly')
        latency_to_use: Column name for latency values
    """
    lookback_df = lookback_df.reset_index(drop=True)

    has_anomalies = lookback_df[label_col].any()
    has_normal = (~lookback_df[label_col]).any()
    
    lookback_features = {
        'anomaly_count': lookback_df[label_col].sum(),
        'anomaly_rate': lookback_df[label_col].mean(),
        'recent_anomaly_count': lookback_df[label_col].tail(3).sum(),
        'datapoints_since_anomaly': ((len(lookback_df) - 1 - lookback_df[lookback_df[label_col] == True].index[-1]) if has_anomalies else -1),
        'has_anomaly_history': float(has_anomalies),
        'latency_during_anomalies': (lookback_df[lookback_df[label_col] == True][latency_to_use].mean() if has_anomalies else -1),
        'latency_during_normal': (lookback_df[lookback_df[label_col] == False][latency_to_use].mean() if has_normal else -1),
        'recent_latency_mean': lookback_df[latency_to_use].tail(3).mean(),
        'baseline_latency_mean': lookback_df[latency_to_use].head(5).mean(),
        'recent_vs_baseline': (lookback_df[latency_to_use].tail(3).mean() / lookback_df[latency_to_use].head(5).mean() if lookback_df[latency_to_use].head(5).mean() > 0 else 1.0),
        'recent_latency_max': lookback_df[latency_to_use].tail(3).max(),
        'latency_trend': (lookback_df[latency_to_use].iloc[-1] - lookback_df[latency_to_use].iloc[0]) / len(lookback_df),
        'anomaly_clustering': lookback_df[label_col].rolling(3).sum().max() if len(lookback_df) >= 3 else 0,
        'missing_points': lookback_df[latency_to_use].isna().sum(),
        'completeness': 1 - lookback_df[latency_to_use].isna().mean()
    }
    return lookback_features


def get_feature_df(og_df, label_col='predictions', latency_to_use='ping_latency'):
    """
    Create a feature dataframe with lookback windows.
    
    Args:
        og_df: Original dataframe sorted by date
        label_col: Column name to use as labels
        latency_to_use: Column name for latency values
    """
    initial = create_lookback_features(og_df.iloc[0:10], label_col=label_col, latency_to_use=latency_to_use)
    featured_df = pd.DataFrame(columns=list(initial.keys()) + ['label', 'date', 'hostname'])
    TOL = pd.Timedelta(minutes=2)

    for i, row in og_df.iloc[9:].iterrows():
        end_time = og_df.loc[i, 'date']
        start_time = end_time - pd.Timedelta(hours=30)
        lookback_df = og_df[(og_df['date'] >= start_time + TOL) & (og_df['date'] < end_time - TOL)].copy()
        if len(lookback_df) == 0:
            continue
        lookback_features = create_lookback_features(lookback_df, label_col=label_col, latency_to_use=latency_to_use)
        label = og_df.loc[i, label_col]
        hostname = og_df.loc[i, 'hostname']
        row = {**lookback_features, 'label': label, 'date': end_time, 'hostname': hostname}
        featured_df.loc[len(featured_df)] = row

    return featured_df


def transform_single_df_to_features(df, cur_hostname, label_col='predictions'):
    """
    Transform a single device's data to features.
    
    Args:
        df: Full dataframe
        cur_hostname: Hostname to filter by
        label_col: Column name to use as labels
    """
    host_isolated = df[df['hostname'] == cur_hostname]
    host_isolated = host_isolated.sort_values(by='date', ascending=True)
    return get_feature_df(host_isolated, label_col=label_col, latency_to_use='ping_latency')


print("Feature engineering functions defined")

Feature engineering functions defined


## 5. Model Evaluation Functions

Includes new threshold-based metrics:
- **FPR at 90% Recall**: What false positive rate is needed to catch 90% of anomalies
- **Recall at 10% FPR**: What percentage of anomalies are caught with 10% false positive rate

In [9]:
def calculate_threshold_metrics(y_test, y_proba):
    """
    Calculate threshold-based metrics:
    - FPR at 90% Recall: What FPR is needed to catch 90% of anomalies
    - Recall at 10% FPR: What recall is achieved at 10% FPR
    
    Args:
        y_test: True labels
        y_proba: Predicted probabilities for positive class
    
    Returns:
        fpr_at_90_recall: FPR when recall is 90%
        recall_at_10_fpr: Recall when FPR is 10%
    """
    # Sort by probability descending
    sorted_indices = np.argsort(-y_proba)
    y_test_sorted = y_test.iloc[sorted_indices].values if hasattr(y_test, 'iloc') else y_test[sorted_indices]
    
    total_positives = y_test_sorted.sum()
    total_negatives = len(y_test_sorted) - total_positives
    
    # Edge case: no anomalies
    if total_positives == 0:
        return -1, -1
    
    # Calculate cumulative metrics
    cumulative_tp = np.cumsum(y_test_sorted)
    cumulative_fp = np.cumsum(1 - y_test_sorted)
    
    # Calculate recall and FPR at each threshold
    recalls = cumulative_tp / total_positives
    fprs = cumulative_fp / total_negatives if total_negatives > 0 else np.zeros_like(cumulative_fp)
    
    # FPR at 90% Recall
    target_recall = 0.90
    idx_90_recall = np.where(recalls >= target_recall)[0]
    if len(idx_90_recall) > 0:
        fpr_at_90_recall = fprs[idx_90_recall[0]]
    else:
        fpr_at_90_recall = 1.0  # Would need 100% FPR to reach 90% recall
    
    # Recall at 10% FPR
    target_fpr = 0.10
    idx_10_fpr = np.where(fprs <= target_fpr)[0]
    if len(idx_10_fpr) > 0:
        recall_at_10_fpr = recalls[idx_10_fpr[-1]]  # Last index where FPR <= 10%
    else:
        recall_at_10_fpr = 0.0  # Can't achieve any recall at 10% FPR
    
    return fpr_at_90_recall, recall_at_10_fpr


def evaluate_model_per_device(X_train, y_train, X_test, y_test, hostname):
    """
    Train all models and evaluate on test set for a single device.
    
    Args:
        X_train, y_train: Training features and labels
        X_test, y_test: Test features and labels
        hostname: Device hostname
    
    Returns:
        List of result dictionaries for each model
    """
    results = []

    for model_name, clf in model_dict.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        try:
            y_proba = clf.predict_proba(X_test)[:, 1]
            has_proba = True
        except:
            y_proba = None
            has_proba = False
        
        # Standard metrics
        accuracy = clf.score(X_test, y_test)
        f1_anomaly = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
        precision = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
        recall = recall_score(y_test, y_pred, pos_label=1, zero_division=0)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

        roc_auc = -1
        fpr_at_90_recall = -1
        recall_at_10_fpr = -1
        
        if has_proba:
            roc_auc = roc_auc_score(y_test, y_proba)
            fpr_at_90_recall, recall_at_10_fpr = calculate_threshold_metrics(y_test, y_proba)

        results.append({
            'model': model_name,
            'hostname': hostname,
            'accuracy': accuracy,
            'f1_score': f1_anomaly,
            'precision': precision,
            'recall': recall,
            'roc_auc': roc_auc,
            'fpr_at_90_recall': fpr_at_90_recall,
            'recall_at_10_fpr': recall_at_10_fpr,
            'true_positives': tp,
            'false_positives': fp,
            'true_negatives': tn,
            'false_negatives': fn,
            'caught_anomalies': tp,
            'missed_anomalies': fn,
            'false_alarms': fp,
            'total_test_samples': len(y_test),
            'total_anomalies': y_test.sum()
        })

    return results


print("Model evaluation functions defined")

Model evaluation functions defined


## 6. Main Evaluation Loop

Iterate over all three anomaly detection methods and evaluate per-device models

In [10]:
# Define the three anomaly columns to evaluate
anomaly_columns = ['basic_ema_anomaly', 'dspot_anomaly', 'tuned_dspot_anomaly']

# Store results for each method
all_results_by_method = {}

# Columns to remove when creating features
remove_cols = ['label', 'date', 'hostname']

print("="*80)
print("EVALUATING PER-DEVICE MODELS FOR EACH ANOMALY DETECTION METHOD")
print("="*80)

for label_col in anomaly_columns:
    print(f"\n{'='*80}")
    print(f"ANOMALY DETECTION METHOD: {label_col}")
    print(f"{'='*80}\n")
    
    # Filter to hosts that have anomalies in both train and test for this method
    train_hostnames_with_anomalies = train[train[label_col] == True]['hostname'].unique()
    test_hostnames_with_anomalies = test[test[label_col] == True]['hostname'].unique()
    hostnames_in_both = set(train_hostnames_with_anomalies).intersection(set(test_hostnames_with_anomalies))
    
    print(f"Hosts with anomalies in train: {len(train_hostnames_with_anomalies)}")
    print(f"Hosts with anomalies in test: {len(test_hostnames_with_anomalies)}")
    print(f"Hosts with anomalies in BOTH train and test: {len(hostnames_in_both)}\n")
    
    if len(hostnames_in_both) == 0:
        print(f"WARNING: No hosts have anomalies in both train and test for {label_col}. Skipping.\n")
        continue
    
    # Filter train and test to only these hosts
    train_filtered = train[train['hostname'].isin(hostnames_in_both)]
    test_filtered = test[test['hostname'].isin(hostnames_in_both)]
    
    # Evaluate models for each host
    method_results = []
    
    for i, cur_hostname in enumerate(hostnames_in_both, 1):
        print(f"[{i}/{len(hostnames_in_both)}] Evaluating models for hostname: {cur_hostname}")
        
        # Create features using this label column
        train_single_w_lookback = transform_single_df_to_features(train_filtered, cur_hostname, label_col=label_col)
        test_single_w_lookback = transform_single_df_to_features(test_filtered, cur_hostname, label_col=label_col)
        
        # Prepare X and y
        X_train = train_single_w_lookback.drop(columns=remove_cols)
        X_test = test_single_w_lookback.drop(columns=remove_cols)
        y_train = train_single_w_lookback["label"].astype(int)
        y_test = test_single_w_lookback["label"].astype(int)
        
        # Evaluate models
        hostname_results = evaluate_model_per_device(X_train, y_train, X_test, y_test, cur_hostname)
        method_results.extend(hostname_results)
    
    # Store results for this method
    all_results_by_method[label_col] = pd.DataFrame(method_results)
    print(f"\nCompleted evaluation for {label_col}: {len(method_results)} total evaluations\n")

print("\n" + "="*80)
print("EVALUATION COMPLETE")
print("="*80)

EVALUATING PER-DEVICE MODELS FOR EACH ANOMALY DETECTION METHOD

ANOMALY DETECTION METHOD: basic_ema_anomaly

Hosts with anomalies in train: 36
Hosts with anomalies in test: 35
Hosts with anomalies in BOTH train and test: 34

[1/34] Evaluating models for hostname: 29129b6




[2/34] Evaluating models for hostname: 972f622




[3/34] Evaluating models for hostname: 5bf17fc




[4/34] Evaluating models for hostname: f8f4b44




[5/34] Evaluating models for hostname: dede9dc




[6/34] Evaluating models for hostname: 5c5004f




[7/34] Evaluating models for hostname: 64b750b




[8/34] Evaluating models for hostname: d493afd




[9/34] Evaluating models for hostname: 9dc32f2




[10/34] Evaluating models for hostname: 33fe84e




[11/34] Evaluating models for hostname: 24a22bf




[12/34] Evaluating models for hostname: a2e0486




[13/34] Evaluating models for hostname: b2c53ee




[14/34] Evaluating models for hostname: ed86ea2




[15/34] Evaluating models for hostname: 2620a05




[16/34] Evaluating models for hostname: 43e847f




[17/34] Evaluating models for hostname: 0f42441




[18/34] Evaluating models for hostname: b340432




[19/34] Evaluating models for hostname: 953d46d




[20/34] Evaluating models for hostname: 592a43c




[21/34] Evaluating models for hostname: 25b3303




[22/34] Evaluating models for hostname: 8445893




[23/34] Evaluating models for hostname: 9ab8252




[24/34] Evaluating models for hostname: da6d469




[25/34] Evaluating models for hostname: 7f6d63d




[26/34] Evaluating models for hostname: c073f39




[27/34] Evaluating models for hostname: 6ca8355




[28/34] Evaluating models for hostname: 1a21874




[29/34] Evaluating models for hostname: b407ebe




[30/34] Evaluating models for hostname: b5c8445




[31/34] Evaluating models for hostname: 63598f8




[32/34] Evaluating models for hostname: 9840de6




[33/34] Evaluating models for hostname: 575f518




[34/34] Evaluating models for hostname: 38b6bf0





Completed evaluation for basic_ema_anomaly: 272 total evaluations


ANOMALY DETECTION METHOD: dspot_anomaly

Hosts with anomalies in train: 23
Hosts with anomalies in test: 24
Hosts with anomalies in BOTH train and test: 23

[1/23] Evaluating models for hostname: 29129b6




[2/23] Evaluating models for hostname: 972f622




[3/23] Evaluating models for hostname: f8f4b44




[4/23] Evaluating models for hostname: dede9dc




[5/23] Evaluating models for hostname: 9dc32f2




[6/23] Evaluating models for hostname: 33fe84e




[7/23] Evaluating models for hostname: 24a22bf




[8/23] Evaluating models for hostname: a2e0486




[9/23] Evaluating models for hostname: b2c53ee




[10/23] Evaluating models for hostname: 2620a05




[11/23] Evaluating models for hostname: 43e847f




[12/23] Evaluating models for hostname: b340432




[13/23] Evaluating models for hostname: 953d46d




[14/23] Evaluating models for hostname: 592a43c




[15/23] Evaluating models for hostname: 25b3303




[16/23] Evaluating models for hostname: 9ab8252




[17/23] Evaluating models for hostname: da6d469




[18/23] Evaluating models for hostname: 7f6d63d




[19/23] Evaluating models for hostname: c073f39




[20/23] Evaluating models for hostname: 6ca8355




[21/23] Evaluating models for hostname: b407ebe




[22/23] Evaluating models for hostname: 575f518




[23/23] Evaluating models for hostname: 38b6bf0





Completed evaluation for dspot_anomaly: 184 total evaluations


ANOMALY DETECTION METHOD: tuned_dspot_anomaly

Hosts with anomalies in train: 29
Hosts with anomalies in test: 24
Hosts with anomalies in BOTH train and test: 22

[1/22] Evaluating models for hostname: 29129b6




[2/22] Evaluating models for hostname: 972f622




[3/22] Evaluating models for hostname: 5bf17fc




[4/22] Evaluating models for hostname: f8f4b44




[5/22] Evaluating models for hostname: dede9dc




[6/22] Evaluating models for hostname: 5c5004f




[7/22] Evaluating models for hostname: 33fe84e




[8/22] Evaluating models for hostname: a2e0486




[9/22] Evaluating models for hostname: ed86ea2




[10/22] Evaluating models for hostname: 2620a05




[11/22] Evaluating models for hostname: 0f42441




[12/22] Evaluating models for hostname: b340432




[13/22] Evaluating models for hostname: 953d46d




[14/22] Evaluating models for hostname: 25b3303




[15/22] Evaluating models for hostname: da6d469




[16/22] Evaluating models for hostname: 7f6d63d




[17/22] Evaluating models for hostname: 6ca8355




[18/22] Evaluating models for hostname: b407ebe




[19/22] Evaluating models for hostname: b5c8445




[20/22] Evaluating models for hostname: 9840de6




[21/22] Evaluating models for hostname: 575f518




[22/22] Evaluating models for hostname: 38b6bf0

Completed evaluation for tuned_dspot_anomaly: 176 total evaluations


EVALUATION COMPLETE




## 7. Results Summary

Display aggregate statistics for each anomaly detection method

In [11]:
print("\n" + "="*80)
print("AGGREGATE RESULTS BY ANOMALY DETECTION METHOD")
print("="*80 + "\n")

for label_col, results_df in all_results_by_method.items():
    print(f"\n{'='*80}")
    print(f"METHOD: {label_col}")
    print(f"{'='*80}\n")
    
    # Group by model and calculate mean metrics
    grouped = results_df.groupby('model').agg({
        'accuracy': 'mean',
        'f1_score': 'mean',
        'precision': 'mean',
        'recall': 'mean',
        'roc_auc': 'mean',
        'fpr_at_90_recall': 'mean',
        'recall_at_10_fpr': 'mean',
        'caught_anomalies': 'sum',
        'missed_anomalies': 'sum',
        'false_alarms': 'sum',
        'total_anomalies': 'sum'
    }).round(4)
    
    print("Standard Metrics:")
    print(grouped[['accuracy', 'f1_score', 'precision', 'recall', 'roc_auc']])
    
    print("\n" + "-"*80)
    print("Threshold-Based Metrics:")
    print("-"*80)
    threshold_metrics = grouped[['fpr_at_90_recall', 'recall_at_10_fpr']].copy()
    print(threshold_metrics)
    print("\nInterpretation:")
    print("  - FPR at 90% Recall: False positive rate needed to catch 90% of anomalies")
    print("    (Lower is better - means fewer false alarms to catch most issues)")
    print("  - Recall at 10% FPR: Percentage of anomalies caught at 10% false positive rate")
    print("    (Higher is better - means catching more issues with acceptable false alarm rate)")
    
    print("\n" + "-"*80)
    print("Anomaly Detection Stats:")
    print("-"*80)
    print(grouped[['caught_anomalies', 'missed_anomalies', 'false_alarms', 'total_anomalies']])
    
    # Best models
    print("\n" + "-"*80)
    print("Best Models:")
    print("-"*80)
    best_f1 = grouped['f1_score'].idxmax()
    best_recall = grouped['recall'].idxmax()
    best_precision = grouped['precision'].idxmax()
    best_fpr_90 = grouped['fpr_at_90_recall'].idxmin()  # Lower is better
    best_recall_10 = grouped['recall_at_10_fpr'].idxmax()  # Higher is better
    
    print(f"Best F1 Score: {best_f1} ({grouped.loc[best_f1, 'f1_score']:.4f})")
    print(f"Best Recall: {best_recall} ({grouped.loc[best_recall, 'recall']:.4f})")
    print(f"Best Precision: {best_precision} ({grouped.loc[best_precision, 'precision']:.4f})")
    print(f"Best FPR at 90% Recall: {best_fpr_90} ({grouped.loc[best_fpr_90, 'fpr_at_90_recall']:.4f})")
    print(f"Best Recall at 10% FPR: {best_recall_10} ({grouped.loc[best_recall_10, 'recall_at_10_fpr']:.4f})")
    
    print("\n")


AGGREGATE RESULTS BY ANOMALY DETECTION METHOD


METHOD: basic_ema_anomaly

Standard Metrics:
                                         accuracy  f1_score  precision  \
model                                                                    
AdaBoost                                   0.8394    0.0575     0.1023   
Decision Tree                              0.6380    0.1480     0.1345   
Linear Classifier (Logistic Regression)    0.6268    0.1921     0.1406   
Linear SVM                                 0.6018    0.1968     0.1473   
Naive Bayes                                0.7532    0.1723     0.1514   
Nearest Neighbors                          0.8433    0.0651     0.1159   
RBF SVM                                    0.5768    0.2134     0.1409   
Random Forest                              0.8760    0.0203     0.0895   

                                         recall  roc_auc  
model                                                     
AdaBoost                                 0.0508

## 8. Save Results to CSV

In [12]:
# Save detailed results for each method
for label_col, results_df in all_results_by_method.items():
    filename = f"per_device_results_{label_col}.csv"
    results_df.to_csv(filename, index=False)
    print(f"Saved results for {label_col} to {filename}")

print("\nAll results saved successfully!")

Saved results for basic_ema_anomaly to per_device_results_basic_ema_anomaly.csv
Saved results for dspot_anomaly to per_device_results_dspot_anomaly.csv
Saved results for tuned_dspot_anomaly to per_device_results_tuned_dspot_anomaly.csv

All results saved successfully!


## Summary

This notebook evaluated per-device ML models on three different anomaly detection methods:
1. **basic_ema_anomaly**: Basic EMA-based detection
2. **dspot_anomaly**: DSPOT detection
3. **tuned_dspot_anomaly**: Tuned DSPOT detection

For each method, we trained 8 different ML models per device and calculated:
- Standard classification metrics (accuracy, F1, precision, recall, ROC-AUC)
- **FPR at 90% Recall**: Answers "how many false alarms to catch 90% of issues?"
- **Recall at 10% FPR**: Answers "how many issues caught with 10% false alarm rate?"

The threshold-based metrics are particularly useful for understanding:
- **Customers who want to catch as many anomalies as possible**: Look at FPR at 90% Recall
- **Customers who want to minimize false alarms**: Look at Recall at 10% FPR