# Explainable Graph Anomaly Detection - Exploratory Analysis

This notebook provides exploratory analysis and visualization capabilities for the GAD pipeline.

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path

# Add parent directory to path for imports
import sys
sys.path.append('..')

from utils.config import load_config
from models.models import *
from utils.evaluation import *

## Dataset Loading and Basic Statistics

In [None]:
# Load dataset
dataset_name = 'weibo'  # Change as needed
data_path = f'../data/processed/{dataset_name}_static.pt'

if Path(data_path).exists():
    data = torch.load(data_path, map_location='cpu')
    print(f"Dataset: {dataset_name}")
    print(f"Nodes: {data.x.shape[0]}")
    print(f"Features: {data.x.shape[1]}")
    print(f"Edges: {data.edge_index.shape[1]}")
    print(f"Anomaly ratio: {data.y.float().mean():.4f}")
else:
    print(f"Dataset {dataset_name} not found. Available datasets in data/processed/")

## Feature Analysis and Visualization

In [None]:
# Analyze feature distributions
if 'data' in locals():
    # Plot feature statistics
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Feature mean distribution
    axes[0,0].hist(data.x.mean(dim=0).numpy(), bins=50, alpha=0.7)
    axes[0,0].set_title('Feature Mean Distribution')
    axes[0,0].set_xlabel('Mean Value')
    
    # Feature std distribution
    axes[0,1].hist(data.x.std(dim=0).numpy(), bins=50, alpha=0.7)
    axes[0,1].set_title('Feature Std Distribution')
    axes[0,1].set_xlabel('Std Value')
    
    # Node degree distribution
    from torch_geometric.utils import degree
    degrees = degree(data.edge_index[0], data.x.shape[0])
    axes[1,0].hist(degrees.numpy(), bins=50, alpha=0.7)
    axes[1,0].set_title('Node Degree Distribution')
    axes[1,0].set_xlabel('Degree')
    axes[1,0].set_yscale('log')
    
    # Anomaly vs normal feature comparison
    normal_features = data.x[data.y == 0].mean(dim=0)
    anomaly_features = data.x[data.y == 1].mean(dim=0)
    
    feature_diff = (anomaly_features - normal_features).abs()
    top_discriminative = torch.topk(feature_diff, 10)
    
    axes[1,1].bar(range(10), top_discriminative.values.numpy())
    axes[1,1].set_title('Top 10 Discriminative Features')
    axes[1,1].set_xlabel('Feature Index')
    axes[1,1].set_ylabel('Mean Difference')
    
    plt.tight_layout()
    plt.show()

## Model Training and Evaluation Example

In [None]:
# Simple anomaly detection example
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score

if 'data' in locals():
    # Prepare data
    X = data.x.numpy()
    y = data.y.numpy()
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Simple L2 distance-based anomaly detection
    center = X_scaled.mean(axis=0)
    distances = np.linalg.norm(X_scaled - center, axis=1)
    
    # Evaluate
    roc_auc = roc_auc_score(y, distances)
    avg_precision = average_precision_score(y, distances)
    
    print(f"L2 Distance Method:")
    print(f"ROC-AUC: {roc_auc:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")
    
    # Plot results
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.hist(distances[y==0], bins=50, alpha=0.7, label='Normal', density=True)
    plt.hist(distances[y==1], bins=50, alpha=0.7, label='Anomaly', density=True)
    plt.xlabel('L2 Distance from Center')
    plt.ylabel('Density')
    plt.legend()
    plt.title('Distance Distribution')
    
    plt.subplot(1, 2, 2)
    from sklearn.metrics import roc_curve
    fpr, tpr, _ = roc_curve(y, distances)
    plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.title('ROC Curve')
    
    plt.tight_layout()
    plt.show()

## Explainability Analysis

In [None]:
# Example explainability analysis
if 'data' in locals():
    # Find most anomalous nodes
    anomaly_scores = distances
    top_anomalies = np.argsort(anomaly_scores)[-10:]
    
    print("Top 10 Most Anomalous Nodes:")
    for i, node_idx in enumerate(top_anomalies):
        score = anomaly_scores[node_idx]
        true_label = y[node_idx]
        print(f"{i+1:2d}. Node {node_idx:5d}: Score={score:.4f}, True Label={'Anomaly' if true_label else 'Normal'}")
    
    print(f"\nAccuracy on top anomalies: {y[top_anomalies].mean():.2f}")

## Advanced Analysis Functions

In [None]:
def analyze_dataset_comprehensive(dataset_name):
    """Comprehensive analysis of a dataset"""
    data_path = f'../data/processed/{dataset_name}_static.pt'
    
    if not Path(data_path).exists():
        print(f"Dataset {dataset_name} not found")
        return
    
    data = torch.load(data_path, map_location='cpu')
    
    print(f"\n=== {dataset_name.upper()} Dataset Analysis ===")
    print(f"Nodes: {data.x.shape[0]:,}")
    print(f"Features: {data.x.shape[1]:,}")
    print(f"Edges: {data.edge_index.shape[1]:,}")
    print(f"Anomaly ratio: {data.y.float().mean():.4f} ({data.y.sum().item():,} anomalies)")
    
    # Graph connectivity
    from torch_geometric.utils import degree
    degrees = degree(data.edge_index[0], data.x.shape[0])
    print(f"Average degree: {degrees.mean().item():.2f}")
    print(f"Max degree: {degrees.max().item()}")
    
    # Feature statistics
    print(f"Feature range: [{data.x.min().item():.4f}, {data.x.max().item():.4f}]")
    print(f"Feature mean: {data.x.mean().item():.4f}")
    print(f"Feature std: {data.x.std().item():.4f}")
    
    return data

# Analyze all available datasets
datasets = ['weibo', 'amazon', 'elliptic', 'tfinance', 'yelp']
for dataset in datasets:
    try:
        analyze_dataset_comprehensive(dataset)
    except Exception as e:
        print(f"Error analyzing {dataset}: {e}")