# MultiWeb Metrics Analysis

This notebook demonstrates how to analyze operational metrics collected from the MultiWeb marketplace platform.

## Learning Objectives
- Load and clean operational data
- Perform exploratory data analysis (EDA)
- Visualize traffic patterns and anomalies
- Identify performance bottlenecks
- Detect security incidents

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import json
from pathlib import Path

# Configure visualization
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

# Configure pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load Data

Load metrics data collected from Prometheus.

In [None]:
# Find the latest metrics files
data_dir = Path('../data')
data_dir.mkdir(exist_ok=True)

# Load HTTP metrics
http_files = sorted(data_dir.glob('http_metrics_*.csv'))
if http_files:
    http_df = pd.read_csv(http_files[-1], parse_dates=['timestamp'])
    print(f"Loaded HTTP metrics: {len(http_df)} records")
    print(f"Time range: {http_df['timestamp'].min()} to {http_df['timestamp'].max()}")
else:
    print("No HTTP metrics found. Run the collector first.")
    http_df = pd.DataFrame()

# Load system metrics
system_files = sorted(data_dir.glob('system_metrics_*.csv'))
if system_files:
    system_df = pd.read_csv(system_files[-1], parse_dates=['timestamp'])
    print(f"\nLoaded system metrics: {len(system_df)} records")
else:
    print("No system metrics found.")
    system_df = pd.DataFrame()

## 2. Data Exploration

Explore the structure and content of the data.

In [None]:
if not http_df.empty:
    print("HTTP Metrics Overview:")
    print(http_df.head())
    print("\nData Types:")
    print(http_df.dtypes)
    print("\nMetrics Available:")
    print(http_df['metric'].value_counts())

## 3. HTTP Traffic Analysis

Analyze request patterns and traffic trends.

In [None]:
if not http_df.empty:
    # Filter request rate data
    request_rate = http_df[http_df['metric'] == 'request_rate']
    
    if not request_rate.empty:
        plt.figure(figsize=(15, 5))
        
        # Plot request rate over time
        plt.subplot(1, 2, 1)
        plt.plot(request_rate['timestamp'], request_rate['value'])
        plt.title('Request Rate Over Time')
        plt.xlabel('Time')
        plt.ylabel('Requests/sec')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        
        # Plot distribution
        plt.subplot(1, 2, 2)
        plt.hist(request_rate['value'], bins=50, edgecolor='black')
        plt.title('Request Rate Distribution')
        plt.xlabel('Requests/sec')
        plt.ylabel('Frequency')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Statistics
        print(f"Request Rate Statistics:")
        print(request_rate['value'].describe())

## 4. Error Rate Analysis

Identify error patterns and anomalies.

In [None]:
if not http_df.empty:
    # Compare error rates and success rates
    error_rate = http_df[http_df['metric'] == 'error_rate']
    success_rate = http_df[http_df['metric'] == 'status_2xx']
    
    if not error_rate.empty and not success_rate.empty:
        plt.figure(figsize=(15, 5))
        
        plt.plot(error_rate['timestamp'], error_rate['value'], 
                label='5xx Errors', color='red', linewidth=2)
        plt.plot(success_rate['timestamp'], success_rate['value'], 
                label='2xx Success', color='green', linewidth=2)
        
        plt.title('HTTP Status Rates Over Time')
        plt.xlabel('Time')
        plt.ylabel('Rate (req/sec)')
        plt.legend()
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        # Calculate error percentage
        total_errors = error_rate['value'].sum()
        total_success = success_rate['value'].sum()
        error_percentage = (total_errors / (total_errors + total_success)) * 100
        print(f"\nOverall Error Rate: {error_percentage:.2f}%")

## 5. Performance Analysis

Analyze response time and latency.

In [None]:
if not http_df.empty:
    duration = http_df[http_df['metric'] == 'request_duration']
    
    if not duration.empty:
        plt.figure(figsize=(15, 5))
        
        # Plot latency over time
        plt.subplot(1, 2, 1)
        plt.plot(duration['timestamp'], duration['value'], color='orange')
        plt.title('P95 Request Duration Over Time')
        plt.xlabel('Time')
        plt.ylabel('Duration (seconds)')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        
        # Box plot
        plt.subplot(1, 2, 2)
        plt.boxplot(duration['value'])
        plt.title('Request Duration Distribution')
        plt.ylabel('Duration (seconds)')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Identify slow requests
        threshold = duration['value'].quantile(0.95)
        slow_requests = duration[duration['value'] > threshold]
        print(f"\nSlow requests (>P95): {len(slow_requests)}")
        print(f"P95 threshold: {threshold:.3f}s")

## 6. System Resource Analysis

Analyze CPU and memory usage.

In [None]:
if not system_df.empty:
    plt.figure(figsize=(15, 10))
    
    # CPU usage
    cpu_data = system_df[system_df['metric'] == 'cpu_usage']
    if not cpu_data.empty:
        plt.subplot(2, 2, 1)
        plt.plot(cpu_data['timestamp'], cpu_data['value'], color='blue')
        plt.title('CPU Usage')
        plt.xlabel('Time')
        plt.ylabel('CPU Rate')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
    
    # Memory usage
    mem_data = system_df[system_df['metric'] == 'memory_usage']
    if not mem_data.empty:
        plt.subplot(2, 2, 2)
        plt.plot(mem_data['timestamp'], mem_data['value'] / (1024**3), color='purple')
        plt.title('Memory Usage')
        plt.xlabel('Time')
        plt.ylabel('Memory (GB)')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
    
    # Open file descriptors
    fd_data = system_df[system_df['metric'] == 'open_fds']
    if not fd_data.empty:
        plt.subplot(2, 2, 3)
        plt.plot(fd_data['timestamp'], fd_data['value'], color='green')
        plt.title('Open File Descriptors')
        plt.xlabel('Time')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 7. Anomaly Detection

Detect anomalies using statistical methods.

In [None]:
if not http_df.empty:
    # Use Z-score for anomaly detection
    request_rate = http_df[http_df['metric'] == 'request_rate'].copy()
    
    if not request_rate.empty:
        # Calculate Z-score
        request_rate['z_score'] = np.abs(
            (request_rate['value'] - request_rate['value'].mean()) / request_rate['value'].std()
        )
        
        # Identify anomalies (Z-score > 3)
        anomalies = request_rate[request_rate['z_score'] > 3]
        
        plt.figure(figsize=(15, 5))
        plt.plot(request_rate['timestamp'], request_rate['value'], label='Normal', alpha=0.7)
        plt.scatter(anomalies['timestamp'], anomalies['value'], 
                   color='red', s=100, label='Anomaly', zorder=5)
        plt.title('Traffic Anomaly Detection')
        plt.xlabel('Time')
        plt.ylabel('Requests/sec')
        plt.legend()
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print(f"\nAnomalies detected: {len(anomalies)}")
        if not anomalies.empty:
            print("\nAnomaly details:")
            print(anomalies[['timestamp', 'value', 'z_score']])

## 8. Summary Report

Generate a comprehensive summary of the analysis.

In [None]:
print("="*60)
print("MULTIWEB METRICS ANALYSIS SUMMARY")
print("="*60)

if not http_df.empty:
    print("\nðŸ“Š HTTP Metrics:")
    print(f"  Total records: {len(http_df)}")
    print(f"  Time range: {http_df['timestamp'].min()} to {http_df['timestamp'].max()}")
    print(f"  Unique metrics: {http_df['metric'].nunique()}")

if not system_df.empty:
    print("\nðŸ’» System Metrics:")
    print(f"  Total records: {len(system_df)}")
    print(f"  Metrics tracked: {system_df['metric'].nunique()}")

print("\n" + "="*60)
print("Analysis complete!")
print("="*60)