In [1]:
"""
Notebook 1: Raw Data ETL & Anomaly Flagging (Multi-Router)
Purpose:
  - Iterate through individual raw router Parquet files.
  - For each router:
    - Load raw data (Dask DataFrame, with column selection).
    - Perform Dask-native initial cleaning and hourly aggregation.
    - Compute the hourly aggregated data to a Pandas DataFrame (memory-safe).
    - Run Isolation Forest anomaly detection.
    - Save the processed hourly data with anomaly info to a dedicated Parquet file.
  - Generate global (cross-router) and illustrative (single-router) EDA plots.

Outputs:
  - outputs/ch4/hourly_<router>_processed_with_anomalies.parquet (10 files)
  - Figures 4-2a, 4-2b, 4-2c, 4-3a, 4-3b, 4-3c saved to outputs/ch4/figs
  - Printed Table 4-1 (Router Inventory).
"""

#######################################################################
# 0. Environment set‑up                                               #
#######################################################################
# ☑️  Install all dependencies.
# !pip install --quiet pandas pyarrow dask[dataframe] matplotlib seaborn scikit-learn tqdm

import os
import glob # To find individual Parquet files
import warnings
import random
import gc
from pathlib import Path
from typing import List, Dict, Tuple, Optional

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm # For progress bars

# For Dask DataFrames
import dask.dataframe as dd

# For Anomaly Detection
from sklearn.ensemble import IsolationForest

# Suppress minor warnings for cleaner output in Jupyter
warnings.filterwarnings("ignore")

# Ensure plots appear inline in Jupyter Notebook
%matplotlib inline

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 150 # Increase resolution for better quality plots
plt.rcParams['savefig.dpi'] = 300 # Save plots with higher resolution

# Seed for reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED); 

print("Environment setup complete.")

#######################################################################
# 1. Configuration                                                    #
#######################################################################

# --- Paths & File Names ---
# Input: Directory containing your INDIVIDUAL router Parquet files
INPUT_ROUTERS_DIR = Path("/mnt/nrdstor/ramamurthy/mhnarfth/internet2/parquet")

# Output root directory for processed data and figures
OUT_DIR           = Path("outputs/ch4").absolute(); OUT_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR           = OUT_DIR / "figs"; FIG_DIR.mkdir(exist_ok=True, parents=True)

# --- Data Specifics ---
TARGET_COL        = "in_packets"           # The volume metric (packets)
TIMESTAMP_COL     = "t_first"              # Earliest packet timestamp for flow
# 't_last' is also needed for initial loading/cleaning, even if not directly used later.
ADDITIONAL_RAW_COLS = ['t_last'] # Columns to load besides TIMESTAMP_COL and TARGET_COL
FREQ              = "h"                    # Hourly resampling frequency ('h' for hourly)
ROUTER_COL        = "router"               # Column for router name (will be added to processed files)

# --- Anomaly Detection Parameters ---
IF_CONTAMINATION  = 0.01 # Initial contamination estimate
ANOM_SCORE_COL    = "if_score"             # Column name for Isolation Forest anomaly scores
ANOM_FLAG_COL     = "if_flag"              # Column name for binary anomaly flags (1 = anomaly, 0 = normal)

# --- Global EDA Parameters ---
# Fraction of total rows for the global flow-size histogram sample.
# This will be sampled from EACH router's raw data for representation.
GLOBAL_HISTOGRAM_SAMPLE_PER_ROUTER_COUNT = 10_000 # Sample up to 10,000 raw rows from each router

print(f"Configuration loaded.")
print(f"Input individual router Parquet files from: {INPUT_ROUTERS_DIR}")
print(f"Outputs will be saved to: {OUT_DIR} (figs: {FIG_DIR})")
print(f"Anomaly Detection contamination (initial): {IF_CONTAMINATION}")

#######################################################################
# Helper Functions                                                    #
#######################################################################

# Helper function to save plots
def save_plot(fig_name: str, router_label: str = ""): 
    """Saves the current matplotlib figure in both PNG and PDF formats."""
    plt.tight_layout()
    if router_label:
        png_path = FIG_DIR / f"{fig_name}_{router_label}.png"
        pdf_path = FIG_DIR / f"{fig_name}_{router_label}.pdf"
    else: # For global plots, no router_label in filename
        png_path = FIG_DIR / f"{fig_name}.png"
        pdf_path = FIG_DIR / f"{fig_name}.pdf"

    plt.savefig(png_path)
    plt.savefig(pdf_path)
    plt.close()
    print(f"  Saved {png_path.name} and {pdf_path.name}")


def plot_eda_single_router(df_hourly: pd.DataFrame, router_label: str, target_col: str):
    """
    Generates single-router EDA plots (Fig 4-2b, 4-3a/b/c).
    """
    print(f"\n--- Generating Single-Router EDA Plots for {router_label} ---")

    # Make a copy to avoid SettingWithCopyWarning when adding columns
    df_hourly_copy = df_hourly.copy() 
    df_hourly_copy['hour_of_day'] = df_hourly_copy.index.hour
    df_hourly_copy['day_of_week_num'] = df_hourly_copy.index.dayofweek
    df_hourly_copy['day_name'] = df_hourly_copy.index.day_name()
    df_hourly_copy['is_weekend'] = df_hourly_copy['day_of_week_num'].isin([5, 6])
    
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

    # Fig 4-2b: Week-of-day heat-map (hour × weekday) - Illustrative for one router
    print(f"  Generating Figure 4-2b (Week-of-Day Heatmap for {router_label})...")
    plt.figure(figsize=(12, 8))
    pivot_table = df_hourly_copy.pivot_table(index='hour_of_day', columns='day_name', values=target_col, aggfunc='mean')
    pivot_table = pivot_table[day_order]
    sns.heatmap(pivot_table, cmap='viridis', annot=False, fmt=".0f", linewidths=.5, linecolor='lightgray')
    plt.title(f"4-2b Average {target_col} Heatmap: Hour of Day vs. Day of Week ({router_label})")
    plt.xlabel("Day of Week")
    plt.ylabel("Hour of Day")
    save_plot(f"4_2b_weekday_heatmap", router_label)

    # Anomaly Detection Plots (Figures 4-3a/b/c)
    if ANOM_SCORE_COL in df_hourly_copy.columns and ANOM_FLAG_COL in df_hourly_copy.columns:
        print(f"\n--- Generating Anomaly Detection Plots for {router_label} ---")
        # Fig 4-3a: Isolation-Forest score histogram
        plt.figure(figsize=(10, 6))
        sorted_scores = np.sort(df_hourly_copy[ANOM_SCORE_COL].values)
        threshold_idx = int((1 - IF_CONTAMINATION) * len(sorted_scores))
        threshold_score = sorted_scores[threshold_idx]
        sns.histplot(df_hourly_copy[ANOM_SCORE_COL], bins=100, kde=True, color='teal', alpha=0.7)
        plt.axvline(x=threshold_score, color='red', linestyle='--', label=f'Threshold (Contamination={IF_CONTAMINATION})')
        plt.title(f"4-3a Isolation Forest Anomaly Score Histogram ({router_label})")
        plt.xlabel("Anomaly Score (Higher = More Anomalous)")
        plt.ylabel("Number of Observations")
        plt.legend()
        save_plot(f"4_3a_if_score_histogram", router_label)

        # Fig 4-3b: Time-series plot with anomalies marked
        plt.figure(figsize=(15, 6))
        plt.plot(df_hourly_copy.index, df_hourly_copy[target_col], label='Original Data', color='blue', alpha=0.7)
        anomalies = df_hourly_copy[df_hourly_copy[ANOM_FLAG_COL] == 1]
        plt.scatter(anomalies.index, anomalies[target_col], color='red', s=50, label='Detected Anomaly', zorder=5)
        plt.title(f"4-3b {target_col} Time Series with Detected Anomalies ({router_label})")
        plt.xlabel("Time")
        plt.ylabel(f"{target_col}")
        plt.legend()
        plt.grid(True, linestyle=':', alpha=0.7)
        save_plot(f"4_3b_timeseries_anomalies", router_label)

        # Fig 4-3c: Anomaly count bar-chart by day of week
        anomaly_counts_per_day = df_hourly_copy[df_hourly_copy[ANOM_FLAG_COL] == 1]['day_name'].value_counts().reindex(day_order).fillna(0)
        plt.figure(figsize=(10, 6))
        sns.barplot(x=anomaly_counts_per_day.index, y=anomaly_counts_per_day.values, palette='viridis')
        plt.title(f"4-3c Anomaly Counts by Day of Week ({router_label})")
        plt.xlabel("Day of Week")
        plt.ylabel("Number of Anomalies")
        plt.grid(axis='y', linestyle=':', alpha=0.7)
        save_plot(f"4_3c_anomaly_counts_by_day", router_label)
    else:
        print(f"  Anomaly columns not found in df_hourly for {router_label}. Skipping anomaly plots.")

def plot_global_eda(all_hourly_mean_per_hour: pd.DataFrame, all_raw_samples_target_col: pd.Series, target_col: str):
    """
    Generates global (cross-router) EDA plots (Fig 4-2a, 4-2c).
    """
    print("\n--- Generating Global EDA Plots (Figures 4-2a, 4-2c) ---")

    # Fig 4-2a: 10-router daily curves (mean traffic per hour)
    if not all_hourly_mean_per_hour.empty: # Only plot if data is provided
        print("  Generating Figure 4-2a (10-Router Daily Curves)...")
        plt.figure(figsize=(12, 7))
        sns.lineplot(x='hour_of_day', y='mean_packets', hue='router', data=all_hourly_mean_per_hour, palette='tab10')
        plt.title(f"4-2a Daily Utilization Curves (Mean {target_col} per Hour Across Routers)")
        plt.xlabel("Hour of Day (0-23)")
        plt.ylabel(f"Mean {target_col}")
        plt.xticks(range(24))
        plt.legend(title='Router', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, linestyle=':', alpha=0.7)
        save_plot(f"4_2a_daily_curves_all_routers", router_label="")
    else:
        print("  No hourly mean data collected. Skipping Fig 4-2a.")

    # Fig 4-2c: Flow-size distribution (log-histogram of raw packets)
    if not all_raw_samples_target_col.empty: # Only plot if data is provided
        print("  Generating Figure 4-2c (Flow Size Distribution - All Routers Sample)...")
        plt.figure(figsize=(10, 6))
        sns.histplot(np.log1p(all_raw_samples_target_col), bins=50, kde=True, color='purple', alpha=0.7)
        plt.title(f"4-2c Log-Histogram of Flow Sizes (All Routers Sample)")
        plt.xlabel(f"log(1 + {target_col})")
        plt.ylabel("Number of Flows (Count)")
        plt.grid(True, linestyle=':', alpha=0.7)
        save_plot(f"4_2c_flow_size_histogram_all_routers", router_label="")
    else:
        print(f"  No raw samples data for flow-size histogram. Skipping Fig 4-2c.")


#######################################################################
# Anomaly Detection (Isolation Forest)                                #
#######################################################################

def run_isolation_forest(series: pd.Series, contamination: float, router_label: str) -> pd.DataFrame:
    """
    Fits Isolation Forest on the time series and returns anomaly scores and flags.
    The input `series` should be the hourly aggregated data (e.g., in_packets).
    """
    print(f"  Running Isolation Forest for Anomaly Detection for {router_label}...")
    
    if series.empty:
        print(f"  Warning: Input series for Isolation Forest is empty for {router_label}. Skipping.")
        return pd.DataFrame(columns=[series.name, ANOM_SCORE_COL, ANOM_FLAG_COL])

    X = series.values.reshape(-1, 1)
    
    iforest = IsolationForest(n_estimators=200, contamination=contamination, random_state=SEED, n_jobs=-1) 
    iforest.fit(X)
    
    scores = -iforest.decision_function(X)
    flags = iforest.predict(X)
    flags = np.where(flags == -1, 1, 0)
    
    out_df = pd.DataFrame({
        series.name: series.values,
        ANOM_SCORE_COL: scores,
        ANOM_FLAG_COL: flags
    }, index=series.index)
    
    print(f"  Isolation Forest detection complete for {router_label}. Found {out_df[ANOM_FLAG_COL].sum()} anomalies.")
    return out_df


#######################################################################
# Main Execution for Notebook 1                                       #
#######################################################################

def main():
    print("\n--- Initiating Notebook 1: Raw Data ETL & Anomaly Flagging (Multi-Router) ---")

    # 1. Get all individual router Parquet file paths
    all_router_file_paths = sorted(list(INPUT_ROUTERS_DIR.glob("*.parquet")))
    if not all_router_file_paths:
        print(f"Error: No .parquet files found in {INPUT_ROUTERS_DIR}. Please check the path.")
        return

    print(f"\nFound {len(all_router_file_paths)} individual router Parquet files in {INPUT_ROUTERS_DIR}.")

    # Data structures to collect info for global EDA plots and Table 4-1
    router_inventory_data = [] 
    all_hourly_mean_per_hour_list = [] 
    all_raw_samples_target_col_list = [] # For Fig 4-2c (Flow-size Distribution)

    # Process each router file sequentially
    for file_path_idx, file_path in enumerate(tqdm(all_router_file_paths, desc="Processing Routers")):
        router_label = file_path.stem # Extract router name from filename (e.g., "atlanta")
        print(f"\n==== Starting Full Processing for Router: {router_label} ({file_path_idx + 1}/{len(all_router_file_paths)}) ====")
        
        # 1.1. Load Individual Raw File (Dask, with column selection)
        print(f"  Loading raw data for {router_label} from {file_path.name} with selected columns...")
        try:
            # Load only necessary columns for initial processing and memory efficiency
            # Ensure 't_last' is also loaded if it's used in initial cleaning (e.g., for flow duration analysis, though not in this pipeline)
            ddf_raw_single_router = dd.read_parquet(file_path, engine="pyarrow", columns=[TIMESTAMP_COL, TARGET_COL] + ADDITIONAL_RAW_COLS)
            # Ensure router column is present and correctly typed (though it's derived later)
            ddf_raw_single_router[ROUTER_COL] = router_label
            print(f"  Loaded Dask DataFrame (lazy) for {router_label}. Partitions: {ddf_raw_single_router.npartitions}")
        except Exception as e:
            print(f"  Error loading Dask DataFrame for {router_label}: {e}. Skipping this router.")
            continue # Skip to next router

        # 1.2. Collect raw data stats for Table 4-1 (Dask-native count)
        num_flows_raw = ddf_raw_single_router.shape[0].compute() 
        print(f"  Raw data for {router_label}: {num_flows_raw} records.")

        # 1.3. Collect a sample of raw 'in_packets' for the global histogram (Fig 4-2c)
        if num_flows_raw > 0 and TARGET_COL in ddf_raw_single_router.columns:
            # Dask's .sample(frac=...) is memory-efficient.
            # Using a fixed fraction across routers to ensure representation.
            raw_sample_per_router = ddf_raw_single_router[TARGET_COL].sample(frac=min(1.0, GLOBAL_HISTOGRAM_SAMPLE_PER_ROUTER_COUNT / num_flows_raw), random_state=SEED).compute()
            all_raw_samples_target_col_list.append(raw_sample_per_router)
            print(f"  Collected {len(raw_sample_per_router)} raw samples for global histogram from {router_label}.")
        else:
            print(f"  No raw data or target column '{TARGET_COL}' found for {router_label}. Skipping raw sample collection for histogram.")

        # 2. Dask-native Initial Clean and Hourly Resampling
        print(f"  Performing Dask-native initial cleaning and hourly resampling for {router_label}...")
        
        # Convert timestamps to datetime in Dask and drop NaNs
        ddf_cleaned_and_filtered = ddf_raw_single_router.copy()
        ddf_cleaned_and_filtered[TIMESTAMP_COL] = dd.to_datetime(ddf_cleaned_and_filtered[TIMESTAMP_COL], errors='coerce')
        ddf_cleaned_and_filtered = ddf_cleaned_and_filtered.dropna(subset=[TIMESTAMP_COL])

        if ddf_cleaned_and_filtered.npartitions == 0 or ddf_cleaned_and_filtered.shape[0].compute() == 0:
            print(f"  Warning: No valid records after initial cleaning for {router_label}. Skipping further processing for this router.")
            num_hourly_rows = 0
            router_inventory_data.append({
                "Router ID": router_label,
                "Raw Flows (#)": num_flows_raw,
                "Hourly Rows (#)": num_hourly_rows
            })
            del ddf_raw_single_router, ddf_cleaned_and_filtered
            gc.collect()
            continue

        # Set index and resample (still Dask)
        # Note: If timestamps are not already sorted, this can be slow or incorrect.
        # `sorted=True` is a hint to Dask.
        ddf_hourly_router = ddf_cleaned_and_filtered.set_index(TIMESTAMP_COL, sorted=True)[TARGET_COL].resample(FREQ).sum()
        
        # Convert back to DataFrame and add router column (still Dask)
        ddf_hourly_router = ddf_hourly_router.to_frame()
        ddf_hourly_router[ROUTER_COL] = router_label

        # 3. Compute Hourly Aggregated Data to Pandas (Memory-Safe)
        # This is the crucial step: df_hourly_resampled should be small (max ~1368 rows).
        df_hourly_resampled = ddf_hourly_router.compute() 
        print(f"  Computed hourly data for {router_label}: {len(df_hourly_resampled)} rows.")
        # Explicitly delete Dask DataFrames to free up graph memory
        del ddf_raw_single_router, ddf_cleaned_and_filtered, ddf_hourly_router 
        gc.collect()

        # 4. Pandas Post-Resampling (Interpolation/Fill)
        if df_hourly_resampled.empty:
            print(f"  Warning: Hourly resampled data for {router_label} is empty after Dask compute. Skipping further processing for this router.")
            num_hourly_rows = 0
            router_inventory_data.append({
                "Router ID": router_label,
                "Raw Flows (#)": num_flows_raw,
                "Hourly Rows (#)": num_hourly_rows
            })
            continue
        
        missing_before_interp = df_hourly_resampled[TARGET_COL].isnull().sum()
        df_hourly_resampled[TARGET_COL] = df_hourly_resampled[TARGET_COL].interpolate(method='linear')
        df_hourly_resampled[TARGET_COL].fillna(0, inplace=True)
        if missing_before_interp > 0:
            print(f"  Interpolated {missing_before_interp} missing values linearly for hourly data.")
        
        # Collect hourly data stats for Table 4-1
        num_hourly_rows = len(df_hourly_resampled)
        router_inventory_data.append({
            "Router ID": router_label,
            "Raw Flows (#)": num_flows_raw,
            "Hourly Rows (#)": num_hourly_rows
        })

        # 5. Run Anomaly Detection (Isolation Forest) (on Pandas df_hourly_resampled)
        df_hourly_with_anomalies = run_isolation_forest(
            df_hourly_resampled[TARGET_COL], IF_CONTAMINATION, router_label
        )
        # Add router column back, ensuring it's consistently set if it was lost in resampling.
        df_hourly_with_anomalies[ROUTER_COL] = router_label 
        del df_hourly_resampled 
        gc.collect()

        # 6. Save Processed Hourly Data for this router
        processed_hourly_parquet_path = OUT_DIR / f"hourly_{router_label.lower().replace(' ', '_')}_processed_with_anomalies.parquet"
        print(f"  Saving processed hourly data for {router_label} to: {processed_hourly_parquet_path}")
        df_hourly_with_anomalies.to_parquet(processed_hourly_parquet_path, index=True) 
        print(f"  Processed hourly data for {router_label} saved successfully.")

        # 7. Collect data for global Fig 4-2a (Daily Utilization Curves)
        hourly_mean_per_hour = df_hourly_with_anomalies.groupby(df_hourly_with_anomalies.index.hour)[TARGET_COL].mean().reset_index()
        hourly_mean_per_hour.columns = ['hour_of_day', 'mean_packets']
        hourly_mean_per_hour[ROUTER_COL] = router_label 
        all_hourly_mean_per_hour_list.append(hourly_mean_per_hour)

        # 8. Generate Illustrative EDA Plots (Single Router: 4-2b, 4-3a/b/c)
        # Choose "elpaso" as the illustrative router
        if router_label == "elpaso": 
            plot_eda_single_router(df_hourly_with_anomalies.copy(), router_label, TARGET_COL)
        else:
            print(f"  Skipping single-router EDA plots for {router_label} (generating for El Paso as representative).")

        del df_hourly_with_anomalies 
        gc.collect()

    # After processing all routers, plot global EDA
    print("\n==== All Routers Processed. Generating Global EDA Plots ====")
    
    # Fig 4-2a: 10-router daily curves
    if all_hourly_mean_per_hour_list:
        combined_hourly_mean_per_hour_df = pd.concat(all_hourly_mean_per_hour_list, ignore_index=True)
        # Pass empty Series for raw samples as they are handled by a separate list
        plot_global_eda(combined_hourly_mean_per_hour_df, pd.Series(dtype=float), TARGET_COL) 
    else:
        print("  No hourly mean data collected. Skipping Fig 4-2a.")
    
    # Fig 4-2c: Flow-size distribution
    if all_raw_samples_target_col_list:
        final_all_raw_samples_series = pd.concat(all_raw_samples_target_col_list, ignore_index=True)
        plot_global_eda(pd.DataFrame(), final_all_raw_samples_series, TARGET_COL) 
    else:
        print("  No raw samples collected. Skipping Fig 4-2c.")

    # Print Table 4-1: Router Inventory
    print("\n--- Table 4-1: Router Inventory (Raw Data & Hourly Aggregates) ---")
    router_inventory_df = pd.DataFrame(router_inventory_data)
    inventory_cols = ["Router ID", "Raw Flows (#)", "Hourly Rows (#)"]
    router_inventory_df = router_inventory_df[inventory_cols] 
    print(router_inventory_df.to_string())
    
    print("\n--- Notebook 1 Complete: Raw Data ETL & Anomaly Flagging ---")
    print(f"Check the '{OUT_DIR}' directory for processed hourly Parquet files (e.g., hourly_atlanta_processed_with_anomalies.parquet).")
    print(f"Check the '{FIG_DIR}' directory for generated figures.")

# Execute the main pipeline for this notebook
if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


Environment setup complete.
Configuration loaded.
Input individual router Parquet files from: /mnt/nrdstor/ramamurthy/mhnarfth/internet2/parquet
Outputs will be saved to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4 (figs: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/figs)
Anomaly Detection contamination (initial): 0.01

--- Initiating Notebook 1: Raw Data ETL & Anomaly Flagging (Multi-Router) ---

Found 10 individual router Parquet files in /mnt/nrdstor/ramamurthy/mhnarfth/internet2/parquet.


Processing Routers:   0%|          | 0/10 [00:00<?, ?it/s]


==== Starting Full Processing for Router: atlanta (1/10) ====
  Loading raw data for atlanta from atlanta.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for atlanta. Partitions: 8
  Raw data for atlanta: 28032240 records.
  Collected 10000 raw samples for global histogram from atlanta.
  Performing Dask-native initial cleaning and hourly resampling for atlanta...
  Computed hourly data for atlanta: 1369 rows.
  Running Isolation Forest for Anomaly Detection for atlanta...


Processing Routers:  10%|█         | 1/10 [00:29<04:22, 29.18s/it]

  Isolation Forest detection complete for atlanta. Found 13 anomalies.
  Saving processed hourly data for atlanta to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_atlanta_processed_with_anomalies.parquet
  Processed hourly data for atlanta saved successfully.
  Skipping single-router EDA plots for atlanta (generating for El Paso as representative).

==== Starting Full Processing for Router: batonrouge (2/10) ====
  Loading raw data for batonrouge from batonrouge.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for batonrouge. Partitions: 9
  Raw data for batonrouge: 80056602 records.
  Collected 10001 raw samples for global histogram from batonrouge.
  Performing Dask-native initial cleaning and hourly resampling for batonrouge...
  Computed hourly data for batonrouge: 1343 rows.
  Running Isolation Forest for Anomaly Detection for batonrouge...


Processing Routers:  20%|██        | 2/10 [01:41<07:18, 54.79s/it]

  Isolation Forest detection complete for batonrouge. Found 14 anomalies.
  Saving processed hourly data for batonrouge to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_batonrouge_processed_with_anomalies.parquet
  Processed hourly data for batonrouge saved successfully.
  Skipping single-router EDA plots for batonrouge (generating for El Paso as representative).

==== Starting Full Processing for Router: boston (3/10) ====
  Loading raw data for boston from boston.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for boston. Partitions: 1
  Raw data for boston: 392891 records.
  Collected 10000 raw samples for global histogram from boston.
  Performing Dask-native initial cleaning and hourly resampling for boston...
  Computed hourly data for boston: 789 rows.
  Running Isolation Forest for Anomaly Detection for boston...


Processing Routers:  30%|███       | 3/10 [01:42<03:31, 30.25s/it]

  Isolation Forest detection complete for boston. Found 8 anomalies.
  Saving processed hourly data for boston to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_boston_processed_with_anomalies.parquet
  Processed hourly data for boston saved successfully.
  Skipping single-router EDA plots for boston (generating for El Paso as representative).

==== Starting Full Processing for Router: dallas (4/10) ====
  Loading raw data for dallas from dallas.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for dallas. Partitions: 17
  Raw data for dallas: 149784626 records.
  Collected 10001 raw samples for global histogram from dallas.
  Performing Dask-native initial cleaning and hourly resampling for dallas...
  Computed hourly data for dallas: 1322 rows.
  Running Isolation Forest for Anomaly Detection for dallas...


Processing Routers:  40%|████      | 4/10 [03:53<06:58, 69.73s/it]

  Isolation Forest detection complete for dallas. Found 14 anomalies.
  Saving processed hourly data for dallas to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_dallas_processed_with_anomalies.parquet
  Processed hourly data for dallas saved successfully.
  Skipping single-router EDA plots for dallas (generating for El Paso as representative).

==== Starting Full Processing for Router: elpaso (5/10) ====
  Loading raw data for elpaso from elpaso.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for elpaso. Partitions: 3
  Raw data for elpaso: 9357137 records.
  Collected 10000 raw samples for global histogram from elpaso.
  Performing Dask-native initial cleaning and hourly resampling for elpaso...
  Computed hourly data for elpaso: 1378 rows.
  Running Isolation Forest for Anomaly Detection for elpaso...
  Isolation Forest detection complete for elpaso. Found 14 anomalies.
  Saving processed hourly data for elpaso to: /home/rama

Processing Routers:  50%|█████     | 5/10 [04:06<04:06, 49.30s/it]

  Saved 4_3c_anomaly_counts_by_day_elpaso.png and 4_3c_anomaly_counts_by_day_elpaso.pdf

==== Starting Full Processing for Router: jackson (6/10) ====
  Loading raw data for jackson from jackson.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for jackson. Partitions: 4
  Raw data for jackson: 13401496 records.
  Collected 10000 raw samples for global histogram from jackson.
  Performing Dask-native initial cleaning and hourly resampling for jackson...
  Computed hourly data for jackson: 1369 rows.
  Running Isolation Forest for Anomaly Detection for jackson...


Processing Routers:  60%|██████    | 6/10 [04:21<02:30, 37.58s/it]

  Isolation Forest detection complete for jackson. Found 14 anomalies.
  Saving processed hourly data for jackson to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_jackson_processed_with_anomalies.parquet
  Processed hourly data for jackson saved successfully.
  Skipping single-router EDA plots for jackson (generating for El Paso as representative).

==== Starting Full Processing for Router: jacksonville (7/10) ====
  Loading raw data for jacksonville from jacksonville.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for jacksonville. Partitions: 5
  Raw data for jacksonville: 19425447 records.
  Collected 10000 raw samples for global histogram from jacksonville.
  Performing Dask-native initial cleaning and hourly resampling for jacksonville...
  Computed hourly data for jacksonville: 1338 rows.
  Running Isolation Forest for Anomaly Detection for jacksonville...


Processing Routers:  70%|███████   | 7/10 [04:42<01:37, 32.42s/it]

  Isolation Forest detection complete for jacksonville. Found 14 anomalies.
  Saving processed hourly data for jacksonville to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_jacksonville_processed_with_anomalies.parquet
  Processed hourly data for jacksonville saved successfully.
  Skipping single-router EDA plots for jacksonville (generating for El Paso as representative).

==== Starting Full Processing for Router: louisville (8/10) ====
  Loading raw data for louisville from louisville.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for louisville. Partitions: 1
  Raw data for louisville: 3150312 records.
  Collected 10000 raw samples for global histogram from louisville.
  Performing Dask-native initial cleaning and hourly resampling for louisville...
  Computed hourly data for louisville: 1466 rows.
  Running Isolation Forest for Anomaly Detection for louisville...


Processing Routers:  80%|████████  | 8/10 [04:47<00:47, 23.54s/it]

  Isolation Forest detection complete for louisville. Found 15 anomalies.
  Saving processed hourly data for louisville to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_louisville_processed_with_anomalies.parquet
  Processed hourly data for louisville saved successfully.
  Skipping single-router EDA plots for louisville (generating for El Paso as representative).

==== Starting Full Processing for Router: phoenix (9/10) ====
  Loading raw data for phoenix from phoenix.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for phoenix. Partitions: 3
  Raw data for phoenix: 8765751 records.
  Collected 10000 raw samples for global histogram from phoenix.
  Performing Dask-native initial cleaning and hourly resampling for phoenix...
  Computed hourly data for phoenix: 1369 rows.
  Running Isolation Forest for Anomaly Detection for phoenix...


Processing Routers:  90%|█████████ | 9/10 [04:57<00:19, 19.20s/it]

  Isolation Forest detection complete for phoenix. Found 14 anomalies.
  Saving processed hourly data for phoenix to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_phoenix_processed_with_anomalies.parquet
  Processed hourly data for phoenix saved successfully.
  Skipping single-router EDA plots for phoenix (generating for El Paso as representative).

==== Starting Full Processing for Router: reno (10/10) ====
  Loading raw data for reno from reno.parquet with selected columns...
  Loaded Dask DataFrame (lazy) for reno. Partitions: 4
  Raw data for reno: 35499401 records.
  Collected 10000 raw samples for global histogram from reno.
  Performing Dask-native initial cleaning and hourly resampling for reno...
  Computed hourly data for reno: 861 rows.
  Running Isolation Forest for Anomaly Detection for reno...


Processing Routers: 100%|██████████| 10/10 [05:32<00:00, 33.27s/it]

  Isolation Forest detection complete for reno. Found 9 anomalies.
  Saving processed hourly data for reno to: /home/ramamurthy/mhnarfth/network_analysis/individually process kora/outputs/ch4/hourly_reno_processed_with_anomalies.parquet
  Processed hourly data for reno saved successfully.
  Skipping single-router EDA plots for reno (generating for El Paso as representative).

==== All Routers Processed. Generating Global EDA Plots ====

--- Generating Global EDA Plots (Figures 4-2a, 4-2c) ---
  Generating Figure 4-2a (10-Router Daily Curves)...





  Saved 4_2a_daily_curves_all_routers.png and 4_2a_daily_curves_all_routers.pdf
  No raw samples data for flow-size histogram. Skipping Fig 4-2c.

--- Generating Global EDA Plots (Figures 4-2a, 4-2c) ---
  No hourly mean data collected. Skipping Fig 4-2a.
  Generating Figure 4-2c (Flow Size Distribution - All Routers Sample)...
  Saved 4_2c_flow_size_histogram_all_routers.png and 4_2c_flow_size_histogram_all_routers.pdf

--- Table 4-1: Router Inventory (Raw Data & Hourly Aggregates) ---
      Router ID  Raw Flows (#)  Hourly Rows (#)
0       atlanta       28032240             1369
1    batonrouge       80056602             1343
2        boston         392891              789
3        dallas      149784626             1322
4        elpaso        9357137             1378
5       jackson       13401496             1369
6  jacksonville       19425447             1338
7    louisville        3150312             1466
8       phoenix        8765751             1369
9          reno       354994