# Individual Analysis of Online Anomaly Detectors with PV Datasets 

In [2]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import itertools
params = {
    'font.size': 12,           
    'axes.labelsize': 12,      
    'xtick.labelsize': 10,      
    'ytick.labelsize': 10,      
    'legend.fontsize': 10,      
    'figure.titlesize': 12
}
plt.rcParams.update(params)

from pathlib import Path

import matplotlib.cm as cm
import matplotlib.dates as mdates
from pathlib import Path
import re
from sklearn.metrics import roc_auc_score, auc, precision_recall_curve, roc_curve

from concurrent.futures import ThreadPoolExecutor, as_completed


In [None]:

DEFAULT_PARAMS_FIXED_WINDOW = pd.array(["HStree_120_{'anomaly_threshold': 0.5, 'max_depth': 15, 'number_of_trees': 25, 'size_limit': 0.1}",
       "xStream_120_{'depth': 25, 'n_chains': 100, 'num_components': 100}",
       "RSHash_120_{'decay': 0.015, 'feature_maxes': [10000], 'feature_mins': [0], 'num_components': 100, 'num_hash_fns': 1}",
       "ExactStorm_120_{'max_radius': 0.1}",
       "RobustRandomCutForest_120_{'num_trees': 4, 'tree_size': 256}",
       "OnlineBootKNN_120_{'algorithm': 'brute', 'alpha': 0.05, 'chunk_size': 240, 'dmetric': 'cityblock', 'ensemble_size': 240, 'n_jobs': -1, 'transf': 'ZNORM'}",
       "oIF_120_{'growth_criterion': 'adaptive', 'max_leaf_samples': 32, 'n_jobs': -1, 'num_trees': 32}",
       "IForestASD_120_{'initial_window_X': None}",
       "KitNet_120_{'hidden_ratio': 0.75, 'learning_rate': 0.1, 'max_size_ae': 10}"],
      dtype=object)

In [None]:

DEFAULT_PARAMS_NO_FIXED_WINDOW = pd.array(["HStree_60_{'anomaly_threshold': 0.5, 'max_depth': 15, 'number_of_trees': 25, 'size_limit': 0.1}",
       "xStream_60_{'depth': 25, 'n_chains': 100, 'num_components': 100}",
       "RSHash_120_{'decay': 0.015, 'feature_maxes': [10000], 'feature_mins': [0], 'num_components': 100, 'num_hash_fns': 1}",
       "ExactStorm_120_{'max_radius': 0.1}",
       "OnlineBootKNN_120_{'algorithm': 'brute', 'alpha': 0.05, 'chunk_size': 240, 'dmetric': 'cityblock', 'ensemble_size': 240, 'n_jobs': -1, 'transf': 'ZNORM'}",
       "RobustRandomCutForest_240_{'num_trees': 4, 'tree_size': 256}",
       "oIF_240_{'growth_criterion': 'adaptive', 'max_leaf_samples': 32, 'n_jobs': -1, 'num_trees': 32}",
       "IForestASD_240_{'initial_window_X': None}",
       "KitNet_240_{'hidden_ratio': 0.75, 'learning_rate': 0.1, 'max_size_ae': 10}"],
      dtype=object)


In [None]:
PARAMS=DEFAULT_PARAMS_NO_FIXED_WINDOW

In [None]:
# Extract the first number after the underscore for each element in PARAMS
WINDOW_LIST = np.unique([re.search(r'_(\d+)_', param).group(1) for param in PARAMS])

print(WINDOW_LIST)

In [None]:
# Get the path to the current script
current_dir = Path.cwd()

# Go one level up
current_dir = current_dir.parent

DATA_PATH = current_dir / 'datasets' / 'processed'

# Define dataset paths
dataset_paths = [
    DATA_PATH / 'processed_server22_A1',
    DATA_PATH / 'processed_server22_A2',
    DATA_PATH / 'processed_server22_A3',
    DATA_PATH / 'processed_server21_A4',
    DATA_PATH / 'processed_server21_A5',
    DATA_PATH / 'processed_L40S02_A6',
    DATA_PATH / 'processed_server18_A7',
    DATA_PATH / 'processed_server18_A8',
    DATA_PATH / 'processed_server18_A9',
]

# Define dataset paths
dataset_paths = [
    DATA_PATH / 'processed_L40S02_A6',

]

def process_file(file_path, scenario):
    try:
        print(f"Processing file: {file_path.name} for scenario: {scenario}") # Now printing the scenario
        df = pd.read_excel(file_path)
        df["window_size"] = file_path.name.split("_")[-1].replace(".xlsx", "")
        df["method_window_and_param"] = df.method + "_" + df.window_size + "_" + df.param
        df = df[df['method_window_and_param'].isin(PARAMS)]

        # Example: Add a 'scenario' column to the DataFrame
        df['scenario'] = scenario

        return df if not df.empty else None
    except Exception as e:
        print(f"Error processing {file_path.name}: {e}")
        return None

def process_path(path):
    SCENARIO = path.name.split("_")[-1]
    print(f"Processing path: {path.name}")
    if not path.is_dir():
        return []
    
    files = [f for f in path.iterdir() if f.suffix == '.xlsx' and f.name.startswith('A')]
    print(f"Total Files for the Scenario {SCENARIO}: {len(files)}")
    
    dfs = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_file, file, SCENARIO) for file in files]
        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                dfs.append(result)
    return dfs

# Main execution
all_dfs = []
for dataset_path in dataset_paths:
    all_dfs.extend(process_path(dataset_path))

# Concatenate all DataFrames at once
concatenated_df = pd.concat(all_dfs, ignore_index=True)


In [None]:
SCENARIO = concatenated_df['scenario'].unique()[0] 
SCENARIO

In [None]:
concatenated_df["timestamp"] = pd.to_datetime(concatenated_df["timestamp"])
concatenated_df["cleaned_score"] = concatenated_df["cleaned_score"].astype('float64')


In [None]:
regex_filter = r"OnlineBootMGV|LODA|Py"
filtered_data = concatenated_df[~concatenated_df['method'].str.contains(regex_filter, regex=True)]
#filtered_data = filtered_data[filtered_data['method'].str.contains("std_p")]
#filtered_data = filtered_data[filtered_data['method'].str.contains("None|FOD|SOD|DIL|QUANT")]
#filtered_data = filtered_data[filtered_data['method'].str.contains("QUANT")]
#filtered_data = filtered_data[filtered_data.method.str.contains("OnlineBootGP|OnlineBootKNN")]

filtered_data["timestamp"] = pd.to_datetime(filtered_data["timestamp"])
filtered_data["cleaned_score"] = filtered_data["cleaned_score"].astype('float64')


In [None]:
# Get the final list of unique merged combinations
methods = filtered_data.method.unique()
methods = ['IForestASD', 'KitNet', 'oIF', 'RobustRandomCutForest' ,'xStream', 'HStree',
 'RSHash' ,'ExactStorm' ,'OnlineBootKNN']
print(methods)


In [None]:
print(PARAMS)

In [None]:
# Get the final list of unique merged combinations

method_window_and_param = filtered_data["method_window_and_param"].unique()
print(method_window_and_param)

In [None]:
num_colors = len(method_window_and_param)
print("Total Hyperparameter Tuning: ", num_colors)


In [None]:
#methods = np.array(['RSHash', 'RobustRandomCutForest', 'KitNet', 'ExactStorm', 'IForestASD', 'oIF', 'HStree', 'OnlineBootKNN','xStream'], dtype=object)

In [None]:
# Define a color list (consider using a color palette from seaborn or matplotlib)
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']


# Generate a list of colors corresponding to the methods
color_list = [colors[i] for i in range(num_colors)]

print(color_list)  # Display the first 10 colors

In [None]:
# Generate the summary of cleaned process to the score
filtered_data['error_type_score'] = filtered_data['error_type_score'].fillna("No Error")
filtered_data.pivot_table(index='method', columns='error_type_score', values='cleaned_score', aggfunc='count')


In [None]:
iterations = filtered_data["iteration"].unique()[0:10]
iterations

## Individual Score of Online Anomaly Detectors with Pfeiffer Vacuum Datasets 

In [None]:
# Group data by method and timestamp, calculating both the mean and std
average_results = filtered_data.groupby(['method',  'timestamp'], as_index=False).agg(
    mean_score=('cleaned_score', 'mean'),
    std_score=('cleaned_score', 'std'),
    mean_gt=('ground_truth', 'mean')
)


In [None]:
# Loop through methods and plot the averaged values and std
for m, color in zip(methods, color_list):
    # Create a single figure
    fig, ax1 = plt.subplots(figsize=(12, 8))
    
    # Filter data for the current method
    method_data = average_results[(average_results["method"] == m)]
    
    # Plot the mean score
    ax1.plot(
        method_data['timestamp'], 
        method_data['mean_score'], 
        label=f'{m} (Averaged Iterations) with Param', 
        color=color
    )
    
    # Plot the standard deviation as a shaded area on the first axis
    ax1.fill_between(
        method_data['timestamp'], 
        method_data['mean_score'] - method_data['std_score'], 
        method_data['mean_score'] + method_data['std_score'], 
        color=color, alpha=0.3, label=f'Method {m} (Â± Std Dev)'
    )

    # Set labels and title for the first y-axis
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Average Score in Time')
    ax1.set_title(f"Method {m} Score")
    
    # Create a second y-axis
    ax2 = ax1.twinx()
    
    # Plot the Ground Truth on the second y-axis
    ax2.plot(
        method_data['timestamp'], 
        method_data['mean_gt'], 
        label='Ground Truth', 
        color='grey', linestyle='--',
        linewidth=0.9  # Make the line very thin
    )
    
    # Set label for the second y-axis
    ax2.set_ylabel('Ground Truth')
    
    # Adjust the second y-axis range (this is the key part to scale it visually)
    ax2.set_ylim(0, 1.01)  # Scale the ground truth values down (50% here)

    # Add legends and grid
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper right')
    ax1.grid(False)  # Remove gridlines
    
    # Format the x-axis to show only the hours
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
    # Improve layout and display
    ax1.tick_params(axis='x')  # larger x-axis labels

    plt.xticks(rotation=45)
    plt.tight_layout()

    # Save the plot
    plt.savefig(current_dir / 'notebooks' / 'score_functions'/ f"{SCENARIO}_score_{m}.pdf", format="pdf")
    
    # Display the plot
    plt.show()


In [None]:

for m, color in zip(methods, color_list):
    # Plot ROC Curve
    plt.figure(figsize=(12, 5))
    f_method_data = filtered_data[filtered_data['method'] == m]
    
    for i in f_method_data['iteration'].unique():
        
        print(f"Method: {m}, Iteration: {i}")
        # Filter data for the current method and iteration
        method_data = f_method_data[f_method_data['iteration'] == i]

        # Get true labels and scores
        y_true = method_data['ground_truth'].values
        y_scores = method_data['cleaned_score'].values

        # Compute ROC curve and AUC
        fpr, tpr, _ = roc_curve(y_true, y_scores)
        roc_auc = auc(fpr, tpr)

        # Compute Precision-Recall curve and AUC
        precision, recall, _ = precision_recall_curve(y_true, y_scores)
        pr_auc = auc(recall, precision)



        plt.subplot(1, 2, 1)
        plt.plot(fpr, tpr, color=color, lw=1 + i * 0.5, label=f'ROC (AUC = {roc_auc:.3f}) Iteration {i}')


        plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve for {m}')
        plt.legend(loc='lower right')

        # Plot Precision-Recall Curve
        plt.subplot(1, 2, 2)
        plt.plot(recall, precision, color=color, lw=1 + i * 0.5, label=f'PR (AUC = {pr_auc:.3f}) Iteration {i}')


        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve for {m}')
        plt.legend(loc='lower left')

        plt.tight_layout()
    plt.show()




## Individual Training Time of Online Anomaly Detectors with PV Datasets 

In [None]:
# Create a single figure
plt.figure(figsize=(12, 8))

# Group data by method and timestamp, calculating mean and standard deviation across iterations
grouped_data = filtered_data.groupby(['method', 'timestamp']).agg(
    mean_training_time_t=('training_time', 'mean'),
    std_training_time_t=('training_time', 'std')
).reset_index()

# Loop through methods and plot the accumulated values with shaded area
for m, color in zip(methods, color_list):
    
    method_data = grouped_data[grouped_data["method"] == m]

    # Compute cumulative sum
    cumulative_training_time = method_data['mean_training_time_t'].expanding().sum()

    # Plot averaged training time with accumulation
    plt.plot(
        range(len(method_data['timestamp'])),  # Use the index as x-axis
        cumulative_training_time,
        label=f'{m}',
        color=color
    )

    # Plot shaded area representing standard deviation
    plt.fill_between(
        range(len(method_data['timestamp'])),  # Use the index as x-axis
        (method_data['mean_training_time_t'] - method_data['std_training_time_t']).expanding().sum(),
        (method_data['mean_training_time_t'] + method_data['std_training_time_t']).expanding().sum(),
        color=color,
        alpha=0.2
    )

    # Get max accumulated training time and timestamp
    max_time = cumulative_training_time.max()

    max_timestamp = len(method_data['timestamp'])  # Ensuring correct index

    # Format the max value properly
    formatted_max_time = f"{max_time:,.2f}"  # Adds commas and rounds to 2 decimals

    if m in ["OnlineBootKNN", "xStream", "KitNet"]:
        # Annotate the max value
        plt.annotate(
            f'Max: {formatted_max_time}', 
            xy=(max_timestamp, max_time), 
            xytext=(max_timestamp, max_time * 1.03),  # Adjusted for better visibility
            #arrowprops=dict(facecolor=color, arrowstyle='->'),
            color=color,
            ha='center'
        )

# Format the x-axis to show only the hours
# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
    
# Adding labels and title
#plt.title("Training Time", fontsize=FONTSIZE_SEC)
plt.xlabel('# Instances')
plt.ylabel('Accumulative Training Time in Seconds')
plt.legend(loc='upper left')  # Place legend dynamically
plt.xticks(rotation=45)
#plt.grid(True, linestyle='--', alpha=0.6)  # Improve grid visibility
plt.tight_layout()

plt.savefig(current_dir / 'notebooks' / 'img_training_and_scoring_time'/f"{SCENARIO}_accum_training_time.pdf", format="pdf")
# Display the plot
plt.show()


In [None]:
# Create a single figure
plt.figure(figsize=(12, 8))

# Group data by method and timestamp, calculating mean and standard deviation across iterations
grouped_data = filtered_data.groupby(['method', 'timestamp']).agg(
    mean_training_time_t=('training_time', 'mean'),
    std_training_time_t=('training_time', 'std')
).reset_index()


for m, color in zip(methods, color_list):
    
    method_data = grouped_data[grouped_data["method"] == m]

    
    training_time = method_data['mean_training_time_t']

    # Plot averaged training time 
    plt.plot(
        method_data['timestamp'],
        training_time,
        label=f'{m}',
        color=color
    )

    # Plot shaded area representing standard deviation
    plt.fill_between(
        method_data['timestamp'],
        (method_data['mean_training_time_t'] - method_data['std_training_time_t']),
        (method_data['mean_training_time_t'] + method_data['std_training_time_t']),
        color=color,
        alpha=0.2
    )

    
    max_time = training_time.max()
    max_idx = training_time.idxmax()
    max_timestamp = method_data.loc[max_idx, 'timestamp']  # Ensuring correct index

    # Format the max value properly
    formatted_max_time = f"{max_time:,.2f}"  # Adds commas and rounds to 2 decimals

    # Annotate the max value
    plt.annotate(
        f'Max: {formatted_max_time}', 
        xy=(max_timestamp, max_time), 
        xytext=(max_timestamp, max_time * 1.03),  # Adjusted for better visibility
        #arrowprops=dict(facecolor=color, arrowstyle='->'),
        color=color,
        ha='center'
    )
# Format the x-axis to show only the hours
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
# Adding labels and title
plt.xlabel('Time')
plt.ylabel('Training Time in Seconds')
plt.legend(loc='best')  # Place legend dynamically
plt.title("Training Time")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)  # Improve grid visibility
plt.tight_layout()

#plt.savefig(f"/home/nicolas/spectral_anomaly_detector/notebooks/img_training_and_scoring_time/{SCENARIO}_training_time.png", format="png")
# Display the plot
plt.show()


In [None]:
# Group by method and compute training time statistics
training_time_stats = filtered_data.groupby(['method', 'timestamp'])['training_time'].agg(
    mean_training_time_t='mean',
).reset_index()

training_time_stats = training_time_stats.groupby(['method'])['mean_training_time_t'].agg(
    mean_time='mean',
    median_time='median',
    min_time='min',
    max_time='max',
    std_dev_time='std'
).reset_index()

# Sort by mean training time in ascending order
training_time_stats = training_time_stats.sort_values(by='mean_time', ascending=True)

# Display summary
training_time_stats

## Individual Scoring Time of Online Anomaly Detectors with PV Datasets 

In [None]:
# Create a single figure
plt.figure(figsize=(12, 8))

# Group data by method and timestamp, calculating mean and standard deviation across iterations
grouped_data = filtered_data.groupby(['method', 'timestamp']).agg(
    mean_scoring_time_t=('scoring_time', 'mean'),
    std_scoring_time_t=('scoring_time', 'std')
).reset_index()

# Loop through methods and plot the accumulated values with shaded area
for m, color in zip(methods, color_list):
    
    method_data = grouped_data[grouped_data["method"] == m]

    # Compute cumulative sum
    cumulative_training_time = method_data['mean_scoring_time_t'].expanding().sum()

    # Plot averaged training time with accumulation
    plt.plot(
        range(len(method_data['timestamp'])),  # Use the index as x-axis
        cumulative_training_time,
        label=f'{m}',
        color=color
    )

    # Plot shaded area representing standard deviation
    plt.fill_between(
        range(len(method_data['timestamp'])),  # Use the index as x-axis
        (method_data['mean_scoring_time_t'] - method_data['std_scoring_time_t']).expanding().sum(),
        (method_data['mean_scoring_time_t'] + method_data['std_scoring_time_t']).expanding().sum(),
        color=color,
        alpha=0.2
    )

    # Get max accumulated training time and timestamp
    max_time = cumulative_training_time.max()

    max_timestamp = len(method_data['timestamp'])  # Ensuring correct index

    # Format the max value properly
    formatted_max_time = f"{max_time:,.2f}"  # Adds commas and rounds to 2 decimals

    if m in ["OnlineBootKNN", "xStream", "KitNet"]:
        # Annotate the max value
        plt.annotate(
            f'Max: {formatted_max_time}', 
            xy=(max_timestamp, max_time), 
            xytext=(max_timestamp, max_time * 1.03),  # Adjusted for better visibility
            #arrowprops=dict(facecolor=color, arrowstyle='->'),
            color=color,
            ha='center'
        )

# Format the x-axis to show only the hours
# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
    
# Adding labels and title
#plt.title("Scoring Time", fontsize=FONTSIZE_SEC)
plt.xlabel('# Instances')
plt.ylabel('Accumulative Scoring Time in Seconds')
plt.legend(loc='upper left')  # Place legend dynamically
plt.xticks(rotation=45)
#plt.grid(True, linestyle='--', alpha=0.6)  # Improve grid visibility
plt.tight_layout()

plt.savefig(current_dir / 'notebooks' / 'img_training_and_scoring_time'/ f"{SCENARIO}_accum_scoring_time.pdf", format="pdf")
# Display the plot
plt.show()


In [None]:
# Create a single figure
plt.figure(figsize=(12, 8))

# Group data by method and timestamp, calculating mean and standard deviation across iterations
grouped_data = filtered_data.groupby(['method', 'timestamp']).agg(
    mean_scoring_time_t=('scoring_time', 'mean'),
    std_scoring_time_t=('scoring_time', 'std')
).reset_index()


for m, color in zip(methods, color_list):
    
    method_data = grouped_data[grouped_data["method"] == m]

    scoring_time = method_data['mean_scoring_time_t']

    plt.plot(
        method_data['timestamp'],
        scoring_time,
        label=f'{m}',
        color=color
    )

    # Plot shaded area representing standard deviation
    plt.fill_between(
        method_data['timestamp'],
        (method_data['mean_scoring_time_t'] - method_data['std_scoring_time_t']),
        (method_data['mean_scoring_time_t'] + method_data['std_scoring_time_t']),
        color=color,
        alpha=0.2
    )

    
    max_time = scoring_time.max()
    max_idx = scoring_time.idxmax()
    max_timestamp = method_data.loc[max_idx, 'timestamp']  # Ensuring correct index

    # Format the max value properly
    formatted_max_time = f"{max_time:,.2f}"  # Adds commas and rounds to 2 decimals

    # Annotate the max value
    plt.annotate(
        f'Max: {formatted_max_time}', 
        xy=(max_timestamp, max_time), 
        xytext=(max_timestamp, max_time * 1.03),  # Adjusted for better visibility
        #arrowprops=dict(facecolor=color, arrowstyle='->'),
        color=color,
        ha='center'
    )
# Format the x-axis to show only the hours
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
# Adding labels and title
plt.xlabel('Time')
plt.ylabel('Scoring Time in Seconds')
plt.legend(loc='best')  # Place legend dynamically
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)  # Improve grid visibility
plt.tight_layout()
plt.title("Scoring Time")
#plt.savefig(f"/home/nicolas/spectral_anomaly_detector/notebooks/img_training_and_scoring_time/{SCENARIO}_scoring_time.png", format="png")
# Display the plot
plt.show()


In [None]:
# Group by method and compute scoring time statistics
scoring_time_stats = filtered_data.groupby(['method', 'timestamp'])['scoring_time'].agg(
    mean_scoring_time_t='mean',
).reset_index()

scoring_time_stats = scoring_time_stats.groupby(['method'])['mean_scoring_time_t'].agg(
    mean_time='mean',
    median_time='median',
    min_time='min',
    max_time='max',
    std_dev_time='std'
).reset_index()

# Sort by mean scoring time in ascending order
scoring_time_stats = scoring_time_stats.sort_values(by='mean_time', ascending=True)

# Display summary
scoring_time_stats

In [None]:
filtered_data['total_time']=filtered_data['training_time']+filtered_data['scoring_time']

In [None]:

# --- 1. SETUP PUBLICATION STYLE ---
# Set the font sizes to match the document (10pt main, >7pt for small text)
params = {
    'font.size': 10,              # Main text size
    'axes.labelsize': 10,         # X/Y Label size
    'axes.titlesize': 10,         # Title size
    'xtick.labelsize': 8,         # Tick size (slightly smaller, but >7pt)
    'ytick.labelsize': 8,         # Tick size
    'legend.fontsize': 8,         # Legend size
    'figure.titlesize': 12,
    'lines.linewidth': 1.5,
    'lines.markersize': 6,
    'font.family': 'serif',       # Matches standard academic papers (Times/LaTeX)
    'text.usetex': False          # Set to True if you have LaTeX installed for math rendering
}
plt.rcParams.update(params)

# Define distinct styles for grayscale safety
line_styles = ['-', '--', '-.', ':', (0, (3, 1, 1, 1)), (0, (5, 10)), (0, (1, 1))]
markers = ['o', 's', '^', 'v', 'D', 'X', '*']
style_cycler = itertools.cycle(zip(line_styles, markers))

# --- 2. DATA PROCESSING (Assumed from your snippet) ---
# Assuming 'filtered_data', 'methods', and 'color_list' exist in your environment.
# grouped_data calculation remains the same:
grouped_data = filtered_data.groupby(['method', 'timestamp']).agg(
    mean_total_time_t=('total_time', 'mean'),
    std_total_time_t=('total_time', 'std')
).reset_index()

# --- 3. PLOTTING ---
# figsize=(7, 4.5) is standard for a full-width (2-column span) figure.
# If this is for a single column, use figsize=(3.5, 2.5).
plt.figure(figsize=(3.5, 2.5))

for m, color in zip(methods, color_list):
    
    ls, mk = next(style_cycler)
    method_data = grouped_data[grouped_data["method"] == m]

    # Compute cumulative sum
    cumulative_training_time = method_data['mean_total_time_t'].expanding().sum()
    
    # Calculate error bands (cumulative error propagation approximation)
    # Note: Standard deviation doesn't strictly sum linearly, but for viz this is common.
    upper_bound = (method_data['mean_total_time_t'] + method_data['std_total_time_t']).expanding().sum()
    lower_bound = (method_data['mean_total_time_t'] - method_data['std_total_time_t']).expanding().sum()

    # Marker interval prevents cluttering the line
    marker_interval = max(1, len(method_data) // 10)

    plt.plot(
        range(len(method_data['timestamp'])), 
        cumulative_training_time,
        label=f'{m}',
        color=color,
        linestyle=ls,       
        marker=mk,          
        markevery=marker_interval, 
        markersize=6,       
        linewidth=1.5       
    )

    # Shaded error region
    plt.fill_between(
        range(len(method_data['timestamp'])), 
        lower_bound,
        upper_bound,
        color=color,
        alpha=0.15,         
        edgecolor=None      
    )

    # Annotations
    max_time = cumulative_training_time.max()
    max_timestamp = len(method_data['timestamp']) 
    formatted_max_time = f"{max_time:,.2f}"
    
    # Specific annotations for methods of interest
    if m in ["OnlineBootKNN", "xStream", "KitNet"]:
        plt.annotate(
            f'Max: {formatted_max_time}', 
            xy=(max_timestamp, max_time), 
            # Offset text slightly to avoid overlapping the line end
            xytext=(max_timestamp, max_time * 1.05), 
            color='black', 
            fontsize=8,        # Compliant: > 7pt
            fontweight='bold',
            ha='center',
            # Add a white background to text so it's readable over grid lines
            bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="none", alpha=0.7)
        )

# --- 4. FORMATTING ---
plt.xlabel('# Total Processed Instances')
plt.ylabel('Accumulative Time (s)') 

# Grid is helpful for reading values in grayscale
plt.grid(True, which='major', linestyle='--', linewidth=0.5, alpha=0.7)

# Legend settings to ensure it fits
plt.legend(loc='upper left', frameon=True, fontsize=8, fancybox=False, edgecolor='black')

plt.xticks(rotation=45)

# Tight layout is crucial, but bbox_inches='tight' in savefig is safer
plt.tight_layout()

# Save with bbox_inches='tight' to ensure no labels are cut off
output_path = current_dir / 'notebooks' / 'img_training_and_scoring_time' / f"{SCENARIO}_accum_total_time.pdf"
plt.savefig(output_path, format="pdf", bbox_inches='tight')

plt.show()

NameError: name 'filtered_data' is not defined

<Figure size 700x450 with 0 Axes>