# Model Evaluation Pipeline for K-Means Anomaly Detection

This notebook implements an evaluation step for our K-Means clustering-based anomaly detection model. We'll assess the model's performance by analyzing the distances between data points and their assigned cluster centroids, using percentile-based thresholds to identify anomalies.

### Distance-Based Anomaly Scoring
- Calculate the Euclidean distance between each point and its assigned cluster centroid
- Establish anomaly thresholds based on distance percentiles
- Points with distances above the threshold are flagged as potential anomalies

## Expected Outputs

1. **Metric Reports**
   - Overall model performance metrics
   - Anomaly detection results

2. **Visualizations**
   - Confusion matrix

## Use Cases

This evaluation pipeline helps validate the effectiveness of our K-Means clustering model for anomaly detection and provides insights for threshold tuning and model optimization.

Use cases for anomaly detection:
- Quality control in manufacturing
- Network traffic analysis
- System performance monitoring
- Fraud detection
- Outlier identification in sensor data

In [None]:
!pip install sagemaker==2.227.0 --quiet

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.function_step import step
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.parameters import ParameterFloat, ParameterInteger, ParameterBoolean, ParameterString
from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig
from sagemaker import get_execution_role
import sagemaker
import os

In [None]:
role = get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'pipelines'

config_yaml = f"""
SchemaVersion: '1.0'
SageMaker:
  PythonSDK:
    Modules:
      RemoteFunction:
        # role arn is not required if in SageMaker Notebook instance or SageMaker Studio
        # Uncomment the following line and replace with the right execution role if in a local IDE
        # RoleArn: <replace the role arn here>
        S3RootUri: s3://{bucket}/{prefix}
        InstanceType: ml.m5.xlarge
        Dependencies: ./requirements.txt
        IncludeLocalWorkDir: true
        PreExecutionCommands:
        - "sudo chmod -R 777 /opt/ml/model"
        CustomFileFilter:
          IgnoreNamePatterns:
          - "data/*"
          - "models/*"
          - "*.ipynb"
          - "__pycache__"

"""

print(config_yaml, file=open('config.yaml', 'w'))

In [None]:
%store -r

env_variables={
    'MLFLOW_TRACKING_URI': mlflow_arn,
    'MLFLOW_EXPERIMENT_NAME': ExecutionVariables.PIPELINE_NAME,
    'MLFLOW_RUN_NAME': ExecutionVariables.PIPELINE_EXECUTION_ID}

In [None]:
%%writefile ./steps/evaluation.py
import os
import numpy as np
import mlflow
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from mlflow import MlflowClient


def evaluation(x_test, y_test, model, percentile=95, run_id=None):
    # set mlflow experiment and server
    mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
    mlflow.set_experiment(os.environ['MLFLOW_EXPERIMENT_NAME'])

    if not run_id:
        current_experiment=dict(mlflow.get_experiment_by_name(os.environ['MLFLOW_EXPERIMENT_NAME']))
        experiment_id=current_experiment['experiment_id']   
        run = MlflowClient().create_run(experiment_id=experiment_id, run_name=os.environ['MLFLOW_RUN_NAME'])
        run_id = run.info.run_id

    with mlflow.start_run(run_id=run_id) as run:
        
        with mlflow.start_run(run_name='evaluation', nested=True) as run:

            mlflow.log_param("percentile", percentile)
            
            test_pred = model.predict(x_test)
            test_dist = model.transform(x_test)
            y_test = y_test.reset_index(drop=True)
            test_anomaly_indexes = y_test.loc[y_test==1].index.tolist()

            dist_to_centroid = [dist_list[ind] for ind, dist_list in zip(test_pred, test_dist)]
            threshold = np.percentile(dist_to_centroid, percentile)
            print(f"Threshold: {threshold}")
            # get max_values index above threshold
            anomaly_idx = np.where(dist_to_centroid > threshold)[0]
            y_pred = [1 if x in anomaly_idx else 0 for x in range(len(test_pred))]
            print(f"Detected anomalies {len(anomaly_idx)}")
            print(f"Groundtruth anomalies {len(test_anomaly_indexes)}")
            print(f"Anomalies detected matching groundtruth {len(np.intersect1d(anomaly_idx, test_anomaly_indexes))}")
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1", f1)
            mlflow.log_metric("roc_auc", roc_auc)

            # log confusion_matrix to figure and mlflow
            fig, ax = plt.subplots()
            # add legend to axes
            plt.title('Confusion Matrix', pad=20, size=14)
            s = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
            s.set(xlabel='Predicted', ylabel='Actual')

            mlflow.log_figure(fig, "confusion_matrix.png")

In [None]:
%%writefile requirements.txt
s3fs
seaborn

# Extending SageMaker Pipeline with Evaluation Step

This notebook extends our existing SageMaker pipeline by adding a crucial evaluation step for our K-Means anomaly detection model. The pipeline now consists of three main steps: preprocessing, training, and evaluation. The evaluation step takes the trained K-Means model and test data as inputs, computing distance-based anomaly scores using the specified percentile threshold. The pipeline parameters allow for flexible configuration of the evaluation criteria, including the percentile threshold for anomaly detection and the instance type for computation.

The evaluation step is integrated seamlessly with the previous steps, using the preprocessed validation data and the trained model artifacts as inputs. The pipeline handles the data flow between steps, ensuring that the evaluation metrics are computed consistently and stored for later analysis. This modular approach allows for easy modification of the evaluation criteria and enables systematic comparison of different model versions. The evaluation results provide insights into the model's effectiveness at identifying anomalies and help in fine-tuning the anomaly detection thresholds for optimal performance in production scenarios.

In [None]:
# Create a SageMaker Pipeline
from steps.preprocess import preprocess_data
from steps.training_kmeans import train_kmeans
from steps.evaluation import evaluation


os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

pipeline_name = f"anomaly-detection-pipeline"

training_instance_type = ParameterString(
    name="training_instance_type", default_value="ml.m5.xlarge"
)

kmeans_nclusters = ParameterInteger(
    name="kmeans_nclusters", default_value=2
)

input_data_s3_uri = ParameterString(
    name="input_data_s3_uri", default_value=data_s3_uri
)

percentile = ParameterFloat(
    name="percentile", default_value=95.0
)

processing_step = step(
    preprocess_data,
    name="Preprocess",
    job_name_prefix=f"{pipeline_name}-Preprocess",
    environment_variables=env_variables,
    instance_type=training_instance_type)(input_data_s3_uri)

training_step = step(
    train_kmeans,
    name="Train",
    job_name_prefix=f"{pipeline_name}-Train",
    environment_variables=env_variables,
    instance_type=training_instance_type)(processing_step[0], kmeans_nclusters, run_id=processing_step[4])

evaluation_step = step(
    evaluation,
    name="Evaluation",
    job_name_prefix=f"{pipeline_name}-Evaluation",
    environment_variables=env_variables,
    instance_type=training_instance_type)(processing_step[1], processing_step[3], training_step[0], percentile=percentile, run_id=processing_step[4])

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        training_instance_type,
        kmeans_nclusters,
        input_data_s3_uri,
        percentile
    ],
    steps=[processing_step, training_step, evaluation_step],
    pipeline_definition_config=PipelineDefinitionConfig(use_custom_job_prefix=True),        
)

# Execute the pipeline in SageMaker
pipeline.upsert(role_arn=role)

In [None]:
pipeline.start(
    parameters={
        "input_data_s3_uri": data_s3_uri,
        "kmeans_nclusters": 2,
        "training_instance_type": "ml.m5.large",
        "percentile": 96
    }
)