# End-to-End MLOps Workflow with SageMaker

This notebook demonstrates the complete MLOps workflow for training and deploying a YOLOv11 object detection model on drone imagery using Amazon SageMaker. The workflow follows the six-step process outlined in the [AWS SageMaker MLOps reference implementation](https://github.com/aws-samples/amazon-sagemaker-from-idea-to-production):

1. **Experiment in a notebook** - Initial development and experimentation
2. **Scale with SageMaker processing jobs and Python SDK** - Moving computation to SageMaker
3. **Operationalize with ML pipeline and model registry** - Building automation
4. **Add a model building CI/CD pipeline** - Automating the model building process
5. **Add a model deployment pipeline** - Automating model deployment
6. **Add model and data monitoring** - Ensuring ongoing quality

## Prerequisites

- AWS account with appropriate permissions
- AWS CLI configured with "ab" profile
- SageMaker Studio access
- IAM roles for SageMaker execution

## Setup

Let's start by importing the necessary libraries and setting up our environment.

In [None]:
import os
import boto3
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from IPython.display import display, HTML
import ipywidgets as widgets
from tqdm.notebook import tqdm

# Set up AWS session with "ab" profile
session = boto3.Session(profile_name='ab')
sagemaker_session = sagemaker.Session(boto_session=session)
region = session.region_name
account_id = session.client('sts').get_caller_identity()['Account']

# Import project modules
import sys
sys.path.append('..')
from src.data.s3_utils import S3DataAccess
from src.data.data_profiler import DroneImageryProfiler
from src.data.ground_truth_utils import create_labeling_job_config, monitor_labeling_job
from src.pipeline.mlflow_integration import MLFlowSageMakerIntegration
from src.pipeline.sagemaker_pipeline import create_training_pipeline
from src.pipeline.model_monitor import setup_model_monitoring

# Set up visualization
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Load project configuration
from configs.project_config import PROJECT_NAME, DATA_BUCKET, ROLE_ARN

print(f"Project: {PROJECT_NAME}")
print(f"Data Bucket: {DATA_BUCKET}")
print(f"Region: {region}")
print(f"Account ID: {account_id}")

## Step 1: Data Exploration and Profiling

Let's start by exploring the drone imagery dataset stored in S3.

In [None]:
# Initialize S3 data access
s3_data_access = S3DataAccess(session=session, bucket_name=DATA_BUCKET)

# List available images
raw_images = s3_data_access.list_objects(prefix="raw-images/")
print(f"Found {len(raw_images)} raw images in the bucket")

# Display sample images
def display_sample_images(num_samples=5):
    sample_images = raw_images[:num_samples]
    fig, axes = plt.subplots(1, num_samples, figsize=(20, 4))
    
    for i, image_key in enumerate(sample_images):
        image_data = s3_data_access.get_object(image_key)
        image = plt.imread(image_data)
        axes[i].imshow(image)
        axes[i].set_title(os.path.basename(image_key))
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

display_sample_images()

Now, let's profile the dataset to understand its characteristics.

In [None]:
# Initialize data profiler
data_profiler = DroneImageryProfiler(s3_data_access)

# Profile the dataset
profile_results = data_profiler.profile_dataset(raw_images[:100])  # Profile first 100 images

# Display profile results
print("Dataset Profile:")
print(f"Total images: {profile_results['total_images']}")
print(f"Average dimensions: {profile_results['avg_width']}x{profile_results['avg_height']}")
print(f"Average file size: {profile_results['avg_file_size_mb']:.2f} MB")
print(f"Image formats: {profile_results['formats']}")

# Plot image dimension distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(profile_results['widths'], bins=20, alpha=0.7)
plt.title('Image Width Distribution')
plt.xlabel('Width (pixels)')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
plt.hist(profile_results['heights'], bins=20, alpha=0.7)
plt.title('Image Height Distribution')
plt.xlabel('Height (pixels)')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

## Step 2: Data Labeling with Ground Truth

Now, let's create a Ground Truth labeling job to annotate the drone imagery.

In [None]:
# Create interactive widgets for labeling job configuration
job_name_widget = widgets.Text(
    value=f"drone-detection-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}",
    description='Job Name:',
    style={'description_width': 'initial'}
)

num_images_widget = widgets.IntSlider(
    value=10,
    min=5,
    max=100,
    step=5,
    description='Number of Images:',
    style={'description_width': 'initial'}
)

worker_type_widget = widgets.Dropdown(
    options=['private', 'public'],
    value='private',
    description='Worker Type:',
    style={'description_width': 'initial'}
)

budget_widget = widgets.FloatSlider(
    value=50.0,
    min=10.0,
    max=500.0,
    step=10.0,
    description='Max Budget (USD):',
    style={'description_width': 'initial'}
)

display(job_name_widget, num_images_widget, worker_type_widget, budget_widget)

In [None]:
# Create labeling job configuration
def create_labeling_job():
    job_name = job_name_widget.value
    num_images = num_images_widget.value
    worker_type = worker_type_widget.value
    max_budget = budget_widget.value
    
    # Select images for labeling
    selected_images = raw_images[:num_images]
    
    # Configure input and output paths
    input_path = f"s3://{DATA_BUCKET}/raw-images/"
    output_path = f"s3://{DATA_BUCKET}/labeled-data/{job_name}/"
    
    # Create labeling job configuration
    labeling_job_config = create_labeling_job_config(
        job_name=job_name,
        input_path=input_path,
        output_path=output_path,
        task_type="BoundingBox",
        worker_type=worker_type,
        labels=["drone", "vehicle", "person", "building"],
        instructions="Label all drones and other objects visible in the image.",
        max_budget_usd=max_budget
    )
    
    # Create the labeling job
    sagemaker_client = session.client('sagemaker')
    response = sagemaker_client.create_labeling_job(**labeling_job_config)
    
    print(f"Created labeling job: {job_name}")
    print(f"Job ARN: {response['LabelingJobArn']}")
    
    return response['LabelingJobArn']

# Create button to start labeling job
create_job_button = widgets.Button(
    description='Create Labeling Job',
    button_style='success',
    tooltip='Click to create the labeling job'
)

job_output = widgets.Output()

def on_create_job_clicked(b):
    with job_output:
        job_arn = create_labeling_job()
        
        # Monitor job progress
        print("\nMonitoring job progress...")
        monitor_labeling_job(job_arn, session=session)

create_job_button.on_click(on_create_job_clicked)

display(create_job_button, job_output)

## Step 3: Data Preprocessing for YOLOv11

Once the labeling job is complete, we need to preprocess the data for YOLOv11 training.

In [None]:
# Convert Ground Truth output to YOLOv11 format
from src.data.ground_truth_utils import convert_ground_truth_to_yolo

def convert_labels_to_yolo(job_name):
    input_manifest = f"s3://{DATA_BUCKET}/labeled-data/{job_name}/output/manifest.json"
    output_directory = f"s3://{DATA_BUCKET}/training-data/yolo-format/{job_name}/"
    
    # Define class mapping
    class_mapping = {
        "drone": 0,
        "vehicle": 1,
        "person": 2,
        "building": 3
    }
    
    # Perform conversion
    convert_ground_truth_to_yolo(
        input_manifest=input_manifest,
        output_directory=output_directory,
        class_mapping=class_mapping,
        session=session
    )
    
    print(f"Converted annotations saved to: {output_directory}")
    return output_directory

# Create widget for job name input
conversion_job_name = widgets.Text(
    value="",
    description='Job Name:',
    placeholder='Enter the completed labeling job name',
    style={'description_width': 'initial'}
)

convert_button = widgets.Button(
    description='Convert to YOLO',
    button_style='info',
    tooltip='Click to convert Ground Truth output to YOLOv11 format'
)

conversion_output = widgets.Output()

def on_convert_clicked(b):
    with conversion_output:
        job_name = conversion_job_name.value
        if not job_name:
            print("Please enter a valid job name")
            return
        
        output_dir = convert_labels_to_yolo(job_name)

convert_button.on_click(on_convert_clicked)

display(conversion_job_name, convert_button, conversion_output)

## Step 4: Model Training with MLFlow Tracking

Now, let's train a YOLOv11 model using SageMaker Training Jobs with MLFlow tracking.

In [None]:
# Initialize MLFlow integration
mlflow_integration = MLFlowSageMakerIntegration(experiment_name="yolov11-drone-detection")

# Create interactive widgets for training configuration
training_data_path = widgets.Text(
    value="",
    description='Training Data Path:',
    placeholder='s3://bucket/training-data/yolo-format/job-name/',
    style={'description_width': 'initial'}
)

instance_type = widgets.Dropdown(
    options=['ml.g4dn.xlarge', 'ml.g4dn.2xlarge', 'ml.g5.xlarge'],
    value='ml.g4dn.xlarge',
    description='Instance Type:',
    style={'description_width': 'initial'}
)

use_spot = widgets.Checkbox(
    value=True,
    description='Use Spot Instances',
    style={'description_width': 'initial'}
)

learning_rate = widgets.FloatLogSlider(
    value=0.001,
    base=10,
    min=-4,  # 10^-4 = 0.0001
    max=-2,  # 10^-2 = 0.01
    step=0.1,
    description='Learning Rate:',
    style={'description_width': 'initial'}
)

batch_size = widgets.IntSlider(
    value=16,
    min=4,
    max=64,
    step=4,
    description='Batch Size:',
    style={'description_width': 'initial'}
)

epochs = widgets.IntSlider(
    value=50,
    min=10,
    max=300,
    step=10,
    description='Epochs:',
    style={'description_width': 'initial'}
)

display(training_data_path, instance_type, use_spot, learning_rate, batch_size, epochs)

In [None]:
# Create function to start training job
def start_training_job():
    from sagemaker.estimator import Estimator
    
    # Get widget values
    data_path = training_data_path.value
    instance = instance_type.value
    spot = use_spot.value
    lr = learning_rate.value
    bs = batch_size.value
    num_epochs = epochs.value
    
    # Validate inputs
    if not data_path:
        print("Please enter a valid training data path")
        return
    
    # Configure hyperparameters
    hyperparameters = {
        "learning_rate": lr,
        "batch_size": bs,
        "epochs": num_epochs,
        "model_variant": "yolov11n",  # Use the nano variant for faster training
        "image_size": 640,
        "enable_mlflow": True
    }
    
    # Start MLFlow run
    with mlflow_integration.start_run():
        # Log parameters
        mlflow_integration.log_parameters(hyperparameters)
        
        # Create SageMaker estimator
        estimator = Estimator(
            image_uri=f"{account_id}.dkr.ecr.{region}.amazonaws.com/yolov11-training:latest",
            role=ROLE_ARN,
            instance_count=1,
            instance_type=instance,
            use_spot_instances=spot,
            max_wait=36000 if spot else None,
            max_run=3600,
            hyperparameters=hyperparameters,
            output_path=f"s3://{DATA_BUCKET}/model-artifacts/",
            sagemaker_session=sagemaker_session
        )
        
        # Start training job
        job_name = f"yolov11-training-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
        estimator.fit(
            inputs={"training": data_path},
            job_name=job_name,
            wait=False
        )
        
        # Log SageMaker job
        mlflow_integration.log_sagemaker_job(job_name)
        
        print(f"Started training job: {job_name}")
        return job_name

# Create button to start training job
train_button = widgets.Button(
    description='Start Training',
    button_style='warning',
    tooltip='Click to start the training job'
)

training_output = widgets.Output()

def on_train_clicked(b):
    with training_output:
        job_name = start_training_job()

train_button.on_click(on_train_clicked)

display(train_button, training_output)

## Step 5: Create and Execute SageMaker Pipeline

Now, let's create a SageMaker Pipeline to automate the entire workflow.

In [None]:
# Create interactive widgets for pipeline configuration
pipeline_name = widgets.Text(
    value=f"yolov11-pipeline-{datetime.now().strftime('%Y-%m-%d')}",
    description='Pipeline Name:',
    style={'description_width': 'initial'}
)

preprocessing_instance = widgets.Dropdown(
    options=['ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge'],
    value='ml.m5.large',
    description='Preprocessing Instance:',
    style={'description_width': 'initial'}
)

training_instance = widgets.Dropdown(
    options=['ml.g4dn.xlarge', 'ml.g4dn.2xlarge', 'ml.g5.xlarge'],
    value='ml.g4dn.xlarge',
    description='Training Instance:',
    style={'description_width': 'initial'}
)

pipeline_data_path = widgets.Text(
    value="",
    description='Data Path:',
    placeholder='s3://bucket/training-data/yolo-format/job-name/',
    style={'description_width': 'initial'}
)

display(pipeline_name, preprocessing_instance, training_instance, pipeline_data_path)

In [None]:
# Create function to create and execute pipeline
def create_and_execute_pipeline():
    # Get widget values
    name = pipeline_name.value
    preproc_instance = preprocessing_instance.value
    train_instance = training_instance.value
    data_path = pipeline_data_path.value
    
    # Validate inputs
    if not name or not data_path:
        print("Please enter valid pipeline name and data path")
        return
    
    # Create pipeline
    pipeline = create_training_pipeline(
        pipeline_name=name,
        role_arn=ROLE_ARN,
        preprocessing_instance_type=preproc_instance,
        training_instance_type=train_instance,
        data_path=data_path,
        hyperparameters={
            "learning_rate": 0.001,
            "batch_size": 16,
            "epochs": 50,
            "model_variant": "yolov11n",
            "image_size": 640,
            "enable_mlflow": True
        },
        session=session
    )
    
    # Register pipeline
    pipeline.upsert(role_arn=ROLE_ARN)
    print(f"Created and registered pipeline: {name}")
    
    # Execute pipeline
    execution = pipeline.start()
    print(f"Started pipeline execution: {execution.arn}")
    
    return execution.arn

# Create button to create and execute pipeline
pipeline_button = widgets.Button(
    description='Create & Execute Pipeline',
    button_style='danger',
    tooltip='Click to create and execute the pipeline'
)

pipeline_output = widgets.Output()

def on_pipeline_clicked(b):
    with pipeline_output:
        execution_arn = create_and_execute_pipeline()

pipeline_button.on_click(on_pipeline_clicked)

display(pipeline_button, pipeline_output)

## Step 6: Model Monitoring and Drift Detection

Finally, let's set up model monitoring for the deployed endpoint.

In [None]:
# Create interactive widgets for monitoring configuration
endpoint_name = widgets.Text(
    value="",
    description='Endpoint Name:',
    placeholder='Enter the deployed endpoint name',
    style={'description_width': 'initial'}
)

baseline_path = widgets.Text(
    value="",
    description='Baseline Data:',
    placeholder='s3://bucket/baseline-data/',
    style={'description_width': 'initial'}
)

monitoring_schedule = widgets.Text(
    value="",
    description='Schedule Name:',
    placeholder='Enter a name for the monitoring schedule',
    style={'description_width': 'initial'}
)

display(endpoint_name, baseline_path, monitoring_schedule)

In [None]:
# Create function to set up model monitoring
def setup_monitoring():
    # Get widget values
    endpoint = endpoint_name.value
    baseline = baseline_path.value
    schedule = monitoring_schedule.value
    
    # Validate inputs
    if not endpoint or not baseline or not schedule:
        print("Please enter valid endpoint name, baseline data path, and schedule name")
        return
    
    # Set up model monitoring
    setup_model_monitoring(
        endpoint_name=endpoint,
        baseline_dataset=baseline,
        monitoring_schedule_name=schedule,
        session=session
    )
    
    print(f"Set up model monitoring for endpoint: {endpoint}")
    print(f"Monitoring schedule: {schedule}")

# Create button to set up monitoring
monitoring_button = widgets.Button(
    description='Setup Monitoring',
    button_style='primary',
    tooltip='Click to set up model monitoring'
)

monitoring_output = widgets.Output()

def on_monitoring_clicked(b):
    with monitoring_output:
        setup_monitoring()

monitoring_button.on_click(on_monitoring_clicked)

display(monitoring_button, monitoring_output)

## Conclusion

In this notebook, we've demonstrated the complete MLOps workflow for training and deploying a YOLOv11 object detection model on drone imagery using Amazon SageMaker. The workflow includes:

1. Data exploration and profiling
2. Data labeling with Ground Truth
3. Data preprocessing for YOLOv11
4. Model training with MLFlow tracking
5. Pipeline orchestration with SageMaker Pipelines
6. Model monitoring and drift detection

This end-to-end workflow showcases the MLOps capabilities of Amazon SageMaker for computer vision applications.

## Cleanup

To avoid ongoing costs, make sure to clean up the resources created in this notebook.

In [None]:
# Create function to clean up resources
def cleanup_resources():
    import boto3
    
    sagemaker_client = session.client('sagemaker')
    
    # Get endpoint name
    endpoint = endpoint_name.value
    
    if endpoint:
        # Delete monitoring schedule
        try:
            sagemaker_client.delete_monitoring_schedule(
                MonitoringScheduleName=monitoring_schedule.value
            )
            print(f"Deleted monitoring schedule: {monitoring_schedule.value}")
        except Exception as e:
            print(f"Error deleting monitoring schedule: {e}")
        
        # Delete endpoint
        try:
            sagemaker_client.delete_endpoint(EndpointName=endpoint)
            print(f"Deleted endpoint: {endpoint}")
        except Exception as e:
            print(f"Error deleting endpoint: {e}")
    
    # Delete pipeline
    pipeline = pipeline_name.value
    if pipeline:
        try:
            sagemaker_client.delete_pipeline(PipelineName=pipeline)
            print(f"Deleted pipeline: {pipeline}")
        except Exception as e:
            print(f"Error deleting pipeline: {e}")
    
    print("Cleanup complete")

# Create button to clean up resources
cleanup_button = widgets.Button(
    description='Cleanup Resources',
    button_style='danger',
    tooltip='Click to clean up all resources'
)

cleanup_output = widgets.Output()

def on_cleanup_clicked(b):
    with cleanup_output:
        cleanup_resources()

cleanup_button.on_click(on_cleanup_clicked)

display(cleanup_button, cleanup_output)