# Data Scientist Workflow Demo

This notebook demonstrates the typical workflow for a Data Scientist working with the MLOps SageMaker Demo project. It focuses on data exploration, labeling, and model development tasks that are accessible with the Data Scientist IAM role.

## Workflow Overview

1. **Data Exploration**: Analyze and visualize the drone imagery dataset
2. **Data Labeling**: Create and manage Ground Truth labeling jobs
3. **Model Development**: Experiment with YOLOv11 models and track experiments with MLFlow

## Prerequisites

- AWS account with appropriate permissions
- AWS CLI configured with "ab" profile
- SageMaker Studio access with Data Scientist role
- Access to the drone imagery dataset in S3

Let's start by importing the necessary libraries and setting up our environment.

In [None]:
import os
import boto3
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from IPython.display import display, HTML
import ipywidgets as widgets
from tqdm.notebook import tqdm

# Set up AWS session with "ab" profile
session = boto3.Session(profile_name='ab')
sagemaker_session = sagemaker.Session(boto_session=session)
region = session.region_name
account_id = session.client('sts').get_caller_identity()['Account']

# Import project modules
import sys
sys.path.append('..')
from src.data.s3_utils import S3DataAccess
from src.data.data_profiler import DroneImageryProfiler
from src.data.ground_truth_utils import create_labeling_job_config, monitor_labeling_job
from src.pipeline.mlflow_integration import MLFlowSageMakerIntegration

# Set up visualization
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Load project configuration
from configs.project_config import PROJECT_NAME, DATA_BUCKET, ROLE_ARN

print(f"Project: {PROJECT_NAME}")
print(f"Data Bucket: {DATA_BUCKET}")
print(f"Region: {region}")
print(f"Account ID: {account_id}")

## 1. Data Exploration

Let's start by exploring the drone imagery dataset stored in S3.

In [None]:
# Initialize S3 data access
s3_data_access = S3DataAccess(session=session, bucket_name=DATA_BUCKET)

# List available images
raw_images = s3_data_access.list_objects(prefix="raw-images/")
print(f"Found {len(raw_images)} raw images in the bucket")

# Display sample images
def display_sample_images(num_samples=5):
    sample_images = raw_images[:num_samples]
    fig, axes = plt.subplots(1, num_samples, figsize=(20, 4))
    
    for i, image_key in enumerate(sample_images):
        image_data = s3_data_access.get_object(image_key)
        image = plt.imread(image_data)
        axes[i].imshow(image)
        axes[i].set_title(os.path.basename(image_key))
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

display_sample_images()

### 1.1 Data Profiling

Let's profile the dataset to understand its characteristics.

In [None]:
# Initialize data profiler
data_profiler = DroneImageryProfiler(s3_data_access)

# Create interactive widget for number of images to profile
num_images_widget = widgets.IntSlider(
    value=50,
    min=10,
    max=200,
    step=10,
    description='Number of Images:',
    style={'description_width': 'initial'}
)

display(num_images_widget)

In [None]:
# Profile the dataset
def profile_dataset():
    num_images = num_images_widget.value
    print(f"Profiling {num_images} images...")
    
    # Profile the dataset with progress bar
    with tqdm(total=num_images) as pbar:
        profile_results = data_profiler.profile_dataset(
            raw_images[:num_images],
            progress_callback=lambda i: pbar.update(1)
        )
    
    # Display profile results
    print("\nDataset Profile:")
    print(f"Total images: {profile_results['total_images']}")
    print(f"Average dimensions: {profile_results['avg_width']}x{profile_results['avg_height']}")
    print(f"Average file size: {profile_results['avg_file_size_mb']:.2f} MB")
    print(f"Image formats: {profile_results['formats']}")
    
    # Plot image dimension distribution
    plt.figure(figsize=(15, 10))
    
    plt.subplot(2, 2, 1)
    plt.hist(profile_results['widths'], bins=20, alpha=0.7)
    plt.title('Image Width Distribution')
    plt.xlabel('Width (pixels)')
    plt.ylabel('Count')
    
    plt.subplot(2, 2, 2)
    plt.hist(profile_results['heights'], bins=20, alpha=0.7)
    plt.title('Image Height Distribution')
    plt.xlabel('Height (pixels)')
    plt.ylabel('Count')
    
    plt.subplot(2, 2, 3)
    plt.hist(profile_results['aspect_ratios'], bins=20, alpha=0.7)
    plt.title('Aspect Ratio Distribution')
    plt.xlabel('Aspect Ratio (width/height)')
    plt.ylabel('Count')
    
    plt.subplot(2, 2, 4)
    plt.hist(profile_results['file_sizes_mb'], bins=20, alpha=0.7)
    plt.title('File Size Distribution')
    plt.xlabel('File Size (MB)')
    plt.ylabel('Count')
    
    plt.tight_layout()
    plt.show()
    
    return profile_results

# Create button to start profiling
profile_button = widgets.Button(
    description='Profile Dataset',
    button_style='info',
    tooltip='Click to profile the dataset'
)

profile_output = widgets.Output()

def on_profile_clicked(b):
    with profile_output:
        profile_output.clear_output()
        profile_results = profile_dataset()

profile_button.on_click(on_profile_clicked)

display(profile_button, profile_output)

### 1.2 Image Visualization

Let's create an interactive image browser to explore the dataset.

In [None]:
# Create interactive image browser
def create_image_browser():
    # Create widgets
    image_index = widgets.IntSlider(
        value=0,
        min=0,
        max=len(raw_images) - 1,
        step=1,
        description='Image Index:',
        style={'description_width': 'initial'}
    )
    
    # Create output widget for image display
    image_output = widgets.Output()
    
    # Function to display image
    def display_image(index):
        with image_output:
            image_output.clear_output(wait=True)
            
            image_key = raw_images[index]
            image_data = s3_data_access.get_object(image_key)
            image = plt.imread(image_data)
            
            plt.figure(figsize=(10, 8))
            plt.imshow(image)
            plt.title(f"Image: {os.path.basename(image_key)}")
            plt.axis('off')
            plt.show()
            
            # Display image metadata
            print(f"Image Path: {image_key}")
            print(f"Dimensions: {image.shape[1]}x{image.shape[0]}")
            print(f"Channels: {image.shape[2] if len(image.shape) > 2 else 1}")
            print(f"Data Type: {image.dtype}")
    
    # Connect slider to display function
    widgets.interactive(display_image, index=image_index)
    
    # Display widgets
    display(image_index, image_output)
    
    # Initialize with first image
    display_image(0)

create_image_browser()

## 2. Data Labeling with Ground Truth

Now, let's create a Ground Truth labeling job to annotate the drone imagery.

In [None]:
# Create interactive widgets for labeling job configuration
job_name_widget = widgets.Text(
    value=f"drone-detection-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}",
    description='Job Name:',
    style={'description_width': 'initial'}
)

num_images_widget = widgets.IntSlider(
    value=10,
    min=5,
    max=100,
    step=5,
    description='Number of Images:',
    style={'description_width': 'initial'}
)

worker_type_widget = widgets.Dropdown(
    options=['private', 'public'],
    value='private',
    description='Worker Type:',
    style={'description_width': 'initial'}
)

budget_widget = widgets.FloatSlider(
    value=50.0,
    min=10.0,
    max=500.0,
    step=10.0,
    description='Max Budget (USD):',
    style={'description_width': 'initial'}
)

# Create multi-select widget for labels
label_widget = widgets.SelectMultiple(
    options=['drone', 'vehicle', 'person', 'building', 'tree', 'animal'],
    value=['drone', 'vehicle', 'person', 'building'],
    description='Labels:',
    style={'description_width': 'initial'}
)

# Create text area for instructions
instructions_widget = widgets.Textarea(
    value='Label all drones and other objects visible in the image. Draw tight bounding boxes around each object.',
    placeholder='Enter instructions for workers',
    description='Instructions:',
    style={'description_width': 'initial'},
    layout={'width': '100%', 'height': '100px'}
)

display(job_name_widget, num_images_widget, worker_type_widget, budget_widget, label_widget, instructions_widget)

In [None]:
# Create labeling job configuration
def create_labeling_job():
    job_name = job_name_widget.value
    num_images = num_images_widget.value
    worker_type = worker_type_widget.value
    max_budget = budget_widget.value
    labels = list(label_widget.value)
    instructions = instructions_widget.value
    
    # Select images for labeling
    selected_images = raw_images[:num_images]
    
    # Configure input and output paths
    input_path = f"s3://{DATA_BUCKET}/raw-images/"
    output_path = f"s3://{DATA_BUCKET}/labeled-data/{job_name}/"
    
    # Create labeling job configuration
    labeling_job_config = create_labeling_job_config(
        job_name=job_name,
        input_path=input_path,
        output_path=output_path,
        task_type="BoundingBox",
        worker_type=worker_type,
        labels=labels,
        instructions=instructions,
        max_budget_usd=max_budget
    )
    
    # Create the labeling job
    sagemaker_client = session.client('sagemaker')
    response = sagemaker_client.create_labeling_job(**labeling_job_config)
    
    print(f"Created labeling job: {job_name}")
    print(f"Job ARN: {response['LabelingJobArn']}")
    
    return response['LabelingJobArn']

# Create button to start labeling job
create_job_button = widgets.Button(
    description='Create Labeling Job',
    button_style='success',
    tooltip='Click to create the labeling job'
)

job_output = widgets.Output()

def on_create_job_clicked(b):
    with job_output:
        job_output.clear_output()
        job_arn = create_labeling_job()
        
        # Monitor job progress
        print("\nMonitoring job progress...")
        monitor_labeling_job(job_arn, session=session)

create_job_button.on_click(on_create_job_clicked)

display(create_job_button, job_output)

### 2.1 Monitor Existing Labeling Jobs

Let's create a tool to monitor existing labeling jobs.

In [None]:
# Function to list existing labeling jobs
def list_labeling_jobs():
    sagemaker_client = session.client('sagemaker')
    response = sagemaker_client.list_labeling_jobs()
    
    jobs = response['LabelingJobSummaryList']
    
    # Create a DataFrame for better display
    job_data = []
    for job in jobs:
        job_data.append({
            'Name': job['LabelingJobName'],
            'Status': job['LabelingJobStatus'],
            'Created': job['CreationTime'].strftime('%Y-%m-%d %H:%M:%S'),
            'Task Type': job.get('LabelCounters', {}).get('TotalLabeled', 0),
            'Total Objects': job.get('LabelCounters', {}).get('TotalLabeled', 0),
            'ARN': job['LabelingJobArn']
        })
    
    if job_data:
        df = pd.DataFrame(job_data)
        display(df)
        return jobs
    else:
        print("No labeling jobs found")
        return []

# Create button to list jobs
list_jobs_button = widgets.Button(
    description='List Labeling Jobs',
    button_style='info',
    tooltip='Click to list existing labeling jobs'
)

list_jobs_output = widgets.Output()

def on_list_jobs_clicked(b):
    with list_jobs_output:
        list_jobs_output.clear_output()
        jobs = list_labeling_jobs()

list_jobs_button.on_click(on_list_jobs_clicked)

display(list_jobs_button, list_jobs_output)

### 2.2 Convert Ground Truth Output to YOLOv11 Format

Once the labeling job is complete, we need to convert the output to YOLOv11 format.

In [None]:
# Convert Ground Truth output to YOLOv11 format
from src.data.ground_truth_utils import convert_ground_truth_to_yolo

# Create widget for job name input
conversion_job_name = widgets.Text(
    value="",
    description='Job Name:',
    placeholder='Enter the completed labeling job name',
    style={'description_width': 'initial'}
)

# Create widget for class mapping
class_mapping_widget = widgets.Textarea(
    value='{"drone": 0, "vehicle": 1, "person": 2, "building": 3}',
    placeholder='Enter class mapping as JSON',
    description='Class Mapping:',
    style={'description_width': 'initial'},
    layout={'width': '100%', 'height': '100px'}
)

display(conversion_job_name, class_mapping_widget)

In [None]:
def convert_labels_to_yolo():
    import json
    
    job_name = conversion_job_name.value
    
    # Validate inputs
    if not job_name:
        print("Please enter a valid job name")
        return
    
    # Parse class mapping
    try:
        class_mapping = json.loads(class_mapping_widget.value)
    except json.JSONDecodeError as e:
        print(f"Error parsing class mapping: {e}")
        return
    
    input_manifest = f"s3://{DATA_BUCKET}/labeled-data/{job_name}/output/manifest.json"
    output_directory = f"s3://{DATA_BUCKET}/training-data/yolo-format/{job_name}/"
    
    # Perform conversion
    try:
        convert_ground_truth_to_yolo(
            input_manifest=input_manifest,
            output_directory=output_directory,
            class_mapping=class_mapping,
            region_name=region
        )
        
        print(f"Converted annotations saved to: {output_directory}")
        return output_directory
    except Exception as e:
        print(f"Error converting labels: {e}")
        return None

# Create button to convert labels
convert_button = widgets.Button(
    description='Convert to YOLO',
    button_style='warning',
    tooltip='Click to convert Ground Truth output to YOLOv11 format'
)

conversion_output = widgets.Output()

def on_convert_clicked(b):
    with conversion_output:
        conversion_output.clear_output()
        output_dir = convert_labels_to_yolo()

convert_button.on_click(on_convert_clicked)

display(convert_button, conversion_output)

## 3. Model Development with MLFlow Tracking

Now, let's experiment with YOLOv11 models and track our experiments with MLFlow.

In [None]:
# Initialize MLFlow integration
mlflow_integration = MLFlowSageMakerIntegration(experiment_name="yolov11-drone-detection")

# Create interactive widgets for training configuration
training_data_path = widgets.Text(
    value="",
    description='Training Data Path:',
    placeholder='s3://bucket/training-data/yolo-format/job-name/',
    style={'description_width': 'initial'}
)

instance_type = widgets.Dropdown(
    options=['ml.g4dn.xlarge', 'ml.g4dn.2xlarge', 'ml.g5.xlarge'],
    value='ml.g4dn.xlarge',
    description='Instance Type:',
    style={'description_width': 'initial'}
)

use_spot = widgets.Checkbox(
    value=True,
    description='Use Spot Instances',
    style={'description_width': 'initial'}
)

model_variant = widgets.Dropdown(
    options=['yolov11n', 'yolov11s', 'yolov11m', 'yolov11l', 'yolov11x'],
    value='yolov11n',
    description='Model Variant:',
    style={'description_width': 'initial'}
)

learning_rate = widgets.FloatLogSlider(
    value=0.001,
    base=10,
    min=-4,  # 10^-4 = 0.0001
    max=-2,  # 10^-2 = 0.01
    step=0.1,
    description='Learning Rate:',
    style={'description_width': 'initial'}
)

batch_size = widgets.IntSlider(
    value=16,
    min=4,
    max=64,
    step=4,
    description='Batch Size:',
    style={'description_width': 'initial'}
)

epochs = widgets.IntSlider(
    value=50,
    min=10,
    max=300,
    step=10,
    description='Epochs:',
    style={'description_width': 'initial'}
)

image_size = widgets.IntSlider(
    value=640,
    min=320,
    max=1280,
    step=32,
    description='Image Size:',
    style={'description_width': 'initial'}
)

display(training_data_path, instance_type, use_spot, model_variant, learning_rate, batch_size, epochs, image_size)

In [None]:
# Create function to start training job
def start_training_job():
    from sagemaker.estimator import Estimator
    
    # Get widget values
    data_path = training_data_path.value
    instance = instance_type.value
    spot = use_spot.value
    variant = model_variant.value
    lr = learning_rate.value
    bs = batch_size.value
    num_epochs = epochs.value
    img_size = image_size.value
    
    # Validate inputs
    if not data_path:
        print("Please enter a valid training data path")
        return
    
    # Configure hyperparameters
    hyperparameters = {
        "learning_rate": lr,
        "batch_size": bs,
        "epochs": num_epochs,
        "model_variant": variant,
        "image_size": img_size,
        "enable_mlflow": True
    }
    
    # Start MLFlow run
    with mlflow_integration.start_run():
        # Log parameters
        mlflow_integration.log_parameters(hyperparameters)
        
        # Create SageMaker estimator
        estimator = Estimator(
            image_uri=f"{account_id}.dkr.ecr.{region}.amazonaws.com/yolov11-training:latest",
            role=ROLE_ARN,
            instance_count=1,
            instance_type=instance,
            use_spot_instances=spot,
            max_wait=36000 if spot else None,
            max_run=3600,
            hyperparameters=hyperparameters,
            output_path=f"s3://{DATA_BUCKET}/model-artifacts/",
            sagemaker_session=sagemaker_session
        )
        
        # Start training job
        job_name = f"yolov11-training-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}"
        estimator.fit(
            inputs={"training": data_path},
            job_name=job_name,
            wait=False
        )
        
        # Log SageMaker job
        mlflow_integration.log_sagemaker_job(job_name)
        
        print(f"Started training job: {job_name}")
        return job_name

# Create button to start training job
train_button = widgets.Button(
    description='Start Training',
    button_style='danger',
    tooltip='Click to start the training job'
)

training_output = widgets.Output()

def on_train_clicked(b):
    with training_output:
        training_output.clear_output()
        job_name = start_training_job()

train_button.on_click(on_train_clicked)

display(train_button, training_output)

### 3.1 View MLFlow Experiments

Let's create a tool to view and compare MLFlow experiments.

In [None]:
# Function to list MLFlow experiments
def list_mlflow_experiments():
    experiments = mlflow_integration.list_experiments()
    
    if experiments:
        # Create a DataFrame for better display
        exp_data = []
        for exp in experiments:
            exp_data.append({
                'Name': exp.name,
                'ID': exp.experiment_id,
                'Artifact Location': exp.artifact_location,
                'Lifecycle Stage': exp.lifecycle_stage
            })
        
        df = pd.DataFrame(exp_data)
        display(df)
        return experiments
    else:
        print("No MLFlow experiments found")
        return []

# Create button to list experiments
list_exp_button = widgets.Button(
    description='List Experiments',
    button_style='info',
    tooltip='Click to list MLFlow experiments'
)

list_exp_output = widgets.Output()

def on_list_exp_clicked(b):
    with list_exp_output:
        list_exp_output.clear_output()
        experiments = list_mlflow_experiments()

list_exp_button.on_click(on_list_exp_clicked)

display(list_exp_button, list_exp_output)

In [None]:
# Function to list runs for an experiment
def list_experiment_runs(experiment_name="yolov11-drone-detection"):
    runs = mlflow_integration.list_runs(experiment_name)
    
    if runs:
        # Create a DataFrame for better display
        run_data = []
        for run in runs:
            run_info = run.info
            run_data.append({
                'Run ID': run_info.run_id,
                'Status': run_info.status,
                'Start Time': datetime.fromtimestamp(run_info.start_time / 1000.0).strftime('%Y-%m-%d %H:%M:%S'),
                'End Time': datetime.fromtimestamp(run_info.end_time / 1000.0).strftime('%Y-%m-%d %H:%M:%S') if run_info.end_time else 'Running',
                'Artifact URI': run_info.artifact_uri
            })
        
        df = pd.DataFrame(run_data)
        display(df)
        return runs
    else:
        print(f"No runs found for experiment: {experiment_name}")
        return []

# Create widget for experiment name input
experiment_name_input = widgets.Text(
    value="yolov11-drone-detection",
    description='Experiment Name:',
    style={'description_width': 'initial'}
)

# Create button to list runs
list_runs_button = widgets.Button(
    description='List Runs',
    button_style='info',
    tooltip='Click to list experiment runs'
)

list_runs_output = widgets.Output()

def on_list_runs_clicked(b):
    with list_runs_output:
        list_runs_output.clear_output()
        runs = list_experiment_runs(experiment_name_input.value)

list_runs_button.on_click(on_list_runs_clicked)

display(experiment_name_input, list_runs_button, list_runs_output)