# Using SageMaker Experiments to Train, Tune, and Deploy a Model

---
## Runtime

This notebook takes approximately 60 minutes to run.

---
## Contents
1. [Introduction](##Introduction)
2. [Setup](##Setup)
3. [Prepare the Dataset](##Prepare-the-Dataset)
4. [Create an Experiment](##Create-an-Experiment)
5. [Train the XGBoost Model](##Train-the-XGBoost-Model)
  1. [Automatic Model Tuning](##Automatic-Model-Training)
6. [Lab Cleanup](##Lab-Cleanup)
---
## Introduction

This notebook demonstrates the use of Amazon [SageMaker Experiments Python SDK](https://sagemaker-experiments.readthedocs.io/en/latest/) and SageMaker’s implementation of the XGBoost algorithm to train and deploy a model.

---

## Setup

In [None]:
!pip install -qU sagemaker-experiments>=0.1.24

In [7]:
import boto3
import io
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import sagemaker
import sys
import time
import zipfile

from IPython.display import display
from IPython.display import Image
from sagemaker.analytics import ExperimentAnalytics
from sagemaker.inputs import TrainingInput
from sagemaker.session import Session
from sagemaker.xgboost.estimator import XGBoost
from smexperiments.experiment import Experiment
from smexperiments.tracker import Tracker
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from time import gmtime, strftime

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sess = boto3.Session()
sm = sess.client('sagemaker')
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/lab-xgboost-experiments'

## Prepare the Dataset

In [None]:
adult_columns = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status",
                 "Occupation", "Relationship", "Ethnic group", "Sex", "Capital Gain", "Capital Loss",
                 "Hours per week", "Country", "Target"]

!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data -O /tmp/adult.data

!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test -O /tmp/adult.test

In [None]:
data = pd.read_csv('/tmp/adult.data')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 20)
data

In [None]:
training_data = pd.read_csv("/tmp/adult.data",
                             names=adult_columns,
                             sep=r'\s*,\s*',
                             engine='python',
                             na_values="?").dropna()

testing_data = pd.read_csv("/tmp/adult.test",
                            names=adult_columns,
                            sep=r'\s*,\s*',
                            engine='python',
                            na_values="?",
                            skiprows=1).dropna()

training_data.head()

In [17]:
from sklearn import preprocessing
def number_encode_features(df):
    result = df.copy()
    encoders = {}
    for column in result.columns:
        if result.dtypes[column] == object:
            encoders[column] = preprocessing.LabelEncoder()
            #  print('Column:', column, result[column])
            result[column] = encoders[column].fit_transform(result[column].fillna('None'))
    return result, encoders

training_data = pd.concat([training_data['Target'], training_data.drop(['Target'], axis=1)], axis=1)
training_data, _ = number_encode_features(training_data)
training_data.to_csv('train_data.csv', index=False, header=False)

testing_data, _ = number_encode_features(testing_data)
test_features = testing_data.drop(['Target'], axis = 1)
test_target = testing_data['Target']
test_features.to_csv('test_features.csv', index=False, header=False)

In [None]:
training_data.head()

In [32]:
from sagemaker.s3 import S3Uploader
from sagemaker.inputs import TrainingInput

sagemaker_session = sagemaker.Session()

train_path = S3Uploader.upload('train_data.csv', 's3://{}/{}'.format(bucket, prefix))
validation_path = S3Uploader.upload('test_features.csv', 's3://{}/{}'.format(bucket, prefix))

train_input = TrainingInput(train_path, content_type='text/csv')
validation_input = TrainingInput(validation_path, content_type='text/csv')

data_inputs = {
    'train': train_input,
    'validation': validation_input
}

## Create an Experiment

In [20]:
example_experiment = Experiment.create(
    experiment_name=f"lab-experiment-adult-{int(time.time())}", 
    description="Using SM Experiments with the Adult dataset."
)

In [21]:
trial_name = f"adult-xgboost-{int(time.time())}"
trial = Trial.create(
    trial_name=trial_name, 
    experiment_name=example_experiment.experiment_name
)

## Train the XGBoost Model

We've created an experiment and set up the training parameters. We will now begin training and poll for status until training is completed. Training should take about 5 and 10 minutes for this example.

In [None]:
container = sagemaker.image_uris.retrieve(
    region=boto3.Session().region_name, 
    framework='xgboost', 
    version='latest'
)

xgb = sagemaker.estimator.Estimator(
    container,
    role, 
    instance_count=1, 
    instance_type='ml.m5.xlarge',
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    sagemaker_session=sagemaker_session
)

xgb.set_hyperparameters(
    max_depth=5, eta=0.2, gamma=4, min_child_weight=6,
    subsample=0.8, silent=0, objective='binary:logistic', num_round=800
)

xgb.fit(
    inputs = data_inputs,
    experiment_config={
        "TrialName": trial.trial_name,
        "TrialComponentDisplayName": "AdultTrainingXGBoost",
    }
) 

## Automatic Model Tuning

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner


hyperparameter_ranges = {
    'eta': ContinuousParameter(0, 1),
    'min_child_weight': ContinuousParameter(1, 10),
    'alpha': ContinuousParameter(0, 2),
    'max_depth': IntegerParameter(1, 10),
    'num_round': IntegerParameter(100, 1000)
}

objective_metric_name = 'validation:auc'
objective_type='Maximize'

tuner = HyperparameterTuner(
    estimator = xgb,
    objective_metric_name = objective_metric_name,
    hyperparameter_ranges = hyperparameter_ranges,
    objective_type = objective_type,
    max_jobs=12,
    max_parallel_jobs=4,
    early_stopping_type='Auto'
)

tuner.fit(
    inputs = data_inputs,
    job_name = 'Adult-HPO-XGBoost'
)

In [43]:
from smexperiments.search_expression import Filter, Operator, SearchExpression
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent


trial_name = f"{tuner.latest_tuning_job.job_name}-{int(time.time())}"
trial = Trial.create(
    trial_name=trial_name, 
    experiment_name=example_experiment.experiment_name
)


search_expression = SearchExpression(
    filters=[
        Filter('TrialComponentName', Operator.CONTAINS, tuner.latest_tuning_job.job_name),
    ],
)


trial_component_search_results = TrialComponent.search(search_expression=search_expression)
for tc in trial_component_search_results:
    trial.add_trial_component(tc.trial_component_name)
    time.sleep(0.5)


In [None]:
trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=Session(sess, sm), 
    experiment_name=xgboost_adult_experiment.experiment_name,
    search_expression={"Filters":search_expression.filters},
)

trial_component_analytics.dataframe()

In [None]:
df.plot.scatter(x='num_round', y='ObjectiveMetric - Last')

## Deploy an Endpoint for the Best Training Job

In [None]:

tuner.best_training_job()

In [None]:

tuner_predictor = tuner.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')