In [None]:
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker
from sagemaker import get_execution_role
from sagemaker.processing import ProcessingInput, ProcessingOutput
import boto3
import re

sess = sagemaker.Session()
bucket =sess.default_bucket()
prefix = 'sagemaker/DEMO-xgboost-dm'
# Define IAM role
sagemaker_role = get_execution_role()

In [None]:
%store -r

%store

## Pre-Processing

In [None]:
data = pd.read_csv('./bank-data/bank-additional-full.csv')
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 20)         # Keep the output on one page
data

In [None]:
input_source = sess.upload_data('./bank-data/bank-additional-full.csv', bucket=bucket, key_prefix=f'{prefix}/input_data')
input_source

In [None]:
!mkdir src

In [None]:
%%writefile ./src/preprocessing.py

import pandas as pd
import numpy as np
import argparse
import os
from sklearn.preprocessing import OrdinalEncoder

import mlflow

def _parse_args():

    parser = argparse.ArgumentParser()

    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
    parser.add_argument('--filename', type=str, default='bank-additional-full.csv')
    parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
    parser.add_argument('--categorical_features', type=str, default='y, job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome')

    return parser.parse_known_args()

if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()

    mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_ARN'])
    print(f"logging experiment to {os.environ['MLFLOW_TRACKING_ARN']}")
    mlflow.set_experiment(os.environ['EXPERIMENT_NAME'])        

    with mlflow.start_run(run_name=f"Preprocessing") as run:            
        
        mlflow.autolog()

        # Load data
        df = pd.read_csv(os.path.join(args.filepath, args.filename))

        mlflow.log_input(
            mlflow.data.from_pandas(df, args.filepath, targets='y'),
            context="InputData",
        )
       
        # Change the value . into _
        df = df.replace(regex=r'\.', value='_')
        df = df.replace(regex=r'\_$', value='')
        # Add two new indicators
        df["no_previous_contact"] = (df["pdays"] == 999).astype(int)
        df["not_working"] = df["job"].isin(["student", "retired", "unemployed"]).astype(int)
        df = df.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)
        # Encode the categorical features
        df = pd.get_dummies(df)
        # Train, test, validation split
        train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=42), [int(0.7 * len(df)), int(0.9 * len(df))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%

        mlflow.log_param("train_split", 0.7)
        mlflow.log_param("validation_split", 0.2)
        mlflow.log_param("test_split", 0.1)

        mlflow.log_input(
            mlflow.data.from_pandas(train_data, args.outputpath),
            context="TrainingData",
        )
        mlflow.log_input(
            mlflow.data.from_pandas(validation_data, args.outputpath),
            context="ValidationData",
        )
        mlflow.log_input(
            mlflow.data.from_pandas(test_data, args.outputpath),
            context="TestData",
        )

        # Local store
        pd.concat([train_data['y_yes'], train_data.drop(['y_yes','y_no'], axis=1)], axis=1).to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=False)
        pd.concat([validation_data['y_yes'], validation_data.drop(['y_yes','y_no'], axis=1)], axis=1).to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=False)
        test_data['y_yes'].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=False)
        test_data.drop(['y_yes','y_no'], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=False)

In [None]:
%%writefile ./src/requirements.txt
mlflow
sagemaker-mlflow

In [None]:
# Let's get the MLflow tracking server ARN and pass it to the processing job below
import boto3
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn import SKLearn

train_path = f"s3://{bucket}/{prefix}/train"
validation_path = f"s3://{bucket}/{prefix}/validation"
test_path = f"s3://{bucket}/{prefix}/test"

sklearn_processor = FrameworkProcessor(
    estimator_cls = SKLearn,
    framework_version="1.2-1",
    role=sagemaker_role,
    instance_type="ml.m5.large",
    instance_count=1, 
    base_job_name='sm-immday-skprocessing',
    env={"MLFLOW_TRACKING_ARN": mlflow_arn, "EXPERIMENT_NAME": "bank-marketing"}
)

sklearn_processor.run(
    code='preprocessing.py',
    source_dir='./src',
    inputs=[
        ProcessingInput(source=input_source, destination="/opt/ml/processing/input", s3_input_mode="File")
    ],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/output/train", destination=train_path),
        ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/output/validation", destination=validation_path),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/output/test", destination=test_path),
    ]
)

## Training
Now we know that most of our features have skewed distributions, some are highly correlated with one another, and some appear to have non-linear relationships with our target variable.  Also, for targeting future prospects, good predictive accuracy is preferred to being able to explain why that prospect was targeted.  Taken together, these aspects make gradient boosted trees a good candidate algorithm.

There are several intricacies to understanding the algorithm, but at a high level, gradient boosted trees works by combining predictions from many simple models, each of which tries to address the weaknesses of the previous models.  By doing this the collection of simple models can actually outperform large, complex models.  Other Amazon SageMaker notebooks elaborate on gradient boosting trees further and how they differ from similar algorithms.

`xgboost` is an extremely popular, open-source package for gradient boosted trees.  It is computationally powerful, fully featured, and has been successfully used in many machine learning competitions.  Let's start with a simple `xgboost` model, trained using Amazon SageMaker's managed, distributed training framework.

In [None]:
import boto3
import sagemaker
import pandas as pd

In [None]:
%store -r

%store

Then, because we're training with the CSV file format, we'll create `s3_input`s that our training function can use as a pointer to the files in S3, which also specify that the content type is CSV.

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_path.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=validation_path.format(bucket, prefix), content_type='csv')

First we'll need to specify training parameters to the estimator.  This includes:
1. The `xgboost` algorithm container
1. The IAM role to use
1. Training instance type and count
1. S3 location for output data
1. Algorithm hyperparameters

And then a `.fit()` function which specifies:
1. S3 location for output data.  In this case we have both a training and validation set which are passed in.

In [None]:
%%writefile ./src/training.py
# Training algorithm
import os
import xgboost
import numpy as np
import pandas as pd
import mlflow
import argparse
import pickle as pkl

from sklearn.metrics import roc_auc_score,accuracy_score,precision_score,recall_score

 
def _parse_args():

    parser = argparse.ArgumentParser()
    
    # XGBoost hyperparameters
    parser.add_argument('--eta', type=float, default=0.2, help='Learning rate (default: 0.2)')
    parser.add_argument('--gamma', type=float, default=4, help='Minimum loss reduction required to make a further partition (default: 4)')
    parser.add_argument('--min_child_weight', type=int, default=6, help='Minimum sum of instance weight needed in a child (default: 6)')
    parser.add_argument('--subsample', type=float, default=0.8, help='Subsample ratio of training instances (default: 0.8)')
    parser.add_argument('--silent', type=int, default=0, help='Logging mode - quiet means silent mode (default: 0)')
    parser.add_argument('--objective', type=str, default='binary:logistic', help='Learning task objective (default: binary:logistic)')
    parser.add_argument('--num_round', type=int, default=100, help='Number of boosting rounds/trees (default: 100)')

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION'])
    
    return parser.parse_known_args()

if __name__=="__main__":
    # Process arguments
    args, _ = _parse_args()

    train_data = pd.read_csv(os.path.join(args.train, 'train.csv'), header=None)
    validation_data = pd.read_csv(os.path.join(args.validation, 'validation.csv'), header=None)
    
    # labels are in the first column
    train_y = train_data.iloc[:, 0]
    train_X = train_data.iloc[:, 1:]
    validation_y = validation_data.iloc[:, 0]
    validation_X = validation_data.iloc[:, 1:]

    mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_ARN'])
    print(f"logging experiment to {os.environ['MLFLOW_TRACKING_ARN']}")
    mlflow.set_experiment(os.environ['EXPERIMENT_NAME'])

    with mlflow.start_run(run_name=f"Training") as run:               
        mlflow.autolog()
             
        # Creating DMatrix(es)
        dtrain = xgboost.DMatrix(train_X, label=train_y)
        dval = xgboost.DMatrix(validation_X, label=validation_y)
        watchlist = [(dtrain, "train"), (dval, "validation")]

        param_dist = {
            "eta": args.eta,
            "gamma": args.gamma,
            "min_child_weight": args.min_child_weight,
            "subsample": args.subsample,
            "silent": args.silent,
            "objective": str(args.objective),
            "num_round": args.num_round
        }        
    
        xgb = xgboost.train(
            params=param_dist,
            dtrain=dtrain,
            evals=watchlist,
            num_boost_round=args.num_round)
    
        predictions = xgb.predict(dval)
    
        print (pd.crosstab(index=validation_y, columns=np.round(predictions),
                           rownames=['Actuals'], colnames=['Predictions'], margins=True))
        
        rounded_predict = np.round(predictions)
    
        val_accuracy = accuracy_score(validation_y, rounded_predict)
        val_precision = precision_score(validation_y, rounded_predict)
        val_recall = recall_score(validation_y, rounded_predict)
    
        print("Accuracy Model A: %.2f%%" % (val_accuracy * 100.0))            
        print("Precision Model A: %.2f" % (val_precision))
        print("Recall Model A: %.2f" % (val_recall))
        
        # Log additional metrics, next to the default ones logged automatically
        mlflow.log_metric("Accuracy Model A", val_accuracy * 100.0)
        mlflow.log_metric("Precision Model A", val_precision)
        mlflow.log_metric("Recall Model A", val_recall)
            
        val_auc = roc_auc_score(validation_y, predictions)
        
        print("Validation AUC A: %.2f" % (val_auc))
        mlflow.log_metric("Validation AUC A", val_auc)
    
        model_file_path="/opt/ml/model/xgboost-model"
        os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
        
        pkl.dump(xgb, open(model_file_path, 'wb'))
    

In [None]:
%%writefile ./src/requirements.txt
mlflow==2.17.0
sagemaker-mlflow

In [None]:
from sagemaker.xgboost.estimator import XGBoost

xgb = XGBoost(
    entry_point='training.py',
    source_dir='./src',
    framework_version='1.7-1',
    role=sagemaker_role,
    instance_count=1, 
    instance_type='ml.m5.xlarge',
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    environment={"MLFLOW_TRACKING_ARN": mlflow_arn, "EXPERIMENT_NAME": "bank-marketing"},
    keep_alive_period_in_seconds=3600)

xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    silent=0,
    num_round=100
)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

In [None]:
model_output_path = xgb.output_path
training_job_name = xgb.latest_training_job.name

%store model_output_path
%store training_job_name