## Using @remote Decorator and ModelBuilder

In [None]:
%store -r

%store

In [None]:
import pandas as pd

In [None]:
!aws s3 cp $train_path/train.csv /tmp/train.csv
!aws s3 cp $validation_path/validation.csv /tmp/val.csv

train = pd.read_csv('/tmp/train.csv')
val = pd.read_csv('/tmp/val.csv')

train_x = train.iloc[:, 1:]
train_y = train.iloc[:, 0]

val_x = val.iloc[:, 1:]
val_y = val.iloc[:, 0]

In this section, you will use XGBoost to train a logistic regression model using the preprocessed data generated in the previous step. Again, you will use a standard Python function that accepts some of the XGBoost hyperparameters as input and returns the model.

The following cell annotates the training function with the @remote decorator to run the Python function as a SageMaker job without requiring any other modifications to the function code. Feel free to comment out the remote decorator in the cells below to seamlesssly move from running the function remotely via SageMaker Training to local execution. If you comment out the decorator to run the function locally, you will need to run this command in the terminal to give permission to the output directory where the function will save the models: sudo chmod -R 777 /opt/ml/model. You don't need to run this command if you leave the remote decorator in, since the config.yaml file runs that command before executing the training job.

Running the training function will initiate a SageMaker training job because the function is decoarated with the @remote decorator.

In [None]:
!sudo mkdir /opt/ml/model
!sudo chmod -R 777 /opt/ml/model

In [None]:
import mlflow
import os
import pickle as pkl
import xgboost
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
import xgboost
from sagemaker.remote_function import remote


#@remote(instance_type='ml.m5.xlarge', job_name_prefix=f"decorator-train")
def train(X_train, y_train, X_val, y_val, experiment_name, tracking_server_arn,
          eta=0.15, 
          max_depth=3, 
          gamma=0.0,
          min_child_weight=1,
          verbosity=0,
          objective='binary:logistic',
          eval_metric='auc',
          num_boost_round=50):

    print('Train features shape: {}'.format(X_train.shape))
    print('Train labels shape: {}'.format(y_train.shape))
    print('Validation features shape: {}'.format(X_val.shape))
    print('Validation labels shape: {}'.format(y_val.shape))        
    
    mlflow.set_tracking_uri(tracking_server_arn)
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run(run_name=f"Training") as run:               
        mlflow.autolog()
             
        # Creating DMatrix(es)
        dtrain = xgboost.DMatrix(X_train.values, label=y_train)
        dval = xgboost.DMatrix(X_val.values, label=y_val)
        watchlist = [(dtrain, "train"), (dval, "validation")]
    
        print('')
        print (f'===Starting training with max_depth {max_depth}===')
        
        param_dist = {
            "max_depth": max_depth,
            "eta": eta,
            "gamma": gamma,
            "min_child_weight": min_child_weight,
            "verbosity": verbosity,
            "objective": objective,
            "eval_metric": eval_metric
        }        
    
        xgb = xgboost.train(
            params=param_dist,
            dtrain=dtrain,
            evals=watchlist,
            num_boost_round=num_boost_round)
    
        predictions = xgb.predict(dval)
    
        print ("Metrics for validation set")
        print('')
        print (pd.crosstab(index=y_val, columns=np.round(predictions),
                           rownames=['Actuals'], colnames=['Predictions'], margins=True))
        
        rounded_predict = np.round(predictions)
    
        val_accuracy = accuracy_score(y_val, rounded_predict)
        val_precision = precision_score(y_val, rounded_predict)
        val_recall = recall_score(y_val, rounded_predict)
    
        print("Accuracy Model A: %.2f%%" % (val_accuracy * 100.0))            
        print("Precision Model A: %.2f" % (val_precision))
        print("Recall Model A: %.2f" % (val_recall))
        
        # Log additional metrics, next to the default ones logged automatically
        mlflow.log_metric("Accuracy Model A", val_accuracy * 100.0)
        mlflow.log_metric("Precision Model A", val_precision)
        mlflow.log_metric("Recall Model A", val_recall)
        
        from sklearn.metrics import roc_auc_score
    
        val_auc = roc_auc_score(y_val, predictions)
        
        print("Validation AUC A: %.2f" % (val_auc))
        mlflow.log_metric("Validation AUC A", val_auc)
    
        model_file_path="/opt/ml/model/xgboost-model"
        os.makedirs(os.path.dirname(model_file_path), exist_ok=True)
        xgb.save_model(model_file_path)

    return xgb


In [None]:
experiment_name = 'new-bank-experiment'

booster = train(train_x, train_y, val_x, val_y, experiment_name, tracking_server_arn)