# Trainning

## Benchmark

## XGBoost model

In [1]:
import pandas as pd
import numpy as np
import sagemaker
import os
from utils import standardize_data

In [2]:
session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = 'wildfire'
data_dir = 'wildfire_data'
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [3]:
from sagemaker import get_execution_role
role = get_execution_role()

In [4]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(session.boto_region_name, 'xgboost', 'latest')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [6]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='multi:softmax',
                        num_class=13,
                        early_stopping_rounds=10,
                        num_round=30)



train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [7]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:merror', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 3, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

In [8]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [None]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

.................................................................................................................................................................................................................................................................................................................................................................................................................................!


In [10]:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())


2021-02-03 05:05:27 Starting - Preparing the instances for training
2021-02-03 05:05:27 Downloading - Downloading input data
2021-02-03 05:05:27 Training - Training image download completed. Training in progress.
2021-02-03 05:05:27 Uploading - Uploading generated training model
2021-02-03 05:05:27 Completed - Training job completed


In [30]:
type(xgb_attached)

sagemaker.estimator.Estimator

In [20]:
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge', strategy = 'SingleRecord', max_payload=100)

In [None]:
xgb_transformer.transform(test_location, content_type = 'text/csv', split_type = 'Line')

In [22]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-west-1-170667047098/xgboost-2021-02-03-05-50-37-185/test.csv.out to wildfire_data/test.csv.out


In [23]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
test_y = pd.read_csv(os.path.join(data_dir, 'test_y.csv'), header=None)
accuracy_xgb = accuracy_score(test_y, predictions)
f1_score_xgb = f1_score(test_y, predictions, average=None)

In [28]:
print(accuracy_xgb)

0.5211490805441222


In [29]:
print(f1_score_xgb )

[0.69039575 0.22378871 0.00417616 0.17545987 0.55823482 0.40904198
 0.43111773 0.06368279 0.45533198 0.43131313 0.02306214 0.02590674
 0.85320755]


Benchmark

In [37]:
(test_y==4).sum().item()/len(test_y)

0.22669453218893248

# SVM Model

In [None]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_svm = SKLearn(entry_point='train_svc.py', 
                 source_dir='source_sklearn',
                 role=role,
                 train_instance_count=1,
                 train_instance_type='ml.c4.xlarge',
                 sagemaker_session=sagemaker_session,
                 py_version='py3',
                 framework_version='0.23-1')

In [None]:
%%time

# Train your estimator on S3 training data
sklearn_svm.fit({'train': output_path})

In [None]:
svm_predictor = sklearn_svm.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

In [None]:
test_x = pd.read_csv(os.path.join(data_dir, "test.csv"), header=None, names=None)
test_y = pd.read_csv(os.path.join(data_dir, "test_y.csv"), header=None, names=None)

In [None]:
test_y_preds = predictor.predict(test_x)
assert len(test_y_preds)==len(test_y)

In [None]:
accuracy = sum(test_y_preds==test_y)/len(test_y)

In [None]:
predictor.delete_endpoint()