# XGBoost Model

In [1]:
import os
import gc
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

from utils import print_f1_scores

import warnings
warnings.filterwarnings('ignore')

## Data Upload For Training

In [3]:
session = sagemaker.Session()

prefix = 'wildfire'
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [4]:
role = get_execution_role()

In [5]:
container = get_image_uri(session.boto_region_name, 'xgboost', 'latest')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


## Estimator Definition

In [6]:
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='multi:softmax',
                        num_class=13,
                        early_stopping_rounds=10,
                        num_round=30)



train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


## Hyperparameter Tuner Definition And Training

In [7]:
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:merror', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 3, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

In [8]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

Fitting the hyperparameter tuner

In [9]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

..................................................................................................................................................................................................................................................................................................................................................................................................................................!


Saving the best model:

In [10]:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())


2021-02-13 05:32:35 Starting - Preparing the instances for training
2021-02-13 05:32:35 Downloading - Downloading input data
2021-02-13 05:32:35 Training - Training image download completed. Training in progress.
2021-02-13 05:32:35 Uploading - Uploading generated training model
2021-02-13 05:32:35 Completed - Training job completed


In [11]:
print(xgb_attached.model_data)

s3://sagemaker-us-west-1-170667047098/wildfire/output/xgboost-210213-0501-001-79f78e61/output/model.tar.gz


## Deployment

In [12]:
xgb_predictor = xgb_attached.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

-----------------!

In [62]:
predictions = []
with open(os.path.join(data_dir, 'test.csv'), "r") as file:
    data = file.readlines()
    for i in range(0, len(data), 10000):
        batch = "".join(data[i:i+10000])
        batch_predictions = xgb_predictor.predict(batch, initial_args={'ContentType': 'text/csv'}).decode("utf-8").split(",")
        batch_predictions = [int(float(p)) for p in batch_predictions]
        predictions += batch_predictions

## Accuracy And F1 Score

In [63]:
test_y = pd.read_csv(os.path.join(data_dir, 'test_y.csv'), header=None)
accuracy_xgb = accuracy_score(test_y, predictions)
f1_score_xgb = f1_score(test_y, predictions, average=None)

In [65]:
print(accuracy_xgb)

0.3869524158538648


In [66]:
df_f1_score_xgb = print_f1_scores(f1_score_xgb)

In [67]:
df_f1_score_xgb

Unnamed: 0,Causes Description,F1 scores
0,Lightning,0.581958
1,Equipment Use,0.042096
2,Smoking,0.0
3,Campfire,0.112746
4,Debris Burning,0.511186
5,Railroad,0.042674
6,Arson,0.276718
7,Children,0.002845
8,Miscellaneous,0.420798
9,Fireworks,0.384193


In [69]:
xgb_predictor.delete_endpoint()