# Refinement MLP model

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, f1_score

import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn
import warnings
warnings.filterwarnings('ignore')

In [2]:
prefix = 'wildfire'
data_dir = 'wildfire_data'
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
output_path = 's3://{}/{}'.format(bucket, prefix)
role = get_execution_role()

In [3]:
test_ref_location = sagemaker_session.upload_data(os.path.join(data_dir, 'test_ref.csv'), key_prefix=prefix)
val_ref_location = sagemaker_session.upload_data(os.path.join(data_dir, 'validation_ref.csv'), key_prefix=prefix)
train_ref_location = sagemaker_session.upload_data(os.path.join(data_dir, 'train_ref.csv'), key_prefix=prefix)

In [4]:
sklearn_mlp = SKLearn(entry_point='train_mlp.py', 
                 source_dir='source_sklearn',
                 role=role,
                 train_instance_count=1,
                 train_instance_type='ml.c4.xlarge',
                 sagemaker_session=sagemaker_session,
                 py_version='py3',
                 framework_version='0.23-1')

train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [5]:
%%time
output_path='s3://{}/{}/'.format(bucket, prefix)

# Train your estimator on S3 training data
sklearn_mlp.fit({'train': output_path})

2021-02-10 06:45:37 Starting - Starting the training job...
2021-02-10 06:45:40 Starting - Launching requested ML instancesProfilerReport-1612939537: InProgress
......
2021-02-10 06:46:50 Starting - Preparing the instances for training......
2021-02-10 06:48:00 Downloading - Downloading input data...
2021-02-10 06:48:33 Training - Training image download completed. Training in progress....[34m2021-02-10 06:48:34,363 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-02-10 06:48:34,365 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-02-10 06:48:34,374 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-02-10 06:48:34,768 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-02-10 06:48:34,781 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-02-10 06:48:34,792 sa

In [6]:
mlp_predictor = sklearn_mlp.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

-------------------!

In [7]:
val = pd.read_csv(os.path.join(data_dir, 'validation_ref.csv'), header=None)
test_x = pd.read_csv(os.path.join(data_dir, 'test_ref.csv'), header=None)
test_y = pd.read_csv(os.path.join(data_dir, 'test_ref_y.csv'), header=None)
val_x = val.iloc[:,1:]
val_y = val.iloc[:,0]

In [8]:
prediction_batches_test = [mlp_predictor.predict(batch) for batch in np.array_split(test_x, 100)]
prediction_batches_val = [mlp_predictor.predict(batch) for batch in np.array_split(val_x, 100)]
test_y_preds = np.concatenate([batch for batch in prediction_batches_test])
val_y_preds = np.concatenate([batch for batch in prediction_batches_val])

In [9]:
from sklearn.metrics import accuracy_score, f1_score
accuracy_mlp_test = accuracy_score(test_y, test_y_preds)
accuracy_mlp_val = accuracy_score(val_y, val_y_preds)
f1_score_mlp_test = f1_score(test_y, test_y_preds, average=None)
f1_score_mlp_val = f1_score(val_y, val_y_preds, average=None)

In [10]:
from utils import print_f1_scores
cause_for_code_refinement = {0: 'Lightning', 1: 'Debris Burning', 2: 'Arson', 3: 'Miscellaneous', 4:'Missing/Undefined', 5: 'Other'}
df_f1_score_mlp_test = print_f1_scores(f1_score_mlp_test, cause_for_code_refinement)
df_f1_score_mlp_val = print_f1_scores(f1_score_mlp_val, cause_for_code_refinement)

In [11]:
print(f"The accuracy for val set is: {accuracy_mlp_val}")
print(f"The accuracy for test set is: {accuracy_mlp_test}")

The accuracy for val set is: 0.4085368075137896
The accuracy for test set is: 0.3815188588825619


In [12]:
df_f1_score_mlp_test

Unnamed: 0,Causes Description,F1 scores
0,Lightning,0.579014
1,Debris Burning,0.483936
2,Arson,0.272922
3,Miscellaneous,0.300102
4,Missing/Undefined,0.089003
5,Other,0.26253


In [13]:
df_f1_score_mlp_val

Unnamed: 0,Causes Description,F1 scores
0,Lightning,0.582622
1,Debris Burning,0.51968
2,Arson,0.364454
3,Miscellaneous,0.257893
4,Missing/Undefined,0.114709
5,Other,0.287345


In [13]:
mlp_predictor.delete_endpoint()