# LINEAR LEARNER

In [1]:
import pandas as pd
import numpy as np
import sagemaker
import os
from utils import standardize_data
from sagemaker import LinearLearner
from sagemaker import get_execution_role
import gc

# specify an output path
prefix = 'wildfire'
data_dir = 'wildfire_data'
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
output_path = 's3://{}/{}'.format(bucket, prefix)
role = get_execution_role()

In [2]:
multiclass_estimator = LinearLearner(role=role,
                       train_instance_count=1, 
                       train_instance_type='ml.c4.xlarge',
                       predictor_type='multiclass_classifier',
                       num_classes=13,
                       balance_multiclass_weights=True,
                       output_path=output_path,
                       sagemaker_session=sagemaker_session,
                       epochs=50)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [3]:
gc.collect()
train = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None, dtype="float32")
val = pd.read_csv(os.path.join(data_dir, 'validation.csv'), header=None, dtype="float32")
test_x = pd.read_csv(os.path.join(data_dir, 'test.csv'), header=None, dtype="float32")
test_y = pd.read_csv(os.path.join(data_dir, 'test_y.csv'), header=None, dtype="float32")

In [4]:
train_y_np = train.iloc[:,0].values
train.drop([0], axis = 1, inplace = True)
train_x_np = train.values
train=None
gc.collect()
val_y_np = val.iloc[:,0].values
val.drop([0], axis = 1, inplace = True)
val_x_np = val.values
del val

In [5]:
formatted_train_data = multiclass_estimator.record_set(train_x_np, labels=train_y_np, channel='train')
formatted_val_data = multiclass_estimator.record_set(val_x_np, labels=val_y_np, channel='validation')

In [6]:
multiclass_estimator.fit([formatted_train_data, formatted_val_data])

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-02-07 06:54:15 Starting - Starting the training job...
2021-02-07 06:54:20 Starting - Launching requested ML instancesProfilerReport-1612680855: InProgress
.........
2021-02-07 06:56:16 Starting - Preparing the instances for training......
2021-02-07 06:57:17 Downloading - Downloading input data...
2021-02-07 06:57:41 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[02/07/2021 06:57:58 INFO 140064705316672] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto',

In [7]:
%%time 
# deploy and create a predictor
multiclass_predictor = multiclass_estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


-----------------!CPU times: user 322 ms, sys: 17.8 ms, total: 340 ms
Wall time: 8min 32s


In [16]:
test_x_np = test_x.values
result = multiclass_predictor.predict(test_x_np[0])

In [17]:
result

[label {
   key: "predicted_label"
   value {
     float32_tensor {
       values: 12.0
     }
   }
 }
 label {
   key: "score"
   value {
     float32_tensor {
       values: 2.6244238142680842e-06
       values: 1.000000013351432e-10
       values: 4.3790746029337413e-10
       values: 6.731854682584526e-07
       values: 1.000000013351432e-10
       values: 6.794044771218921e-10
       values: 1.000000013351432e-10
       values: 4.790722263336988e-10
       values: 5.5270419352382305e-09
       values: 0.0246282871812582
       values: 0.09284412115812302
       values: 0.004488704260438681
       values: 0.8780355453491211
     }
   }
 }]

In [18]:
prediction_batches = [multiclass_predictor.predict(batch) for batch in np.array_split(test_x_np, 100)]
test_y_preds = np.concatenate([np.array([x.label['predicted_label'].float32_tensor.values[0] for x in batch]) 
                                 for batch in prediction_batches])

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
test_y_np = test_y.values.astype('float32')
accuracy_multiclass = accuracy_score(test_y_np, test_y_preds)
f1_score_multiclass = f1_score(test_y_np, test_y_preds, average=None)

In [20]:
accuracy_multiclass

0.08657749337542989

In [21]:
from utils import print_f1_scores
df_f1_score_multiclass = print_f1_scores(f1_score_multiclass)

In [22]:
df_f1_score_multiclass

Unnamed: 0,Causes Description,F1 scores
0,Lightning,0.0
1,Equipment Use,0.0
2,Smoking,0.0
3,Campfire,0.0
4,Debris Burning,0.071673
5,Railroad,0.035833
6,Arson,0.0
7,Children,0.0
8,Miscellaneous,0.004566
9,Fireworks,0.123243


In [23]:
multiclass_estimator.delete_endpoint()

The function delete_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
