# LINEAR 

In [1]:
import pandas as pd
import numpy as np
import sagemaker
import os
from utils import standardize_data
from sagemaker import LinearLearner
from sagemaker import get_execution_role

# specify an output path
prefix = 'wildfire'
data_dir = 'wildfire_data'
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
output_path = 's3://{}/{}'.format(bucket, prefix)
role = get_execution_role()

In [2]:
multiclass_estimator = LinearLearner(role=role,
                       train_instance_count=1, 
                       train_instance_type='ml.c4.xlarge',
                       predictor_type='multiclass_classifier',
                       num_classes=13,
                       balance_multiclass_weights=True,
                       output_path=output_path,
                       sagemaker_session=sagemaker_session,
                       epochs=15)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [3]:
train = pd.read_csv(os.path.join(data_dir, 'train.csv'), header=None)
val = pd.read_csv(os.path.join(data_dir, 'validation.csv'), header=None)
test_x = pd.read_csv(os.path.join(data_dir, 'test.csv'), header=None)
test_y = pd.read_csv(os.path.join(data_dir, 'test_y.csv'), header=None)

In [4]:
train_y_np = train.iloc[:,0].values.astype('float32')
train.drop([0], axis = 1, inplace = True)
train_x_np = train.values.astype('float32')
val_y_np = val.iloc[:,0].values.astype('float32')
val.drop([0], axis = 1, inplace = True)
val_x_np = val.values.astype('float32')

In [8]:
formatted_train_data = multiclass_estimator.record_set(train_x_np, labels=train_y_np, channel='train')
formatted_val_data = multiclass_estimator.record_set(val_x_np, labels=val_y_np, channel='validation')

In [9]:
multiclass_estimator.fit([formatted_train_data, formatted_val_data])

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-02-05 07:01:08 Starting - Starting the training job...
2021-02-05 07:01:33 Starting - Launching requested ML instancesProfilerReport-1612508468: InProgress
......
2021-02-05 07:02:34 Starting - Preparing the instances for training.........
2021-02-05 07:03:55 Downloading - Downloading input data...
2021-02-05 07:04:35 Training - Downloading the training image[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[02/05/2021 07:04:34 INFO 139783852615488] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u

In [10]:
%%time 
# deploy and create a predictor
multiclass_predictor = multiclass_estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


---------------!CPU times: user 304 ms, sys: 609 µs, total: 304 ms
Wall time: 7min 32s


In [11]:
test_x_np = test_x.values.astype('float32')
result = multiclass_predictor.predict(test_x_np[0])

In [12]:
result

[label {
   key: "predicted_label"
   value {
     float32_tensor {
       values: 12.0
     }
   }
 }
 label {
   key: "score"
   value {
     float32_tensor {
       values: 3.0731650895177154e-06
       values: 1.000000013351432e-10
       values: 3.835055828460554e-09
       values: 8.309261829708703e-07
       values: 1.000000013351432e-10
       values: 4.985261647938444e-10
       values: 1.000000013351432e-10
       values: 2.8947733099471407e-10
       values: 1.0261609340034283e-09
       values: 0.026925604790449142
       values: 0.10333773493766785
       values: 0.005169513635337353
       values: 0.8645632266998291
     }
   }
 }]

In [13]:
prediction_batches = [multiclass_predictor.predict(batch) for batch in np.array_split(test_x_np, 100)]
test_preds = np.concatenate([np.array([x.label['predicted_label'].float32_tensor.values[0] for x in batch]) 
                                 for batch in prediction_batches])

In [14]:
test_preds[0]

12.0

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
test_y_np = test_y.values.astype('float32')
accuracy_linear = accuracy_score(test_y_np, test_preds)
f1_score_linear = f1_score(test_y_np, test_preds, average=None)

In [16]:
accuracy_linear

0.07994713309524479

In [17]:
from utils import print_f1_scores
df_f1_score_linear = print_f1_scores(f1_score_linear)

In [18]:
df_f1_score_linear

Unnamed: 0,Causes Description,F1 scores
0,Lightning,0.0
1,Equipment Use,0.016398
2,Smoking,0.0
3,Campfire,0.0
4,Debris Burning,0.0
5,Railroad,0.042616
6,Arson,0.055314
7,Children,0.0
8,Miscellaneous,0.004077
9,Fireworks,0.129707


In [None]:
multiclass_estimator.delete_endpoint()