# Introduction to SageMaker

Let's initialize our IAM role and S3 bucket where we store our data

In [None]:
import sagemaker

role = sagemaker.get_execution_role()

print(role)

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()

print(bucket)

# Training an anomaly detection model

We will fetch data from the NYC Taxi dataset which records the amount New York City taxi ridership over the course of six months in 30 min intervals. We will use the pandas common library for manipulating datasets to visualize the data.

In [None]:
%%time

import pandas as pd

import urllib.request

data_filename = 'nyc_taxi.csv'

data_source = 'https://raw.githubusercontent.com/numenta/NAB/master/data/realKnownCause/nyc_taxi.csv'

urllib.request.urlretrieve(data_source, data_filename)

taxi_data = pd.read_csv(data_filename, delimiter=',')

taxi_data.head()

In [None]:
%matplotlib inline

import matplotlib

import matplotlib.pyplot as plt

matplotlib.rcParams['figure.dpi'] = 100

taxi_data.plot()

Using this data, let's train a model for anomaly detection. We use the Random-Cut Forest SageMaker algorithm.

Particular to a SageMaker RCF training job are the following hyperparameters:

* **`num_samples_per_tree`** - the number randomly sampled data points sent to each tree. As a general rule, `1/num_samples_per_tree` should approximate the the estimated ratio of anomalies to normal points in the dataset.
* **`num_trees`** - the number of trees to create in the forest. Each tree learns a separate model from different samples of data. The full forest model uses the mean predicted anomaly score from each constituent tree.
* **`feature_dim`** - the dimension of each data point.

* Recommended instance type: `ml.m4`, `ml.c4`, or `ml.c5`
* Current limitations:
  * The RCF algorithm does not take advantage of GPU hardware.

In [None]:
from sagemaker import RandomCutForest

prefix='taxidata'

# specify general training job information
rcf = RandomCutForest(role=role,
                      train_instance_count=1,
                      train_use_spot_instances=True,
                      train_max_wait=60*5,
                      train_max_run=60*5,
                      train_instance_type='ml.m4.xlarge',
                      data_location='s3://{}/{}/'.format(bucket, prefix),
                      output_path='s3://{}/{}/output'.format(bucket, prefix),
                      num_samples_per_tree=512,
                      num_trees=50)

record_set = rcf.record_set(taxi_data.value.to_numpy().reshape(-1,1))

In [None]:

rcf.fit(record_set)


## Predictions

### SageMaker Endopoint

In [None]:
rcf_inference = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge',
)
print('Endpoint name: {}'.format(rcf_inference.endpoint))

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

rcf_inference.content_type = 'text/csv'
rcf_inference.serializer = csv_serializer
rcf_inference.accept = 'application/json'
rcf_inference.deserializer = json_deserializer

# Let's calculate anomaly score for first 6 samples
taxi_data_numpy = taxi_data.value.to_numpy().reshape(-1,1)
print(taxi_data_numpy[:6])
results = rcf_inference.predict(taxi_data_numpy[:6])
print(results)

# Let's calculate anomaly score for the
results = rcf_inference.predict(taxi_data_numpy)
scores = [datum['score'] for datum in results['scores']]

# add scores to taxi data frame and print first few values
taxi_data['score'] = pd.Series(scores, index=taxi_data.index)
taxi_data.head(50)

### Batch Transform

In [None]:

batch_output = 's3://{}/{}/batch_output'

batch_input = sagemaker_session.upload_data(path='nyc_taxi.csv', key_prefix='taxidata')

rcf_batch = rcf.transformer(instance_count=1, instance_type='ml.m4.xlarge', output_path=batch_output)

print(batch_input)

#rcf_batch.transform(data=batch_input, data_type='S3Prefix', content_type='text/csv', split_type='Line')

rcf_batch.transform(data=record_set.s3_data, data_type='ManifestFile', wait=True)

Plots below

In [None]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

#
# *Try this out* - change `start` and `end` to zoom in on the 
# anomaly found earlier in this notebook
#
start, end = 0, len(taxi_data)
#start, end = 5500, 6500
taxi_data_subset = taxi_data[start:end]

ax1.plot(taxi_data_subset['value'], color='C0', alpha=0.8)
ax2.plot(taxi_data_subset['score'], color='C1')

ax1.grid(which='major', axis='both')

ax1.set_ylabel('Taxi Ridership', color='C0')
ax2.set_ylabel('Anomaly Score', color='C1')

ax1.tick_params('y', colors='C0')
ax2.tick_params('y', colors='C1')

ax1.set_ylim(0, 40000)
ax2.set_ylim(min(scores), 1.4*max(scores))
fig.set_figwidth(10)

In [None]:
score_mean = taxi_data['score'].mean()
score_std = taxi_data['score'].std()
score_cutoff = score_mean + 3*score_std

anomalies = taxi_data_subset[taxi_data_subset['score'] > score_cutoff]
anomalies

In [None]:
ax2.plot(anomalies.index, anomalies.score, 'ko')
fig

In [None]:
sagemaker.Session().delete_endpoint(rcf_inference.endpoint)