# Spam classification pipeline at Amazon SageMaker

Pipeline consists of: dataset preprocessing, model training and model serving.

In [None]:
import sagemaker
import datetime
import numpy as np
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.tensorflow import TensorFlow

Download the dataset:

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip -d data && rm data/readme smsspamcollection.zip

In [None]:
session = sagemaker.Session()
bucket_name = session.default_bucket()

time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
processing_job_name = f"spam-classification-{time}"

## Data processing

Preprocess dataset using SKLearnProcessor and save vocabulary, train and test data to S3.

In [None]:
output_path = f"s3://{bucket_name}/{processing_job_name}/output"

sklearn_processor = SKLearnProcessor(
    framework_version="0.20.0",
    role=get_execution_role(),
    instance_type="ml.t3.medium",
    instance_count=1,
)

sklearn_processor.run(
    code="processing/data_processing.py",
    job_name=processing_job_name,
    inputs=[ProcessingInput(source="./data", destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(
            output_name="output",
            source="/opt/ml/processing/output",
            destination=output_path,
        )
    ],
    arguments=[
        "--data_path",
        "/opt/ml/processing/input",
        "--output_path",
        "/opt/ml/processing/output",
    ],
)

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

## Model training

Train model:

In [None]:
hyperparameters = {"epochs": 5, "batch_size": 32, "learning_rate": 0.01}
train_path = f"{output_path}/train"
test_path = f"{output_path}/test"

estimator = TensorFlow(
    source_dir="training",
    entry_point="train.py",
    model_dir="/opt/ml/model",
    train_instance_type="ml.m5.large",
    train_instance_count=1,
    hyperparameters=hyperparameters,
    role=get_execution_role(),
    base_job_name="spam-classification",
    framework_version="2.1",
    py_version="py3",
    script_mode=True,
)

inputs = {
    "train": train_path,
    "test": test_path,
}

estimator.fit(inputs)

## Model deployment

Deploy trained model:

In [None]:
predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

Test model:

In [None]:
def get_results(score):
    return 1 if score > 0.5 else 0


prefix = f"{processing_job_name}/output/test/X_test.npy"
session.download_data("data/test", bucket=bucket_name, key_prefix=prefix)

test_samples = np.load("data/test/X_test.npy")
result = predictor.predict(test_samples[0])["predictions"][0][0]
print(f"\n\nPrediction result: {get_results(result)}")

Delete endpoint (stop serving container):

In [None]:
predictor.delete_endpoint()