In [None]:
! pip install --upgrade boto3

In [None]:
import os
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

role = get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket here if you wish.
bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/DEMO-automl-parquet"

We will use [PyArrow](https://arrow.apache.org/docs/python/) library to store the Abalone dataset in the Parquet format.

In [None]:
import pyarrow

In [None]:
%%time

import numpy as np
import pandas as pd

s3 = boto3.client("s3")
# Download the dataset and load into a pandas dataframe
FILE_NAME = "abalone.csv"
s3.download_file("sagemaker-sample-files", f"datasets/tabular/uci_abalone/abalone.csv", FILE_NAME)

feature_names = [
    "Sex",
    "Length",
    "Diameter",
    "Height",
    "Whole weight",
    "Shucked weight",
    "Viscera weight",
    "Shell weight",
    "Rings",
]
data = pd.read_csv(FILE_NAME, header=None, names=feature_names)

data.to_parquet("abalone.parquet")

In [None]:
%%time
sagemaker.Session().upload_data("abalone.parquet", bucket=bucket, key_prefix=prefix)

After setting the parameters, we kick off training, and poll for status until training is completed, which in this example, takes under 1 hour.

In [None]:
%%time
import time
from time import gmtime, strftime

job_name = "autopilot-parquet-" + strftime("%m-%d-%H-%M", gmtime())
print("AutoML job:", job_name)

create_auto_ml_job_params = {
    "AutoMLJobConfig": {
        "CompletionCriteria": {
            "MaxCandidates": 5,
        }
    },
    "AutoMLJobName": job_name,
    "InputDataConfig": [
        {
            "ContentType": "x-application/vnd.amazon+parquet",
            "CompressionType": "None",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": f"s3://{bucket}/{prefix}/abalone.parquet",
                }
            },
            "TargetAttributeName": "Rings",
        }
    ],
    "OutputDataConfig": {"S3OutputPath": f"s3://{bucket}/{prefix}/output"},
    "RoleArn": role,
}

client = boto3.client("sagemaker", region_name=region)
client.create_auto_ml_job(**create_auto_ml_job_params)

response = client.describe_auto_ml_job(AutoMLJobName=job_name)
status = response["AutoMLJobStatus"]
secondary_status = response["AutoMLJobSecondaryStatus"]
print(f"{status} - {secondary_status}")

while status != "Completed" and secondary_status != "Failed":
    time.sleep(60)
    response = client.describe_auto_ml_job(AutoMLJobName=job_name)
    status = response["AutoMLJobStatus"]
    secondary_status = response["AutoMLJobSecondaryStatus"]
    print(f"{status} - {secondary_status}")