In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time
# allow interactions with aws services
import boto3
# To work with regular expressions, we import re
import re
# for working with JSON data in Python we import json
import json
# smac is a libray for working with aws SageMaker: an ML service
import sagemaker.amazon.common as smac

In [2]:
!pip install sagemaker --upgrade

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [3]:
#import sagemaker to work with aws sagemaker: this library allows for
#easy access to SageMaker's functionalities such as training, deploying, and managing models, as well
#as providing tools for data pre-processing and model evaluation.""""""
import sagemaker
# get the execution role for the sagemaker session
role = sagemaker.get_execution_role()
# get the region of the current session
region = boto3.Session().region_name

# get the name of the bucket for the current session: were we will upload and store data for training,modeling ..
bucket = sagemaker.Session().default_bucket()
# fix the file name within the bucket were all data and tasks can be located together
prefix = (
    "sagemaker/breast-cancer-prediction"  # place to upload training files within the bucket
)


In [4]:
#get the data
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data')
# set columns name
data.columns = ["id","diagnosis","radius","texture","perimeter","area","smoothness",
                "compactness","concavity","concave points","symmetry","fractal_dimension",
                "radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se",
                "concave points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst",
                "perimeter_worst","area_worst","smoothness_worst","compactness_worst","concavity_worst",
                "concave points_worst","symmetry_worst","fractal_dimension_worst"] 

#save the data
data.to_csv("data.csv", sep=',', index=False)

In [5]:
print(data.shape)

(568, 32)


In [6]:
display(data.head())

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave points,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [7]:
display(data.diagnosis.value_counts())

B    357
M    211
Name: diagnosis, dtype: int64

In [8]:
data["diagnosis"].isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
563    False
564    False
565    False
566    False
567    False
Name: diagnosis, Length: 568, dtype: bool

In [9]:
data["diagnosis"].duplicated()

0      False
1       True
2       True
3       True
4       True
       ...  
563     True
564     True
565     True
566     True
567     True
Name: diagnosis, Length: 568, dtype: bool

In [10]:
data.dropna()

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave points,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
1,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
3,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
4,843786,M,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,...,15.470,23.75,103.40,741.6,0.17910,0.52490,0.5355,0.1741,0.3985,0.12440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
564,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
565,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
566,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [11]:
data.shape

(568, 32)

Data Preparation

In [12]:
# split the dataset 
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
validation_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

# create training dataset
data_train = data[train_list]
# create validation dataset
data_validation = data[validation_list]
# create test dataset
data_test = data[test_list]
# use the second column of the data to determine if the value is "M" or not 
y_train = ((data_train.iloc[:, 1] == "M") + 0).to_numpy()
X_train = data_train.iloc[:, 2:].to_numpy()

y_val = ((data_validation.iloc[:, 1] == "M") + 0).to_numpy()
X_val = data_validation.iloc[:, 2:].to_numpy()

y_test = ((data_test.iloc[:, 1] == "M") + 0).to_numpy()
X_test = data_test.iloc[:, 2:].to_numpy()

We will convert the datasets to the recordIO-wrapped protobuf format used by the Amazon SageMaker algorithms, and then upload this data to S3. 

In [15]:
# let's create a train_file
train_file = "linear_train.data"

# let's create a binary file f: it allows us to read and write the data in memory without being written to disk
f = io.BytesIO()
# write the X_train and y_train to the f file in a dense tensor format
smac.write_numpy_to_dense_tensor(f, X_train.astype("float32"), y_train.astype("float32"))
# point to the beginning of the file
f.seek(0)

#let's interact with s3 and upload the content of the f file to the bucket we already specified
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train", train_file)
).upload_fileobj(f)

In [14]:
import io

In [16]:
# the same for the validation dataset
validation_file = "linear_validation.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_val.astype("float32"), y_val.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation", validation_file)
).upload_fileobj(f)

In [17]:
# we will create a container for our algorithm
from sagemaker import image_uris

container = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

In [20]:
linear_job = "linearjob1"

print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/".format(bucket, prefix)},
    "HyperParameters": {
        "feature_dim": "30",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "32",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

Job name is: linearjob1


### Training

In [21]:
%%time

region = boto3.Session().region_name
sm = boto3.client("sagemaker")

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)["TrainingJobStatus"]
print(status)
sm.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=linear_job)
if status == "Failed":
    message = sm.describe_training_job(TrainingJobName=linear_job)["FailureReason"]
    print("Training failed with the following error: {}".format(message))
    raise Exception("Training job failed")

InProgress
CPU times: user 69.8 ms, sys: 6.37 ms, total: 76.2 ms
Wall time: 4min


In [22]:
linear_hosting_container = {
    "Image": container,
    "ModelDataUrl": sm.describe_training_job(TrainingJobName=linear_job)["ModelArtifacts"][
        "S3ModelArtifacts"
    ],
}

create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)

print(create_model_response["ModelArn"])

arn:aws:sagemaker:us-east-1:138973825716:model/linearjob1


In [23]:
linear_endpoint_config = "DEMO-linear-endpoint-config-" + time.strftime(
    "%Y-%m-%d-%H-%M-%S", time.gmtime()
)
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "ModelName": linear_job,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-linear-endpoint-config-2023-01-16-18-07-39
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:138973825716:endpoint-config/demo-linear-endpoint-config-2023-01-16-18-07-39


In [24]:
%%time

linear_endpoint = "DEMO-linear-endpoint-" + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config
)
print(create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Status: " + status)

sm.get_waiter("endpoint_in_service").wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

if status != "InService":
    raise Exception("Endpoint creation did not succeed")

DEMO-linear-endpoint-202301161807
arn:aws:sagemaker:us-east-1:138973825716:endpoint/demo-linear-endpoint-202301161807
Status: Creating
Arn: arn:aws:sagemaker:us-east-1:138973825716:endpoint/demo-linear-endpoint-202301161807
Status: InService
CPU times: user 133 ms, sys: 8.7 ms, total: 142 ms
Wall time: 5min 1s


In [25]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [26]:
runtime = boto3.client("runtime.sagemaker")

payload = np2csv(X_test)
response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
test_pred = np.array([r["score"] for r in result["predictions"]])

In [29]:
test_mae_linear = np.mean(np.abs(y_test - test_pred))
test_mae_baseline = np.mean(
    np.abs(y_test - np.median(y_train))
)  ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear, 3))

Test MAE Baseline : 0.396
Test MAE Linear: 0.197


In [31]:
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(y_train), len(y_test))

prediction_accuracy = np.mean((y_test == test_pred_class)) * 100
baseline_accuracy = np.mean((y_test == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 96.2 %
Baseline Accuracy: 60.4 %


In [32]:
sm.delete_endpoint(EndpointName=linear_endpoint)

{'ResponseMetadata': {'RequestId': '2ec84f4e-b15c-4ebc-bf65-14a5546c1866',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2ec84f4e-b15c-4ebc-bf65-14a5546c1866',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 16 Jan 2023 18:14:49 GMT'},
  'RetryAttempts': 0}}

==> Our linear model does a good job of predicting breast cancer and has an overall accuracy of close to 92%. 