In [53]:
import os
import boto3
import re
import sagemaker
import pandas as pd

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# Declare bucket name, remote file, and destination
my_bucket = 'ccprojectjmps'
data_key = 'Original_Dataset.csv'
training_data = ''
data_location = 's3://{}/{}'.format(my_bucket, data_key)

#pd.read_csv(data_location)

In [54]:
import pickle
import numpy as np #used to import mathematical operations
import matplotlib.pyplot as plt #used to plot different things in python
import pandas as pd #import data sets and manage data sets
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import io
import time
import json
import sagemaker.amazon.common as smac

In [55]:
#Taking the input
dataset = pd.read_csv(data_location)

In [56]:
rand_split = np.random.rand(len(dataset))
train_list = rand_split < 0.75
test_list = rand_split >= 0.75

data_train = dataset[train_list]
data_test = dataset[test_list]

train_y = ((data_train.iloc[:, -1] == 0) + 0).to_numpy()
train_x = data_train.iloc[:, :-1].to_numpy()

test_y = ((data_test.iloc[:, -1] == 0) + 0).to_numpy()
test_x = data_test.iloc[:, :-1].to_numpy();

#print(train_x)

In [57]:
#Training the classifier
classifier = LogisticRegression(random_state = 0)
classifier.fit(train_x, train_y)

LogisticRegression(random_state=0)

In [58]:
train_file = "linear_train.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_x.astype("float32"), train_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(my_bucket).Object(
    os.path.join(training_data, "train", train_file)
).upload_fileobj(f)

In [59]:
from sagemaker import image_uris

container = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

In [63]:
linear_job = "DEMO-linear-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}train/".format(my_bucket, training_data),
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/".format(my_bucket, training_data)},
    "HyperParameters": {
        "feature_dim": "31",
        "mini_batch_size": "32",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "32",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

Job name is: DEMO-linear-2022-11-20-19-00-47


In [64]:
%%time

region = boto3.Session().region_name
sm = boto3.client("sagemaker")

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)["TrainingJobStatus"]
print(status)
sm.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=linear_job)
if status == "Failed":
    message = sm.describe_training_job(TrainingJobName=linear_job)["FailureReason"]
    print("Training failed with the following error: {}".format(message))
    raise Exception("Training job failed")

InProgress
CPU times: user 71.1 ms, sys: 1.41 ms, total: 72.5 ms
Wall time: 4min


In [65]:
linear_hosting_container = {
    "Image": container,
    "ModelDataUrl": sm.describe_training_job(TrainingJobName=linear_job)["ModelArtifacts"][
        "S3ModelArtifacts"
    ],
}

create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)

print(create_model_response["ModelArn"])

arn:aws:sagemaker:us-east-2:843351136705:model/demo-linear-2022-11-20-19-00-47


In [66]:
linear_endpoint_config = "DEMO-linear-endpoint-config-" + time.strftime(
    "%Y-%m-%d-%H-%M-%S", time.gmtime()
)
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            "InstanceType": "ml.m4.xlarge",
            "InitialInstanceCount": 1,
            "ModelName": linear_job,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

DEMO-linear-endpoint-config-2022-11-20-19-09-40
Endpoint Config Arn: arn:aws:sagemaker:us-east-2:843351136705:endpoint-config/demo-linear-endpoint-config-2022-11-20-19-09-40


In [67]:
%%time

linear_endpoint = "DEMO-linear-endpoint-" + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config
)
print(create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Status: " + status)

sm.get_waiter("endpoint_in_service").wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

if status != "InService":
    raise Exception("Endpoint creation did not succeed")

DEMO-linear-endpoint-202211201910
arn:aws:sagemaker:us-east-2:843351136705:endpoint/demo-linear-endpoint-202211201910
Status: Creating
Arn: arn:aws:sagemaker:us-east-2:843351136705:endpoint/demo-linear-endpoint-202211201910
Status: InService
CPU times: user 126 ms, sys: 6.37 ms, total: 132 ms
Wall time: 4min 1s


In [69]:
#Function to convert array to a csv
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [70]:
runtime = boto3.client("runtime.sagemaker")

payload = np2csv(test_x)
response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
test_pred = np.array([r["score"] for r in result["predictions"]])

In [71]:
test_mae_linear = np.mean(np.abs(test_y - test_pred))
test_mae_baseline = np.mean(
    np.abs(test_y - np.median(train_y))
)  ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear, 3))

Test MAE Baseline : 0.331
Test MAE Linear: 0.322


In [72]:
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class)) * 100
baseline_accuracy = np.mean((test_y == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 70.3 %
Baseline Accuracy: 66.9 %
