In [None]:
import os
import boto3
import re
import sagemaker

bucket = "ENTER-BUCKET-NAME"

prefix = (
    "sagemaker/liver"  # place to upload training files within the bucket
)


# Define IAM Role 
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# Enter data filename
data_key = 'ilpd.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)


In [6]:
import pandas as pd
import numpy as np
import io
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import sagemaker.amazon.common as smac


In [7]:
df = pd.read_csv(data_location)

In [8]:
df.head()

Unnamed: 0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
0,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
1,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
2,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
3,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1
4,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.3,1


In [9]:
print(df.shape)

(582, 11)


In [10]:
# Check for missing values - we only have 4 so I decided to remove them in Step2

print(df.isnull().sum())


65        0
Female    0
0.7       0
0.1       0
187       0
16        0
18        0
6.8       0
3.3       0
0.9       4
1         0
dtype: int64


In [11]:
# Specify the column names
column_names = ['Age', 'Gender', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot', 'TP', 'ALB', 'A_G_Ratio', 'Selector']
df.columns = column_names

In [12]:
# Step 2: Perform label encoding on 'Gender' column
encoder = LabelEncoder()
df['Gender'] = encoder.fit_transform(df['Gender'])
df['Selector'] = df['Selector'].replace({2: 0})
df = df.dropna()

# Print the preprocessed dataset
print(df.head())

   Age  Gender    TB   DB  Alkphos  Sgpt  Sgot   TP  ALB  A_G_Ratio  Selector
0   62       1  10.9  5.5      699    64   100  7.5  3.2       0.74         1
1   62       1   7.3  4.1      490    60    68  7.0  3.3       0.89         1
2   58       1   1.0  0.4      182    14    20  6.8  3.4       1.00         1
3   72       1   3.9  2.0      195    27    59  7.3  2.4       0.40         1
4   46       1   1.8  0.7      208    19    14  7.6  4.4       1.30         1


In [13]:
print(df.isnull().sum())

Age          0
Gender       0
TB           0
DB           0
Alkphos      0
Sgpt         0
Sgot         0
TP           0
ALB          0
A_G_Ratio    0
Selector     0
dtype: int64


In [14]:
# Split the data into 80% training, 10% validation and 10% testing

rand_split = np.random.rand(len(df))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = df[train_list]
data_val = df[val_list]
data_test = df[test_list]

train_y = data_train.iloc[:, -1].values
train_X = data_train.iloc[:, :-1].values

val_y = data_val.iloc[:, -1].values
val_X = data_val.iloc[:, :-1].values

test_y = data_test.iloc[:, -1].values
test_X = data_test.iloc[:, :-1].values


In [15]:
train_file = "linear_train.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype("float32"), train_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train", train_file)
).upload_fileobj(f)


In [16]:
validation_file = "linear_validation.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype("float32"), val_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation", validation_file)
).upload_fileobj(f)


In [17]:
from sagemaker import image_uris

container = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

In [20]:
linear_job = "liver-prediction-api"

print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.m5.large", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/".format(bucket, prefix)},
    "HyperParameters": {
        "feature_dim": "10",
        "mini_batch_size": "100",
        "predictor_type": "binary_classifier",
        "epochs": "10",
        "num_models": "32",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

Job name is: liver-prediction-api


In [None]:
# Setup hosting container and create model
linear_hosting_container = {
    "Image": container,
    "ModelDataUrl": sm.describe_training_job(TrainingJobName=linear_job)["ModelArtifacts"][
        "S3ModelArtifacts"
    ],
}

create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)

print(create_model_response["ModelArn"])

In [None]:
linear_endpoint_config = "liver-prediction-api-endpoint-config"
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            "InstanceType": "ml.m5.large",
            "InitialInstanceCount": 1,
            "ModelName": linear_job,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

In [23]:
linear_endpoint = "liver-prediction-api"
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config
)
print(create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Status: " + status)

sm.get_waiter("endpoint_in_service").wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

if status != "InService":
    raise Exception("Endpoint creation did not succeed")

liver-prediction-api
arn:aws:sagemaker:eu-central-1:736126753921:endpoint/liver-prediction-api
Status: Creating
Arn: arn:aws:sagemaker:eu-central-1:736126753921:endpoint/liver-prediction-api
Status: InService


In [24]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [25]:
runtime = boto3.client("runtime.sagemaker")

payload = np2csv(test_X)
response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
test_pred = np.array([r["score"] for r in result["predictions"]])

In [26]:
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class)) * 100
baseline_accuracy = np.mean((test_y == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 73.9 %
Baseline Accuracy: 73.9 %
