In [2]:
import os
import boto3
import re
import sagemaker

role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
bucket = sagemaker.Session().default_bucket()

prefix = (
    "sagemaker/titanic-prediction"  # place to upload training files within the bucket
)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac

In [4]:
s3 = boto3.client("s3")

filename = "titanic-train.csv"
# s3://datascience-sagemaker-mk/titanic/train.csv
s3.download_file(
    f"datascience-sagemaker-mk", "titanic/train.csv", filename
)
data = pd.read_csv(filename)


In [5]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
# data.columns = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
#        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [8]:

# save the data
data.to_csv("data-titanic.csv", sep=",", index=False)

# print the shape of the data file
print(data.shape)

# show the top few rows
display(data.head())

# describe the data object
display(data.describe())

# we will also summarize the categorical field diganosis
display(data.Survived.value_counts())

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


0    549
1    342
Name: Survived, dtype: int64

In [9]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [10]:
cols_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]

In [11]:
data = data.drop(columns=cols_to_drop)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [13]:
X = pd.get_dummies(data)

In [14]:
cols_to_convert = list(X.select_dtypes(exclude='float64'))

In [15]:
cols_to_convert

['Survived',
 'Pclass',
 'SibSp',
 'Parch',
 'Sex_female',
 'Sex_male',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         714 non-null    float64
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   Sex_female  891 non-null    uint8  
 7   Sex_male    891 non-null    uint8  
 8   Embarked_C  891 non-null    uint8  
 9   Embarked_Q  891 non-null    uint8  
 10  Embarked_S  891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(5)
memory usage: 46.2 KB


In [17]:
for col in cols_to_convert:
    X[col] = X[col].astype('float64')

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    float64
 1   Pclass      891 non-null    float64
 2   Age         714 non-null    float64
 3   SibSp       891 non-null    float64
 4   Parch       891 non-null    float64
 5   Fare        891 non-null    float64
 6   Sex_female  891 non-null    float64
 7   Sex_male    891 non-null    float64
 8   Embarked_C  891 non-null    float64
 9   Embarked_Q  891 non-null    float64
 10  Embarked_S  891 non-null    float64
dtypes: float64(11)
memory usage: 76.7 KB


In [19]:
X.dropna(inplace=True)

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    714 non-null    float64
 1   Pclass      714 non-null    float64
 2   Age         714 non-null    float64
 3   SibSp       714 non-null    float64
 4   Parch       714 non-null    float64
 5   Fare        714 non-null    float64
 6   Sex_female  714 non-null    float64
 7   Sex_male    714 non-null    float64
 8   Embarked_C  714 non-null    float64
 9   Embarked_Q  714 non-null    float64
 10  Embarked_S  714 non-null    float64
dtypes: float64(11)
memory usage: 66.9 KB


In [23]:
rand_split = np.random.rand(len(X))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) 

data_train = X[train_list]
data_val = X[val_list]

In [71]:
X

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.0,3.0,22.0,1.0,0.0,7.2500,0.0,1.0,0.0,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,1.0,0.0,0.0
2,1.0,3.0,26.0,0.0,0.0,7.9250,1.0,0.0,0.0,0.0,1.0
3,1.0,1.0,35.0,1.0,0.0,53.1000,1.0,0.0,0.0,0.0,1.0
4,0.0,3.0,35.0,0.0,0.0,8.0500,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
885,0.0,3.0,39.0,0.0,5.0,29.1250,1.0,0.0,0.0,1.0,0.0
886,0.0,2.0,27.0,0.0,0.0,13.0000,0.0,1.0,0.0,0.0,1.0
887,1.0,1.0,19.0,0.0,0.0,30.0000,1.0,0.0,0.0,0.0,1.0
889,1.0,1.0,26.0,0.0,0.0,30.0000,0.0,1.0,1.0,0.0,0.0


In [72]:
data_train

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0.0,3.0,22.0,1.0,0.0,7.2500,0.0,1.0,0.0,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,1.0,0.0,0.0
2,1.0,3.0,26.0,0.0,0.0,7.9250,1.0,0.0,0.0,0.0,1.0
3,1.0,1.0,35.0,1.0,0.0,53.1000,1.0,0.0,0.0,0.0,1.0
6,0.0,1.0,54.0,0.0,0.0,51.8625,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
884,0.0,3.0,25.0,0.0,0.0,7.0500,0.0,1.0,0.0,0.0,1.0
885,0.0,3.0,39.0,0.0,5.0,29.1250,1.0,0.0,0.0,1.0,0.0
886,0.0,2.0,27.0,0.0,0.0,13.0000,0.0,1.0,0.0,0.0,1.0
887,1.0,1.0,19.0,0.0,0.0,30.0000,1.0,0.0,0.0,0.0,1.0


In [24]:
train_y = ((data_train.iloc[:, 1]) ).to_numpy()
train_X = data_train.iloc[:, 2:].to_numpy()

val_y = ((data_val.iloc[:, 1])).to_numpy()
val_X = data_val.iloc[:, 2:].to_numpy()

# test_y = ((data_test.iloc[:, 1]) + 0).to_numpy()
# test_X = data_test.iloc[:, 2:].to_numpy();

In [73]:
train_X

array([[22.,  1.,  0., ...,  0.,  0.,  1.],
       [38.,  1.,  0., ...,  1.,  0.,  0.],
       [26.,  0.,  0., ...,  0.,  0.,  1.],
       ...,
       [27.,  0.,  0., ...,  0.,  0.,  1.],
       [19.,  0.,  0., ...,  0.,  0.,  1.],
       [32.,  0.,  0., ...,  0.,  1.,  0.]])

In [27]:
train_file = "titanic_linear_train.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype("float32"), train_y.astype("float32"))
f.seek(0)

preprocess_bucket = 'datascience-sagemaker-mk'
prefix = 'titanic'

boto3.Session().resource("s3").Bucket(preprocess_bucket).Object(
    os.path.join(prefix, "train", train_file)
).upload_fileobj(f)

In [28]:
validation_file = "titanic-linear_validation.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype("float32"), val_y.astype("float32"))
f.seek(0)

boto3.Session().resource("s3").Bucket(preprocess_bucket).Object(
    os.path.join(prefix, "validation", validation_file)
).upload_fileobj(f)

In [29]:
from sagemaker import image_uris

container = image_uris.retrieve(region=boto3.Session().region_name, framework="linear-learner")

In [30]:
bucket = 'datascience-sagemaker-mk'
prefix = 'titanic'

In [31]:
linear_job = "titanic-linear-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {"TrainingImage": container, "TrainingInputMode": "File"},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.c4.2xlarge", "VolumeSizeInGB": 10},
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None",
        },
    ],
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/".format(bucket, prefix)},
    "HyperParameters": {
        "feature_dim": "9",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "32",
        "loss": "absolute_loss",
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 60 * 60},
}

Job name is: titanic-linear-2023-05-22-13-56-52


In [32]:
%%time

region = boto3.Session().region_name
sm = boto3.client("sagemaker")

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)["TrainingJobStatus"]
print(status)
sm.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=linear_job)
if status == "Failed":
    message = sm.describe_training_job(TrainingJobName=linear_job)["FailureReason"]
    print("Training failed with the following error: {}".format(message))
    raise Exception("Training job failed")

InProgress
CPU times: user 188 ms, sys: 6.41 ms, total: 195 ms
Wall time: 4min


In [33]:
linear_hosting_container = {
    "Image": container,
    "ModelDataUrl": sm.describe_training_job(TrainingJobName=linear_job)["ModelArtifacts"][
        "S3ModelArtifacts"
    ],
}

create_model_response = sm.create_model(
    ModelName=linear_job, ExecutionRoleArn=role, PrimaryContainer=linear_hosting_container
)

print(create_model_response["ModelArn"])

arn:aws:sagemaker:us-east-1:287758680514:model/titanic-linear-2023-05-22-13-56-52


## host on EC2 for inference

In [35]:
linear_job

'titanic-linear-2023-05-22-13-56-52'

In [38]:
linear_endpoint_config = "titanic-linear-endpoint-config-" + time.strftime(
    "%Y-%m-%d-%H-%M-%S", time.gmtime()
)
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[
        {
            "InstanceType": "ml.p2.xlarge",
            "InitialInstanceCount": 1,
            "ModelName": linear_job,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])

titanic-linear-endpoint-config-2023-05-22-14-53-30
Endpoint Config Arn: arn:aws:sagemaker:us-east-1:287758680514:endpoint-config/titanic-linear-endpoint-config-2023-05-22-14-53-30


In [39]:
%%time

linear_endpoint = "titanic-linear-endpoint-" + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint, EndpointConfigName=linear_endpoint_config
)
print(create_endpoint_response["EndpointArn"])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Status: " + status)

sm.get_waiter("endpoint_in_service").wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp["EndpointStatus"]
print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

if status != "InService":
    raise Exception("Endpoint creation did not succeed")

titanic-linear-endpoint-202305221457
arn:aws:sagemaker:us-east-1:287758680514:endpoint/titanic-linear-endpoint-202305221457
Status: Creating
Arn: arn:aws:sagemaker:us-east-1:287758680514:endpoint/titanic-linear-endpoint-202305221457
Status: InService
CPU times: user 113 ms, sys: 17.5 ms, total: 130 ms
Wall time: 4min 32s


## test data

In [79]:
def transform (bucket, path):
    s3 = boto3.client("s3")

    filename = "titanic-test.csv"
 #f"datascience-sagemaker-mk", "titanic/test.csv", filename
    s3.download_file(
        bucket, path, filename
    )
    data = pd.read_csv(filename)
    data = data.drop(columns=cols_to_drop)
    X = pd.get_dummies(data)

    cols_to_convert = list(X.select_dtypes(exclude='float64'))
 
    for col in cols_to_convert:
        X[col] = X[col].astype('float64')
 
    X.dropna(inplace=True)

    print (X)
    
#     test_y = ((X.iloc[:, 1])).to_numpy()
    test_X = X.iloc[:, 1:].to_numpy();
#     test_X = X.to_numpy();
    return test_X, test_y

In [80]:
test_X, test_y = transform('datascience-sagemaker-mk', 'titanic/test.csv')

     Pclass   Age  SibSp  Parch      Fare  Sex_female  Sex_male  Embarked_C  \
0       3.0  34.5    0.0    0.0    7.8292         0.0       1.0         0.0   
1       3.0  47.0    1.0    0.0    7.0000         1.0       0.0         0.0   
2       2.0  62.0    0.0    0.0    9.6875         0.0       1.0         0.0   
3       3.0  27.0    0.0    0.0    8.6625         0.0       1.0         0.0   
4       3.0  22.0    1.0    1.0   12.2875         1.0       0.0         0.0   
..      ...   ...    ...    ...       ...         ...       ...         ...   
409     3.0   3.0    1.0    1.0   13.7750         1.0       0.0         0.0   
411     1.0  37.0    1.0    0.0   90.0000         1.0       0.0         0.0   
412     3.0  28.0    0.0    0.0    7.7750         1.0       0.0         0.0   
414     1.0  39.0    0.0    0.0  108.9000         1.0       0.0         1.0   
415     3.0  38.5    0.0    0.0    7.2500         0.0       1.0         0.0   

     Embarked_Q  Embarked_S  
0           1.0      

In [81]:
test_X, test_y

(array([[34.5,  0. ,  0. , ...,  0. ,  1. ,  0. ],
        [47. ,  1. ,  0. , ...,  0. ,  0. ,  1. ],
        [62. ,  0. ,  0. , ...,  0. ,  1. ,  0. ],
        ...,
        [28. ,  0. ,  0. , ...,  0. ,  0. ,  1. ],
        [39. ,  0. ,  0. , ...,  1. ,  0. ,  0. ],
        [38.5,  0. ,  0. , ...,  0. ,  0. ,  1. ]]),
 array([34.5 , 47.  , 62.  , 27.  , 22.  , 14.  , 30.  , 26.  , 18.  ,
        21.  , 46.  , 23.  , 63.  , 47.  , 24.  , 35.  , 21.  , 27.  ,
        45.  , 55.  ,  9.  , 21.  , 48.  , 50.  , 22.  , 22.5 , 41.  ,
        50.  , 24.  , 33.  , 30.  , 18.5 , 21.  , 25.  , 39.  , 41.  ,
        30.  , 45.  , 25.  , 45.  , 60.  , 36.  , 24.  , 27.  , 20.  ,
        28.  , 10.  , 35.  , 25.  , 36.  , 17.  , 32.  , 18.  , 22.  ,
        13.  , 18.  , 47.  , 31.  , 60.  , 24.  , 21.  , 29.  , 28.5 ,
        35.  , 32.5 , 55.  , 30.  , 24.  ,  6.  , 67.  , 49.  , 27.  ,
        18.  ,  2.  , 22.  , 27.  , 25.  , 25.  , 76.  , 29.  , 20.  ,
        33.  , 43.  , 27.  , 26.  , 16. 

In [82]:
len(test_X)

331

In [83]:
len(test_X[0])

9

In [84]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=",", fmt="%g")
    return csv.getvalue().decode().rstrip()

In [85]:
linear_endpoint

'titanic-linear-endpoint-202305221457'

In [87]:
runtime = boto3.client("runtime.sagemaker")

payload = np2csv(test_X)
response = runtime.invoke_endpoint(
    EndpointName=linear_endpoint, ContentType="text/csv", Body=payload
)
result = json.loads(response["Body"].read().decode())
test_pred = np.array([r["score"] for r in result["predictions"]])

In [88]:
test_pred

array([ 3.50970697,  2.00015283,  2.84324455,  2.85731125,  2.58432174,
        3.15541196,  3.12896466,  2.78249931,  2.66011643,  2.97703457,
        2.20796108,  1.67712879,  1.89013672,  1.36273408,  2.35902858,
        3.44479561,  3.07842112,  2.45806766,  2.02728796,  1.74823129,
        3.40422797,  2.50163054, -0.78330183,  2.33016372,  1.98255587,
        3.04326129,  2.27221394,  2.19485569,  2.82023525,  2.28935575,
        2.35364294,  3.13697147,  2.50937557,  2.89433718,  2.64403486,
        2.53871226,  2.24738169,  1.51104546,  2.9128685 ,  2.25142407,
        0.86305571,  2.19338751,  2.40425229,  2.84590983,  2.58581352,
       -0.28448915,  4.21642065,  2.67881274,  2.91610408, -0.76382732,
        3.10073137,  2.68319392,  3.07900691,  3.31535673,  0.54594111,
        3.40730214,  1.9915601 ,  2.59326077, -1.07510853,  3.26818275,
        3.00697184,  2.3305335 ,  2.66146994, -0.14179635,  0.40537858,
        1.67326498,  2.73595667,  3.26818275,  3.47668552, -0.50

In [89]:
test_mae_linear = np.mean(np.abs(test_y - test_pred))
test_mae_baseline = np.mean(
    np.abs(test_y - np.median(train_y))
)  ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear, 3))

Test MAE Baseline : 28.242
Test MAE Linear: 27.966


In [90]:
test_pred_class = (test_pred > 0.5) + 0
test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class)) * 100
baseline_accuracy = np.mean((test_y == test_pred_baseline)) * 100

print("Prediction Accuracy:", round(prediction_accuracy, 1), "%")
print("Baseline Accuracy:", round(baseline_accuracy, 1), "%")

Prediction Accuracy: 0.9 %
Baseline Accuracy: 0.6000000000000001 %
