# Random Forest using Custom Image

You will be working with the heart disease data to build a random forest model in this segment to predict whether a person is suffering from heart disease or not.
- Heart disease = 0 means that the person does not have heart disease.
- Heart disease = 1 means that the person has heart disease.
- sex = 0 means that the person is female.
- sex = 1 means that the person is male.
- BP: Blood Pressure
- Cholestrol: Cholestrol level

In [1]:
# Import required libraries:
import os
import sys

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import itertools

## Fetch data

In [2]:
# Load the data into dataframe: 
df = pd.read_csv('heart_v2.csv')
df.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


## Prepare data

In [5]:
df.columns[:-1]

Index(['age', 'sex', 'BP', 'cholestrol'], dtype='object')

In [6]:
# Select required columns:
columnReq=[list(df.columns)[-1]]+list(df.columns)[:-1]
columnReq

['heart disease', 'age', 'sex', 'BP', 'cholestrol']

In [7]:
# Update the dataframe:
df=df[columnReq]

In [8]:
# View the updated dataframe:
df.head()

Unnamed: 0,heart disease,age,sex,BP,cholestrol
0,1,70,1,130,322
1,0,67,0,115,564
2,1,57,1,124,261
3,0,64,1,128,263
4,0,74,0,120,269


In [9]:
# Select all columns except the target column and save it in X:
X=df.iloc[:,1:]
X.head()

Unnamed: 0,age,sex,BP,cholestrol
0,70,1,130,322
1,67,0,115,564
2,57,1,124,261
3,64,1,128,263
4,74,0,120,269


In [10]:
# Select and save the target variable in y:
y=df.iloc[:,0:1]
y.head()

Unnamed: 0,heart disease
0,1
1,0
2,1
3,0
4,0


In [12]:
# Import required libraries:
from sklearn.model_selection import train_test_split

In [13]:
# Split the data into train-test split:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
X_train.shape, X_test.shape

((189, 4), (81, 4))

In [14]:
# View the train data:
train=y_train.join(X_train)
train.head()

Unnamed: 0,heart disease,age,sex,BP,cholestrol
84,0,57,1,110,201
251,0,44,1,130,219
92,1,54,1,124,266
201,1,58,1,125,300
126,1,62,1,120,267


In [15]:
# View test data:
test=y_test.join(X_test)
test.head()

Unnamed: 0,heart disease,age,sex,BP,cholestrol
30,1,57,1,128,229
116,1,46,1,120,249
79,0,56,1,120,236
127,0,52,0,136,196
196,0,58,0,100,248


In [16]:
# Save train data:
train.to_csv('rftrain.csv',index=False,header=False)

In [17]:
# Save test data: 
test.to_csv('rftest.csv',index=False,header=False)

In [18]:
!/bin/bash ./setup.sh

SageMaker instance route table setup is ok. We are good to go.
SageMaker instance routing for Docker is ok. We are good to go!


In [19]:
import sagemaker
from sagemaker import get_execution_role
from sklearn import preprocessing
from sklearn.metrics import classification_report,confusion_matrix

In [20]:
from sagemaker.sklearn.estimator import SKLearn

In [21]:
sagemaker_session=sagemaker.Session()
role=get_execution_role()

In [22]:
region=sagemaker_session.boto_session.region_name

In [23]:
region

'us-east-1'

In [24]:
train_file="rftrain.csv"
test_file="rftest.csv"
bucket_name="ec2-east-sagemaker-demo"

In [25]:
import re

In [27]:
training_folder=r'BYOA/train'
test_folder=r'BYOA/test'
model_folder=r'BYOA/model/'

In [28]:
training_data_uri=r's3://'+bucket_name+r'/'+training_folder

In [29]:
training_data_uri

's3://ec2-east-sagemaker-demo/BYOA/train'

In [30]:
testing_data_uri=r's3://'+bucket_name+r'/'+test_folder
model_data_uri=r's3://'+bucket_name+r'/'+model_folder

In [32]:
sagemaker_session.upload_data(train_file,
                             bucket=bucket_name,
                             key_prefix=training_folder
                             
                             )

's3://ec2-east-sagemaker-demo/BYOA/train/rftrain.csv'

In [33]:
sagemaker_session.upload_data(test_file,
                             bucket=bucket_name,
                             key_prefix=test_folder
                             
                             )

's3://ec2-east-sagemaker-demo/BYOA/test/rftest.csv'

In [34]:
!pygmentize 'rfcustom.py'

[37m#!/usr/bin/env python[39;49;00m
[37m# coding: utf-8[39;49;00m

[37m# In[1]:[39;49;00m


[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m

[34mfrom[39;49;00m [04m[36msklearn[39;49;00m [34mimport[39;49;00m ensemble
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mexternals[39;49;00m [34mimport[39;49;00m joblib


[37m# In[ ]:[39;49;00m


[37m#Define the Parameter in envirorment variable [39;49;00m


[37m# In[7]:[39;49;00m


[34mdef[39;49;00m [32mparse_args[39;49;00m():
    parser = argparse.ArgumentParser()

    [37m# Hyperparameters used for Training[39;49;00m
    parser.add_argument([33m'[39;49;00m[33m--n_estimators[39;49;00m[33m'[39;49;00m, [36mtype[39;49;00m=[36mint[39;49;00m, default=[34m100

In [35]:
instance_type='local'

In [37]:
estimator=SKLearn(entry_point='rfcustom.py',train_instance_type=instance_type,
                 role=role,
                  output_path=model_data_uri,
                  base_job_name='Custom-Random-Forest-v1',
                  hyperparameters={'n_estimators':5,'max_depth':3}
                  
                 )

This is not the latest supported version. If you would like to use version 0.23-1, please add framework_version=0.23-1 to your constructor.


In [38]:
estimator.fit({'training':training_data_uri,'testing':testing_data_uri})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


Creating tmp_562d32e_algo-1-f5tiw_1 ... 
[1BAttaching to tmp_562d32e_algo-1-f5tiw_12mdone[0m
[36malgo-1-f5tiw_1  |[0m 2020-08-02 13:20:16,250 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
[36malgo-1-f5tiw_1  |[0m 2020-08-02 13:20:16,253 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-f5tiw_1  |[0m 2020-08-02 13:20:16,265 sagemaker_sklearn_container.training INFO     Invoking user training script.
[36malgo-1-f5tiw_1  |[0m 2020-08-02 13:20:16,404 sagemaker-containers INFO     Module rfcustom does not provide a setup.py. 
[36malgo-1-f5tiw_1  |[0m Generating setup.py
[36malgo-1-f5tiw_1  |[0m 2020-08-02 13:20:16,404 sagemaker-containers INFO     Generating setup.cfg
[36malgo-1-f5tiw_1  |[0m 2020-08-02 13:20:16,404 sagemaker-containers INFO     Generating MANIFEST.in
[36malgo-1-f5tiw_1  |[0m 2020-08-02 13:20:16,404 sagemaker-containers INFO     Installing module with the following command:
[36m

[36malgo-1-f5tiw_1  |[0m   import imp
[36malgo-1-f5tiw_1  |[0m ####Starting Main Program#########
[36malgo-1-f5tiw_1  |[0m ####ARG######### Namespace(current_host='algo-1-f5tiw', hosts=['algo-1-f5tiw'], max_depth=3, model_dir='/opt/ml/model', n_estimators=5, output_data_dir='/opt/ml/output/data', test='/opt/ml/input/data/testing', train='/opt/ml/input/data/training')
[36malgo-1-f5tiw_1  |[0m #######Successfully Loaded Train
[36malgo-1-f5tiw_1  |[0m ####### Test Successfully Loaded
[36malgo-1-f5tiw_1  |[0m ####### Model Started Training
[36malgo-1-f5tiw_1  |[0m Training Accuracy: 0.730
[36malgo-1-f5tiw_1  |[0m Testing Accuracy: 0.630
[36malgo-1-f5tiw_1  |[0m 2020-08-02 13:20:18,562 sagemaker-containers INFO     Reporting training SUCCESS
[36mtmp_562d32e_algo-1-f5tiw_1 exited with code 0
[0mAborting on container exit...
===== Job Complete =====


In [40]:
predictor=estimator.deploy(instance_type=instance_type,initial_instance_count=1)

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


Attaching to tmppo8jm0h3_algo-1-sjopc_1
[36malgo-1-sjopc_1  |[0m Processing /opt/ml/code
[36malgo-1-sjopc_1  |[0m Building wheels for collected packages: rfcustom
[36malgo-1-sjopc_1  |[0m   Building wheel for rfcustom (setup.py) ... [?25ldone
[36malgo-1-sjopc_1  |[0m [?25h  Created wheel for rfcustom: filename=rfcustom-1.0.0-py2.py3-none-any.whl size=6401 sha256=fdf0eae7a7ece2830cde105ca0a16d1fbb80e043433e49dd2000412944f1b332
[36malgo-1-sjopc_1  |[0m   Stored in directory: /tmp/pip-ephem-wheel-cache-_ooei1xf/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3
[36malgo-1-sjopc_1  |[0m Successfully built rfcustom
[36malgo-1-sjopc_1  |[0m Installing collected packages: rfcustom
[36malgo-1-sjopc_1  |[0m Successfully installed rfcustom-1.0.0
[36malgo-1-sjopc_1  |[0m   import imp
[36malgo-1-sjopc_1  |[0m [2020-08-02 13:27:18 +0000] [30] [INFO] Starting gunicorn 19.9.0
[36malgo-1-sjopc_1  |[0m [2020-08-02 13:27:18 +0000] [30] [INFO] Listening at: unix:/t

In [41]:
dfinf=pd.read_csv(test_file,names=columnReq)

In [42]:
dfinf.head()

Unnamed: 0,heart disease,age,sex,BP,cholestrol
0,1,57,1,128,229
1,1,46,1,120,249
2,0,56,1,120,236
3,0,52,0,136,196
4,0,58,0,100,248


In [43]:
xtest=dfinf.iloc[:,1:]

In [44]:
xtest.head()

Unnamed: 0,age,sex,BP,cholestrol
0,57,1,128,229
1,46,1,120,249
2,56,1,120,236
3,52,0,136,196
4,58,0,100,248


In [45]:
pred=predictor.predict(xtest)

[36malgo-1-sjopc_1  |[0m 2020-08-02 13:29:53,998 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)
[36malgo-1-sjopc_1  |[0m   import imp
[36malgo-1-sjopc_1  |[0m ###### Uploaded model########
[36malgo-1-sjopc_1  |[0m 172.18.0.1 - - [02/Aug/2020:13:29:54 +0000] "POST /invocations HTTP/1.1" 200 776 "-" "-"


In [46]:
pred

array([1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])

In [48]:
ytest=dfinf.iloc[:,0:1]

In [50]:
ytest=ytest.values

In [51]:
ytest

array([[1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0]])

In [52]:
confusion_matrix(ytest,pred)

array([[37, 12],
       [18, 14]])

In [53]:
classification_report(ytest,pred)

'              precision    recall  f1-score   support\n\n           0       0.67      0.76      0.71        49\n           1       0.54      0.44      0.48        32\n\n    accuracy                           0.63        81\n   macro avg       0.61      0.60      0.60        81\nweighted avg       0.62      0.63      0.62        81\n'

In [None]:
!/bin/bash ./setup.sh

In [None]:
# Import required libraries:
import sagemaker
from sagemaker import get_execution_role
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix

# SageMaker SKLearn Estimator
from sagemaker.sklearn.estimator import SKLearn

In [None]:
# Create SageMaker session:
sagemaker_session = sagemaker.Session()

role = get_execution_role()
region = sagemaker_session.boto_session.region_name

In [None]:
# Specify your bucket name:
train_file='rftrain.csv'
test_file='rftest.csv'
bucket_name = 'ec2-east-sagemaker-demo'

# Specify file names:
training_folder = r'BYOA/train'
test_folder = r'BYOA/test'
model_folder = r'BYOA/model/'

# Save files to S3
training_data_uri = r's3://' + bucket_name + r'/' + training_folder
testing_data_uri = r's3://' + bucket_name + r'/' + test_folder
model_data_uri = r's3://' + bucket_name + r'/' + model_folder

In [None]:
# Upload train data:
sagemaker_session.upload_data(train_file,
                              bucket=bucket_name, 
                              key_prefix=training_folder)

In [None]:
# Upload test data:
sagemaker_session.upload_data(test_file, 
                              bucket=bucket_name, 
                              key_prefix=test_folder)

In [None]:
!pygmentize 'rfcustom.py'

In [None]:
# set instance_type='ml.m5.xlarge'
instance_type='local'

In [None]:
# Build instance:
estimator = SKLearn(entry_point='rfcustom.py',
                    train_instance_type= instance_type,                     
                    role=role, 
                    output_path=model_data_uri,
                    base_job_name='Custom Random-forest',
                    hyperparameters={'n_estimators': 5,'max_depth':3})

## Train the model

In [None]:
# Train the model:
estimator.fit({'training':training_data_uri,'testing':testing_data_uri})

## Deploy the Model

In [None]:
# Deploy the model:
predictor = estimator.deploy(initial_instance_count=1, 
                           instance_type=instance_type)

In [None]:
# Read test file:
dftst = pd.read_csv(test_file,names=columnReq)
dftst.head()

In [None]:
# View test data for X:
xtest=dftst.iloc[:,1:]
xtest.head()

In [None]:
# View y:
ytest=dftst.iloc[:,0:1]

In [None]:
# Import required libraries:
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Predict:
k=predictor.predict(xtest)

In [None]:
# Print Confusion Matrix:
confusion_matrix(ytest,k)

In [None]:
# Print classification report:
classification_report(ytest,k)