# Chapter 03. Interactive Model Training

## 3.1 Built-in Algorithms
---

### 3.1.1 Setup
---

In [1]:
%cd /root/sagemaker-course/notebooks/

/root/sagemaker-course/notebooks


In [3]:
import boto3
import pandas as pd

pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 10)         # Keep the output on one page

BUCKET = 'sagemaker-course-20200812'
PREFIX = 'churn'
LOCAL_DATA_DIRECTORY = f'../data/{PREFIX}'

print(f'Artifacts will be written to s3://{BUCKET}/{PREFIX}')

Artifacts will be written to s3://sagemaker-course-20200812/churn


In [3]:
import sagemaker

In [4]:
sagemaker_session = sagemaker.Session()
boto_session = sagemaker_session.boto_session

In [5]:
from sagemaker import get_execution_role

role = get_execution_role()
print(role)

arn:aws:iam::209970524256:role/service-role/AmazonSageMaker-ExecutionRole-20200618T144956


### 3.1.2 Data
---

To see how the dataset was preprocessed, see this notebook: [XGBoost customer churn notebook that starts with the original dataset](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_applying_machine_learning/xgboost_customer_churn/xgboost_customer_churn.ipynb). 

In [6]:
local_data_path = f'{LOCAL_DATA_DIRECTORY}/training-dataset-with-header.csv'
data = pd.read_csv(local_data_path)

data

Unnamed: 0,Churn,Account Length,VMail Message,Day Mins,Day Calls,Eve Mins,Eve Calls,Night Mins,Night Calls,Intl Mins,Intl Calls,CustServ Calls,State_AK,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DC,State_DE,State_FL,State_GA,State_HI,State_IA,State_ID,State_IL,State_IN,State_KS,State_KY,State_LA,State_MA,State_MD,State_ME,State_MI,State_MN,State_MO,State_MS,State_MT,State_NC,State_ND,State_NE,State_NH,State_NJ,State_NM,State_NV,State_NY,State_OH,State_OK,State_OR,State_PA,State_RI,State_SC,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,Area Code_408,Area Code_415,Area Code_510,Int'l Plan_no,Int'l Plan_yes,VMail Plan_no,VMail Plan_yes
0,0,106,0,274.4,120,198.6,82,160.8,62,6.0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0
1,0,28,0,187.8,94,248.6,86,208.8,124,10.6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0
2,1,148,0,279.3,104,201.6,87,280.8,99,7.9,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
3,0,132,0,191.9,107,206.9,127,272.0,88,12.6,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0
4,0,92,29,155.4,110,188.5,104,254.9,118,8.0,4,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2328,0,106,0,194.8,133,213.4,73,190.8,92,11.5,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0
2329,1,125,0,143.2,80,88.1,94,233.2,135,8.8,7,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0
2330,0,129,0,143.7,114,297.8,98,212.6,86,11.4,8,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0
2331,0,159,0,198.8,107,195.5,91,213.3,120,16.5,7,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0


In [7]:
s3_input_train = sagemaker_session.upload_data(f'{LOCAL_DATA_DIRECTORY}/train.csv',
                                               bucket=BUCKET,
                                               key_prefix=PREFIX)

s3_input_validation = sagemaker_session.upload_data(f'{LOCAL_DATA_DIRECTORY}/validation.csv',
                                                    bucket=BUCKET,
                                                    key_prefix=PREFIX)

s3_input_train = sagemaker.TrainingInput(s3_data=s3_input_train, content_type='csv')
s3_input_validation = sagemaker.TrainingInput(s3_data=s3_input_validation, content_type='csv')

### 3.1.3 Training
---

In [12]:
from sagemaker import image_uris

xgboost_image_uri = image_uris.retrieve(framework='xgboost', region=boto_session.region_name, version='0.90-2')
xgboost_image_uri

'257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3'

In [13]:
from sagemaker import estimator

In [16]:
xgb_model = estimator.Estimator(image_uri=xgboost_image_uri,
                                role=role,
                                instance_count=1,
                                instance_type='ml.m4.xlarge',
                                output_path=f's3://{BUCKET}/{PREFIX}',
                                base_job_name='builtin-xgboost',
                                sagemaker_session=sagemaker_session)

In [17]:
xgb_model.set_hyperparameters(max_depth=5,
                              subsample=0.8,
                              num_round=600,
                              eta=0.2,
                              gamma=4,
                              min_child_weight=6,
                              silent=0,
                              objective='binary:logistic')

In [19]:
xgb_model.fit({'train': s3_input_train,
               'validation': s3_input_validation})

2020-08-12 12:05:53 Starting - Starting the training job...
2020-08-12 12:05:54 Starting - Launching requested ML instances......
2020-08-12 12:07:16 Starting - Preparing the instances for training......
2020-08-12 12:08:15 Downloading - Downloading input data...
2020-08-12 12:08:35 Training - Downloading the training image...
2020-08-12 12:09:22 Uploading - Uploading generated training model.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[12:09:1

## 3.2 Custom Code with Pre-built Docker Images
---

### 3.2.1 Create a custom Scikit-learn script to train a model
---

See the [sagemaker-training-toolkit](https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md) repository for a comphrehensive list of environment variables used by SageMaker Docker images.

In [20]:
!pygmentize '../scripts/sklearn/sklearn_rf.py'

[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m print_function

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m

[34mfrom[39;49;00m [04m[36msklearn[39;49;00m [34mimport[39;49;00m ensemble
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mexternals[39;49;00m [34mimport[39;49;00m joblib


[34mif[39;49;00m [31m__name__[39;49;00m == [33m'[39;49;00m[33m__main__[39;49;00m[33m'[39;49;00m:
    parser = argparse.ArgumentParser()

    [37m# Hyperparameters are described here. In this simple example we are just including one hyperparameter.[39;49;00m
    parser.add_argument([33m'[39;49;00m[33m--n_estimators[39;49;00m[33m'[39;49;00m, [36mtype[39;49;00m=[36mint[39;49;00m, default=[34m100[39;49;00m)

    [37m# Sagemaker specific arguments. Defaults are set in the 

### 3.2.2 Train a SageMaker Scikit Estimator
---

In [21]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    framework_version='0.20.0',
    py_version='py3',
    entry_point='../scripts/sklearn/sklearn_rf.py',
    code_location=f's3://{BUCKET}/{PREFIX}',
    hyperparameters={'n_estimators': 50},
    role=role,
    instance_type='ml.c4.xlarge',
    output_path=f's3://{BUCKET}/{PREFIX}',
    base_job_name='custom-code-sklearn',
    sagemaker_session=sagemaker_session)

In [22]:
sklearn_estimator.fit({'train': s3_input_train})

2020-08-12 12:16:45 Starting - Starting the training job...
2020-08-12 12:16:47 Starting - Launching requested ML instances......
2020-08-12 12:18:09 Starting - Preparing the instances for training......
2020-08-12 12:19:10 Downloading - Downloading input data...
2020-08-12 12:19:42 Training - Downloading the training image...
2020-08-12 12:20:09 Uploading - Uploading generated training model
2020-08-12 12:20:09 Completed - Training job completed
[34m2020-08-12 12:19:57,228 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-08-12 12:19:57,230 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-12 12:19:57,241 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-08-12 12:19:57,533 sagemaker-containers INFO     Module sklearn_rf does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-08-12 12:19:57,533 sagemaker-containers INFO     Generating 

In [23]:
# Head to the URL in the following cell to view the details of this training job.

f'https://{boto_session.region_name}.console.aws.amazon.com/sagemaker/home?region={boto_session.region_name}#/jobs/{sklearn_estimator.jobs[0].job_name}'

'https://us-east-2.console.aws.amazon.com/sagemaker/home?region=us-east-2#/jobs/custom-code-sklearn-2020-08-12-12-16-44-914'

### 3.3 Installing custom Python requirements
---

In [28]:
sklearn_estimator = SKLearn(
    framework_version='0.20.0',
    py_version='py3',
    entry_point='../scripts/sklearn/sklearn_rf.py',
    code_location=f's3://{BUCKET}/{PREFIX}',
    hyperparameters={'n_estimators': 50},
    role=role,
    instance_type='ml.c4.xlarge',
    output_path=f's3://{BUCKET}/{PREFIX}',
    base_job_name='custom-code-sklearn',
    sagemaker_session=sagemaker_session)

print(sklearn_estimator.image_uri)

257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3


The [sagemaker-scikit-learn-container](https://github.com/aws/sagemaker-scikit-learn-container/) repo contains the source code, including the Dockerfile, for creating this image.

In [30]:
!cat ../scripts/sklearn/requirements.txt

eli5

In [31]:
sklearn_estimator = SKLearn(
    framework_version='0.20.0',
    py_version='py3',
    entry_point='sklearn_rf.py',
    source_dir='../scripts/sklearn',
    code_location=f's3://{BUCKET}/{PREFIX}',
    hyperparameters={'n_estimators': 50},
    role=role,
    instance_type='ml.c4.xlarge',
    output_path=f's3://{BUCKET}/{PREFIX}',
    base_job_name='install-libs-sklearn',
    sagemaker_session=sagemaker_session)

In [32]:
sklearn_estimator.fit({'train': s3_input_train})

2020-08-12 12:22:23 Starting - Starting the training job...
2020-08-12 12:22:25 Starting - Launching requested ML instances......
2020-08-12 12:23:28 Starting - Preparing the instances for training...
2020-08-12 12:24:13 Downloading - Downloading input data...
2020-08-12 12:24:45 Training - Downloading the training image..[34m2020-08-12 12:24:59,577 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-08-12 12:24:59,579 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-08-12 12:24:59,589 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-08-12 12:24:59,879 sagemaker-containers INFO     Module sklearn_rf does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-08-12 12:24:59,879 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-08-12 12:24:59,879 sagemaker-containers INFO     Generating MANIFEST.in[0m
[34m2020-08-12 12:24:59,87

## 3.4 Preprocessing Data with SageMaker Processing Jobs
---

In [33]:
df = pd.read_csv(f'{LOCAL_DATA_DIRECTORY}/raw_churn.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
State,KS,OH,NJ,OH,OK
Account Length,128,107,137,84,75
Area Code,415,415,415,408,415
Phone,382-4657,371-7191,358-1921,375-9999,330-6626
Int'l Plan,no,no,no,yes,yes
...,...,...,...,...,...
Intl Mins,10,13.7,12.2,6.6,10.1
Intl Calls,3,3,5,7,3
Intl Charge,2.7,3.7,3.29,1.78,2.73
CustServ Calls,1,1,0,2,3


In [34]:
s3_raw_data = sagemaker_session.upload_data(f'{LOCAL_DATA_DIRECTORY}/raw_churn.csv',
                                            bucket=BUCKET,
                                            key_prefix=PREFIX)

print(f'Raw data S3 URI: {s3_raw_data}')

Raw data S3 URI: s3://sagemaker-course-20200812/churn/raw_churn.csv


In [35]:
from sagemaker.sklearn.processing import SKLearnProcessor

SKLearnProcessor?

[0;31mInit signature:[0m
[0mSKLearnProcessor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mframework_version[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrole[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minstance_type[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minstance_count[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcommand[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvolume_size_in_gb[0m[0;34m=[0m[0;36m30[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvolume_kms_key[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moutput_kms_key[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_runtime_in_seconds[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbase_job_name[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msagemaker_session[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0menv[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;

In [36]:
%%writefile ../scripts/sklearn/preprocessing.py

import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-test-split-ratio', type=float, default=0.3)
    args, _ = parser.parse_known_args()
    
    print('Received arguments {}'.format(args))

    input_data_path = os.path.join('/opt/ml/processing/input', 'raw_churn.csv')
    
    print('Reading input data from {}'.format(input_data_path))
    df = pd.read_csv(input_data_path)
    df = pd.DataFrame(data=df)
    
    # Encode target
    lb = LabelBinarizer()
    label = lb.fit_transform(df['Churn?'])
    df['Churn?'] = label.flatten()
    
    negative_examples, positive_examples = np.bincount(df['Churn?'])
    print('Data after cleaning: {}, {} positive examples, {} negative examples'.format(df.shape, positive_examples, negative_examples))
    
    split_ratio = args.train_test_split_ratio
    print('Splitting data into train and test sets with ratio {}'.format(split_ratio))
    X_train, X_test, y_train, y_test = train_test_split(df.drop('Churn?', axis=1), df['Churn?'], test_size=split_ratio, random_state=0)

    
    numerical_cols = ['Account Length', 'VMail Message', 'Day Mins', 'Day Calls', 'Eve Mins',
                      'Eve Calls', 'Night Mins', 'Night Calls', 'Intl Mins', 'Intl Calls',
                      'CustServ Calls']
    categorical_cols = ["State", "Int'l Plan", "VMail Plan"]

    num_proc = make_pipeline(SimpleImputer(strategy='median'))
    cat_proc = make_pipeline(
        SimpleImputer(strategy='constant', fill_value='missing'),
        OneHotEncoder(handle_unknown='ignore', sparse=False))    
    preprocessor = make_column_transformer((numerical_cols, num_proc),
                                           (categorical_cols, cat_proc))
    print('Running preprocessing and feature engineering transformations')
    train_features = preprocessor.fit_transform(X_train)
    test_features = preprocessor.transform(X_test)
    
    print('Train data shape after preprocessing: {}'.format(train_features.shape))
    print('Test data shape after preprocessing: {}'.format(test_features.shape))
    
    one_hot_encoder = preprocessor.named_transformers_['pipeline-2'].named_steps['onehotencoder']
    encoded_cat_cols = one_hot_encoder.get_feature_names(input_features=categorical_cols).tolist()
    processed_cols = numerical_cols + encoded_cat_cols

    train_df = pd.DataFrame(train_features, columns=processed_cols)
    train_df.insert(0, 'churn', y_train)

    test_df = pd.DataFrame(test_features, columns=processed_cols)
    test_df.insert(0, 'churn', y_test)
    
    train_output_path = os.path.join('/opt/ml/processing/train', 'train.csv')
    test_output_path = os.path.join('/opt/ml/processing/test', 'test.csv')

    print('Saving training features to {}'.format(train_output_path))
    train_df.to_csv(train_output_path, header=True, index=False)

    print('Saving test features to {}'.format(test_output_path))
    test_df.to_csv(test_output_path, header=True, index=False)

Writing ../scripts/sklearn/preprocessing.py


In [37]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

processing_input = ProcessingInput(source=s3_raw_data, destination='/opt/ml/processing/input')

processing_output_train = ProcessingOutput(output_name='train.csv', source='/opt/ml/processing/train',
                                           destination=f's3://{BUCKET}/{PREFIX}/processing/')
processing_output_test = ProcessingOutput(output_name='test.csv', source='/opt/ml/processing/test',
                                          destination=f's3://{BUCKET}/{PREFIX}/processing/')

In [38]:
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.m4.xlarge',
                                     instance_count=1,
                                     sagemaker_session=sagemaker_session)

sklearn_processor.run(code='../scripts/sklearn/preprocessing.py',
                      inputs=[processing_input],
                      outputs=[processing_output_train, processing_output_test],
                      arguments=['--train-test-split-ratio', '0.2'])


Job Name:  sagemaker-scikit-learn-2020-08-12-12-26-02-630
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://sagemaker-course-20200812/churn/raw_churn.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-209970524256/sagemaker-scikit-learn-2020-08-12-12-26-02-630/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train.csv', 'S3Output': {'S3Uri': 's3://sagemaker-course-20200812/churn/processing/', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test.csv', 'S3Output': {'S3Uri': 's3://sagemaker-course-20200812/churn/processing/', 'LocalPath': '/opt/ml/processing/test', 'S3UploadM

In [39]:
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

s3_output_uris = []

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    s3_output_uris.append(output['S3Output']['S3Uri'] + output['OutputName'])
    
s3_output_uris

['s3://sagemaker-course-20200812/churn/processing/train.csv',
 's3://sagemaker-course-20200812/churn/processing/test.csv']

In [40]:
for output_uri in s3_output_uris:
    key_prefix = output_uri.split(BUCKET)[1][1:]
    print(f'Downloading file: {key_prefix}')
    sagemaker_session.download_data(f'{LOCAL_DATA_DIRECTORY}/processed',
                                    bucket=BUCKET,
                                    key_prefix=key_prefix)

Downloading file: churn/processing/train.csv
Downloading file: churn/processing/test.csv


In [41]:
processed_train_df = pd.read_csv(f'{LOCAL_DATA_DIRECTORY}/processed/train.csv')
processed_test_df = pd.read_csv(f'{LOCAL_DATA_DIRECTORY}/processed/test.csv')

In [42]:
print('Train data shape after preprocessing: {}'.format(processed_train_df.shape))
processed_train_df.head()

Train data shape after preprocessing: (2666, 67)


Unnamed: 0,churn,Account Length,VMail Message,Day Mins,Day Calls,Eve Mins,Eve Calls,Night Mins,Night Calls,Intl Mins,Intl Calls,CustServ Calls,State_AK,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DC,State_DE,State_FL,State_GA,State_HI,State_IA,State_ID,State_IL,State_IN,State_KS,State_KY,State_LA,State_MA,State_MD,State_ME,State_MI,State_MN,State_MO,State_MS,State_MT,State_NC,State_ND,State_NE,State_NH,State_NJ,State_NM,State_NV,State_NY,State_OH,State_OK,State_OR,State_PA,State_RI,State_SC,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,Int'l Plan_no,Int'l Plan_yes,VMail Plan_no,VMail Plan_yes
0,0.0,80.0,0.0,198.1,160.0,156.7,87.0,182.1,76.0,9.3,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,,28.0,0.0,168.2,87.0,161.7,92.0,192.4,112.0,10.1,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,120.0,0.0,252.0,120.0,150.2,106.0,151.8,96.0,9.6,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,105.0,0.0,251.6,88.0,175.1,103.0,184.4,112.0,5.4,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,,134.0,34.0,247.2,105.0,225.5,133.0,186.3,76.0,6.1,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [43]:
print('Test data shape after preprocessing: {}'.format(processed_test_df.shape))
processed_test_df.head()

Test data shape after preprocessing: (667, 67)


Unnamed: 0,churn,Account Length,VMail Message,Day Mins,Day Calls,Eve Mins,Eve Calls,Night Mins,Night Calls,Intl Mins,Intl Calls,CustServ Calls,State_AK,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DC,State_DE,State_FL,State_GA,State_HI,State_IA,State_ID,State_IL,State_IN,State_KS,State_KY,State_LA,State_MA,State_MD,State_ME,State_MI,State_MN,State_MO,State_MS,State_MT,State_NC,State_ND,State_NE,State_NH,State_NJ,State_NM,State_NV,State_NY,State_OH,State_OK,State_OR,State_PA,State_RI,State_SC,State_SD,State_TN,State_TX,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,Int'l Plan_no,Int'l Plan_yes,VMail Plan_no,VMail Plan_yes
0,,92.0,0.0,264.3,91.0,160.9,115.0,198.6,73.0,9.3,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,112.0,36.0,113.7,117.0,157.5,82.0,177.6,118.0,10.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,,69.0,0.0,195.3,70.0,216.7,108.0,259.9,119.0,12.5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,,95.0,0.0,175.2,91.0,244.4,109.0,75.8,95.0,7.5,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,115.0,0.0,345.3,81.0,203.4,106.0,217.5,107.0,11.8,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [44]:
preprocessing_job_description['ProcessingInputs']

[{'InputName': 'input-1',
  'S3Input': {'S3Uri': 's3://sagemaker-course-20200812/churn/raw_churn.csv',
   'LocalPath': '/opt/ml/processing/input',
   'S3DataType': 'S3Prefix',
   'S3InputMode': 'File',
   'S3DataDistributionType': 'FullyReplicated',
   'S3CompressionType': 'None'}},
 {'InputName': 'code',
  'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-209970524256/sagemaker-scikit-learn-2020-08-12-12-26-02-630/input/code/preprocessing.py',
   'LocalPath': '/opt/ml/processing/input/code',
   'S3DataType': 'S3Prefix',
   'S3InputMode': 'File',
   'S3DataDistributionType': 'FullyReplicated',
   'S3CompressionType': 'None'}}]

In [45]:
preprocessing_job_description['ProcessingOutputConfig']

{'Outputs': [{'OutputName': 'train.csv',
   'S3Output': {'S3Uri': 's3://sagemaker-course-20200812/churn/processing/',
    'LocalPath': '/opt/ml/processing/train',
    'S3UploadMode': 'EndOfJob'}},
  {'OutputName': 'test.csv',
   'S3Output': {'S3Uri': 's3://sagemaker-course-20200812/churn/processing/',
    'LocalPath': '/opt/ml/processing/test',
    'S3UploadMode': 'EndOfJob'}}]}