# Build a SageMaker Pipeline to train and register the injury narrative BERT classifier

In [1]:
%%capture
#!pip install tensorflow
!pip install transformers
!pip install nltk

In [2]:
import pandas as pd
import tensorflow as tf
import re
import nltk
import string
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras import activations, optimizers, losses
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as np
import sagemaker
from sagemaker import get_execution_role
import joblib 
import collections

In [3]:
bucket = 'cdc-cdh-sagemaker-s3fs-dev'
sagemaker_session = sagemaker.Session(default_bucket=bucket)
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
default_bucket

'cdc-cdh-sagemaker-s3fs-dev'

In [4]:
%%time
%run ./src/pre-processing.py --data_path ./data/raw  --train_percentage 1

INFO: 08/20/2021 01:13:31 PM Start.....
INFO: 08/20/2021 01:13:31 PM Parsing arguments
INFO: 08/20/2021 01:13:31 PM Getting and splitting data
INFO: 08/20/2021 01:13:31 PM nb classes in final data:43
INFO: 08/20/2021 01:13:31 PM  X (153944,) , y : (153944,)
INFO: 08/20/2021 01:13:31 PM X_train shape (138549,) y_train shape : (138549,)
INFO: 08/20/2021 01:13:31 PM X_valid shape (15395,) y_valid shape : (15395,)


False


INFO: 08/20/2021 01:13:31 PM Preprocessing...


[62, 13, 71, 42, 60, 64, 55, 70, 31, 43, 63, 73, 24, 50, 53, 41, 66, 26, 78, 11, 99, 12, 65, 72, 23, 27, 44, 52, 69, 22, 51, 25, 32, 21, 40, 61, 56, 54, 49, 79, 67, 45, 20]


INFO: 08/20/2021 01:14:00 PM Tokenization and encoding...
INFO: 08/20/2021 01:14:09 PM Encoding Labels .....
INFO: 08/20/2021 01:14:09 PM Create TF Dataset....
INFO: 08/20/2021 01:14:09 PM Saving train and valid TF Records...
INFO: 08/20/2021 01:32:39 PM Saving test dataset...
INFO: 08/20/2021 01:32:54 PM Complete


In [5]:
#%run ./src/evaluate_model_metrics.py --input_data './data/test' \
#            --input_model './output/model/training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017' \
#            --max_len 45 \
#            --output_data './output/model'

In [6]:
num_records = 138549
num_valid_records = 15395
max_len = 45
epochs = 5
batch_size = 16
valid_batch_size = 16
steps_per_epoch = num_records // batch_size
validation_steps = num_valid_records // valid_batch_size
learning_rate = 5e-5
optimizer = 'adam'

In [7]:
print(num_records)
print(steps_per_epoch)
print(validation_steps)

138549
8659
962


In [8]:
%%time
%run ./src/train.py --train ./data/train --validation ./data/valid --epochs 5 --num_records 138549 --steps_per_epoch 8659 --validation_steps 962

input train:  ./data/train
input valid:  ./data/valid
loading data...
train_dir :  ./data/train
train_file :  ./data/train/train.tfrecord
valid_dir: ./data/valid
valid_file: ./data/valid/valid.tfrecord
loading encoder...
Building model...
[2021-08-20 13:34:04.000 tensorflow-2-3-gpu--ml-g4dn-xlarge-c85184389676cdfa7bdf06745c9b:69 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-08-20 13:34:04.025 tensorflow-2-3-gpu--ml-g4dn-xlarge-c85184389676cdfa7bdf06745c9b:69 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 45)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 45)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 43, 512)      1180160     bert[0][0]            















Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
saving model...
CPU times: user 2h 6min 26s, sys: 55min 53s, total: 3h 2min 20s
Wall time: 3h 28min 20s


### pipeline name

In [9]:
import time
timestamp = int(time.time())

pipeline_name = 'BaseBERT-Injury-Coding-pipeline-{}'.format(timestamp)

## Step 1 - Dataset and preprocessing step

## Upload raw dataset

In [10]:
prefix = 'projects/project006/injury-data/raw'
input_data_train = sagemaker_session.upload_data(path = './data/raw',
                                                      bucket = bucket,
                                                      key_prefix = prefix)
input_data_train

's3://cdc-cdh-sagemaker-s3fs-dev/projects/project006/injury-data/raw'

## Configure pre-processing step

In [11]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

In [12]:
# 7 params
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.c5.2xlarge"
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

train_percentage = ParameterFloat(
    name="TrainPercentage",
    default_value=0.05,
)

is_sample_dataset = ParameterString(
    name="SampleDataset",
    default_value="True",
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_train,
)

transformer_model = ParameterString(
    name="TransformerModel",
    default_value='bert-base-uncased',
)
max_seq_length = ParameterInteger(
    name="MaxSeqLength",
    default_value=45,
)

In [13]:
region = sagemaker_session.boto_region_name

In [14]:
from sagemaker.sklearn.processing import SKLearnProcessor

processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={'AWS_DEFAULT_REGION': sagemaker_session.boto_region_name},                             
)

INFO: 08/20/2021 05:07:09 PM Same images used for training and inference. Defaulting to image scope: inference.
INFO: 08/20/2021 05:07:09 PM Defaulting to only available Python version: py3


In [15]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep


processing_inputs=[
    ProcessingInput(
        input_name='raw-input-data',
        source=input_data,
        destination='/opt/ml/processing/input/data/',
        s3_data_distribution_type='ShardedByS3Key'
    )
]

processing_outputs=[
    ProcessingOutput(output_name='processed-train',
                     source='/opt/ml/processing/output/processed/train',
                     s3_upload_mode='EndOfJob'),
    ProcessingOutput(output_name='processed-validation',
                     source='/opt/ml/processing/output/processed/validation',
                     s3_upload_mode='EndOfJob'),
    ProcessingOutput(output_name='processed-test',
                     source='/opt/ml/processing/output/processed/test',
                     s3_upload_mode='EndOfJob'),
]        

processing_step = ProcessingStep(
    name='Pre-Processing', 
    code='./src/pre-processing.py',
    processor=processor,
    inputs=processing_inputs,
    outputs=processing_outputs,
    job_arguments=['--train_percentage', str(train_percentage.default_value),                   
                   '--is_sample_dataset', str(is_sample_dataset.default_value),
                   '--max_len',str(max_seq_length.default_value),
                   '--transformer_model',str(transformer_model.default_value)
                  ]
)        

print(processing_step)

ProcessingStep(name='Pre-Processing', step_type=<StepTypeEnum.PROCESSING: 'Processing'>)


In [16]:
import json

# print out the list of the processing job properties
print(json.dumps(
    processing_step.properties.__dict__,
    indent=4, sort_keys=True, default=str
))

{
    "AppSpecification": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9c90>",
    "AutoMLJobArn": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9c10>",
    "CreationTime": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9350>",
    "Environment": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9ed0>",
    "ExitMessage": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a97d0>",
    "ExperimentConfig": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9310>",
    "FailureReason": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a92d0>",
    "LastModifiedTime": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9210>",
    "MonitoringScheduleArn": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9050>",
    "NetworkConfig": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9910>",
    "ProcessingEndTime": "<sagemaker.workflow.properties.Properties

In [17]:
print(json.dumps(
    processing_step.properties.ProcessingOutputConfig.Outputs['processed-train'].__dict__, 
    indent=4, sort_keys=True, default=str
))

{
    "AppManaged": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a93d0>",
    "FeatureStoreOutput": "<sagemaker.workflow.properties.Properties object at 0x7ffa350a9dd0>",
    "OutputName": "<sagemaker.workflow.properties.Properties object at 0x7ffa7da9be10>",
    "S3Output": "<sagemaker.workflow.properties.Properties object at 0x7ffa7da9bed0>",
    "_path": "Steps.Pre-Processing.ProcessingOutputConfig.Outputs['processed-train']",
    "_shape_name": "ProcessingOutput"
}


In [18]:
print(json.dumps(
    processing_step.properties.ProcessingOutputConfig.Outputs['processed-train'].S3Output.S3Uri.__dict__,
    indent=4, sort_keys=True, default=str
))

{
    "__str__": "S3Uri",
    "_path": "Steps.Pre-Processing.ProcessingOutputConfig.Outputs['processed-train'].S3Output.S3Uri",
    "_shape_name": "S3Uri"
}


## Step 3 - Training Step

In [19]:
# 12 params

epochs = ParameterInteger(
    name="Epochs",
    default_value=3
)

num_records = ParameterInteger(
    name="NumRecords"
)
   

learning_rate = ParameterFloat(
    name="LearningRate",
    default_value=5e-5
) 
    
train_batch_size = ParameterInteger(
    name="TrainBatchSize",
    default_value=16
)

train_steps_per_epoch = ParameterInteger(
    name="TrainStepsPerEpoch",
    default_value=500
)

validation_batch_size = ParameterInteger(
    name="ValidationBatchSize",
    default_value=16
)

validation_steps_per_epoch = ParameterInteger(
    name="ValidationStepsPerEpoch",
    default_value=500
)


train_instance_count = ParameterInteger(
    name="TrainInstanceCount",
    default_value=1
)

train_instance_type = ParameterString(
    name="TrainInstanceType",
    default_value="ml.p3.2xlarge"
)


max_seq_length = ParameterInteger(
    name="MaxSeqLength",
    default_value=45,
)

optimizer = ParameterString(
    name="optimizer",
    default_value='Adam'
)

input_mode = ParameterString(
    name="InputMode",
    default_value="File",
)

In [20]:
hyperparameters={
    'max_seq_length': max_seq_length,
    'epochs': epochs,
    'num_records':num_records,
    'learning_rate': learning_rate,
    'batch_size': train_batch_size,
    'steps_per_epoch': train_steps_per_epoch,
    'validation_batch_size': validation_batch_size,
    'validation_steps': validation_steps_per_epoch,
    'optimizer':optimizer
}

In [21]:
metric_definitions = [{'Name':'train:loss','Regex':'loss: ([0-9\\.]+)'},
                                    {'Name':'train:accuracy','Regex':'acc: ([0-9\\.]+)'},
                                    {'Name':'validation:loss','Regex':'val_loss: ([0-9\\.]+)'},
                                    {'Name':'validation:accuracy','Regex':'val_acc: ([0-9\\.]+)'}]

In [22]:
from sagemaker.huggingface import HuggingFace

estimator = HuggingFace(
        entry_point="train.py",
        source_dir = "./src/",
        role=role,
        instance_count=train_instance_count,
        volume_size = 50,
        max_run = 18000,
        instance_type=train_instance_type,
        transformers_version = "4.4",
        tensorflow_version  = "2.4",
        py_version="py37",
        hyperparameters = hyperparameters,
        metric_definitions = metric_definitions,
        enable_sagemaker_metrics = True
    )



In [23]:
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") # PT1H represents `one hour`

In [24]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

training_step = TrainingStep(
    name='Train',
    estimator=estimator,
    inputs={
        'train': TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
                'processed-train'
            ].S3Output.S3Uri
        ),
        'validation': TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
                'processed-validation'
            ].S3Output.S3Uri
        )
    },
    cache_config=cache_config
)

print(training_step)

TrainingStep(name='Train', step_type=<StepTypeEnum.TRAINING: 'Training'>)


## Evaluation Step

In [25]:
from sagemaker.sklearn.processing import SKLearnProcessor

evaluation_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={'AWS_DEFAULT_REGION': region},
    max_runtime_in_seconds=7200
)

INFO: 08/20/2021 05:07:35 PM Same images used for training and inference. Defaulting to image scope: inference.
INFO: 08/20/2021 05:07:35 PM Defaulting to only available Python version: py3


In [26]:
from sagemaker.workflow.properties import PropertyFile

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='metrics',
    path='evaluation.json'
)

In [27]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

evaluation_step = ProcessingStep(
    name='EvaluateModel',
    processor=evaluation_processor,
    code='src/evaluate_model_metrics.py',
    inputs=[
        ProcessingInput(
            source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/input/model'
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['processed-test'].S3Output.S3Uri,
            destination='/opt/ml/processing/input/data'
        )
    ],
    outputs=[
        ProcessingOutput(output_name='metrics', 
                         s3_upload_mode='EndOfJob',
                         source='/opt/ml/processing/output/metrics/'),
    ],
    job_arguments=[
        '--max_len', str(max_seq_length.default_value),
    ],
    property_files=[evaluation_report],
)

## Register model step

In [28]:
# 3 parameters
model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)

deploy_instance_type = ParameterString(
    name="DeployInstanceType",
    default_value="ml.m5.large"
)

deploy_instance_count = ParameterInteger(
    name="DeployInstanceCount",
    default_value=1
)

In [29]:
model_package_group_name = f"BaseBERT-Injury-Coding-{timestamp}"

print(model_package_group_name)

BaseBERT-Injury-Coding-1629479214


In [30]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics 

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            evaluation_step.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json"
    )
)

print(model_metrics)

S3UploadFailedError: Failed to upload src/evaluate_model_metrics.py to sagemaker-us-east-1-924778591724/sagemaker-scikit-learn-2021-08-20-17-07-41-343/input/code/evaluate_model_metrics.py: An error occurred (AccessDenied) when calling the PutObject operation: Access Denied

Define deployment image for inference.

In [31]:
inference_image_uri = sagemaker.image_uris.retrieve(
    framework="tensorflow",
    region=region,
    version="2.4.0",
    py_version="py37",
    instance_type=deploy_instance_type,
    image_scope="inference"
)
print(inference_image_uri)

ValueError: Unsupported tensorflow version: 2.4.0. You may need to upgrade your SDK version (pip install -U sagemaker) for newer tensorflow versions. Supported tensorflow version(s): 1.10.0, 1.11.0, 1.12.0, 1.13.0, 1.14.0, 1.15.0, 1.15.2, 1.15.3, 1.15.4, 1.15.5, 1.4.1, 1.5.0, 1.6.0, 1.7.0, 1.8.0, 1.9.0, 2.0.0, 2.0.1, 2.0.2, 2.0.3, 2.0.4, 2.1.0, 2.1.1, 2.1.2, 2.1.3, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1, 2.3.2, 2.4.1, 1.10, 1.11, 1.12, 1.13, 1.14, 1.15, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4.

### Register model

In [None]:
from sagemaker.workflow.step_collections import RegisterModel

register_step = RegisterModel(
    name="RegisterModel",
    estimator=estimator,
    image_uri=inference_image_uri, # Replace None 
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["application/jsonlines"],
    response_types=["application/jsonlines"],
    inference_instances=[deploy_instance_type],
    transform_instances=[deploy_instance_type], # batch transform is not used in this lab
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

## Deployment Step

In [None]:
from sagemaker.model import Model

model_name = 'bert-model-{}'.format(timestamp)

model = Model(
    name=model_name,
    image_uri=inference_image_uri, # Replace None
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=sagemaker_session,
    role=role,
)

In [None]:
from sagemaker.inputs import CreateModelInput

create_inputs = CreateModelInput(
    instance_type=deploy_instance_type, 
)

In [None]:
from sagemaker.workflow.steps import CreateModelStep

create_step = CreateModelStep(
    name="CreateModel",
    model=model, # Replace None
    inputs=create_inputs, # Replace None
)

# 6. Check accuracy condition step

In [None]:
min_accuracy_value = ParameterFloat(
    name="MinAccuracyValue",
    default_value=0.85 # random choice from three classes
)

In [None]:
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import (
    ConditionStep,
    JsonGet,
)

minimum_accuracy_condition = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step=evaluation_step,
        property_file=evaluation_report,
        json_path="metrics.accuracy.value",
    ),
    right=min_accuracy_value # minimum accuracy threshold
)

minimum_accuracy_condition_step = ConditionStep(
    name="AccuracyCondition",
    conditions=[minimum_accuracy_condition],
    if_steps=[register_step, create_step], # successfully exceeded or equaled the minimum accuracy, continue with model registration
    else_steps=[], # did not exceed the minimum accuracy, the model will not be registered
)

# 7. Create pipeline

In [None]:
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        input_data,
        processing_instance_count,
        processing_instance_type,
        max_seq_length,
        is_sample_dataset,
        transformer_model,
        train_percentage,       
        epochs,
        num_records,
        learning_rate,
        optimizer,
        train_batch_size,
        train_steps_per_epoch,
        validation_batch_size,
        validation_steps_per_epoch,
        input_mode,
        train_instance_count,
        train_instance_type,   
        min_accuracy_value, 
        model_approval_status,
        deploy_instance_type,
        deploy_instance_count
    ],
    steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],
    sagemaker_session=sess,
)

In [None]:
import json
from pprint import pprint

definition = json.loads(pipeline.definition())

pprint(definition)

In [None]:
response = pipeline.create(role_arn=role)

pipeline_arn = response["PipelineArn"]
print(pipeline_arn)

In [None]:
execution = pipeline.start(
    parameters=dict(
        InputData=raw_input_data_s3_uri,
        ProcessingInstanceCount=1,
        ProcessingInstanceType='ml.c5.2xlarge',
        MaxSeqLength=45,
        SampleDataset='True',
        TransformerModel = 'bert-based-uncased',
        TrainPercentage=0.9,
        Epochs=5,
        num_records = 138549,
        LearningRate=5e-5,
        optimizer = 'Adam'
        TrainBatchSize=16,,
        TrainStepsPerEpoch=8659,
        ValidationBatchSize=16,
        ValidationStepsPerEpoch=962,
        InputMode= 'File',
        TrainInstanceCount=1,
        TrainInstanceType='ml.p3.2xlarge',
        MinAccuracyValue=0.75,
        ModelApprovalStatus='PendingManualApproval', 
        DeployInstanceType='ml.m5.large',
        DeployInstanceCount=1 
    )
)

print(execution.arn)