# Build a SageMaker Pipeline to train and register the injury narrative BERT classifier

In [3]:
%%capture
!pip install tensorflow
!pip install transformers
!pip install nltk
!pip install -U sagemaker

In [4]:
import pandas as pd
import tensorflow as tf
import re
import nltk
import string
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras import activations, optimizers, losses
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as np
import sagemaker
from sagemaker import get_execution_role
import joblib 
import collections

In [5]:
print(tf.__version__)

2.6.0


In [6]:

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
bucket = default_bucket
bucket

'sagemaker-us-east-1-979294212144'

In [7]:
role

'arn:aws:iam::979294212144:role/service-role/AmazonSageMaker-ExecutionRole-20210423T122185'

In [40]:
#%%time
#%run ./src/pre-processing.py --data_path ./data/raw  --train_percentage 0.05 --is_sample_dataset

INFO: 08/25/2021 08:42:58 PM Start.....
INFO: 08/25/2021 08:42:58 PM Parsing arguments
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO: 08/25/2021 08:42:59 PM Getting and splitting data
INFO: 08/25/2021 08:42:59 PM nb classes in final data:28
INFO: 08/25/2021 08:42:59 PM  X (7668,) , y : (7668,)
INFO: 08/25/2021 08:42:59 PM X_train shape (6901,) y_train shape : (6901,)
INFO: 08/25/2021 08:42:59 PM X_valid shape (767,) y_valid shape : (767,)


True
True


INFO: 08/25/2021 08:42:59 PM Preprocessing...


nb classes in final data: 28
test_data_small.shape (3768, 2)
[62, 71, 63, 11, 43, 55, 42, 52, 60, 73, 13, 66, 12, 53, 64, 27, 24, 99, 26, 72, 70, 51, 44, 41, 31, 78, 32, 23]


INFO: 08/25/2021 08:43:01 PM Tokenization and encoding...
INFO: 08/25/2021 08:43:02 PM Encoding Labels .....
INFO: 08/25/2021 08:43:02 PM Create TF Dataset....
INFO: 08/25/2021 08:43:02 PM Saving train and valid TF Records...


Instructions for updating:
To write TFRecords to disk, use `tf.io.TFRecordWriter`. To save and load the contents of a dataset, use `tf.data.experimental.save` and `tf.data.experimental.load`


Instructions for updating:
To write TFRecords to disk, use `tf.io.TFRecordWriter`. To save and load the contents of a dataset, use `tf.data.experimental.save` and `tf.data.experimental.load`


NotFoundError: /opt/ml/processing/output/processed/validation/valid.tfrecord; No such file or directory [Op:DatasetToTFRecord]

CPU times: user 1min 24s, sys: 597 ms, total: 1min 25s
Wall time: 1min 41s


In [8]:
#%run ./src/evaluate_model_metrics.py --input_data './data/test' \
#            --input_model './output/model/training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017' \
#            --max_len 45 \
#            --output_data './output/model'

In [9]:
num_records = 6901
num_valid_records = 767
max_len = 45
epochs = 5
batch_size = 16
valid_batch_size = 16
steps_per_epoch = num_records // batch_size
validation_steps = num_valid_records // valid_batch_size
learning_rate = 5e-5
optimizer = 'adam'

In [10]:
print(num_records)
print(steps_per_epoch)
print(validation_steps)

6901
431
47


In [8]:
#%%time
#%run ./src/train.py --train ./data/train --validation ./data/valid --epochs 5 --num_records 138549 --steps_per_epoch 8659 --validation_steps 962

input train:  ./data/train
input valid:  ./data/valid
loading data...
train_dir :  ./data/train
train_file :  ./data/train/train.tfrecord
valid_dir: ./data/valid
valid_file: ./data/valid/valid.tfrecord
loading encoder...
Building model...
[2021-08-20 13:34:04.000 tensorflow-2-3-gpu--ml-g4dn-xlarge-c85184389676cdfa7bdf06745c9b:69 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-08-20 13:34:04.025 tensorflow-2-3-gpu--ml-g4dn-xlarge-c85184389676cdfa7bdf06745c9b:69 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 45)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 45)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 43, 512)      1180160     bert[0][0]            















Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
saving model...
CPU times: user 2h 6min 26s, sys: 55min 53s, total: 3h 2min 20s
Wall time: 3h 28min 20s


### pipeline name

In [167]:
import time
timestamp = int(time.time())

pipeline_name = 'BaseBERT-Injury-Coding-pipeline-{}'.format(timestamp)

## Step 1 - Dataset and preprocessing step

## Upload raw dataset

In [168]:
prefix = 'injury-data/raw'
input_data_train = sagemaker_session.upload_data(path = './data/raw',
                                                      bucket = bucket,
                                                      key_prefix = prefix)
input_data_train

's3://sagemaker-us-east-1-979294212144/injury-data/raw'

## Configure pre-processing step

In [169]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

In [170]:
# 7 params
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.c5.2xlarge"
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

train_percentage = ParameterFloat(
    name="TrainPercentage",
    default_value=0.05
)

is_sample_dataset = ParameterString(
    name="SampleDataset",
    default_value="True"
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_train
)

transformer_model = ParameterString(
    name="TransformerModel",
    default_value='bert-base-uncased'
)
max_seq_length = ParameterInteger(
    name="MaxSeqLength",
    default_value=45
)

In [171]:
region = sagemaker_session.boto_region_name

In [172]:
processing_image_uri = sagemaker.image_uris.retrieve(
    framework="tensorflow",
    region=region,
    version="2.4.1",
    py_version="py37",
    instance_type=processing_instance_type,
    image_scope="training"
)
print(processing_image_uri)

763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.4.1-cpu-py37


In [173]:
from sagemaker.sklearn.processing import SKLearnProcessor,ScriptProcessor

processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={'AWS_DEFAULT_REGION': sagemaker_session.boto_region_name},     
     max_runtime_in_seconds=18000
)

INFO: 08/25/2021 08:56:43 PM Same images used for training and inference. Defaulting to image scope: inference.
INFO: 08/25/2021 08:56:43 PM Defaulting to only available Python version: py3


In [174]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep


processing_inputs=[
    ProcessingInput(
        input_name='raw-input-data',
        source=input_data,
        destination='/opt/ml/processing/input/data/',
        s3_data_distribution_type='ShardedByS3Key'
    )
]

processing_outputs=[
    ProcessingOutput(output_name='processed-train',
                     source='/opt/ml/processing/output/processed/train',
                     s3_upload_mode='EndOfJob'),
    ProcessingOutput(output_name='processed-validation',
                     source='/opt/ml/processing/output/processed/validation',
                     s3_upload_mode='EndOfJob'),
    ProcessingOutput(output_name='processed-test',
                     source='/opt/ml/processing/output/processed/test',
                     s3_upload_mode='EndOfJob'),
]        

processing_step = ProcessingStep(
    name='Pre-Processing', 
    code='./src/pre-processing.py',
    processor=processor,
    inputs=processing_inputs,
    outputs=processing_outputs,
    job_arguments=['--train_percentage', str(train_percentage.default_value),                   
                   '--max_len',str(max_seq_length.default_value),
                   '--transformer_model',str(transformer_model.default_value),
                   '--is_sample_dataset'
                  ]
)        

print(processing_step)

ProcessingStep(name='Pre-Processing', step_type=<StepTypeEnum.PROCESSING: 'Processing'>, depends_on=None)


In [175]:
import json

# print out the list of the processing job properties
print(json.dumps(
    processing_step.properties.__dict__,
    indent=4, sort_keys=True, default=str
))

{
    "AppSpecification": "<sagemaker.workflow.properties.Properties object at 0x7f5efe106690>",
    "AutoMLJobArn": "<sagemaker.workflow.properties.Properties object at 0x7f5efcb5c690>",
    "CreationTime": "<sagemaker.workflow.properties.Properties object at 0x7f5efcb5c710>",
    "Environment": "<sagemaker.workflow.properties.Properties object at 0x7f5efe106390>",
    "ExitMessage": "<sagemaker.workflow.properties.Properties object at 0x7f5efcb5cdd0>",
    "ExperimentConfig": "<sagemaker.workflow.properties.Properties object at 0x7f5efe106dd0>",
    "FailureReason": "<sagemaker.workflow.properties.Properties object at 0x7f5efcb5c9d0>",
    "LastModifiedTime": "<sagemaker.workflow.properties.Properties object at 0x7f5efcb5c950>",
    "MonitoringScheduleArn": "<sagemaker.workflow.properties.Properties object at 0x7f5efcb5c290>",
    "NetworkConfig": "<sagemaker.workflow.properties.Properties object at 0x7f5efe106190>",
    "ProcessingEndTime": "<sagemaker.workflow.properties.Properties

In [176]:
print(json.dumps(
    processing_step.properties.ProcessingOutputConfig.Outputs['processed-train'].__dict__, 
    indent=4, sort_keys=True, default=str
))

{
    "AppManaged": "<sagemaker.workflow.properties.Properties object at 0x7f5efe1062d0>",
    "FeatureStoreOutput": "<sagemaker.workflow.properties.Properties object at 0x7f5efe1063d0>",
    "OutputName": "<sagemaker.workflow.properties.Properties object at 0x7f5efe106c90>",
    "S3Output": "<sagemaker.workflow.properties.Properties object at 0x7f5efe106450>",
    "_path": "Steps.Pre-Processing.ProcessingOutputConfig.Outputs['processed-train']",
    "_shape_names": [
        "ProcessingOutput"
    ]
}


In [177]:
print(json.dumps(
    processing_step.properties.ProcessingOutputConfig.Outputs['processed-train'].S3Output.S3Uri.__dict__,
    indent=4, sort_keys=True, default=str
))

{
    "__str__": "S3Uri",
    "_path": "Steps.Pre-Processing.ProcessingOutputConfig.Outputs['processed-train'].S3Output.S3Uri",
    "_shape_names": [
        "S3Uri"
    ]
}


## Step 3 - Training Step

In [178]:
# 12 params

epochs = ParameterInteger(
    name="Epochs",
    default_value=3
)

num_records = ParameterInteger(
    name="NumRecords",
    default_value = 6901
)
   

learning_rate = ParameterFloat(
    name="LearningRate",
    default_value=5e-5
) 
    
train_batch_size = ParameterInteger(
    name="TrainBatchSize",
    default_value=16
)

train_steps_per_epoch = ParameterInteger(
    name="TrainStepsPerEpoch",
    default_value=431
)

validation_batch_size = ParameterInteger(
    name="ValidationBatchSize",
    default_value=16
)

validation_steps_per_epoch = ParameterInteger(
    name="ValidationStepsPerEpoch",
    default_value=47
)


train_instance_count = ParameterInteger(
    name="TrainInstanceCount",
    default_value=1
)

train_instance_type = ParameterString(
    name="TrainInstanceType",
    default_value="ml.p3.2xlarge"
)


max_seq_length = ParameterInteger(
    name="MaxSeqLength",
    default_value=45
)

optimizer = ParameterString(
    name="optimizer",
    default_value='Adam'
)

input_mode = ParameterString(
    name="InputMode",
    default_value="File"
)

In [179]:
train_batch_size.default_value

16

In [180]:
hyperparameters={
    'max_seq_length': max_seq_length.default_value,
    'epochs': epochs.default_value,
    'num_records': num_records.default_value,
    'learning_rate': learning_rate.default_value,
    'batch_size': train_batch_size.default_value,
    'steps_per_epoch': train_steps_per_epoch.default_value,
    'validation_batch_size': validation_batch_size.default_value,
    'validation_steps': validation_steps_per_epoch.default_value,
    'optimizer': optimizer.default_value
}

In [181]:
metric_definitions = [{'Name':'train:loss','Regex':'loss: ([0-9\\.]+)'},
                                    {'Name':'train:accuracy','Regex':'acc: ([0-9\\.]+)'},
                                    {'Name':'validation:loss','Regex':'val_loss: ([0-9\\.]+)'},
                                    {'Name':'validation:accuracy','Regex':'val_acc: ([0-9\\.]+)'}]

In [182]:
from sagemaker.huggingface import HuggingFace

estimator = HuggingFace(
        entry_point="train.py",
        source_dir = "./src/",
        role=role,
        instance_count=train_instance_count.default_value,
        volume_size = 50,
        max_run = 18000,
        instance_type=train_instance_type.default_value,
        transformers_version = "4.4",
        tensorflow_version  = "2.4",
        py_version="py37",
        input_mode = input_mode.default_value,
        hyperparameters = hyperparameters,
        metric_definitions = metric_definitions,
        enable_sagemaker_metrics = True
    )



In [183]:
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") # PT1H represents `one hour`

In [184]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

training_step = TrainingStep(
    name='Train',
    estimator=estimator,
    inputs={
        'train': TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
                'processed-train'
            ].S3Output.S3Uri
        ),
        'validation': TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
                'processed-validation'
            ].S3Output.S3Uri
        )
    },
    cache_config=cache_config
)

print(training_step)

TrainingStep(name='Train', step_type=<StepTypeEnum.TRAINING: 'Training'>, depends_on=None)


## Evaluation Step

In [185]:
from sagemaker.sklearn.processing import SKLearnProcessor,ScriptProcessor

evaluation_processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={'AWS_DEFAULT_REGION': region},
    max_runtime_in_seconds=7200
)

INFO: 08/25/2021 08:56:48 PM Same images used for training and inference. Defaulting to image scope: inference.
INFO: 08/25/2021 08:56:48 PM Defaulting to only available Python version: py3


In [186]:
from sagemaker.workflow.properties import PropertyFile

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='metrics',
    path='evaluation.json'
)

In [187]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

evaluation_step = ProcessingStep(
    name='EvaluateModel',
    processor=evaluation_processor,
    code='./src/evaluate_model_metrics.py',
    inputs=[
        ProcessingInput(
            source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/input/model'
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['processed-test'].S3Output.S3Uri,
            destination='/opt/ml/processing/input/data'
        )
    ],
    outputs=[
        ProcessingOutput(output_name='metrics', 
                         s3_upload_mode='EndOfJob',
                         source='/opt/ml/processing/output/metrics/'),
    ],
    job_arguments=[
        '--max_len', str(max_seq_length.default_value)
    ],
    property_files=[evaluation_report],
)

## Register model step

In [188]:
# 3 parameters
model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)

deploy_instance_type = ParameterString(
    name="DeployInstanceType",
    default_value="ml.m5.large"
)

deploy_instance_count = ParameterInteger(
    name="DeployInstanceCount",
    default_value=1
)

In [189]:
model_package_group_name = f"BaseBERT-Injury-Coding-{timestamp}"

print(model_package_group_name)

BaseBERT-Injury-Coding-1629925000


In [190]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics 

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            evaluation_step.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json"
    )
)

print(model_metrics)

<sagemaker.model_metrics.ModelMetrics object at 0x7f5efd2ac110>


Define deployment image for inference.

In [191]:
inference_image_uri = sagemaker.image_uris.retrieve(
    framework="tensorflow",
    region=region,
    version="2.4.1",
    py_version="py37",
    instance_type=deploy_instance_type,
    image_scope="inference"
)
print(inference_image_uri)

INFO: 08/25/2021 08:56:51 PM Ignoring unnecessary Python version: py37.


763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-inference:2.4.1-cpu


### Register model

In [192]:
from sagemaker.workflow.step_collections import RegisterModel

register_step = RegisterModel(
    name="RegisterModel",
    estimator=estimator,
    image_uri=inference_image_uri, # Replace None 
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["application/jsonlines"],
    response_types=["application/jsonlines"],
    inference_instances=[deploy_instance_type],
    transform_instances=[deploy_instance_type], # batch transform is not used in this lab
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

## Deployment Step

In [193]:
from sagemaker.model import Model

model_name = 'bert-model-{}'.format(timestamp)

model = Model(
    name=model_name,
    image_uri=inference_image_uri, # Replace None
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=sagemaker_session,
    role=role,
)

In [194]:
from sagemaker.inputs import CreateModelInput

create_inputs = CreateModelInput(
    instance_type=deploy_instance_type, 
)

In [195]:
from sagemaker.workflow.steps import CreateModelStep

create_step = CreateModelStep(
    name="CreateModel",
    model=model, # Replace None
    inputs=create_inputs, # Replace None
)

# 6. Check accuracy condition step

In [196]:
min_accuracy_value = ParameterFloat(
    name="MinAccuracyValue",
    default_value=0.75 # random choice from three classes
)

In [197]:
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import (
    ConditionStep,
    JsonGet,
)

minimum_accuracy_condition = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step=evaluation_step,
        property_file=evaluation_report,
        json_path="metrics.accuracy.value",
    ),
    right=min_accuracy_value # minimum accuracy threshold
)

minimum_accuracy_condition_step = ConditionStep(
    name="AccuracyCondition",
    conditions=[minimum_accuracy_condition],
    if_steps=[register_step, create_step], # successfully exceeded or equaled the minimum accuracy, continue with model registration
    else_steps=[], # did not exceed the minimum accuracy, the model will not be registered
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


# 7. Create pipeline

In [198]:
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[ 
        input_data,        
        processing_instance_count, 
        processing_instance_type, 
        max_seq_length, 
        is_sample_dataset, 
        transformer_model, 
        train_percentage,   
        epochs, 
        num_records, 
        learning_rate, 
        optimizer, 
        train_batch_size, 
        train_steps_per_epoch,  
        validation_batch_size, 
        validation_steps_per_epoch, 
        input_mode, 
        train_instance_count, 
        train_instance_type,   
        min_accuracy_value, 
        model_approval_status, 
        deploy_instance_type, 
        deploy_instance_count 
    ],
    steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],
    sagemaker_session=sagemaker_session,
)


In [199]:
import json
from pprint import pprint

definition = json.loads(pipeline.definition())

pprint(definition)

INFO: 08/25/2021 08:56:57 PM Defaulting to the only supported framework/algorithm version: latest.
INFO: 08/25/2021 08:56:57 PM Ignoring unnecessary instance type: None.


{'Metadata': {},
 'Parameters': [{'DefaultValue': 's3://sagemaker-us-east-1-979294212144/injury-data/raw',
                 'Name': 'InputData',
                 'Type': 'String'},
                {'DefaultValue': 1,
                 'Name': 'ProcessingInstanceCount',
                 'Type': 'Integer'},
                {'DefaultValue': 'ml.c5.2xlarge',
                 'Name': 'ProcessingInstanceType',
                 'Type': 'String'},
                {'DefaultValue': 45, 'Name': 'MaxSeqLength', 'Type': 'Integer'},
                {'DefaultValue': 'True',
                 'Name': 'SampleDataset',
                 'Type': 'String'},
                {'DefaultValue': 'bert-base-uncased',
                 'Name': 'TransformerModel',
                 'Type': 'String'},
                {'DefaultValue': 0.05,
                 'Name': 'TrainPercentage',
                 'Type': 'Float'},
                {'DefaultValue': 3, 'Name': 'Epochs', 'Type': 'Integer'},
                {'DefaultValue

In [200]:
response = pipeline.create(role_arn=role)

#pipeline_arn = response["PipelineArn"]
#print(pipeline_arn)

INFO: 08/25/2021 08:57:05 PM Defaulting to the only supported framework/algorithm version: latest.
INFO: 08/25/2021 08:57:05 PM Ignoring unnecessary instance type: None.


In [201]:
execution = pipeline.start(
    parameters=dict(
        InputData=input_data_train,
        ProcessingInstanceCount=1,
        ProcessingInstanceType='ml.c5.2xlarge',
        MaxSeqLength=45,
        SampleDataset='True',
        TransformerModel = 'bert-based-uncased',
        TrainPercentage=0.05,
        Epochs=1,
        NumRecords = 6901,
        LearningRate=5e-5,
        optimizer = 'Adam',
        TrainBatchSize=16,
        TrainStepsPerEpoch=431,
        ValidationBatchSize=16,
        ValidationStepsPerEpoch=47,
        InputMode= 'File',
        TrainInstanceCount=1,
        TrainInstanceType='ml.p3.2xlarge',
        MinAccuracyValue=0.75,
        ModelApprovalStatus='PendingManualApproval', 
        DeployInstanceType='ml.m5.large',
        DeployInstanceCount=1 
    )
)

print(execution.arn)

arn:aws:sagemaker:us-east-1:979294212144:pipeline/basebert-injury-coding-pipeline-1629925000/execution/pbe0shuwfy1a
