# Build a SageMaker Pipeline to train and register the injury narrative BERT classifier

In [1]:
%%capture
!pip install tensorflow
!pip install transformers
!pip install nltk

In [2]:
import pandas as pd
import tensorflow as tf
import re
import nltk
import string
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras import activations, optimizers, losses
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as np
import sagemaker
from sagemaker import get_execution_role
import joblib 
import collections

In [3]:
bucket = 'cdc-cdh-sagemaker-s3fs-dev'
sagemaker_session = sagemaker.Session(default_bucket=bucket)
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
default_bucket

'cdc-cdh-sagemaker-s3fs-dev'

In [11]:
%run ./src/pre-processing.py --data_path ./data/raw  --train_percentage 0.05 --is_sample_dataset

INFO: 08/18/2021 05:35:25 PM Start.....
INFO: 08/18/2021 05:35:25 PM Parsing arguments
INFO: 08/18/2021 05:35:25 PM Getting and splitting data
INFO: 08/18/2021 05:35:25 PM nb classes in final data:28
INFO: 08/18/2021 05:35:25 PM  X (7668,) , y : (7668,)
INFO: 08/18/2021 05:35:25 PM X_train shape (6901,) y_train shape : (6901,)
INFO: 08/18/2021 05:35:25 PM X_valid shape (767,) y_valid shape : (767,)


True
True


INFO: 08/18/2021 05:35:25 PM Preprocessing...


[62, 71, 63, 11, 43, 55, 42, 52, 60, 73, 13, 66, 12, 53, 64, 27, 24, 99, 26, 72, 70, 51, 44, 41, 31, 78, 32, 23]


INFO: 08/18/2021 05:35:27 PM Tokenization and encoding...
INFO: 08/18/2021 05:35:28 PM Encoding Labels .....
INFO: 08/18/2021 05:35:28 PM Create TF Dataset....
INFO: 08/18/2021 05:35:28 PM Saving train and valid TF Records...


Instructions for updating:
To write TFRecords to disk, use `tf.io.TFRecordWriter`. To save and load the contents of a dataset, use `tf.data.experimental.save` and `tf.data.experimental.load`


Instructions for updating:
To write TFRecords to disk, use `tf.io.TFRecordWriter`. To save and load the contents of a dataset, use `tf.data.experimental.save` and `tf.data.experimental.load`
INFO: 08/18/2021 05:36:44 PM Saving test dataset...
INFO: 08/18/2021 05:36:56 PM Complete


In [7]:
%run ./src/evaluate_model_metrics.py --input_data './data/test' \
            --input_model './output/model/training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017' \
            --max_len 45 \
            --output_data './output/model'

INFO: 08/17/2021 09:25:54 PM Start.....
INFO: 08/17/2021 09:25:54 PM Parsing arguments
INFO: 08/17/2021 09:25:54 PM Create sagemaker session
INFO: 08/17/2021 09:25:54 PM input_data: ./data/test
INFO: 08/17/2021 09:25:54 PM input_model: ./output/model/training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017
INFO: 08/17/2021 09:25:54 PM Listing contents of input model dir: ./output/model/training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017
INFO: 08/17/2021 09:25:54 PM Loading model..


Tensorflow: 2.6.0 

Hugging Face transfomers: 4.9.2 

NLTK: 3.4.5 

BaseBERT
.ipynb_checkpoints
model.tar.gz


INFO: 08/17/2021 09:26:37 PM Listing contents of input data dir: ./data/test
INFO: 08/17/2021 09:26:37 PM Loading encoder..
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 1025, in emit
    msg = self.format(record)
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 869, in format
    return fmt.format(record)
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 608, in format
    record.message = record.getMessage()
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_insta

{'text': ['f puncture wound fiinger attaching cap insulin syring used home care patient'], 'predict_proba': 0.9990896, 'predicted_class': 55}
75687
                                                    text  true_event  \
0      f puncture wound fiinger attaching cap insulin...          55   
1      contusion lt lower leg p mvc hit car guiding c...          24   
2      pt works quarry attempting dislodge large rock...          71   
3      walking work twisted lt ankle later right knee...          73   
4      c low back pain lifting box work today dx left...          71   
...                                                  ...         ...   
75682  coaching football collided player pain rt leg ...          12   
75683  male using wire brush work piece got eye dx fo...          66   
75684            lifting work back px dx thoracic strain          71   
75685  f pt work poked lt thumb wth needle drawing bl...          55   
75686        picking car hurt back work dx low back pain    

In [12]:
num_records = 6901
num_valid_records = 767
max_len = 45
epochs = 5
batch_size = 16
valid_batch_size = 16
steps_per_epoch = num_records // batch_size
validation_steps = num_valid_records // valid_batch_size
learning_rate = 5e-5
optimizer = 'adam'

In [13]:
print(num_records)
print(steps_per_epoch)
print(validation_steps)

6901
431
47


In [21]:
%%time
%run ./src/train.py --train ./data/train --validation ./data/valid --epochs 1 --num_records 6901 --steps_per_epoch 767 --validation_steps 47

input train:  ./data/train
input valid:  ./data/valid
train_dir :  ./data/train
train_file :  ./data/train/train.tfrecord
valid_dir: ./data/valid
valid_file: ./data/valid/valid.tfrecord


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 45)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 45)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 43, 512)      1180160     bert[0][0]                 







saving model...


NameError: name 'odel' is not defined

CPU times: user 2h 50min 38s, sys: 7min 39s, total: 2h 58min 17s
Wall time: 26min 31s


In [18]:
%%time
%run ./src/train_old.py --train ./data/train  --epochs 1 --num_records 6901 --steps_per_epoch 767 --validation_steps 47

input train:  ./data/train


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 45)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 45)]         0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 43, 512)      1180160     bert[0][0]                 







saving model...
CPU times: user 2h 50min 48s, sys: 7min 39s, total: 2h 58min 28s
Wall time: 26min 47s


### pipeline name

In [4]:
import time
timestamp = int(time.time())

pipeline_name = 'BERT-pipeline-{}'.format(timestamp)

## Step 1 - Dataset and preprocessing step

## Upload raw dataset

In [5]:
prefix = 'projects/project006/injury-data/raw'
input_data_train = sagemaker_session.upload_data(path = './data/raw',
                                                      bucket = bucket,
                                                      key_prefix = prefix)
input_data_train

's3://cdc-cdh-sagemaker-s3fs-dev/projects/project006/injury-data/raw'

## Configure pre-processing step

In [6]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

In [7]:
# 7 params
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.c5.2xlarge"
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)

train_percentage = ParameterFloat(
    name="TrainPercentage",
    default_value=0.05,
)

is_sample_dataset = ParameterString(
    name="SampleDataset",
    default_value="True",
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_train,
)

transformer_model = ParameterString(
    name="TransformerModel",
    default_value='bert-base-uncased',
)
max_seq_length = ParameterInteger(
    name="MaxSeqLength",
    default_value=45,
)

In [22]:
region = sagemaker_session.boto_region_name

In [23]:
from sagemaker.sklearn.processing import SKLearnProcessor

processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={'AWS_DEFAULT_REGION': sagemaker_session.boto_region_name},                             
)

In [24]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep


processing_inputs=[
    ProcessingInput(
        input_name='raw-input-data',
        source=input_data,
        destination='/opt/ml/processing/input/data/',
        s3_data_distribution_type='ShardedByS3Key'
    )
]

processing_outputs=[
    ProcessingOutput(output_name='processed-train',
                     source='/opt/ml/processing/output/processed/train',
                     s3_upload_mode='EndOfJob'),
    ProcessingOutput(output_name='processed-validation',
                     source='/opt/ml/processing/output/processed/validation',
                     s3_upload_mode='EndOfJob'),
    ProcessingOutput(output_name='processed-test',
                     source='/opt/ml/processing/output/processed/test',
                     s3_upload_mode='EndOfJob'),
]        

processing_step = ProcessingStep(
    name='Pre-Processing', 
    code='./src/pre-processing.py',
    processor=processor,
    inputs=processing_inputs,
    outputs=processing_outputs,
    job_arguments=['--train_percentage', str(train_percentage.default_value),                   
                   '--is_sample_dataset', str(is_sample_dataset.default_value),
                   '--max_len',str(max_seq_length.default_value),
                   '--transformer_model',str(transformer_model.default_value)
                  ]
)        

print(processing_step)

ProcessingStep(name='Pre-Processing', step_type=<StepTypeEnum.PROCESSING: 'Processing'>, depends_on=None)


In [25]:
import json

# print out the list of the processing job properties
print(json.dumps(
    processing_step.properties.__dict__,
    indent=4, sort_keys=True, default=str
))

{
    "AppSpecification": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c456d0>",
    "AutoMLJobArn": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45e50>",
    "CreationTime": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45dd0>",
    "Environment": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45850>",
    "ExitMessage": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45c90>",
    "ExperimentConfig": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45a90>",
    "FailureReason": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45cd0>",
    "LastModifiedTime": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45d90>",
    "MonitoringScheduleArn": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45e10>",
    "NetworkConfig": "<sagemaker.workflow.properties.Properties object at 0x7fd3e2c45950>",
    "ProcessingEndTime": "<sagemaker.workflow.properties.Properties

In [26]:
print(json.dumps(
    processing_step.properties.ProcessingOutputConfig.Outputs['processed-train'].__dict__, 
    indent=4, sort_keys=True, default=str
))

{
    "AppManaged": "<sagemaker.workflow.properties.Properties object at 0x7fd3e9e5b250>",
    "FeatureStoreOutput": "<sagemaker.workflow.properties.Properties object at 0x7fd3e9e5b050>",
    "OutputName": "<sagemaker.workflow.properties.Properties object at 0x7fd3e9e5b0d0>",
    "S3Output": "<sagemaker.workflow.properties.Properties object at 0x7fd3e9e5b190>",
    "_path": "Steps.Pre-Processing.ProcessingOutputConfig.Outputs['processed-train']",
    "_shape_names": [
        "ProcessingOutput"
    ]
}


In [27]:
print(json.dumps(
    processing_step.properties.ProcessingOutputConfig.Outputs['processed-train'].S3Output.S3Uri.__dict__,
    indent=4, sort_keys=True, default=str
))

{
    "__str__": "S3Uri",
    "_path": "Steps.Pre-Processing.ProcessingOutputConfig.Outputs['processed-train'].S3Output.S3Uri",
    "_shape_names": [
        "S3Uri"
    ]
}


## Step 3 - Training Step

In [28]:
# 12 params

epochs = ParameterInteger(
    name="Epochs",
    default_value=3
)

num_records = ParameterInteger(
    name="NumRecords"
)
   

learning_rate = ParameterFloat(
    name="LearningRate",
    default_value=5e-5
) 
    
train_batch_size = ParameterInteger(
    name="TrainBatchSize",
    default_value=16
)

train_steps_per_epoch = ParameterInteger(
    name="TrainStepsPerEpoch",
    default_value=500
)

validation_batch_size = ParameterInteger(
    name="ValidationBatchSize",
    default_value=16
)

validation_steps_per_epoch = ParameterInteger(
    name="ValidationStepsPerEpoch",
    default_value=500
)


train_instance_count = ParameterInteger(
    name="TrainInstanceCount",
    default_value=1
)

train_instance_type = ParameterString(
    name="TrainInstanceType",
    default_value="ml.p3.2xlarge"
)


max_seq_length = ParameterInteger(
    name="MaxSeqLength",
    default_value=45,
)

optimizer = ParameterString(
    name="optimizer",
    default_value='Adam'
)

input_mode = ParameterString(
    name="InputMode",
    default_value="File",
)

In [29]:
hyperparameters={
    'max_seq_length': max_seq_length,
    'epochs': epochs,
    'num_records':num_records,
    'learning_rate': learning_rate,
    'batch_size': train_batch_size,
    'steps_per_epoch': train_steps_per_epoch,
    'validation_batch_size': validation_batch_size,
    'validation_steps': validation_steps_per_epoch,
    'optimizer':optimizer
}

In [30]:
metric_definitions = [{'Name':'train:loss','Regex':'loss: ([0-9\\.]+)'},
                                    {'Name':'train:accuracy','Regex':'acc: ([0-9\\.]+)'},
                                    {'Name':'validation:loss','Regex':'val_loss: ([0-9\\.]+)'},
                                    {'Name':'validation:accuracy','Regex':'val_acc: ([0-9\\.]+)'}]

In [31]:
from sagemaker.huggingface import HuggingFace

estimator = HuggingFace(
        entry_point="train.py",
        source_dir = "./src/",
        role=role,
        instance_count=train_instance_count,
        volume_size = 50,
        max_run = 18000,
        instance_type=train_instance_type,
        transformers_version = "4.4",
        tensorflow_version  = "2.4",
        py_version="py37",
        hyperparameters = hyperparameters,
        metric_definitions = metric_definitions,
        enable_sagemaker_metrics = True
    )



In [32]:
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") # PT1H represents `one hour`

In [33]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

training_step = TrainingStep(
    name='Train',
    estimator=estimator,
    inputs={
        'train': TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
                'processed-train'
            ].S3Output.S3Uri
        ),
        'validation': TrainingInput(
            s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
                'processed-validation'
            ].S3Output.S3Uri
        )
    },
    cache_config=cache_config
)

print(training_step)

TrainingStep(name='Train', step_type=<StepTypeEnum.TRAINING: 'Training'>, depends_on=None)


## Evaluation Step

In [34]:
from sagemaker.sklearn.processing import SKLearnProcessor

evaluation_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={'AWS_DEFAULT_REGION': region},
    max_runtime_in_seconds=7200
)

In [35]:
from sagemaker.workflow.properties import PropertyFile

evaluation_report = PropertyFile(
    name='EvaluationReport',
    output_name='metrics',
    path='evaluation.json'
)

In [36]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

evaluation_step = ProcessingStep(
    name='EvaluateModel',
    processor=evaluation_processor,
    code='src/evaluate_model_metrics.py',
    inputs=[
        ProcessingInput(
            source=training_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/input/model'
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs['processed-test'].S3Output.S3Uri,
            destination='/opt/ml/processing/input/data'
        )
    ],
    outputs=[
        ProcessingOutput(output_name='metrics', 
                         s3_upload_mode='EndOfJob',
                         source='/opt/ml/processing/output/metrics/'),
    ],
    job_arguments=[
        '--max_len', str(max_seq_length.default_value),
    ],
    property_files=[evaluation_report],
)

## Register model step

In [None]:
# 3 parameters
model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)

deploy_instance_type = ParameterString(
    name="DeployInstanceType",
    default_value="ml.m5.large"
)

deploy_instance_count = ParameterInteger(
    name="DeployInstanceCount",
    default_value=1
)

In [None]:
model_package_group_name = f"BERT-Reviews-{timestamp}"

print(model_package_group_name)

In [None]:
from sagemaker.model_metrics import MetricsSource, ModelMetrics 

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri="{}/evaluation.json".format(
            evaluation_step.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
        ),
        content_type="application/json"
    )
)

print(model_metrics)

Define deployment image for inference.

In [None]:
inference_image_uri = sagemaker.image_uris.retrieve(
    framework="tensorflow",
    region=region,
    version="2.4.0",
    py_version="py37",
    instance_type=deploy_instance_type,
    image_scope="inference"
)
print(inference_image_uri)

### Register model

In [None]:
from sagemaker.workflow.step_collections import RegisterModel

register_step = RegisterModel(
    name="RegisterModel",
    estimator=estimator,
    image_uri=inference_image_uri, # Replace None 
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["application/jsonlines"],
    response_types=["application/jsonlines"],
    inference_instances=[deploy_instance_type],
    transform_instances=[deploy_instance_type], # batch transform is not used in this lab
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

## Deployment Step

In [None]:
from sagemaker.model import Model

model_name = 'bert-model-{}'.format(timestamp)

model = Model(
    name=model_name,
    image_uri=inference_image_uri, # Replace None
    model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=sagemaker_session,
    role=role,
)

In [None]:
from sagemaker.inputs import CreateModelInput

create_inputs = CreateModelInput(
    instance_type=deploy_instance_type, 
)

In [None]:
from sagemaker.workflow.steps import CreateModelStep

create_step = CreateModelStep(
    name="CreateModel",
    model=model, # Replace None
    inputs=create_inputs, # Replace None
)

# 6. Check accuracy condition step

In [None]:
min_accuracy_value = ParameterFloat(
    name="MinAccuracyValue",
    default_value=0.85 # random choice from three classes
)

In [None]:
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import (
    ConditionStep,
    JsonGet,
)

minimum_accuracy_condition = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step=evaluation_step,
        property_file=evaluation_report,
        json_path="metrics.accuracy.value",
    ),
    right=min_accuracy_value # minimum accuracy threshold
)

minimum_accuracy_condition_step = ConditionStep(
    name="AccuracyCondition",
    conditions=[minimum_accuracy_condition],
    if_steps=[register_step, create_step], # successfully exceeded or equaled the minimum accuracy, continue with model registration
    else_steps=[], # did not exceed the minimum accuracy, the model will not be registered
)

# 7. Create pipeline

In [None]:
from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        input_data,
        processing_instance_count,
        processing_instance_type,
        max_seq_length,
        is_sample_dataset,
        transformer_model,
        train_percentage,
        
        epochs,
        num_records,
        learning_rate,
        optimizer,
        train_batch_size,
        train_steps_per_epoch,
        validation_batch_size,
        validation_steps_per_epoch,
        input_mode,
        train_instance_count,
        train_instance_type,   
        
        min_accuracy_value,
        
        model_approval_status,
        deploy_instance_type,
        deploy_instance_count
    ],
    steps=[processing_step, training_step, evaluation_step, minimum_accuracy_condition_step],
    sagemaker_session=sess,
)

In [None]:
import json
from pprint import pprint

definition = json.loads(pipeline.definition())

pprint(definition)

In [None]:
response = pipeline.create(role_arn=role)

pipeline_arn = response["PipelineArn"]
print(pipeline_arn)

In [None]:
execution = pipeline.start(
    parameters=dict(
        InputData=raw_input_data_s3_uri,
        ProcessingInstanceCount=1,
        ProcessingInstanceType='ml.c5.2xlarge',
        MaxSeqLength=45,
        SampleDataset='True',
        TransformerModel = 'bert-based-uncased',
        TrainPercentage=0.9,
        Epochs=3,
        num_records = 138549,
        LearningRate=5e-5,
        optimizer = 'Adam'
        TrainBatchSize=16,,
        TrainStepsPerEpoch=50,
        ValidationBatchSize=16,
        ValidationStepsPerEpoch=64,
        InputMode= 'File',
        TrainInstanceCount=1,
        TrainInstanceType='ml.p3.2xlarge',
        MinAccuracyValue=0.75,
        ModelApprovalStatus='PendingManualApproval', 
        DeployInstanceType='ml.m5.large',
        DeployInstanceCount=1 
    )
)

print(execution.arn)