# Injury classification with SageMaker

In [1]:
#https://wandb.ai/jack-morris/david-vs-goliath/reports/Does-Model-Size-Matter-A-Comparison-of-BERT-and-DistilBERT--VmlldzoxMDUxNzU
#https://towardsdatascience.com/working-with-hugging-face-transformers-and-tf-2-0-89bf35e3555a
#https://towardsdatascience.com/scale-neural-network-training-with-sagemaker-distributed-8cf3aefcff51
#https://towardsdatascience.com/how-to-reduce-training-time-for-a-deep-learning-model-using-tf-data-43e1989d2961
#https://towardsdatascience.com/hugging-face-transformers-fine-tuning-distilbert-for-binary-classification-tasks-490f1d192379
#https://towardsdatascience.com/to-distil-or-not-to-distil-bert-roberta-and-xlnet-c777ad92f8
#https://ymeadows.com/articles/fine-tuning-transformer-based-language-models

### Smart Batching
#https://towardsdatascience.com/divide-hugging-face-transformers-training-time-by-2-or-more-21bf7129db9q-21bf7129db9e
#https://www.youtube.com/watch?v=ynOZUNnbEWU

In [2]:
!python --version

Python 3.7.10


In [3]:
%%capture
!pip install tensorflow
!pip install transformers
!pip install nltk

In [4]:
import transformers
import nltk
print(transformers.__version__)

4.9.2


In [5]:
import nltk 

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
import pandas as pd
import tensorflow as tf
import re
import nltk
import string
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras import activations, optimizers, losses
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as np
import sagemaker
from sagemaker import get_execution_role
import joblib 
import collections

In [7]:
print(tf.__version__)

2.6.0


In [8]:
bucket = 'cdc-cdh-sagemaker-s3fs-dev'
sagemaker_session = sagemaker.Session(default_bucket=bucket)
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
default_bucket

'cdc-cdh-sagemaker-s3fs-dev'

In [9]:
%store -z
%store

Stored variables and their in-db values:


## Step 1 - Import and split data

In [10]:
def get_samples(ratio,train):
    great_than2_classes = train['event'].value_counts()[train['event'].value_counts() >2].index
    train = train[train['event'].isin(great_than2_classes.to_list())]
    train_samples, _ = train_test_split(train,train_size=ratio,random_state=42,stratify=train['event'])
    print("nb classes in samples",train_samples['event'].nunique())
    print("nb oservations:",train_samples.shape)
    #print(train_samples['event'].value_counts())

    return train_samples

In [49]:
def get_data(is_sample=None,ratio=None,is_test_split=False):
  
    train = pd.read_csv('train.csv')
    if is_sample:
        train = get_samples(ratio = ratio, train=train)
    great_than2_classes = train['event'].value_counts()[train['event'].value_counts() >5].index 
    train_filter = train[train['event'].isin(great_than2_classes.to_list())]
    
    print("nb classes in final data:",train_filter['event'].nunique())

    X = train_filter['text']
    y = train_filter['event']

    print(f"X.shape {X.shape} y.shape : {y.shape}")

    X_train_valid,X_test,y_train_valid, y_test = train_test_split(X,y,train_size=0.9,random_state=42,stratify=y)
    
    if is_test_split:
        X_train,X_valid,y_train,y_valid = train_test_split(X_train_valid,y_train_valid,train_size=0.8,random_state=42,stratify=y_train_valid)

    if is_test_split :
        print(f"X_train shape {X_train.shape} y_train shape : {y_train.shape}")
        print(f"X_valid shape {X_valid.shape} y_valid shape : {y_valid.shape}")
        print(f"X_test shape {X_test.shape} y_test shape : {y_test.shape}")
        
        return {
          'train': (X_train,y_train),
          'valid': (X_valid,y_valid),
          'test': (X_test,y_test)
      }

    else:
        print(f"X_train shape {X_train_valid.shape} y_train shape : {y_train_valid.shape}")
        print(f"X_valid shape {X_test.shape} y_valid shape : {y_test.shape}")
        
        return {
          'train': (X_train_valid,y_train_valid),
          'valid': (X_test,y_test),
      }


In [50]:
pd.options.display.max_colwidth = 3100
data = get_data(is_sample=False,ratio=1)
#data = get_data(is_test_split=True)
X_train, y_train = data['train']
X_valid,y_valid = data['valid']
#X_test,y_test = data['test']

nb classes in final data: 43
X.shape (153944,) y.shape : (153944,)
X_train shape (138549,) y_train shape : (138549,)
X_valid shape (15395,) y_valid shape : (15395,)


## Step 2 - Preprocess Data

In [51]:
import numpy as np
print('classes in train :',len(np.unique(y_train)))
print('classes in valid :',len(np.unique(y_valid)))

CLASSES = y_train.unique().tolist()
print(CLASSES)

classes in train : 43
classes in valid : 43
[62, 13, 71, 42, 60, 64, 55, 70, 31, 43, 63, 73, 24, 50, 53, 41, 66, 26, 78, 11, 99, 12, 65, 72, 23, 27, 44, 52, 69, 22, 51, 25, 32, 21, 40, 61, 56, 54, 49, 79, 67, 45, 20]


In [52]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """

    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)

    # convert text to lowercase
    text = text.strip().lower()

    # remove all non-ASCII characters:
    text = re.sub(r'[^\x00-\x7f]', r'', text)

    # replace punctuation characters with spaces
    filters = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = " ".join(text.split())
    return text

In [53]:
def remove_useless_words(text,useless_words):
    sentence = [word for word in word_tokenize(text)]
    sentence_stop = [word for word in sentence if word not in useless_words]

    text = " ".join(sentence_stop)

    return text

In [54]:
def preprocess_data(X):
    """ Preprocess : cleaning and remove stop words"""

    X = X.apply(lambda x: re.sub(r'\d+', '', x))
    X = X.apply(lambda x: clean_text(x))

    stopwords = nltk.corpus.stopwords.words('english')
    useless_words = stopwords + list(string.punctuation) + ['yom', 'yof', 'yowm', 'yf', 'ym', 'yo']
    # print("useless word : ",useless_words)
    X = X.apply(lambda x: remove_useless_words(x,useless_words))

    return X

In [55]:
%%time
X_train_processed = preprocess_data(X_train)
X_valid_processed = preprocess_data(X_valid)

print("after preprocessing...")
print(X_train_processed.head(5))

after preprocessing...
69539                     work hammering metal pole hit finger dx r index finger pain communited intraarticular fx pip
76664     work pet groomer grooming dog another dog jumped upand dog bit pt r cheek abrasions jawline dx dog bite face
71708                                                                                     hit chisel eyebrown lac face
3165                      c upper arm shoulder pain w movement since yest think strained work lifting dx shoulder pain
136917                                                           fracture pubic rami semi load pvc pipe fell unloading
Name: text, dtype: object
CPU times: user 24 s, sys: 11.8 ms, total: 24 s
Wall time: 24 s


In [98]:
test_data = pd.read_csv('./data/raw/test_data.csv')
test_data = test_data[~test_data['event'].isin([10,29,30,59,74])]
X_test, y_test = test_data['text'], test_data['event']
X_test_processed = preprocess_data(X_test)
X_test_processed

0                                   f puncture wound fiinger attaching cap insulin syring used home care patient
1               contusion lt lower leg p mvc hit car guiding car gas pedal got stuck pushing door work yesterday
2                        pt works quarry attempting dislodge large rock developed chest pains dx chest wall pain
3                                       walking work twisted lt ankle later right knee dx left ankle knee sprain
4                                             c low back pain lifting box work today dx left sided low back pain
                                                          ...                                                   
75692    coaching football collided player pain rt leg diff breathing dx fx rib left side cld fx sprain rt ankle
75693                                               male using wire brush work piece got eye dx foreign body eye
75694                                                                    lifting work back px dx

In [99]:
len(X_test_processed)

75687

## Step 3 - Feature Extraction

### Tokenize and Encode Train and Test data

In [56]:
%%time
from transformers import  AutoTokenizer, DistilBertTokenizerFast,BertTokenizerFast,RobertaTokenizerFast
MAX_LEN = 45


MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

x_train = X_train_processed.to_list()
x_valid = X_valid_processed.to_list()


train_encodings = tokenizer(x_train, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')
valid_encodings = tokenizer(x_valid, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')


CPU times: user 15.7 s, sys: 1.16 s, total: 16.9 s
Wall time: 4.97 s


In [57]:
print(train_encodings['input_ids'][0])
print(train_encodings['attention_mask'][0])

tf.Tensor(
[  101  2147 27883  3384  6536  2718  4344  1040  2595  1054  5950  4344
  3255  4012 23041 17572 26721  8445 21412 23292 28315   102     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0], shape=(45,), dtype=int32)
tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0], shape=(45,), dtype=int32)


### Encoding labels

In [58]:
from sklearn.preprocessing import LabelEncoder

print("Encoding Labels .....")
encoder = LabelEncoder()
encoder.fit(y_train)
y_train_encode = np.asarray(encoder.transform(y_train))
y_valid_encode = np.asarray(encoder.transform(y_valid))

Encoding Labels .....


In [59]:
id2label = { id:str(label) for id, label in enumerate(encoder.classes_)}
label2id = { str(label):id for id, label in enumerate(encoder.classes_)}

In [60]:
print('classes in train :',len(np.unique(y_train_encode)))
print('classes in valid :',len(np.unique(y_valid_encode)))

classes in train : 43
classes in valid : 43


In [61]:
import os
data_path='./data/train/'
os.makedirs(data_path,exist_ok=True)

from pickle import dump
dump(encoder,open(os.path.join(data_path,'encode.pkl'),'wb'))

### Create TF Dataset

In [64]:
%%time
def construct_tfdataset(encodings, y=None):
    if y is not None:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))
    
train_tfdataset = construct_tfdataset(train_encodings, y_train_encode)
valid_tfdataset = construct_tfdataset(valid_encodings, y_valid_encode)

CPU times: user 3.64 ms, sys: 0 ns, total: 3.64 ms
Wall time: 3.39 ms


### Create TF Records File

In [25]:

def _save_feature_as_tfrecord(tfdataset,file_path):
    """ Helper function to save the tf dataset as tfrecords"""
    

    def single_example_data(encoding,label_id):

        input_ids =encoding['input_ids']
        attention_mask = encoding['attention_mask']

        tfrecord_features = collections.OrderedDict()

        def _int64_feature(value):
            """Returns an int64_list from a bool / enum / int / uint."""
            return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

        tfrecord_features['input_ids'] = _int64_feature(input_ids)
        tfrecord_features['attention_mask'] = _int64_feature(attention_mask)
        tfrecord_features['label_ids'] =  _int64_feature([label_id])

        _example = tf.train.Example(features=tf.train.Features(feature=tfrecord_features))

        return _example.SerializeToString()

    def data_generator():
        for features in tfdataset:
            yield single_example_data(*features)

    serialized_tfdataset = tf.data.Dataset.from_generator(
        data_generator, output_types=tf.string, output_shapes=())        

    writer = tf.data.experimental.TFRecordWriter(file_path)
    writer.write(serialized_tfdataset)


        

In [26]:
%time

# training data
_save_feature_as_tfrecord(train_tfdataset,'./data/train/train.tfrecord')

#validation data
_save_feature_as_tfrecord(valid_tfdataset,'./data/valid/valid.tfrecord')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


### Upload data to S3

In [27]:
prefix = 'projects/project006/injury-data/training'
tfrecord_train_location = sagemaker_session.upload_data(path = './data/train',
                                                      bucket = bucket,
                                                      key_prefix = prefix)
tfrecord_train_location

's3://cdc-cdh-sagemaker-s3fs-dev/projects/project006/injury-data/training'

In [28]:
prefix = 'projects/project006/injury-data/validation'
tfrecord_valid_location = sagemaker_session.upload_data(path = './data/valid',
                                                      bucket = bucket,
                                                      key_prefix = prefix)
tfrecord_valid_location

's3://cdc-cdh-sagemaker-s3fs-dev/projects/project006/injury-data/validation'

## Step 4 - Run BERT Training Job

### Training output path

In [29]:
prefix = 'projects/project006/output'
output_path = 's3://{}/{}/'.format(bucket, prefix )
output_path

's3://cdc-cdh-sagemaker-s3fs-dev/projects/project006/output/'

### Training script

In [30]:
# TensorFlow 2.3 script
!pygmentize './src/train.py'

[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mtensorflow[39;49;00m [34mas[39;49;00m [04m[36mtf[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m  TFDistilBertForSequenceClassification,DistilBertConfig
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m TFBertForSequenceClassification, BertConfig
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m [34mimport[39;49;00m TFRobertaForSequenceClassification, RobertaConfig
[34mfrom[39;49;00m [04m[36mtensorflow[39;49;00m[04m[36m.[39;49;00m[04m[36mkeras[39;49;00m[04m[36m.[39;49;00m[04m[36moptimizers[39;49;00m[04m[36m.[39;49;00m[04m[36mschedules[39;49;00m [34mimport[39;49;00m PolynomialDecay
[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m[04m[36m.[39;49;00m[04

### Hyperparameters

In [105]:
num_records = len(X_train)
num_valid_records = len(X_valid)
max_len = MAX_LEN
epochs = 5
batch_size = 16
valid_batch_size = 16
steps_per_epoch = num_records // batch_size
validation_steps = num_valid_records // valid_batch_size
learning_rate = 5e-5
optimizer = 'adam'

In [106]:
print(num_records)
print(steps_per_epoch)
print(validation_steps)

138549
8659
962


In [33]:
%run ./src/train.py --train ./data/train --validation ./data/validation --epochs 1 --num_records 138549 --steps_per_epoch 8659 --validation_steps 962

### Using Hugging Face Estimator to start training

In [34]:
import time
from time import gmtime, strftime
from sagemaker.huggingface import HuggingFace

model_name = 'BaseBERT'
job_name_prefix = f"training-{model_name}"
timestamp = strftime("-%m-%d-%M-%S", gmtime())

job_name = job_name_prefix + timestamp

_estimator = HuggingFace(
        base_job_name  = job_name,
        entry_point="train.py",
        source_dir = "./src/",
        role=role,
        instance_count=1,
        volume_size = 5,
        max_run = 18000,
        instance_type='ml.p3.2xlarge',
        transformers_version = "4.4",
        tensorflow_version  = "2.4",
        py_version="py37",
        output_path = output_path,
        hyperparameters = {
                "model_name": model_name,
                "num_records":  num_records,
                "max_len":max_len,
                "epochs":int(epochs),
                "learning_rate":float(learning_rate),
                "batch_size":int(batch_size),
                "valid_batch_size":valid_batch_size,
                "steps_per_epoch": steps_per_epoch,
                "validation_steps": validation_steps,
                "optimizer":optimizer
                },
        metric_definitions = [{'Name':'train:loss','Regex':'loss: ([0-9\\.]+)'},
                                    {'Name':'train:accuracy','Regex':'acc: ([0-9\\.]+)'},
                                    {'Name':'validation:loss','Regex':'val_loss: ([0-9\\.]+)'},
                                    {'Name':'validation:accuracy','Regex':'val_acc: ([0-9\\.]+)'}],
        enable_sagemaker_metrics = True
    )


train_data = sagemaker.inputs.TrainingInput(
    tfrecord_train_location, # Replace None
    distribution='FullyReplicated'
)

validation_data = sagemaker.inputs.TrainingInput(
    tfrecord_valid_location, # Replace None
    distribution='FullyReplicated'
)

_estimator.fit({'train':train_data,'validation':validation_data}, wait=False)

### Download the trained distilbert model

In [35]:
print(_estimator.latest_training_job.name)

training-BaseBERT-08-10-24-42-2021-08-10-13-24-42-457


In [36]:
from sagemaker.s3  import S3Downloader
import os

def download_model(sagemaker_session,job_name):
    
    if job_name is not None :
        modeldesc = sagemaker_session.describe_training_job(job_name)
        s3_model_path = modeldesc['ModelArtifacts']['S3ModelArtifacts']
    
    os.makedirs(f"./output/model/{job_name}/",exist_ok=True)

    S3Downloader.download(
        s3_uri=s3_model_path, # s3 uri where the trained model is located
        local_path=f"./output/model/{job_name}/", # local path where *.targ.gz is saved
        sagemaker_session=sagemaker_session # sagemaker session used for training the model
    )
    
    return s3_model_path, modeldesc['HyperParameters']

In [37]:
#downloading model
job_name = 'training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017'
s3_model_path, hp = download_model(sagemaker_session,job_name)

print(s3_model_path)
print(hp)

s3://cdc-cdh-sagemaker-s3fs-dev/projects/project006/output/training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017/output/model.tar.gz
{'batch_size': '16', 'epochs': '5', 'learning_rate': '5e-05', 'max_len': '45', 'model_name': '"BaseBERT"', 'num_records': '138549', 'optimizer': '"adam"', 'sagemaker_container_log_level': '20', 'sagemaker_job_name': '"training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017"', 'sagemaker_program': '"train_keras.py"', 'sagemaker_region': '"us-east-1"', 'sagemaker_submit_directory': '"s3://cdc-cdh-sagemaker-s3fs-dev/training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017/source/sourcedir.tar.gz"', 'steps_per_epoch': '8659', 'valid_batch_size': '16', 'validation_steps': '962'}


### Extract and load the model

In [38]:
import tarfile
from transformers.optimization_tf import AdamWeightDecay
from tensorflow.keras.optimizers.schedules import PolynomialDecay

def extract_data_load_model(job_name,model_name):

    t = tarfile.open(f'./output/model/{job_name}/model.tar.gz', 'r:gz')
    t.extractall(path=f'./output/model/{job_name}')
    _model = tf.keras.models.load_model(f"./output/model/{job_name}/{model_name}",custom_objects={'AdamWeightDecay':AdamWeightDecay})
    
    return _model

In [39]:
model_name = 'BaseBERT'
job_name = 'training-BaseBERT-08-02-58-54-2021-08-02-19-58-55-017'
loaded_model = extract_data_load_model(job_name,model_name)




## Step 5 - Evaluate the model

### Load training and validation data

In [44]:
def _load_data(train_dir,valid_dir,MAX_LEN,epochs,batch_size,valid_batch_size,steps_per_epoch,validation_steps):
    """ Helper function to load,parse and create input data pipeline from TFRecords"""
          
    train_file = os.path.join(train_dir,"train.tfrecord") 
    valid_file = os.path.join(valid_dir,"valid.tfrecord")
    
    # Create a description of the features.
    feature_description = {
        'input_ids': tf.io.FixedLenFeature([MAX_LEN], tf.int64),
        'attention_mask': tf.io.FixedLenFeature([MAX_LEN], tf.int64),
        'label_ids': tf.io.FixedLenFeature([], tf.int64),
    }
        
    def _parse_function(example_proto):

        # Parse the input `tf.train.Example` proto using the dictionary above.
        parsed  = tf.io.parse_single_example(example_proto, feature_description)

        return {'input_ids':parsed['input_ids'],'attention_mask':parsed['attention_mask']},parsed['label_ids']
        
    
    train_dataset = tf.data.TFRecordDataset(train_file)
    train_dataset = train_dataset.repeat(epochs * steps_per_epoch)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    train_dataset = train_dataset.map(_parse_function,num_parallel_calls=tf.data.AUTOTUNE)
    
    train_dataset = train_dataset.batch(batch_size)
    
    
    
    valid_dataset = tf.data.TFRecordDataset(valid_file)
    valid_dataset = valid_dataset.repeat(epochs * validation_steps)
    valid_dataset = valid_dataset.prefetch(tf.data.AUTOTUNE)
    valid_dataset = valid_dataset.map(_parse_function,num_parallel_calls=tf.data.AUTOTUNE)
    valid_dataset = valid_dataset.batch(valid_batch_size)
    
   
    return train_dataset, valid_dataset

In [45]:
train_dir = './data/train'
valid_dir= './data/valid'
train_dataset,valid_dataset = _load_data(train_dir,valid_dir,MAX_LEN,epochs,batch_size,valid_batch_size,steps_per_epoch,validation_steps)

### Evaluate the model

In [46]:

def _evaluate_model(loaded_model):
    print("Evaluating Training data...")
    train_score = loaded_model.evaluate(train_dataset,
                                     steps = steps_per_epoch,
                                     batch_size=batch_size)

    print("Training loss: ", train_score[0])
    print("Training accuracy: ", train_score[1])

    print("Evaluating Validation data...")
    valid_score = loaded_model.evaluate(valid_dataset, steps = validation_steps,batch_size=valid_batch_size)
    print("Validation loss: ", valid_score[0])
    print("Validation accuracy: ", valid_score[1])
    
_evaluate_model(loaded_model)

Evaluating Training data...
Training loss:  0.11638274043798447
Training accuracy:  0.9641124606132507
Evaluating Validation data...
Validation loss:  0.535329282283783
Validation accuracy:  0.8525857329368591


### Using the fine-tuned model to predict new samples

In [65]:
MODEL_NAME = 'bert-base-uncased'

def _create_predictor(model, encoder,model_name, max_len,text):
    tkzr = AutoTokenizer.from_pretrained(model_name)
    x = [text]
    encodings =  tkzr(x, max_length=max_len, truncation=True, padding='max_length',return_tensors='tf')
    tfdataset = construct_tfdataset(encodings)
    tfdataset = tfdataset.batch(1)
    preds = model.predict(tfdataset)
    categories = encoder.classes_.tolist()
    enc = np.argmax(preds[0])
    
    return {     'text' : x,
                 'predict_proba' : preds[0][np.argmax(preds[0])],
                 'predicted_class' : categories[np.argmax(preds)]                             
                }

In [66]:
x = 'sp lt ankle going steps twisted ankle work'
_create_predictor(loaded_model, encoder,MODEL_NAME, MAX_LEN,x)

  [n for n in tensors.keys() if n not in ref_input_names])


{'text': ['sp lt ankle going steps twisted ankle work'],
 'predict_proba': 0.9985917,
 'predicted_class': 73}

## Step 6 - Save Predictions and calculate metrics

In [67]:
%%time
import numpy as np

model_name = 'bert-base-uncased'
x = X_valid_processed.tolist()
y = y_valid.tolist()

def _batch_predict(model, encoder,model_name, max_len,x,y) :
    tkzr = AutoTokenizer.from_pretrained(model_name)
    encodings_x =  tkzr(x, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')
    tfdataset = construct_tfdataset(encodings_x).batch(32)
    preds = loaded_model.predict(tfdataset)
    predictions_encode = pd.DataFrame(data=preds).apply(lambda x: np.argmax(x),axis=1)
    categories = encoder.classes_.tolist()
    predictions_event= predictions_encode.apply(lambda x:categories[x])


    print(len(predictions_event))
    results = pd.DataFrame({'text': x,
                      'true_event':y,
                      'preds_encode': predictions_encode,
                      'preds_event':predictions_event
                            })


    return results

results = _batch_predict(loaded_model,encoder,model_name,MAX_LEN,x,y)
results.to_csv('solution.csv')
results.head(10)

15395
CPU times: user 28min 27s, sys: 20 s, total: 28min 47s
Wall time: 4min 32s


Unnamed: 0,text,true_event,preds_encode,preds_event
0,concussion cow kicked gate struck forehead,62,29,62
1,fractured hand work,99,27,60
2,corneal abrasion kickback x bruising eye work,62,29,62
3,complains lt thumb pain slamming door last nightat work dx acute lt thumb sparin injury,62,29,62
4,sts works microbio lab pt smelled paint thinner became dizzy lightheaded dx inhalant exposure w dizziness,55,25,55
5,l shoulder pain lifting pt work x shoulder strain,71,37,71
6,pain l eye working piece wood went eye dx corneal abrasion,66,33,66
7,dx forearm laceration p cut x work,62,27,60
8,c lbp lifting heavy equipment engine block work dx acute lbp,71,37,71
9,work tonight pt scratched pt way hospital dx superficial fingernail scratches forearm,12,0,11


### Calculate metrics

In [68]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,balanced_accuracy_score
import pandas as pd

def compute_metrics(pred):
    labels = pred.true_event
    preds = pred.preds_event
    acc = accuracy_score(labels, preds)
    bal_acc = balanced_accuracy_score(labels, preds)
    precision = precision_score(labels,preds,average='macro')
    recall = recall_score(labels,preds,average='macro')
    f1 = f1_score(labels,preds,average='macro')
    return {
        'accuracy': acc,
        'balanced_accuracy':bal_acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

compute_metrics(results)

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.8526144852224748,
 'balanced_accuracy': 0.6106155926140897,
 'f1': 0.6171119378801808,
 'precision': 0.6472400138296469,
 'recall': 0.6106155926140897}

## Evaluate Test data

In [100]:
%%time
MAX_LEN = 45
model_name = 'bert-base-uncased'

x_test = X_test_processed.tolist()
y_test = y_test.tolist()

results_test = _batch_predict(loaded_model,encoder,model_name,MAX_LEN,x_test,y_test)
results_test.head()

75687
CPU times: user 2h 20min 12s, sys: 1min 34s, total: 2h 21min 47s
Wall time: 22min 11s


Unnamed: 0,text,true_event,preds_encode,preds_event
0,f puncture wound fiinger attaching cap insulin syring used home care patient,55,25,55
1,contusion lt lower leg p mvc hit car guiding car gas pedal got stuck pushing door work yesterday,24,9,26
2,pt works quarry attempting dislodge large rock developed chest pains dx chest wall pain,71,37,71
3,walking work twisted lt ankle later right knee dx left ankle knee sprain,73,39,73
4,c low back pain lifting box work today dx left sided low back pain,71,37,71


In [101]:
results_test.head()

Unnamed: 0,text,true_event,preds_encode,preds_event
0,f puncture wound fiinger attaching cap insulin syring used home care patient,55,25,55
1,contusion lt lower leg p mvc hit car guiding car gas pedal got stuck pushing door work yesterday,24,9,26
2,pt works quarry attempting dislodge large rock developed chest pains dx chest wall pain,71,37,71
3,walking work twisted lt ankle later right knee dx left ankle knee sprain,73,39,73
4,c low back pain lifting box work today dx left sided low back pain,71,37,71


In [102]:
compute_metrics(results_test)

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.8556687409991148,
 'balanced_accuracy': 0.5998814422117884,
 'f1': 0.6066925496506665,
 'precision': 0.6256779684299058,
 'recall': 0.5998814422117884}

In [107]:
MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

encodings_x_test =  tokenizer(x_test, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')
y_test_encode = np.asarray(encoder.transform(y_test))
tfdataset_test = construct_tfdataset(encodings_x_test,y_test_encode).batch(16)

print("Evaluating Test data...")
test_score = loaded_model.evaluate(tfdataset_test, steps = validation_steps,batch_size=16)
print("Test loss: ", test_score[0])
print("Test accuracy: ", test_score[1])


Evaluating Test data...


  [n for n in tensors.keys() if n not in ref_input_names])


Test loss:  0.5342479348182678
Test accuracy:  0.8543398976325989
