# Injury classification with SageMaker

In [1]:
#https://wandb.ai/jack-morris/david-vs-goliath/reports/Does-Model-Size-Matter-A-Comparison-of-BERT-and-DistilBERT--VmlldzoxMDUxNzU
#https://towardsdatascience.com/working-with-hugging-face-transformers-and-tf-2-0-89bf35e3555a
#https://towardsdatascience.com/scale-neural-network-training-with-sagemaker-distributed-8cf3aefcff51
#https://towardsdatascience.com/how-to-reduce-training-time-for-a-deep-learning-model-using-tf-data-43e1989d2961
#https://towardsdatascience.com/hugging-face-transformers-fine-tuning-distilbert-for-binary-classification-tasks-490f1d192379
#https://towardsdatascience.com/to-distil-or-not-to-distil-bert-roberta-and-xlnet-c777ad92f8
#https://ymeadows.com/articles/fine-tuning-transformer-based-language-models

### Smart Batching
#https://towardsdatascience.com/divide-hugging-face-transformers-training-time-by-2-or-more-21bf7129db9q-21bf7129db9e
#https://www.youtube.com/watch?v=ynOZUNnbEWU

In [2]:
!python --version

Python 3.7.10


In [3]:
%%capture
!pip install tensorflow
!pip install transformers
!pip install nltk

In [4]:
import transformers
import nltk
print(transformers.__version__)

4.9.2


In [5]:
import nltk 

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
import pandas as pd
import tensorflow as tf
import re
import nltk
import string
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras import activations, optimizers, losses
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as np
import sagemaker
from sagemaker import get_execution_role
import joblib 
import collections

In [7]:
print(tf.__version__)

2.6.0


In [8]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
bucket

Couldn't call 'get_role' to get Role ARN from role name AmazonSageMaker-ExecutionRole-20210423T122185 to get Role path.
Assuming role was created in SageMaker AWS console, as the name contains `AmazonSageMaker-ExecutionRole`. Defaulting to Role ARN with service-role in path. If this Role ARN is incorrect, please add IAM read permissions to your role or supply the Role Arn directly.


'sagemaker-us-east-1-979294212144'

In [9]:
%store -z
%store

Stored variables and their in-db values:


## Step 1 - Import and split data

In [10]:
def get_samples(ratio,train):
    great_than2_classes = train['event'].value_counts()[train['event'].value_counts() >2].index
    train = train[train['event'].isin(great_than2_classes.to_list())]
    train_samples, _ = train_test_split(train,train_size=ratio,random_state=42,stratify=train['event'])
    print("nb classes in samples",train_samples['event'].nunique())
    print("nb oservations:",train_samples.shape)
    #print(train_samples['event'].value_counts())

    return train_samples

In [11]:
def get_data(data_file,is_sample=None,ratio=None,is_test_split=False):
  
    train = pd.read_csv(data_file)
    if is_sample:
        train = get_samples(ratio = ratio, train=train)
    great_than2_classes = train['event'].value_counts()[train['event'].value_counts() >5].index 
    train_filter = train[train['event'].isin(great_than2_classes.to_list())]
    
    print("nb classes in final data:",train_filter['event'].nunique())

    X = train_filter['text']
    y = train_filter['event']

    print(f"X.shape {X.shape} y.shape : {y.shape}")

    X_train_valid,X_test,y_train_valid, y_test = train_test_split(X,y,train_size=0.9,random_state=42,stratify=y)
    
    if is_test_split:
        X_train,X_valid,y_train,y_valid = train_test_split(X_train_valid,y_train_valid,train_size=0.8,random_state=42,stratify=y_train_valid)

    if is_test_split :
        print(f"X_train shape {X_train.shape} y_train shape : {y_train.shape}")
        print(f"X_valid shape {X_valid.shape} y_valid shape : {y_valid.shape}")
        print(f"X_test shape {X_test.shape} y_test shape : {y_test.shape}")
        
        return {
          'train': (X_train,y_train),
          'valid': (X_valid,y_valid),
          'test': (X_test,y_test)
      }

    else:
        print(f"X_train shape {X_train_valid.shape} y_train shape : {y_train_valid.shape}")
        print(f"X_valid shape {X_test.shape} y_valid shape : {y_test.shape}")
        
        return {
          'train': (X_train_valid,y_train_valid),
          'valid': (X_test,y_test),
      }


In [12]:
pd.options.display.max_colwidth = 3100
data = get_data(data_file='./data/raw/train.csv',is_sample=True,ratio=0.05)
#data = get_data(is_test_split=True)
X_train, y_train = data['train']
X_valid,y_valid = data['valid']
#X_test,y_test = data['test']

nb classes in samples 41
nb oservations: (7697, 4)
nb classes in final data: 28
X.shape (7668,) y.shape : (7668,)
X_train shape (6901,) y_train shape : (6901,)
X_valid shape (767,) y_valid shape : (767,)


## Step 2 - Preprocess Data

In [13]:
import numpy as np
print('classes in train :',len(np.unique(y_train)))
print('classes in valid :',len(np.unique(y_valid)))

CLASSES = y_train.unique().tolist()
print(CLASSES)

classes in train : 28
classes in valid : 28
[62, 71, 63, 11, 43, 55, 42, 52, 60, 73, 13, 66, 12, 53, 64, 27, 24, 99, 26, 72, 70, 51, 44, 41, 31, 78, 32, 23]


In [14]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """

    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)

    # convert text to lowercase
    text = text.strip().lower()

    # remove all non-ASCII characters:
    text = re.sub(r'[^\x00-\x7f]', r'', text)

    # replace punctuation characters with spaces
    filters = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = " ".join(text.split())
    return text

In [15]:
def remove_useless_words(text,useless_words):
    sentence = [word for word in word_tokenize(text)]
    sentence_stop = [word for word in sentence if word not in useless_words]

    text = " ".join(sentence_stop)

    return text

In [16]:
def preprocess_data(X):
    """ Preprocess : cleaning and remove stop words"""

    X = X.apply(lambda x: re.sub(r'\d+', '', x))
    X = X.apply(lambda x: clean_text(x))

    stopwords = nltk.corpus.stopwords.words('english')
    useless_words = stopwords + list(string.punctuation) + ['yom', 'yof', 'yowm', 'yf', 'ym', 'yo']
    # print("useless word : ",useless_words)
    X = X.apply(lambda x: remove_useless_words(x,useless_words))

    return X

In [17]:
%%time
X_train_processed = preprocess_data(X_train)
X_valid_processed = preprocess_data(X_valid)

print("after preprocessing...")
print(X_train_processed.head(5))

after preprocessing...
78811                                                yomcontusion foot metal fell foot work
29443            c hip pain pulling heavy trash bags work friday dx strain right hip flexor
146033                                  hurt chest leaning fish aquarium dx contusion chest
92679                                   assaulted work punched face customer contusion face
124907    reports sus laceration rt palm lost footing whilecoming ladder dx palm laceration
Name: text, dtype: object
CPU times: user 1.45 s, sys: 2.89 ms, total: 1.46 s
Wall time: 1.46 s


## Test Data

In [18]:

##TODO Needs to fix sampling strategy to work for percentage/ no har coded classes

def get_test_data(test_data_file, is_sample, ratio):
    
    test_data = pd.read_csv(test_data_file)
    test_data = test_data[~test_data['event'].isin([10,29,30,59,74])]
    test_data = test_data[['text','event']]
    
    if is_sample:
        test_data = get_samples(ratio = ratio, train=test_data) 
        great_than5_classes = test_data['event'].value_counts()[test_data['event'].value_counts() >5].index
        test_data = test_data[test_data['event'].isin(great_than5_classes.to_list())]
    
    print("nb classes in final data:",test_data['event'].nunique())
    print(f"test_data_small.shape {test_data.shape}")
    
    return test_data

In [19]:
test_data = get_test_data('./data/raw/test_data.csv',is_sample=True, ratio=0.05)
X_test, y_test = test_data['text'], test_data['event']
X_test_processed = preprocess_data(X_test)
X_test_processed

nb classes in samples 39
nb oservations: (3784, 2)
nb classes in final data: 28
test_data_small.shape (3768, 2)


37427                                                              male hurt bending work dx knee pain b
3526                                    works construction door fell hitting head loc c neck pain chi ms
8292       c l finger pain work l th digit removing door panel crushed dx finger contu subungal hematoma
63604                                                 wks lows heavy lifting h worsening lbp atypical cp
11228    f pt work yesterday slipped fell onto floor hitting head loc altered mental status today dx chi
                                                      ...                                               
36432                               drives subject bus lots lifting pushing people wheelchairs back pain
16967                                          work handling concrete got rash hands contact dermat itis
49205                                                                                sexual assault work
2439                               work hit open freeze

In [20]:
train_classes =y_train.unique().tolist()
test_classes = y_test.unique().tolist()

print(sorted(train_classes))
print(sorted(test_classes))

[11, 12, 13, 23, 24, 26, 27, 31, 32, 41, 42, 43, 44, 51, 52, 53, 55, 60, 62, 63, 64, 66, 70, 71, 72, 73, 78, 99]
[11, 12, 13, 23, 24, 26, 27, 31, 32, 41, 42, 43, 44, 51, 52, 53, 55, 60, 62, 63, 64, 66, 70, 71, 72, 73, 78, 99]


In [21]:
text_processed = pd.DataFrame({'text':X_test_processed,'event':y_test})
text_processed.to_csv('./data/test/test_processed.csv')

## Step 3 - Feature Extraction

### Tokenize and Encode Train and Test data

In [22]:
%%time
from transformers import  AutoTokenizer, DistilBertTokenizerFast,BertTokenizerFast,RobertaTokenizerFast
MAX_LEN = 45


MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

x_train = X_train_processed.to_list()
x_valid = X_valid_processed.to_list()


train_encodings = tokenizer(x_train, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')
valid_encodings = tokenizer(x_valid, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')


CPU times: user 1.11 s, sys: 512 ms, total: 1.63 s
Wall time: 1.37 s


In [23]:
print(train_encodings['input_ids'][0])
print(train_encodings['attention_mask'][0])

tf.Tensor(
[  101 10930 12458 12162 14499  3329  3384  3062  3329  2147   102     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0], shape=(45,), dtype=int32)
tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0], shape=(45,), dtype=int32)


### Encoding labels

In [24]:
from sklearn.preprocessing import LabelEncoder

print("Encoding Labels .....")
encoder = LabelEncoder()
encoder.fit(y_train)
y_train_encode = np.asarray(encoder.transform(y_train))
y_valid_encode = np.asarray(encoder.transform(y_valid))

Encoding Labels .....


In [25]:
id2label = { id:str(label) for id, label in enumerate(encoder.classes_)}
label2id = { str(label):id for id, label in enumerate(encoder.classes_)}

In [26]:
print('classes in train :',len(np.unique(y_train_encode)))
print('classes in valid :',len(np.unique(y_valid_encode)))

classes in train : 28
classes in valid : 28


In [27]:
import os
data_path='./data/train/'
test_data_path = './data/test'
os.makedirs(data_path,exist_ok=True)

from pickle import dump
#for training process
dump(encoder,open(os.path.join(data_path,'encode.pkl'),'wb'))

#for evaluation process
dump(encoder,open(os.path.join(test_data_path,'encode.pkl'),'wb'))

In [28]:
print(encoder.classes_.tolist())

[11, 12, 13, 23, 24, 26, 27, 31, 32, 41, 42, 43, 44, 51, 52, 53, 55, 60, 62, 63, 64, 66, 70, 71, 72, 73, 78, 99]


### Create TF Dataset

In [29]:
%%time
def construct_tfdataset(encodings, y=None):
    if y is not None:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))
    
train_tfdataset = construct_tfdataset(train_encodings, y_train_encode)
valid_tfdataset = construct_tfdataset(valid_encodings, y_valid_encode)

CPU times: user 4.15 ms, sys: 20 µs, total: 4.17 ms
Wall time: 3.63 ms


### Create TF Records File

In [30]:

def _save_feature_as_tfrecord(tfdataset,file_path):
    """ Helper function to save the tf dataset as tfrecords"""
    

    def single_example_data(encoding,label_id):

        input_ids =encoding['input_ids']
        attention_mask = encoding['attention_mask']

        tfrecord_features = collections.OrderedDict()

        def _int64_feature(value):
            """Returns an int64_list from a bool / enum / int / uint."""
            return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

        tfrecord_features['input_ids'] = _int64_feature(input_ids)
        tfrecord_features['attention_mask'] = _int64_feature(attention_mask)
        tfrecord_features['label_ids'] =  _int64_feature([label_id])

        _example = tf.train.Example(features=tf.train.Features(feature=tfrecord_features))

        return _example.SerializeToString()

    def data_generator():
        for features in tfdataset:
            yield single_example_data(*features)

    serialized_tfdataset = tf.data.Dataset.from_generator(
        data_generator, output_types=tf.string, output_shapes=())        

    writer = tf.data.experimental.TFRecordWriter(file_path)
    writer.write(serialized_tfdataset)


        

In [30]:
%%time

# training data
_save_feature_as_tfrecord(train_tfdataset,'./data/train/train.tfrecord')

#validation data
_save_feature_as_tfrecord(valid_tfdataset,'./data/valid/valid.tfrecord')

Instructions for updating:
To write TFRecords to disk, use `tf.io.TFRecordWriter`. To save and load the contents of a dataset, use `tf.data.experimental.save` and `tf.data.experimental.load`
CPU times: user 1min 26s, sys: 224 ms, total: 1min 26s
Wall time: 1min 26s


### Upload data to S3

In [31]:
prefix = 'injury-data/training'
tfrecord_train_location = sagemaker_session.upload_data(path = './data/train',
                                                      bucket = bucket,
                                                      key_prefix = prefix)
tfrecord_train_location

's3://sagemaker-us-east-1-979294212144/injury-data/training'

In [32]:
prefix = 'injury-data/validation'
tfrecord_valid_location = sagemaker_session.upload_data(path = './data/valid',
                                                      bucket = bucket,
                                                      key_prefix = prefix)
tfrecord_valid_location

's3://sagemaker-us-east-1-979294212144/injury-data/validation'

## Step 4 - Run BERT Training Job

### Training output path

In [31]:
prefix = 'output'
output_path = 's3://{}/{}/'.format(bucket, prefix )
output_path

's3://sagemaker-us-east-1-979294212144/output/'

### Training script

In [32]:
# TensorFlow 2.3 script
!pygmentize './src/train.py'

[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36msubprocess[39;49;00m
    
[37m# implement pip as a subprocess:[39;49;00m
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mtransformers[39;49;00m[33m'[39;49;00m])
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mnltk[39;49;00m[33m'[39;49;00m])
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mtensorflow[39;49;00m[33m'[39;49;00m])

[34mimport[39;49;00m [04m[36mos

### Hyperparameters

In [33]:
num_records = len(X_train)
num_valid_records = len(X_valid)
max_len = MAX_LEN
epochs = 5
batch_size = 16
valid_batch_size = 16
steps_per_epoch = num_records // batch_size
validation_steps = num_valid_records // valid_batch_size
learning_rate = 5e-5
optimizer = 'adam'

In [34]:
print(num_records)
print(steps_per_epoch)
print(validation_steps)

6901
431
47


In [38]:
%%time
#%run ./src/train.py --train ./data/train --validation ./data/valid --epochs 1 --num_records 6901 --steps_per_epoch 431 --validation_steps 47

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs


### Using Hugging Face Estimator to start training

In [39]:
import time
from time import gmtime, strftime
from sagemaker.huggingface import HuggingFace

model_name = 'BaseBERT'
job_name_prefix = f"training-{model_name}"
timestamp = strftime("-%m-%d-%M-%S", gmtime())

job_name = job_name_prefix + timestamp

_estimator = HuggingFace(
        base_job_name  = job_name,
        entry_point="train.py",
        source_dir = "./src/",
        role=role,
        instance_count=1,
        volume_size = 20,
        max_run = 18000,
        instance_type='ml.p3.2xlarge',
        transformers_version = "4.4",
        tensorflow_version  = "2.4",
        py_version="py37",
        output_path = output_path,
        hyperparameters = {
                "model_name": model_name,
                "num_records":  num_records,
                "max_len":max_len,
                "epochs":int(epochs),
                "learning_rate":float(learning_rate),
                "batch_size":int(batch_size),
                "valid_batch_size":valid_batch_size,
                "steps_per_epoch": steps_per_epoch,
                "validation_steps": validation_steps,
                "optimizer":optimizer
                },
        metric_definitions = [{'Name':'train:loss','Regex':'loss: ([0-9\\.]+)'},
                                    {'Name':'train:accuracy','Regex':'acc: ([0-9\\.]+)'},
                                    {'Name':'validation:loss','Regex':'val_loss: ([0-9\\.]+)'},
                                    {'Name':'validation:accuracy','Regex':'val_acc: ([0-9\\.]+)'}],
        enable_sagemaker_metrics = True
    )


train_data = sagemaker.inputs.TrainingInput(
    tfrecord_train_location, # Replace None
    distribution='FullyReplicated'
)

validation_data = sagemaker.inputs.TrainingInput(
    tfrecord_valid_location, # Replace None
    distribution='FullyReplicated'
)

_estimator.fit({'train':train_data,'validation':validation_data}, wait=False)

### Download the trained distilbert model

In [74]:
print(_estimator.latest_training_job.name)

training-BaseBERT-08-25-44-56-2021-08-25-15-44-56-919


In [75]:
from sagemaker.s3  import S3Downloader
import os

def download_model(sagemaker_session,job_name):
    
    if job_name is not None :
        modeldesc = sagemaker_session.describe_training_job(job_name)
        s3_model_path = modeldesc['ModelArtifacts']['S3ModelArtifacts']
    
    os.makedirs(f"./output/model/{job_name}/",exist_ok=True)

    S3Downloader.download(
        s3_uri=s3_model_path, # s3 uri where the trained model is located
        local_path=f"./output/model/{job_name}/", # local path where *.targ.gz is saved
        sagemaker_session=sagemaker_session # sagemaker session used for training the model
    )
    
    return s3_model_path, modeldesc['HyperParameters']

In [76]:
#downloading model
job_name = 'training-BaseBERT-08-25-44-56-2021-08-25-15-44-56-919'
s3_model_path, hp = download_model(sagemaker_session,job_name)

print(s3_model_path)
print(hp)

s3://sagemaker-us-east-1-979294212144/output/training-BaseBERT-08-25-44-56-2021-08-25-15-44-56-919/output/model.tar.gz
{'batch_size': '16', 'epochs': '5', 'learning_rate': '5e-05', 'max_len': '45', 'model_name': '"BaseBERT"', 'num_records': '6901', 'optimizer': '"adam"', 'sagemaker_container_log_level': '20', 'sagemaker_job_name': '"training-BaseBERT-08-25-44-56-2021-08-25-15-44-56-919"', 'sagemaker_program': '"train.py"', 'sagemaker_region': '"us-east-1"', 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-979294212144/training-BaseBERT-08-25-44-56-2021-08-25-15-44-56-919/source/sourcedir.tar.gz"', 'steps_per_epoch': '431', 'valid_batch_size': '16', 'validation_steps': '47'}


### Extract and load the model

In [35]:
import tarfile
from transformers.optimization_tf import AdamWeightDecay
from tensorflow.keras.optimizers.schedules import PolynomialDecay

def extract_data_load_model(job_name,model_name):

    t = tarfile.open(f'./output/model/{job_name}/model.tar.gz', 'r:gz')
    t.extractall(path=f'./output/model/{job_name}')
    _model = tf.keras.models.load_model(f"./output/model/{job_name}/{model_name}",custom_objects={'AdamWeightDecay':AdamWeightDecay})
    
    return _model

In [36]:
model_name = 'BaseBERT'
job_name = 'training-BaseBERT-08-25-44-56-2021-08-25-15-44-56-919'
loaded_model = extract_data_load_model(job_name,model_name)


## Step 5 - Evaluate the model

### Load training and validation data

In [37]:
def _load_data(train_dir,valid_dir,MAX_LEN,epochs,batch_size,valid_batch_size,steps_per_epoch,validation_steps):
    """ Helper function to load,parse and create input data pipeline from TFRecords"""
          
    train_file = os.path.join(train_dir,"train.tfrecord") 
    valid_file = os.path.join(valid_dir,"valid.tfrecord")
    
    # Create a description of the features.
    feature_description = {
        'input_ids': tf.io.FixedLenFeature([MAX_LEN], tf.int64),
        'attention_mask': tf.io.FixedLenFeature([MAX_LEN], tf.int64),
        'label_ids': tf.io.FixedLenFeature([], tf.int64),
    }
        
    def _parse_function(example_proto):

        # Parse the input `tf.train.Example` proto using the dictionary above.
        parsed  = tf.io.parse_single_example(example_proto, feature_description)

        return {'input_ids':parsed['input_ids'],'attention_mask':parsed['attention_mask']},parsed['label_ids']
        
    
    train_dataset = tf.data.TFRecordDataset(train_file)
    train_dataset = train_dataset.repeat(epochs * steps_per_epoch)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    train_dataset = train_dataset.map(_parse_function,num_parallel_calls=tf.data.AUTOTUNE)
    
    train_dataset = train_dataset.batch(batch_size)
    
    
    
    valid_dataset = tf.data.TFRecordDataset(valid_file)
    valid_dataset = valid_dataset.repeat(epochs * validation_steps)
    valid_dataset = valid_dataset.prefetch(tf.data.AUTOTUNE)
    valid_dataset = valid_dataset.map(_parse_function,num_parallel_calls=tf.data.AUTOTUNE)
    valid_dataset = valid_dataset.batch(valid_batch_size)
    
   
    return train_dataset, valid_dataset

In [38]:
train_dir = './data/train'
valid_dir= './data/valid'
train_dataset,valid_dataset = _load_data(train_dir,valid_dir,MAX_LEN,epochs,batch_size,valid_batch_size,steps_per_epoch,validation_steps)

### Evaluate the model

In [39]:

def _evaluate_model(loaded_model):
    print("Evaluating Training data...")
    train_score = loaded_model.evaluate(train_dataset,
                                     steps = steps_per_epoch,
                                     batch_size=batch_size)

    print("Training loss: ", train_score[0])
    print("Training accuracy: ", train_score[1])

    print("Evaluating Validation data...")
    valid_score = loaded_model.evaluate(valid_dataset, steps = validation_steps,batch_size=valid_batch_size)
    print("Validation loss: ", valid_score[0])
    print("Validation accuracy: ", valid_score[1])
    
_evaluate_model(loaded_model)

Evaluating Training data...
Training loss:  0.07934041321277618
Training accuracy:  0.9795533418655396
Evaluating Validation data...
Validation loss:  0.860435426235199
Validation accuracy:  0.7646276354789734


### Using the fine-tuned model to predict new samples

In [None]:
MODEL_NAME = 'bert-base-uncased'

def _create_predictor(model, encoder,model_name, max_len,text):
    tkzr = AutoTokenizer.from_pretrained(model_name)
    x = [text]
    encodings =  tkzr(x, max_length=max_len, truncation=True, padding='max_length',return_tensors='tf')
    tfdataset = construct_tfdataset(encodings)
    tfdataset = tfdataset.batch(1)
    preds = model.predict(tfdataset)
    categories = encoder.classes_.tolist()
    enc = np.argmax(preds[0])
    
    return {     'text' : x,
                 'predict_proba' : preds[0][np.argmax(preds[0])],
                 'predicted_class' : categories[np.argmax(preds)]                             
                }

In [None]:
x = 'sp lt ankle going steps twisted ankle work'
_create_predictor(loaded_model, encoder,MODEL_NAME, MAX_LEN,x)

## Step 6 - Save Predictions and calculate metrics

In [42]:
%%time
import numpy as np

model_name = 'bert-base-uncased'
x = X_valid_processed.tolist()
y = y_valid.tolist()

def _batch_predict(model, encoder,model_name, max_len,x,y) :
    tkzr = AutoTokenizer.from_pretrained(model_name)
    encodings_x =  tkzr(x, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')
    tfdataset = construct_tfdataset(encodings_x).batch(32)
    preds = loaded_model.predict(tfdataset)
    predictions_encode = pd.DataFrame(data=preds).apply(lambda x: np.argmax(x),axis=1)
    categories = encoder.classes_.tolist()
    predictions_event= predictions_encode.apply(lambda x:categories[x])


    print(len(predictions_event))
    results = pd.DataFrame({'text': x,
                      'true_event':y,
                      'preds_encode': predictions_encode,
                      'preds_event':predictions_event
                            })


    return results

results = _batch_predict(loaded_model,encoder,model_name,MAX_LEN,x,y)
results.to_csv('solution.csv')
results.head(10)

767
CPU times: user 1min 28s, sys: 1.93 s, total: 1min 30s
Wall time: 25.5 s


Unnamed: 0,text,true_event,preds_encode,preds_event
0,work lifting heavy objects work strauined upper arm,71,23,71
1,slipped fell ice hitting back head pavement work dx chi,42,10,42
2,lbp rad post leg p mech fall parking lot work sciatica,42,10,42
3,j slipped fell wet floor neck shoulder pain loc dx cervical strain,42,10,42
4,work c low back pain,70,22,70
5,breaking fight work pulling person another thumb pain sprain,11,0,11
6,ago delivering papers walking back car jumped twisted foot fx,73,25,73
7,acc shot nail nail gun thigh fb removal,62,18,62
8,neck back pain rearended mva sb driver mailbox work,26,5,26
9,unspecified lifting work felt pull lower back lumbar strain,71,23,71


### Calculate metrics

In [43]:
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,balanced_accuracy_score
import pandas as pd

def compute_metrics(pred):
    labels = pred.true_event
    preds = pred.preds_event
    acc = accuracy_score(labels, preds)
    bal_acc = balanced_accuracy_score(labels, preds)
    precision = precision_score(labels,preds,average='macro')
    recall = recall_score(labels,preds,average='macro')
    f1 = f1_score(labels,preds,average='macro')
    return {
        'accuracy': acc,
        'balanced_accuracy':bal_acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

compute_metrics(results)

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.7679269882659713,
 'balanced_accuracy': 0.6554404070118658,
 'f1': 0.6423576584345757,
 'precision': 0.6657096900587514,
 'recall': 0.6554404070118658}

## Evaluate Test data

In [89]:
y_test.index

Int64Index([37427,  3526,  8292, 63604, 11228, 52758,  4024,  3022, 68461,
            15150,
            ...
            48480, 66479, 48145, 13776, 38542, 36432, 16967, 49205,  2439,
             9297],
           dtype='int64', length=3768)

In [90]:
%%time
MAX_LEN = 45
model_name = 'bert-base-uncased'

x_test = X_test_processed.reset_index(drop=True).tolist()
y_test = y_test.reset_index(drop=True)

tkzr = AutoTokenizer.from_pretrained(model_name)
encodings_x =  tkzr(x_test, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')
tfdataset = construct_tfdataset(encodings_x).batch(32)
preds = loaded_model.predict(tfdataset)
preds

CPU times: user 8min 54s, sys: 8.02 s, total: 9min 2s
Wall time: 2min 30s


array([[1.7098183e-04, 2.0497383e-04, 1.5861157e-04, ..., 9.5997047e-01,
        6.7987824e-03, 1.0380523e-03],
       [2.1427622e-06, 1.9612321e-06, 1.1960501e-05, ..., 1.4669994e-06,
        5.5458358e-06, 8.3575196e-06],
       [1.6551414e-04, 2.6177961e-04, 1.9675975e-04, ..., 1.6934944e-05,
        1.6356539e-04, 6.3731556e-04],
       ...,
       [7.9736876e-01, 1.1166642e-01, 1.4137506e-03, ..., 3.2624155e-03,
        2.2344638e-04, 5.7809362e-03],
       [6.6859531e-05, 1.3940266e-04, 2.6153182e-04, ..., 1.2606778e-04,
        2.8488834e-05, 1.4067366e-04],
       [9.9847263e-01, 8.1823656e-04, 5.6473287e-05, ..., 5.2038631e-05,
        2.2533372e-06, 5.7162630e-05]], dtype=float32)

In [91]:
preds.shape

(3768, 28)

In [92]:
predictions_encode.isna().sum()

0

In [93]:
categories = encoder.classes_.tolist()
predictions_event= predictions_encode.apply(lambda x:categories[x])
predictions_event.shape


(3768,)

In [94]:
print(len(predictions_event))
len(x_test)

3768


3768

In [95]:
results = pd.DataFrame.from_dict({
                       'text': x_test,
                      'true_event':y_test,
                      'preds_encode': predictions_encode,
                      'preds_event':predictions_event})

In [96]:
results

Unnamed: 0,true_event,preds_encode,preds_event,text
0,73,25,73,male hurt bending work dx knee pain b
1,62,18,62,works construction door fell hitting head loc c neck pain chi ms
2,64,18,62,c l finger pain work l th digit removing door panel crushed dx finger contu subungal hematoma
3,71,23,71,wks lows heavy lifting h worsening lbp atypical cp
4,42,10,42,f pt work yesterday slipped fell onto floor hitting head loc altered mental status today dx chi
...,...,...,...,...
3763,71,23,71,drives subject bus lots lifting pushing people wheelchairs back pain
3764,55,16,55,work handling concrete got rash hands contact dermat itis
3765,11,0,11,sexual assault work
3766,63,19,63,work hit open freezer door c r shoulder pain dx acute r shoulder pain


In [97]:
%%time
MAX_LEN = 45
model_name = 'bert-base-uncased'

x_test = X_test_processed.reset_index(drop=True).tolist()
y_test = y_test.reset_index(drop=True)

results_test = _batch_predict(loaded_model,encoder,model_name,MAX_LEN,x_test,y_test)
results_test.head()

3768
CPU times: user 8min 53s, sys: 7.24 s, total: 9min
Wall time: 2min 30s


Unnamed: 0,text,true_event,preds_encode,preds_event
0,male hurt bending work dx knee pain b,73,25,73
1,works construction door fell hitting head loc c neck pain chi ms,62,18,62
2,c l finger pain work l th digit removing door panel crushed dx finger contu subungal hematoma,64,18,62
3,wks lows heavy lifting h worsening lbp atypical cp,71,23,71
4,f pt work yesterday slipped fell onto floor hitting head loc altered mental status today dx chi,42,10,42


In [98]:
compute_metrics(results_test)

{'accuracy': 0.8118365180467091,
 'balanced_accuracy': 0.7090264179661682,
 'f1': 0.7124110587776313,
 'precision': 0.7296497658918498,
 'recall': 0.7090264179661682}

In [99]:
MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

encodings_x_test =  tokenizer(x_test, max_length=MAX_LEN, truncation=True, padding='max_length',return_tensors='tf')
y_test_encode = np.asarray(encoder.transform(y_test))
tfdataset_test = construct_tfdataset(encodings_x_test,y_test_encode).batch(16)

print("Evaluating Test data...")
test_score = loaded_model.evaluate(tfdataset_test, steps = validation_steps,batch_size=16)
print("Test loss: ", test_score[0])
print("Test accuracy: ", test_score[1])


Evaluating Test data...


  [n for n in tensors.keys() if n not in ref_input_names])


Test loss:  0.7126179337501526
Test accuracy:  0.8125
