In [None]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
!pip install transformers datasets==1.18.4 accelerate==0.20.3

In [None]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.debugger import ProfilerConfig, DebuggerHookConfig, Rule, ProfilerRule, rule_configs
import sagemaker.huggingface
from sagemaker.huggingface import HuggingFace
import transformers
from transformers import AutoTokenizer
from datasets import Dataset


import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from textwrap import wrap

import boto3
import pprint
import time

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
%%time
#import the data and do required transformations

bucket='llmtraining'
data_key1 = 'LLM_TrainingData_071423.csv'
s3 = boto3.client('s3')

obj_props = s3.get_object(Bucket = bucket, Key=data_key1)
df_orig = pd.read_csv((obj_props['Body']), index_col=False)
df_orig = df_orig.drop(columns=['Unnamed: 0'])
df_orig.dropna(inplace=True)
df_orig.info()
df_orig.head()

In [None]:
df=df_orig
keep_cols = ['text', 'labels']
df2 = df[keep_cols]
#df2 = df2.rename(columns={'summary': 'text', 'code': 'labels'})
df2.info()
df2.head()

In [None]:
#create a huggingface dataset object, and encode the class labels (convert from str to int, while saving a dictionary)

df_dataset = Dataset.from_pandas(df2)
df_dataset = df_dataset.class_encode_column('labels')

In [None]:
# test train split using datasets functionality

df_dataset = df_dataset.train_test_split(test_size=0.2)
train_dataset = df_dataset['train']
test_dataset = df_dataset['test']

In [None]:
# tokenizer used in preprocessing
model_id = 'allenai/scibert_scivocab_uncased'

# dataset used
dataset_name = 'sample_data'

# s3 key prefix for the data
s3_prefix = 'samples/datasets/scibert_multiclass'

In [None]:
# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, model_max_length=512)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path, fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path, fs=s3)

In [None]:
print(f'Uploaded training data to {training_input_path}')
print(f'Uploaded testing data to {test_input_path}')

In [None]:
# START HERE IF DATA IS ALREADY IN S3

model_id = 'allenai/scibert_scivocab_uncased'
s3_prefix = 'samples/datasets/scibert_multiclass'
dataset_name = 'sample_data'
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'

In [None]:
from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig

# initialize the Amazon Training Compiler
compiler_config=TrainingCompilerConfig()


# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 3,                                    # number of training epochs
                 'train_batch_size': 20,                         # batch size for training
                 'eval_batch_size': 24,                          # batch size for evaluation
                 'learning_rate': 3e-5,                          # learning rate used during training
                 'model_id':model_id,                            # pre-trained model
                 'fp16': True,                                   # Whether to use 16-bit (mixed) precision training
                }

# job name for sagemaker training 
job_name=f"scibert-{dataset_name}"

In [None]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [None]:
# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train.py',        # fine-tuning script used in training jon
    source_dir           = './scripts',       # directory where fine-tuning script is stored
    instance_type        = 'ml.p3.2xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    transformers_version = '4.11.0',          # the transformers version used in the training job
    pytorch_version      = '1.9.0',           # the pytorch_version version used in the training job
    py_version           = 'py38',            # the python version used in the training job
    hyperparameters      = hyperparameters,   # the hyperparameter used for running the training job
    compiler_config      = compiler_config,   # the compiler configuration used in the training job
    disable_profiler     = True,              # whether to disable the profiler during training used to gain maximum performance
    debugger_hook_config = False,             # whether to enable the debugger hook during training used to gain maximum performance
    metric_definitions   = metric_definitions
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {
    'train': training_input_path,
    'test': test_input_path
}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data)

In [None]:
from sagemaker import TrainingJobAnalytics

df_metrics = TrainingJobAnalytics(training_job_name=huggingface_estimator.latest_training_job.name).dataframe()

In [None]:
import pandas as pd
df_wide = pd.pivot(df_metrics, index=['timestamp'], columns=['metric_name'], values='value')
df_wide.reset_index(inplace=True)