# Custom Script with Supported Framework - HuggingFace

## 1. Training Process

In [None]:
import sagemaker

sess = sagemaker.Session()
bucket = 'implementation-unsmile'

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
from datetime import datetime

now = datetime.now()
str(now)[:19].replace('-', '').replace(':', '').replace(' ', '')

In [None]:
from datetime import datetime

now = datetime.now()
now = str(now)[:19].replace('-', '').replace(':', '').replace(' ', '')

model_list = ['beomi/KcELECTRA-base', 'beomi/kcbert-base', 'beomi/kcbert-large']
model_name = 'beomi/kcbert-base'

num_train_epochs = 20
per_device_train_batch_size = 16
data_preprocessed = 'preprocessed'

prefix = f'{now}_{model_name.split("/")[1]}_{num_train_epochs}_{data_preprocessed}'

output_path = "s3://{}/{}/output".format(bucket, prefix)
checkpoint_url = "s3://{}/{}/checkpoints".format(bucket, prefix)

In [None]:
# hyperparameters, which are passed into the training job
hyperparameters = {
    'num_train_epochs': num_train_epochs, # train epochs
    'per_device_train_batch_size': per_device_train_batch_size, # batch size
    'model_name': model_name # model which will be trained on
}

In [None]:
from sagemaker.huggingface import HuggingFace

huggingface_estimator = HuggingFace(
        entry_point='train.py',
        source_dir='./scripts',
        instance_type='ml.g4dn.xlarge',
        instance_count=1,
        role=role,
        transformers_version='4.12',
        pytorch_version='1.9',
        py_version='py38',
        output_dir=output_path,
        checkpoint_s3_uri=checkpoint_url,
        hyperparameters=hyperparameters,
        sagemaker_session=sess,
        use_spot_instances=True,
        max_wait=360000,
        max_run=100000
)

In [None]:
huggingface_estimator.fit()

## 2. Deploy process

In [None]:
predictor = huggingface_estimator.deploy(1,"ml.m5.large")

In [None]:
import re

def pre_processing(text):
    text = re.sub('[^ㄱ-힣a-zA-Z0-9 ]', ' ', text)    
    text = re.sub(' +', ' ', text)
    
    result_text = text[0]
    cnt = 0
    
    for alpha in text[1:]:
        if result_text[-1] == alpha: cnt += 1
        else: cnt = 0

        if cnt < 3: result_text += alpha
        else: continue
        
    return result_text

In [None]:
inputs = '급식충아 꺼져!'
inputs = pre_processing(inputs)

sentiment_input= {"inputs": inputs}
predictor2.predict(sentiment_input)

In [None]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker 

# Load from s3 artifact
role = sagemaker.get_execution_role()
model_data = 's3://implementation-unsmile/huggingface-pytorch-training-2022-05-26-02-22-36-270/output/model.tar.gz'

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=model_data, # path to your trained sagemaker model
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.12", # transformers version used
   pytorch_version="1.9", # pytorch version used
   py_version="py38", # python version of the DLC
)

In [None]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
   initial_instance_count=1,
   instance_type="ml.m5.large"
)

In [None]:
# example request, you always need to define "inputs"
data = {
   "inputs": "조용히해!!!"
}

# request
predictor.predict(data)