# Evaluate Each Model

In [2]:
import re

def pre_processing(text):
    text = re.sub('[^ㄱ-힣a-zA-Z0-9 ]', ' ', text)    
    text = re.sub(' +', ' ', text)
    
    result_text = text[0]
    cnt = 0
    
    for alpha in text[1:]:
        if result_text[-1] == alpha: cnt += 1
        else: cnt = 0

        if cnt < 3: result_text += alpha
        else: continue
        
    return result_text

def get_predicated_label(output_labels, min_score):
    labels = []
    for label in output_labels:
        if label['score'] > min_score:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [None]:
from datetime import datetime

now = datetime.now()
now = str(now)[:19].replace('-', '').replace(':', '').replace(' ', '')

model_list = ['beomi/kcbert-base', 'beomi/kcbert-large', 'beomi/KcELECTRA-base']
model_name = model_list[2]

num_train_epochs = 20
per_device_train_batch_size = 64
data_preprocessed = 'preprocessed'

prefix = f'{now}_{model_name.split("/")[1]}_{num_train_epochs}_{data_preprocessed}'

In [None]:
import sagemaker

sess = sagemaker.Session()
bucket = 'implementation-unsmile'

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
import pandas as pd

# load dataset
train_file = 'data/train/unsmile_train_v1.0.tsv'
test_file = 'data/valid/unsmile_valid_v1.0.tsv'

s3_uri_train = 's3://{}/{}'.format(bucket, train_file)
s3_uri_test = 's3://{}/{}'.format(bucket, test_file)

train_df = pd.read_csv(s3_uri_train, sep='\t')
test_df = pd.read_csv(s3_uri_test, sep='\t')


In [None]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker 

role = sagemaker.get_execution_role()
model_list = [
    # kcBERT-base
    's3://implementation-unsmile/huggingface-pytorch-training-2022-05-26-11-55-44-694/output/model.tar.gz',
    # kcBERT-large
    's3://implementation-unsmile/huggingface-pytorch-training-2022-05-26-09-35-57-477/output/model.tar.gz',
    # kcELECTRA-base
    's3://implementation-unsmile/huggingface-pytorch-training-2022-05-26-02-22-36-270/output/model.tar.gz'
]
model_data = model_list[2]

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=model_data, # path to your trained sagemaker model
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.12", # transformers version used
   pytorch_version="1.9", # pytorch version used
   py_version="py38", # python version of the DLC
)

In [None]:
predictor_kcelectra_base = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)

In [None]:
import tqdm

unsmile_labels = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]
unsmile_labels_eng = ["female/family","male","sexual minority","race/country","age","region","religion","etc","malicious comments/abuse","clean"]
predicated_labels_kcelectra_base = []

for idx, out in tqdm.tqdm(test_df.iterrows()):
    text = pre_processing(out['문장'])
    data = {"inputs": text}
    response = predictor_kcelectra_base.predict(data)[0]
    temp_list = [0,0,0,0,0,0,0,0,0,0]
    temp_list[unsmile_labels.index(response['label'])] = 1
    
    predicated_labels_kcelectra_base.append(temp_list)
    
predicated_labels_kcelectra_base[:5]

In [None]:
def make_label_list(row):
    temp_list = []
    for col in unsmile_labels:
        temp_list.append(row[col])
    return temp_list

test_df['labels'] = test_df.apply(lambda x: make_label_list(x), axis=1)

In [None]:
test_df_list = list(test_df['labels'])

In [None]:
with open(f'./inference_file/kcElectra-base.txt', 'w') as f:
    for lable in predicated_labels_kcelectra_base:
        f.write(str(lable) + '\n')
    
    f.close()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_df_list, predicated_labels_kcelectra_base, target_names=unsmile_labels_eng))

In [None]:
y_test = [x.index(1) for x in test_df_list]
y_pred_kcelectra_base = [x.index(1) for x in predicated_labels_kcelectra_base]

In [None]:
#importing confusion matrix
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, y_pred_kcbert_large)
print('Confusion Matrix\n')
print(confusion)

In [None]:
plot_confusion_matrix(confusion, unsmile_labels_eng, normalize=False)

In [None]:
predictor.delete_endpoint()