# Built-in Algorithm - BlazingText

https://docs.aws.amazon.com/ko_kr/sagemaker/latest/dg/blazingtext.html  
https://github.com/daekeun-ml/blazingtext-workshop-korean  
[튜토리얼](https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/blazingtext_text_classification_dbpedia/blazingtext_text_classification_dbpedia.html)

## 1. 입력 데이터 형식 맞추기

```
__label__0 아 더빙.. 진짜 짜증나네요 목소리
__label__1 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
```

In [7]:
import pandas as pd

bucket = ''

train_file = 'data/unsmile_train_v1.0.tsv'
test_file = 'data/unsmile_valid_v1.0.tsv'

s3_uri_train = 's3://{}/{}'.format(bucket, train_file)
s3_uri_test = 's3://{}/{}'.format(bucket, test_file)

train_df = pd.read_csv(s3_uri_train, sep='\t')
test_df = pd.read_csv(s3_uri_test, sep='\t')

In [11]:
import re

foreign_dict = ['Hình', 'xăm', 'đẹp']

def pre_processing(text):
    
    for word in foreign_dict:
        if word in text:
            text = text.replace(word, '')
    
    text = re.sub('[^ㄱ-힣a-zA-Z0-9 ]', ' ', text)    
    text = re.sub(' +', ' ', text)
    
    result_text = text[0]
    cnt = 0
    
    for alpha in text[1:]:
        if result_text[-1] == alpha:
            cnt += 1
        else:
            cnt = 0

        if cnt < 3:
            result_text += alpha
        else:
            continue
        
    return result_text

In [12]:
train_df['문장2'] = train_df['문장'].apply(lambda x: pre_processing(x))
test_df['문장2'] = test_df['문장'].apply(lambda x: pre_processing(x))

In [13]:
def make_label_list(row):
    temp_list = []
    for col in unsmile_labels:
        temp_list.append(row[col])
    return temp_list

In [14]:
train_df['labels'] = train_df.apply(lambda x: make_label_list(x), axis=1)
test_df['labels'] = test_df.apply(lambda x: make_label_list(x), axis=1)

In [15]:
unsmile_labels = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]

In [17]:
train_df = train_df.drop(5876)
train_df = train_df.drop(11942)

In [18]:
train_df['label'] = train_df['labels'].apply(lambda x: unsmile_labels[x.index(1)])
test_df['label'] = test_df['labels'].apply(lambda x: unsmile_labels[x.index(1)])

In [19]:
with open('./unsmile.train', 'w') as f:
    
    for idx, row in train_df.iterrows():
        text = "__label__" + row.label
        text += ' ' + row.문장2

        f.write(text+'\n') 
        
    f.close()

with open('./unsmile.validation', 'w') as f:
    
    for idx, row in test_df.iterrows():
        text = "__label__" + row.label
        text += ' ' + row.문장2

        f.write(text+'\n')

    f.close()    


In [None]:
import sagemaker

sess = sagemaker.Session()

prefix = 'blazingtext/unsmile'

train_channel = prefix + "/train"
validation_channel = prefix + "/validation"

sess.upload_data(path="./unsmile.train", bucket=bucket, key_prefix=train_channel)
sess.upload_data(path="./unsmile.validation", bucket=bucket, key_prefix=validation_channel)

In [None]:
import boto3
from sagemaker import get_execution_role

region_name = boto3.Session().region_name
s3_output_location = "s3://{}/{}/output".format(bucket, prefix)
role = get_execution_role()


In [None]:
# get docker image from ecr
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print("Using SageMaker BlazingText container: {} ({})".format(container, region_name))

In [None]:
# Create Estimator object
bt_model = sagemaker.estimator.Estimator(
    container, # choose algorithm
    role,
    instance_count=1, # number of instance for train job
    instance_type="ml.c4.4xlarge", # instance type
    volume_size=30, 
    max_run=360000, # limit time(sec) for training
    input_mode="File",
    output_path=s3_output_location,
    hyperparameters={
        "mode": "supervised",
        "epochs": 1,
        "min_count": 2,
        "learning_rate": 0.05,
        "vector_dim": 10,
        "early_stopping": True,
        "patience": 4,
        "min_epochs": 5,
        "word_ngrams": 2,
    },
)

In [None]:
bt_model.fit(inputs=data_channels, logs=True)

In [None]:
from sagemaker.serializers import JSONSerializer

unsmile_classifier = bt_model.deploy(
    initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=JSONSerializer()
)

In [None]:
payload = {"instances": ['한남']}

unsmile_classifier.predict(payload)

In [None]:
unsmile_classifier.delete_endpoint()