In [1]:
import re
import sentencepiece as spm
import os
import json

In [8]:
def clean_str(text):
    pattern = '([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)' # E-mail제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+' # URL제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '([ㄱ-ㅎㅏ-ㅣ]+)'  # 한글 자음, 모음 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '<[^>]*>'         # HTML 태그 제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '[^\w\s.?!]'         # 특수기호제거
    text = re.sub(pattern=pattern, repl='', string=text)
    pattern = '\s+'         # tab to whitespace
    text = re.sub(pattern=pattern, repl=' ', string=text)
    pattern = '\d\.\d+'         
    text = re.sub(pattern=pattern, repl='USRNUM', string=text)
    pattern = '\d+\.'         
    text = re.sub(pattern=pattern, repl='USRSEQ', string=text)
    pattern = '\d+'         
    text = re.sub(pattern=pattern, repl='USRNUM', string=text)
    return text  

In [2]:
def clean_str(text):
    pattern = '\s+'         # tab to whitespace
    text = re.sub(pattern=pattern, repl=' ', string=text)
    pattern = '[^ 가-힣]+'
    text = re.sub(pattern=pattern, repl='', string=text)
    return text  

In [3]:
clean_str('구급대원 분들은 0.000001초가 시급한 분들입니다. 1.여러분들 2. 안녕하세요 333.우리나라 333 22영화 123,000')

'구급대원 분들은 초가 시급한 분들입니다 여러분들  안녕하세요 우리나라  영화 '

# Create Train dataset

In [4]:
file_names = []
for root, dirs, files in os.walk('./data/petitions/all'):
    for fname in files:
        full_fname = os.path.join(root, fname)
        file_names.append(full_fname)

In [5]:
file_names

['./data/petitions/all\\petitions_2017-08',
 './data/petitions/all\\petitions_2017-09',
 './data/petitions/all\\petitions_2017-10',
 './data/petitions/all\\petitions_2017-11',
 './data/petitions/all\\petitions_2017-12']

In [6]:
out_file_path = "./data/petitions/petition.sp.all.train"
with open(out_file_path, 'a', encoding='UTF8') as out_file:
    for file_name in file_names:
        with open(file_name, "r", encoding='utf-8', errors='ignore') as in_file:
            datas = in_file.readlines()
            
            for i in range(len(datas)):
                data = json.loads(datas[i])
                content = clean_str(data['content'])
                content.strip()
                out_file.write(content + "\n")

            print("Total {0} items has been converted".format(len(datas)))

Total 662 items has been converted
Total 17010 items has been converted
Total 5932 items has been converted
Total 29426 items has been converted
Total 18383 items has been converted


In [12]:
import pandas as pd
filename = 'C:\\Users\\kkk\\PycharmProjects\\hug-face\\hug\\test\\train_10000.tsv'
df = pd.read_csv(filename, sep='\t', encoding='utf-8')
out_file_path = "./petition.sp.10k.train"

with open(out_file_path, 'a', encoding='UTF8') as out_file:
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')
    
    for i in range(len(df)):
        text = df['contents'][i]

        content = clean_str(text)
        content.strip()
        out_file.write(content + "\n")
    print("Total {0} items has been converted".format(len(df)))

Total 9663 items has been converted


# Train

In [11]:
input_file = '../data/petitions/petition.sp.all.train'
#templates = '--input={} --model_prefix={} --vocab_size={} --character_coverage={} --model_type={} --user_defined_symbols={} --unk_piece={} --bos_piece={} --eos_piece={} --pad_piece={}'
templates = '--input={} --model_prefix={} --vocab_size={} --character_coverage={} --model_type={} --user_defined_symbols={} --pad_id={} --unk_piece={} --pad_piece={}'
vocab_size = 30000
prefix = '../data/petitions/sp-all-30000'
character_coverage = 0.9998
model_type = 'bpe'
#user_defined_symbols='[SEQ],[NUM],[CLS],[MASK],[SEP]'
user_defined_symbols='[CLS],[MASK],[SEP]'
pad_id = 3
unk_piece = '[UNK]'
#bos_piece = '[S]'
#eos_piece = '[/S]'
pad_piece = '[PAD]'
#cmd = templates.format(input_file, prefix, vocab_size, character_coverage, model_type, user_defined_symbols, unk_piece, bos_piece, eos_piece, pad_piece, pad_id)
cmd = templates.format(input_file, prefix, vocab_size, character_coverage, model_type, user_defined_symbols, pad_id, unk_piece, pad_piece)

In [12]:
spm.SentencePieceTrainer.Train(cmd)

True

# Load Model and Inference

In [13]:
spp = spm.SentencePieceProcessor()
spp.Load('../data/petitions/sp-all-30000.model')

True

In [14]:
#sentence = "은행간 인수합병 반대와 장시간노동 해소를 요구중인 금융노조가 조합원 쟁의행위 찬반투표를 벌여 압도적 찬성률로 파업을 가결했다. USRSEQ일반적인 나 너 그리고 우리나라 대한민국 내 나이 USRNUM살이다."
#spp.EncodeAsPieces(sentence)

# 사전 생성

In [15]:
out_file_path = "../data/petitions/vocab.txt"

with open(out_file_path, 'a', encoding='UTF8') as f_out:
    vocab = {spp.IdToPiece(i): i for i in range(spp.GetPieceSize())}
    for word in vocab:
        f_out.write(word.split('\t')[0].strip() + "\n")