# Evn

In [2]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

In [4]:
# random seed 설정
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# data dir
data_dir = '/content/drive/MyDrive/문서/강의계획서/삼성전기/삼성전기.20220228/data'
os.listdir(data_dir)

In [None]:
kowiki_dir = os.path.join(data_dir, "kowiki")
if not os.path.isdir(kowiki_dir):
    os.makedirs(kowiki_dir)
os.listdir(kowiki_dir)

# 파일확인

In [None]:
args.corpus = os.path.join(kowiki_dir , 'kowiki.txt.zip')
args

In [None]:
# wiki 라인수 확인
count = 0
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(tqdm(f)):
            count += 1
print(count)

In [None]:
# wiki 내용 확인
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(tqdm(f)):
            if i >= 50:
                break
            line = line.decode('utf-8').strip()
            print(line)

# Char Tokenizer

In [None]:
char_counter = collections.Counter()
# char 개수 확인
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(tqdm(f)):
            line = line.decode('utf-8').strip()
            char_counter.update(list(line))

In [None]:
# char 개수
print(len(char_counter))

In [None]:
# 빈도수가 많은 글자 10개 출력
most_freq = sorted(char_counter.items(), key=lambda item: item[1], reverse=True)

most_freq[:10]

In [None]:
top_10 = most_freq[:10]

# font_name = fm.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
# font_name = 'AppleGothic'
font_name = 'NanumBarunGothic'
plt.rc('font', family=font_name)
plt.rcParams["axes.unicode_minus"] = False # 한글 폰트 사용시 - 깨지는 문제 해결
plt.bar(range(len(top_10)), [val[1] for val in top_10])
plt.xticks(range(len(top_10)), [val[0] for val in top_10])
plt.show()

In [None]:
# 빈도수가 적은 글자 10개 출력
least_freq = sorted(char_counter.items(), key=lambda item: item[1])

least_freq[:10]

In [None]:
# token 개수
count = 0
for c, cnt in char_counter.items():
    count += cnt
print(count)

In [None]:
# char에 일련번호 부여
char_to_id = {'[PAD]': 0, '[UNK]': 1}
for c, cnt in char_counter.items():
    char_to_id[c] = len(char_to_id)
print(len(char_to_id))

In [None]:
char_to_id

In [None]:
# wiki char tokenize
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            line = line.decode('utf-8').strip()
            print(line)
            _id = [char_to_id[c] for c in line]
            print(_id)

# Word Tokenizer

In [None]:
re.sub('([.,!?()·\"\'])', r' \1 ', "안녕.나는,만나서!반가워?너는(누구니)저는\"인공지능'입니다")

In [None]:
word_counter = collections.Counter()
# word 개수 확인
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(tqdm(f)):
            line = line.decode('utf-8').strip()
            line = re.sub('([.,!?()·\"\'])', r' \1 ', line)  # 앞뒤에 공백 추가
            word_counter.update(line.split())

In [None]:
# word 개수
print(len(word_counter))

In [None]:
# 빈도수가 많은 단어 20개 출력
most_freq = sorted(word_counter.items(), key=lambda item: item[1], reverse=True)
most_freq[:20]

In [None]:
top_20 = most_freq[:20]

plt.figure(figsize=(16, 4))
# font_name = fm.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
# font_name = 'AppleGothic'
font_name = 'NanumBarunGothic'
plt.rc('font', family=font_name)
plt.rcParams["axes.unicode_minus"] = False # 한글 폰트 사용시 - 깨지는 문제 해결
plt.bar(range(len(top_20)), [val[1] for val in top_20])
plt.xticks(range(len(top_20)), [val[0] for val in top_20])
plt.show()

In [None]:
# 빈도수가 적은 단어 10개 출력
least_freq = sorted(word_counter.items(), key=lambda item: item[1])
least_freq[:10]

In [None]:
# token 개수
count = 0
for c, cnt in word_counter.items():
    count += cnt
print(count)

In [None]:
# word에 일련번호 부여
word_to_id = {'[PAD]': 0, '[UNK]': 1}
for w, cnt in word_counter.items():
    word_to_id[w] = len(word_to_id)
print(len(word_to_id))

In [None]:
# wiki word tokenize
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            line = line.decode('utf-8').strip()
            line = re.sub('([.,!?\(\)·"\'])', r' \1 ', line)  # 앞뒤에 공백 추가
            print(line)
            _id = [word_to_id[w] for w in line.split()]
            print(_id)

# Morp Tokenizer (Linux 환경에서 확인)

In [None]:
# 형태소분석기 설치
!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

In [29]:
import konlpy
# mecab 선언
mecab = konlpy.tag.Mecab()

In [None]:
morph_counter = collections.Counter()
# morph 개수 확인
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(tqdm(f)):
            line = line.decode('utf-8').strip()
            morph_counter.update(mecab.morphs(line))

In [None]:
# morph 개수
print(len(morph_counter))

In [None]:
# 빈도수가 많은 morph 20개 출력
most_freq = sorted(morph_counter.items(), key=lambda item: item[1], reverse=True)
most_freq[:20]

In [None]:
top_20 = most_freq[:20]

plt.figure(figsize=(16, 4))
# font_name = fm.FontProperties(fname='c:/Windows/Fonts/malgun.ttf').get_name()
# font_name = 'AppleGothic'
font_name = 'NanumBarunGothic'
plt.rc('font', family=font_name)
plt.rcParams["axes.unicode_minus"] = False # 한글 폰트 사용시 - 깨지는 문제 해결
plt.bar(range(len(top_20)), [val[1] for val in top_20])
plt.xticks(range(len(top_20)), [val[0] for val in top_20])
plt.show()

In [None]:
# 빈도수가 적은 morph 10개 출력
least_freq = sorted(morph_counter.items(), key=lambda item: item[1])
least_freq[:10]

In [None]:
# token 개수
count = 0
for c, cnt in morph_counter.items():
    count += cnt
print(count)

In [None]:
# morph에 일련번호 부여
morph_to_id = {'[PAD]': 0, '[UNK]': 1}
for w, cnt in morph_counter.items():
    morph_to_id[w] = len(morph_to_id)
print(len(morph_to_id))

In [None]:
# wiki morph tokenize
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            line = line.decode('utf-8').strip()
            print(line)
            morphs = mecab.morphs(line)
            print(morphs)
            _id = [morph_to_id[m] for m in mecab.morphs(line)]
            print(_id)

# BPE

In [38]:
# BPE corpus
corpus = """
low lower newest widest
low lower newest widest
low       newest widest
low       newest
low       newest
          newest
"""

In [None]:
word_counter = collections.Counter()
word_counter.update(corpus.strip().split())

print(word_counter)

In [None]:
bpe_counter = collections.defaultdict(int)

# subword 개수 확인
for w, n in word_counter.items():
    w = f"\u2581{w}"
    bpe_counter[" ".join(w)] = n

print(bpe_counter)

In [41]:
def update_vocab(vocab, counter):
    """
    vocab 변경
    :param vocab: vocabulary
    :param counter: BPE counter
    """
    for w in counter:
        for s in w.split():
            if s not in vocab:
                vocab[s] = len(vocab)
    return vocab

In [None]:
# bpe 일련번호 부여
bpe_to_id = {'[PAD]': 0, '[UNK]': 1}
bpe_to_id = update_vocab(bpe_to_id, bpe_counter)

print(bpe_to_id)

In [43]:
def get_stats(counter):
    """
    bi-gram 빈도수 계산
    :param counter: BPE counter
    """
    pairs = collections.defaultdict(int)
    for word, freq in counter.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

In [44]:
def merge_vocab(pair, v_in):
    """
    bi-gram merge
    :param counter: BPE counter
    """
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

In [None]:
# bigram pair
pairs = get_stats(bpe_counter)

print(pairs)

In [None]:
# find most freq bigram pair
best = max(pairs, key=pairs.get)

print(best)

In [None]:
# merge most freq bigram pair
bpe_counter = merge_vocab(best, bpe_counter)

print(bpe_counter)

In [None]:
# update vocab
bpe_to_id = update_vocab(bpe_to_id, bpe_counter)

print(bpe_to_id)

In [None]:
# bigram pair
pairs = get_stats(bpe_counter)
print(pairs)
# find most freq bigram pair
best = max(pairs, key=pairs.get)
print(best)
# merge most freq bigram pair
bpe_counter = merge_vocab(best, bpe_counter)
print(bpe_counter)
# update vocab
bpe_to_id = update_vocab(bpe_to_id, bpe_counter)
print(bpe_to_id)

# Sentencepiece

In [50]:
def train_sentencepiece(corpus, prefix, vocab_size=32000):
    """
    sentencepiece를 이용해 vocab 학습
    :param corpus: 학습할 말뭉치
    :param prefix: 저장할 vocab 이름
    :param vocab_size: vocab 개수
    """
    spm.SentencePieceTrainer.train(
        f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" +  # 7은 특수문자 개수
        " --model_type=unigram" +
        " --max_sentence_length=999999" +  # 문장 최대 길이
        " --pad_id=0 --pad_piece=[PAD]" +  # pad token 및 id 지정
        " --unk_id=1 --unk_piece=[UNK]" +  # unknown token 및 id 지정
        " --bos_id=2 --bos_piece=[BOS]" +  # begin of sequence token 및 id 지정
        " --eos_id=3 --eos_piece=[EOS]" +  # end of sequence token 및 id 지정
        " --user_defined_symbols=[SEP],[CLS],[MASK]" +  # 기타 추가 토큰 SEP: 4, CLS: 5, MASK: 6
        " --input_sentence_size=100000" +  # 말뭉치에서 셈플링해서 학습
        " --shuffle_input_sentence=true")  # 셈플링한 말뭉치 shuffle

In [None]:
# 파일복사
shutil.copy(args.corpus, 'kowiki.txt.zip')

In [None]:
# 현재 작업폴더에 압축풀기
!unzip kowiki.txt.zip

print(os.listdir("./"))

In [55]:
# vocab 학습
train_sentencepiece("kowiki.txt", "kowiki_32000")

In [None]:
# 생성결과 확인
print(os.listdir("./"))

In [None]:
# 생성파일 백업
corpus_dir = os.path.dirname(args.corpus)
shutil.move("kowiki_32000.model", corpus_dir)
shutil.move("kowiki_32000.vocab", corpus_dir)

print(os.listdir(corpus_dir))

In [None]:
# load vocab
spm_vocab = spm.SentencePieceProcessor()
spm_vocab.load(os.path.join(corpus_dir, "kowiki_32000.model"))

In [None]:
# wiki spm tokenize
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            line = line.decode('utf-8').strip()
            print(line)
            tokens = spm_vocab.encode_as_pieces(line)
            print(tokens)
            _ids = spm_vocab.encode_as_ids(line)
            print(_ids)

In [None]:
# 문자열을 token으로 분할
tokens = spm_vocab.encode_as_pieces("아름다운 대한민국 우리나라 금수강산")

print(tokens)

In [None]:
# token을 문자열로 복원
print(spm_vocab.decode_pieces(tokens))

In [None]:
# 문자열을 숫자로 분할
ids = spm_vocab.encode_as_ids("아름다운 대한민국 우리나라 금수강산")

print(ids)

In [None]:
# 숫자를 문자열로 복원
print(spm_vocab.decode_ids(ids))

In [None]:
# token을 숫자로 변경
print(spm_vocab.piece_to_id(tokens))

In [None]:
# 숫자를 token으로 변경
print(spm_vocab.id_to_piece(ids))

# Sentencepiece with Morph (Linux 환경에서 확인)

In [None]:
# morph 단위로 분할된 말뭉치 생성
with open("kowiki-mecab.txt", "w") as o_f:
    with zipfile.ZipFile(args.corpus) as z:
        with z.open('kowiki.txt') as f:
            for i, line in enumerate(tqdm(f)):
                line = line.decode('utf-8').strip()
                tokens = mecab.morphs(line)
                string = " ".join(tokens)
                o_f.write(string)
                o_f.write("\n")

In [None]:
# 파일 확인
print(os.listdir("./"))

In [68]:
# morph vocab 학습
train_sentencepiece("kowiki-mecab.txt", "kowiki_mecab_32000")

In [None]:
# 생성결과 확인
print(os.listdir("./"))

In [None]:
# 생성파일 백업
corpus_dir = os.path.dirname(args.corpus)
shutil.move("kowiki_mecab_32000.model", corpus_dir)
shutil.move("kowiki_mecab_32000.vocab", corpus_dir)

print(os.listdir(corpus_dir))

In [None]:
# load morph vocab
spm_morph_vocab = spm.SentencePieceProcessor()
spm_morph_vocab.load(os.path.join(corpus_dir, "kowiki_mecab_32000.model"))

In [None]:
# wiki spm morph tokenize
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            line = line.decode('utf-8').strip()
            print(line)
            string = " ".join(mecab.morphs(line))
            print(string)
            tokens = spm_morph_vocab.encode_as_pieces(string)
            print(tokens)
            _ids = spm_morph_vocab.encode_as_ids(string)
            print(_ids)

# 실습
- data_dir / kowiki / kowiki.txt.zip의 '지미카터' 문서를 encode_as_pieces를 이용해 줄단위로 tokenize 해서 배열에 저장해 보세요.
- encode_as_pieces를 이용해 tokenize 한 내용을 piece_to_id를 이용해 숫자로 변경해 보세요.
- data_dir / kowiki / kowiki.txt.zip의 '지미카터' 문서를 encode_as_ids를 이용해 줄단위로 tokenize 해서 배열에 저장해 보세요.
- encode_as_ids를 이용해 tokenize 한 내용을 id_to_pieces 이용해 token로 변경해 보세요.
