In [1]:
!pip install -r requirements.txt


Collecting hgtk==0.1.3 (from -r requirements.txt (line 1))
  Downloading hgtk-0.1.3.tar.gz (6.2 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting matplotlib==3.2.2 (from -r requirements.txt (line 2))
  Downloading matplotlib-3.2.2.tar.gz (40.3 MB)
     ---------------------------------------- 0.0/40.3 MB ? eta -:--:--
     -- ------------------------------------- 2.1/40.3 MB 44.9 MB/s eta 0:00:01
     ------ --------------------------------- 7.0/40.3 MB 73.9 MB/s eta 0:00:01
     ----------- -------------------------- 12.3/40.3 MB 108.8 MB/s eta 0:00:01
     --------------- ---------------------- 16.8/40.3 MB 131.2 MB/s eta 0:00:01
     ------------------- ------------------- 20.6/40.3 MB 93.0 MB/s eta 0:00:01
     ------------------------ ------------- 26.0/40.3 MB 108.8 MB/s eta 0:00:01
     ----------------------------- -------- 31.4/40.3 MB 131.2 MB/s eta 0:00:01
     -------------------------------- ----- 34.6/40.3

  error: subprocess-exited-with-error
  
  × Preparing metadata (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [66 lines of output]
      Running from numpy source directory.
      Cythonizing sources
      Processing numpy/random\_bounded_integers.pxd.in
      Processing numpy/random\mtrand.pyx
        required_version = LooseVersion('0.29.14')
        if LooseVersion(cython_version) < required_version:
      Processing numpy/random\_bit_generator.pyx
      Processing numpy/random\_bounded_integers.pyx.in
      Processing numpy/random\_common.pyx
      Processing numpy/random\_generator.pyx
      Processing numpy/random\_mt19937.pyx
      
      Error compiling Cython file:
      ------------------------------------------------------------
      ...
              for i in range(1, RK_STATE_LEN):
                  self.rng_state.key[i] = val[i]
              self.rng_state.pos = i
      
              self._bitgen.state = &self.rng_state
              self._bitgen.ne

In [3]:
import os
import pickle
from string import ascii_lowercase, ascii_uppercase
from collections.abc import Iterable

import numpy as np
from jamo import h2j, j2hcj
from tensorflow import reduce_sum
from tensorflow.keras import callbacks, layers, metrics
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.sequence import pad_sequences


directory = os.path.dirname(os.path.abspath('test.ipynb'))
path = os.path.join(directory, 'model')

def get_path(filename):
    return os.path.join(path, filename)

if os.path.isfile(get_path('chardict.pkl')):
    with open(get_path('chardict.pkl'), 'rb') as f:
        char_dict = pickle.load(f)

else:
    jaem = ['ㄱ', 'ㄴ', 'ㄷ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅅ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ', 'ㄲ', 'ㄸ', 'ㅃ', 'ㅆ', 'ㅉ', 'ㄳ', 'ㄵ', 'ㄶ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅄ']
    moem = ['ㅏ', 'ㅑ', 'ㅓ', 'ㅕ', 'ㅗ', 'ㅛ', 'ㅜ', 'ㅠ', 'ㅡ', 'ㅣ', 'ㅐ', 'ㅒ', 'ㅔ', 'ㅖ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅢ']
    english = list(ascii_lowercase) + list(ascii_uppercase)
    sign = [s for s in ''' `~!@#$%^&*()+-/=_,.?;:'"[]{}<>\|''']

    link_list = sign + jaem + moem + english + [str(i) for i in range(10)]
    char_dict = {k: code + 2 for code, k in enumerate(link_list)} # 0은 padding, 1은 oov

    with open(get_path('chardict.pkl'), 'wb') as f:
        pickle.dump(char_dict, f, pickle.HIGHEST_PROTOCOL)

vocab_size = len(char_dict) + 2 # padding, OOV 포함!
maxlen = 60


def encode(text: str) -> list:
    """
    하나의 str을 받아와 인코딩합니다. 단, padding 작업은 진행하지 않습니다.

    argument
    text: 인코딩할 문자열입니다.

    return: 인코딩된 리스트를 반환합니다.
    """
    assert isinstance(text, str), "text argument must be str."

    text = j2hcj(h2j(str(text)))
    code = [char_dict.get(t, 1) for t in text]
    return code


def preprocessing(data: Iterable) -> np.ndarray:
    """
    하나의 str 또는 str로 구성된 iterable한 객체을 받아와 인코딩합니다.
    padding 작업과 one-hot-encoding 작업도 진행합니다.

    argument
    text: 인코딩할 문자열 또는 문자열이 담긴 순회가능한 객체입니다.

    return: 인코딩, padding, one-hot-encoding 작업을 거친 3차원 numpy 배열입니다.
    """
    if isinstance(data, str):
        data = [encode(data)]
    elif isinstance(data, Iterable):
        data = [encode(t) for t in data]
    else:
        assert True, "data argument must be str or Iterable object."
    
    data = pad_sequences(data, maxlen)
    
    return to_categorical(data, vocab_size)


def load_badword_model() -> Model:
    """
    학습된 모델을 불러옵니다. 불러온 모델은 compile 작업을 마친 상태입니다.
    
    return: 사전학습된 tf.keras.Model 객체가 compile된 상태로 반환됩니다.
    """
    model = load_model(get_path('model.h5'))
    model.compile(
        loss="binary_crossentropy", 
        optimizer="adam", 
        metrics=[
                 metrics.BinaryAccuracy(name="acc"), 
                 metrics.Recall(name="recall"), 
                 metrics.Precision(name="prec"),
                 ]
                  )
    
    return model

In [8]:
model = load_badword_model()
data = preprocessing("그게 뭔데 씹덕아...")
data.shape

(1, 60, 148)

In [2]:
import pandas as pd
test = pd.read_csv(r'C:\code\skkukdt_minzy\DeepLearning\data\test_final.csv')

In [9]:
model = load_badword_model()
def pred(data):
    data = preprocessing(data)
    return model.predict(data)

test['pred'] = test['text'].apply(pred)



In [None]:
MAX_SEQUENCE_LENGTH = 100 
test_t = test['text'].apply(lambda x : [char_dict.get(t, 1) for t in j2hcj(h2j(str(x)))])
test_t = pad_sequences(test_t, maxlen=60, padding='pre')


model = load_badword_model()
model.predict(test_t)