# Env

In [None]:
# imports
import argparse
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm, trange

In [None]:
# 환경 설정
args = {
    # random seed value
    "seed": 1234
}
args = argparse.Namespace(**args)

print(args)

Namespace(seed=1234)


In [None]:
# random seed 설정
random.seed(args.seed)
np.random.seed(args.seed)
tf.random.set_seed(args.seed)

# Soyspacing (파일확인)

In [None]:
# 설치
!pip install git+https://github.com/lovit/soyspacing.git

Collecting git+https://github.com/lovit/soyspacing.git
  Cloning https://github.com/lovit/soyspacing.git to /tmp/pip-req-build-np6a9cmb
  Running command git clone -q https://github.com/lovit/soyspacing.git /tmp/pip-req-build-np6a9cmb
Building wheels for collected packages: soyspacing
  Building wheel for soyspacing (setup.py) ... [?25l[?25hdone
  Created wheel for soyspacing: filename=soyspacing-1.0.17-cp37-none-any.whl size=10484 sha256=f55ea2d09c81e18aeaeac1bf503d459016a1c690f0109f68c59424c9b222da42
  Stored in directory: /tmp/pip-ephem-wheel-cache-_t147dak/wheels/6f/8d/52/ef635e70e93c1ed89773d05527c94597d2d8a4cde7094bd20d
Successfully built soyspacing
Installing collected packages: soyspacing
Successfully installed soyspacing-1.0.17


In [None]:
# 학습파일 다운로드
!wget https://raw.githubusercontent.com/lovit/soyspacing/master/demo_model/134963_norm.txt

--2021-04-22 11:13:28--  https://raw.githubusercontent.com/lovit/soyspacing/master/demo_model/134963_norm.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1660890 (1.6M) [text/plain]
Saving to: ‘134963_norm.txt’


2021-04-22 11:13:29 (20.2 MB/s) - ‘134963_norm.txt’ saved [1660890/1660890]



In [None]:
# import
from soyspacing.countbase import CountSpace

In [None]:
# 학습
corpus_fname = "134963_norm.txt"
model = CountSpace()
model.train(corpus_fname)

all tags length = 694236 --> 57795, (num_doc = 15602)

In [None]:
# 저장
model.save_model("soyspacing.model", json_format=False)

In [None]:
# 학습된 모델 사용
model = CountSpace()
model.load_model("soyspacing.model", json_format=False)

In [None]:
# 통계적처리
# 띄어쓰기 화긴하기
# rul -> 통계적 -> 딥러닝닝

sent_corrected, tags = model.correct("ㅋㅋㅋㅋㅋㅋ어이가 없네 ㅋㅋ")
sent_corrected, tags

# none은 데이터가 없는것

('ㅋㅋㅋㅋㅋㅋ어이가 없네 ㅋㅋ', [0, 0, 0, 0, 0, None, None, None, 1, None, 1, None, 1])

In [None]:
sent_corrected, tags = model.correct("너무너무재밌는라라랜드 또 보러 오고 싶어요")
sent_corrected, tags

('너무너무 재밌는 라라랜드 또 보러 오고 싶어요',
 [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, None, 1, 0, 0, 1])

# PyCrfsuite

In [None]:
# 설치
!pip install git+https://github.com/lovit/pycrfsuite_spacing.git

Collecting git+https://github.com/lovit/pycrfsuite_spacing.git
  Cloning https://github.com/lovit/pycrfsuite_spacing.git to /tmp/pip-req-build-pfwxuvsh
  Running command git clone -q https://github.com/lovit/pycrfsuite_spacing.git /tmp/pip-req-build-pfwxuvsh
Collecting python-crfsuite>=0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 11.4MB/s 
[?25hBuilding wheels for collected packages: pycrfsuite-spacing
  Building wheel for pycrfsuite-spacing (setup.py) ... [?25l[?25hdone
  Created wheel for pycrfsuite-spacing: filename=pycrfsuite_spacing-1.0.2-cp37-none-any.whl size=3816 sha256=121dc5f6d3ffe25ee93f58f98c667c2ba1e12bd73fea248ed3618a187c4e9e57
  Stored in directory: /tmp/pip-ephem-wheel-cache-58vvth1o/wheels/4b/c1/29/1006fb42a117fe79cd6021e897a97c6a376d5c8c3fd3bce13b
Successfully built py

In [None]:
from pycrfsuite_spacing import TemplateGenerator
from pycrfsuite_spacing import CharacterFeatureTransformer
from pycrfsuite_spacing import sent_to_xy

In [None]:
# window를 올리면  앞뒤 글자 기준보고 띄어쓰기 할건지 안할건지
# CRF가 통계적 기법을 할때 성능을 올리는 모델


to_feature = CharacterFeatureTransformer(
    TemplateGenerator(
        begin=-2,
        end=2,
        min_range_length=3,
        max_range_length=3)
    )

x, y = sent_to_xy('이것도 너프해 보시지', to_feature)
x, y

([['X[0,2]=이것도'],
  ['X[-1,1]=이것도', 'X[0,2]=것도너'],
  ['X[-2,0]=이것도', 'X[-1,1]=것도너', 'X[0,2]=도너프'],
  ['X[-2,0]=것도너', 'X[-1,1]=도너프', 'X[0,2]=너프해'],
  ['X[-2,0]=도너프', 'X[-1,1]=너프해', 'X[0,2]=프해보'],
  ['X[-2,0]=너프해', 'X[-1,1]=프해보', 'X[0,2]=해보시'],
  ['X[-2,0]=프해보', 'X[-1,1]=해보시', 'X[0,2]=보시지'],
  ['X[-2,0]=해보시', 'X[-1,1]=보시지'],
  ['X[-2,0]=보시지']],
 ['0', '0', '1', '0', '0', '1', '0', '0', '1'])

In [None]:
from pycrfsuite_spacing import PyCRFSuiteSpacing

In [None]:
corpus = []
with open("./134963_norm.txt") as f:
    for line in f:
        line = line.strip()
        if line:
            corpus.append(line)

In [None]:
# 모델 학습
correct = PyCRFSuiteSpacing(to_feature, verbose=True, feature_minfreq=3, max_iterations=50, l1_cost=0, l2_cost=1.0)
correct.train(corpus, "pycrfsuite.model")

feature scanning: begin with min_count=3
feature scanning ... 463332 -> 84198 with min_count=3
begin appending data to trainer
all data are appended to trainer. begin training
Feature generation
type: CRF1d
feature.minfreq: 3.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 84893
Seconds required: 0.443

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 317384.847182
Feature norm: 1.000000
Error norm: 106777.950221
Active features: 84067
Line search trials: 1
Line search step: 0.000006
Seconds required for this iteration: 0.233

***** Iteration #2 *****
Loss: 297470.000395
Feature norm: 0.926881
Error norm: 34767.927026
Active features: 84893
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.131

***** 

In [None]:
# 모델 로드
# count가 아닌 CRF
# 데이터 정제할때 ex.띄어쓰기 
correct = PyCRFSuiteSpacing(to_feature)
correct.load_tagger("pycrfsuite.model")

In [None]:
correct('동탄목욕탕수건도둑엄준식무야호')

'동탄목욕탕수건도둑엄준식무야호'

# PyKoSpacing

In [None]:
!pip install git+https://github.com/haven-jeon/PyKoSpacing.git
# 설치완료후 '메뉴' >> '런타임' >> '런타임 다시 시작' 해야 함
# 낮은 tf를 써서 한번 다시 켜야함
#

Collecting git+https://github.com/haven-jeon/PyKoSpacing.git
  Cloning https://github.com/haven-jeon/PyKoSpacing.git to /tmp/pip-req-build-2w12s_23
  Running command git clone -q https://github.com/haven-jeon/PyKoSpacing.git /tmp/pip-req-build-2w12s_23
Collecting tensorflow==2.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/94/0a/012cc33c643d844433d13001dd1db179e7020b05ddbbd0a9dc86c38a8efa/tensorflow-2.4.0-cp37-cp37m-manylinux2010_x86_64.whl (394.7MB)
[K     |████████████████████████████████| 394.7MB 42kB/s 
Collecting argparse>=1.4.0
  Downloading https://files.pythonhosted.org/packages/f2/94/3af39d34be01a24a6e65433d19e107099374224905f1e0cc6bbe1fd22a2f/argparse-1.4.0-py2.py3-none-any.whl
Building wheels for collected packages: pykospacing
  Building wheel for pykospacing (setup.py) ... [?25l[?25hdone
  Created wheel for pykospacing: filename=pykospacing-0.4-cp37-none-any.whl size=2255638 sha256=407697381f97a616c2becdf4aa3d2a3524133e9fea0954921905e7aa3598db04
  Store

In [None]:
from pykospacing import spacing

In [None]:
spacing("김형호영화시장분석가는'1987'의네이버영화정보네티즌10점평에서언급된단어들을지난해12월27일부터올해1월10일까지통계프로그램R과KoNLP패키지로텍스트마이닝하여분석했다.")

"김형호 영화시장 분석가는 '1987'의 네이버 영화 정보 네티즌 10점 평에서 언급된 단어들을 지난해 12월 27일부터 올해 1월 10일까지 통계 프로그램 R과 KoNLP 패키지로 텍스트마이닝하여 분석했다."

In [None]:
spacing('수건도둑엄준식')

'수건 도둑 엄준식'

# Py-hanspell

In [None]:
!pip install git+https://github.com/ssut/py-hanspell.git

Collecting git+https://github.com/ssut/py-hanspell.git
  Cloning https://github.com/ssut/py-hanspell.git to /tmp/pip-req-build-kizg74sb
  Running command git clone -q https://github.com/ssut/py-hanspell.git /tmp/pip-req-build-kizg74sb
Building wheels for collected packages: py-hanspell
  Building wheel for py-hanspell (setup.py) ... [?25l[?25hdone
  Created wheel for py-hanspell: filename=py_hanspell-1.1-cp37-none-any.whl size=4854 sha256=3c83476c50162b51d86d7d064ee8f7d5fc823fb647fca48120245031d2f41d5e
  Stored in directory: /tmp/pip-ephem-wheel-cache-dfoahozi/wheels/0a/25/d1/e5e96476dbb1c318cc26c992dd493394fe42b0c204b3e65588
Successfully built py-hanspell
Installing collected packages: py-hanspell
Successfully installed py-hanspell-1.1


In [None]:
from hanspell import spell_checker

In [None]:
# spelled_sent = spell_checker.check("안녕 하세요. 저는 한국인 입니다. 이문장은 한글로 작성됬습니다.")
spelled_sent = spell_checker.check('동탄목욕탕수건도둑엄준식')
spelled_sent.checked

'동탄 목욕탕 수건 도둑 엄 준식'

In [None]:
spelled_sent.words
# 원 문장과의 차이
# space는 2인가?
# 철자틀린건 1

OrderedDict([('동탄', 2), ('목욕탕', 2), ('수건', 2), ('도둑', 2), ('엄', 2), ('준식', 2)])

# KoNLPy

In [None]:
# 형태소분석기 설치
!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

+ pip install konlpy
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.4MB/s 
[?25hCollecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 30.4MB/s 
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 8.1MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collecte

In [None]:
import konlpy

In [None]:
# 형태소 분석기
hannanum = konlpy.tag.Hannanum()
kkma = konlpy.tag.Kkma()
komoran = konlpy.tag.Komoran()
mecab = konlpy.tag.Mecab()
okt = konlpy.tag.Okt()

# mecab이 좀빠른편이긴함, 조금 정확도가 안좋아서
# 답은 없음,선호대로로
# noun splitter
# mecap마뉴이씀

In [None]:
# string = "롯데마트의 흑마늘 양념치킨이 논란이 되고 있다."
# string = "롯데마트의 흑마늘 양념 치킨이 논란이 되고 있다."
string = '동탄목욕탕수건도둑엄준식'

## morphs

In [None]:
hannanum.morphs(string)

['동탄목욕탕수건도둑엄준식']

In [None]:
kkma.morphs(string)

['동', '타', 'ㄴ', '목욕탕', '수건', '도둑', '어', 'ㅁ', '줄', 'ㄴ', '식']

In [None]:
komoran.morphs(string)

['동탄', '목욕탕', '수건', '도둑', '엄준', '식']

In [None]:
mecab.morphs(string)

['동탄', '목욕탕', '수건', '도둑', '엄준', '식']

In [None]:
okt.morphs(string)

['동탄', '목욕탕', '수건', '도둑', '엄준', '식']

## nouns

In [None]:
hannanum.nouns(string)

['동탄목욕탕수건도둑엄준식']

In [None]:
kkma.nouns(string)

['동', '목욕탕', '목욕탕수건도둑', '수건', '도둑', '식']

In [None]:
komoran.nouns(string)

['동탄', '목욕탕', '수건', '도둑', '엄준', '식']

In [None]:
mecab.nouns(string)

['동탄', '목욕탕', '수건', '도둑', '엄준']

In [None]:
okt.nouns(string)

['동탄', '목욕탕', '수건', '도둑', '엄준']

## pos

In [None]:
hannanum.pos(string)

[('동탄목욕탕수건도둑엄준식', 'N')]

In [None]:
hannanum.tagset
# 한나눔의 teg의 setting을 보자

{'E': '어미',
 'EC': '연결 어미',
 'EF': '종결 어미',
 'EP': '선어말어미',
 'ET': '전성 어미',
 'F': '외국어',
 'I': '독립언',
 'II': '감탄사',
 'J': '관계언',
 'JC': '격조사',
 'JP': '서술격 조사',
 'JX': '보조사',
 'M': '수식언',
 'MA': '부사',
 'MM': '관형사',
 'N': '체언',
 'NB': '의존명사',
 'NC': '보통명사',
 'NN': '수사',
 'NP': '대명사',
 'NQ': '고유명사',
 'P': '용언',
 'PA': '형용사',
 'PV': '동사',
 'PX': '보조 용언',
 'S': '기호',
 'X': '접사',
 'XP': '접두사',
 'XS': '접미사'}

In [None]:
kkma.pos(string)

[('동', 'NNG'),
 ('타', 'VV'),
 ('ㄴ', 'ETD'),
 ('목욕탕', 'NNG'),
 ('수건', 'NNG'),
 ('도둑', 'NNG'),
 ('어', 'VV'),
 ('ㅁ', 'ETN'),
 ('줄', 'VA'),
 ('ㄴ', 'ETD'),
 ('식', 'NNB')]

In [None]:
kkma.tagset
# kkma의 teg setting을 보자

{'EC': '연결 어미',
 'ECD': '의존적 연결 어미',
 'ECE': '대등 연결 어미',
 'ECS': '보조적 연결 어미',
 'EF': '종결 어미',
 'EFA': '청유형 종결 어미',
 'EFI': '감탄형 종결 어미',
 'EFN': '평서형 종결 어미',
 'EFO': '명령형 종결 어미',
 'EFQ': '의문형 종결 어미',
 'EFR': '존칭형 종결 어미',
 'EP': '선어말 어미',
 'EPH': '존칭 선어말 어미',
 'EPP': '공손 선어말 어미',
 'EPT': '시제 선어말 어미',
 'ET': '전성 어미',
 'ETD': '관형형 전성 어미',
 'ETN': '명사형 전성 어미',
 'IC': '감탄사',
 'JC': '접속 조사',
 'JK': '조사',
 'JKC': '보격 조사',
 'JKG': '관형격 조사',
 'JKI': '호격 조사',
 'JKM': '부사격 조사',
 'JKO': '목적격 조사',
 'JKQ': '인용격 조사',
 'JKS': '주격 조사',
 'JX': '보조사',
 'MA': '부사',
 'MAC': '접속 부사',
 'MAG': '일반 부사',
 'MD': '관형사',
 'MDN': '수 관형사',
 'MDT': '일반 관형사',
 'NN': '명사',
 'NNB': '일반 의존 명사',
 'NNG': '보통명사',
 'NNM': '단위 의존 명사',
 'NNP': '고유명사',
 'NP': '대명사',
 'NR': '수사',
 'OH': '한자',
 'OL': '외국어',
 'ON': '숫자',
 'SE': '줄임표',
 'SF': '마침표, 물음표, 느낌표',
 'SO': '붙임표(물결,숨김,빠짐)',
 'SP': '쉼표,가운뎃점,콜론,빗금',
 'SS': '따옴표,괄호표,줄표',
 'SW': '기타기호 (논리수학기호,화폐기호)',
 'UN': '명사추정범주',
 'VA': '형용사',
 'VC': '지정사',
 'VCN': "부정 지정사, 형용사 '아니다'",
 'VC

In [None]:
komoran.pos(string)

[('동탄', 'NNP'),
 ('목욕탕', 'NNP'),
 ('수건', 'NNP'),
 ('도둑', 'NNP'),
 ('엄준', 'NNP'),
 ('식', 'NNB')]

In [None]:
komoran.tagset

{'EC': '연결 어미',
 'EF': '종결 어미',
 'EP': '선어말어미',
 'ETM': '관형형 전성 어미',
 'ETN': '명사형 전성 어미',
 'IC': '감탄사',
 'JC': '접속 조사',
 'JKB': '부사격 조사',
 'JKC': '보격 조사',
 'JKG': '관형격 조사',
 'JKO': '목적격 조사',
 'JKQ': '인용격 조사',
 'JKS': '주격 조사',
 'JKV': '호격 조사',
 'JX': '보조사',
 'MAG': '일반 부사',
 'MAJ': '접속 부사',
 'MM': '관형사',
 'NA': '분석불능범주',
 'NF': '명사추정범주',
 'NNB': '의존 명사',
 'NNG': '일반 명사',
 'NNP': '고유 명사',
 'NP': '대명사',
 'NR': '수사',
 'NV': '용언추정범주',
 'SE': '줄임표',
 'SF': '마침표, 물음표, 느낌표',
 'SH': '한자',
 'SL': '외국어',
 'SN': '숫자',
 'SO': '붙임표(물결,숨김,빠짐)',
 'SP': '쉼표,가운뎃점,콜론,빗금',
 'SS': '따옴표,괄호표,줄표',
 'SW': '기타기호 (논리수학기호,화폐기호)',
 'VA': '형용사',
 'VCN': '부정 지정사',
 'VCP': '긍정 지정사',
 'VV': '동사',
 'VX': '보조 용언',
 'XPN': '체언 접두사',
 'XR': '어근',
 'XSA': '형용사 파생 접미사',
 'XSN': '명사파생 접미사',
 'XSV': '동사 파생 접미사'}

In [None]:
mecab.pos(string)

[('동탄', 'NNP'),
 ('목욕탕', 'NNG'),
 ('수건', 'NNG'),
 ('도둑', 'NNG'),
 ('엄준', 'NNP'),
 ('식', 'XSN')]

In [None]:
mecab.tagset

{'EC': '연결 어미',
 'EF': '종결 어미',
 'EP': '선어말어미',
 'ETM': '관형형 전성 어미',
 'ETN': '명사형 전성 어미',
 'IC': '감탄사',
 'JC': '접속 조사',
 'JKB': '부사격 조사',
 'JKC': '보격 조사',
 'JKG': '관형격 조사',
 'JKO': '목적격 조사',
 'JKQ': '인용격 조사',
 'JKS': '주격 조사',
 'JKV': '호격 조사',
 'JX': '보조사',
 'MAG': '일반 부사',
 'MAJ': '접속 부사',
 'MM': '관형사',
 'NNB': '의존 명사',
 'NNBC': '단위를 나타내는 명사',
 'NNG': '일반 명사',
 'NNP': '고유 명사',
 'NP': '대명사',
 'NR': '수사',
 'SC': '구분자 , · / :',
 'SE': '줄임표 …',
 'SF': '마침표, 물음표, 느낌표',
 'SH': '한자',
 'SL': '외국어',
 'SN': '숫자',
 'SSC': '닫는 괄호 ), ]',
 'SSO': '여는 괄호 (, [',
 'SY': '기타 기호',
 'VA': '형용사',
 'VCN': '부정 지정사',
 'VCP': '긍정 지정사',
 'VV': '동사',
 'VX': '보조 용언',
 'XPN': '체언 접두사',
 'XR': '어근',
 'XSA': '형용사 파생 접미사',
 'XSN': '명사파생 접미사',
 'XSV': '동사 파생 접미사'}

In [None]:
okt.pos(string)

[('동탄', 'Noun'),
 ('목욕탕', 'Noun'),
 ('수건', 'Noun'),
 ('도둑', 'Noun'),
 ('엄준', 'Noun'),
 ('식', 'Suffix')]

In [None]:
okt.tagset

{'Adjective': '형용사',
 'Adverb': '부사',
 'Alpha': '알파벳',
 'Conjunction': '접속사',
 'Determiner': '관형사',
 'Eomi': '어미',
 'Exclamation': '감탄사',
 'Foreign': '외국어, 한자 및 기타기호',
 'Hashtag': '트위터 해쉬태그',
 'Josa': '조사',
 'KoreanParticle': '(ex: ㅋㅋ)',
 'Noun': '명사',
 'Number': '숫자',
 'PreEomi': '선어말어미',
 'Punctuation': '구두점',
 'ScreenName': '트위터 아이디',
 'Suffix': '접미사',
 'Unknown': '미등록어',
 'Verb': '동사'}

# SoyNLP

In [None]:
!pip install git+https://github.com/lovit/soynlp.git

Collecting git+https://github.com/lovit/soynlp.git
  Cloning https://github.com/lovit/soynlp.git to /tmp/pip-req-build-16kjggm0
  Running command git clone -q https://github.com/lovit/soynlp.git /tmp/pip-req-build-16kjggm0
Building wheels for collected packages: soynlp
  Building wheel for soynlp (setup.py) ... [?25l[?25hdone
  Created wheel for soynlp: filename=soynlp-0.0.493-cp37-none-any.whl size=420197 sha256=c2aadd5566f651e2c2eac091608e0d82c97117b7068a7982136f2fc49b7260da
  Stored in directory: /tmp/pip-ephem-wheel-cache-e9u5otfi/wheels/0c/bf/1a/ea7353ec5c14dd9e3a6a1ab01c4fdc1b474998469ba9229b7d
Successfully built soynlp
Installing collected packages: soynlp
Successfully installed soynlp-0.0.493


In [None]:
!wget https://github.com/lovit/soynlp/raw/master/tutorials/2016-10-20.txt

--2021-04-22 12:04:30--  https://github.com/lovit/soynlp/raw/master/tutorials/2016-10-20.txt
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt [following]
--2021-04-22 12:04:30--  https://raw.githubusercontent.com/lovit/soynlp/master/tutorials/2016-10-20.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 43694449 (42M) [text/plain]
Saving to: ‘2016-10-20.txt’


2021-04-22 12:04:32 (132 MB/s) - ‘2016-10-20.txt’ saved [43694449/43694449]



In [None]:
corpus = []
with open("./2016-10-20.txt") as f:
    for line in f:
        line = line.strip()
        if line:
            corpus.append(line)
            if "수건" in line:
                print(line)

# line에 '아이오아이'가 있는지 걸러보자

민교협의 정치시평 두 개의 존엄  김진해 경희대 후마니타스 칼리지 교수  궁금했어요 하지 않아도 될 말을 유족들에게 굳이 남긴 이유가 뭘까 제 이름으로 진단서가 나가지만 사망 원인에 대해서는 저에게 권한이 없습니다 부원장과 주치의가 협의한 대로 써야 합니다  더 궁금했어요 잠수 를 타기로 마음먹고 남긴 메모에 진실만을 깨달으려 하세요 라고 적은 이유가 뭘까 아무 말 없이 사라졌어도 이상하지 않았을 텐데 당신은 굳이 흔적을 남겼더군요  어쩌면 317일 동안 당신도 백남기 어른의 가장 가까운 곳에서 그분의 사투를 지켜보았을 거고 눈물 흘리는 가족들에게 말없이 응원의 눈길을 주었을 겁니다 그러니 당신이 그 말도 안 되는 상황에서 진실만을 보라 고 하지 않았을까 싶어요  나는 당신이 남긴 흔적을 보며 존엄성에 대해 얘기해 보고 싶었어요 하나의 존엄성은 다른 분들이 많이 애도하는 고 백남기 어른과 그 가족들의 존엄성입니다 존엄성이란 게 정의하는 게 어려운 일이라 도리어 각각의 상황을 갖고 생각하는 게 좀 더 나을 거 같군요 알다시피 지금 백남기 어른과 가족의 존엄성은 완전히 짓밟혔습니다 모든 평범한 사람들은 돌아가신 아버지를 고이 보내드리고 싶어합니다 잠시만 상상해 봐도 그 동안 백남기 어른의 부인이 어떻게 병상을 지켰을지 두 딸 도라지와 민주화 씨는 어땠을지 당신은 가까이에서 직접 보았겠지요 무탈하게 농사짓던 사람이 물대포를 맞고 순식간에 죽음의 몸으로 바뀌었을 때 그분의 부인과 자식들이 어떤 마음으로 몇 백 개의 밤과 낮을 보냈는지 의식 없는 아버지의 손을 주무르고 젖은 수건으로 머리카락과 얼굴을 조심스레 닦아주고는 이내 어두운 병실 복도 구석으로 물러나와 초점 잃고 앉아있었을 딸들을요 기자회견장 말구요 집회 현장의 단상 말구요 아버지를 잃었는데도 인간의 입에서 나올 수 없는 말로 조롱당할 때에도 인간이기를 포기하지 말라 고 눈물로 타이르는 모습을요 당신이 어디에 있건 이 분들의 짓밟힌 존엄성을 잘 알고 있을 겁니다  다른 하나는 바로 당신의 존엄성입니다 지금 당신

## LRNounExtractor

In [None]:
from soynlp.noun import LRNounExtractor
from soynlp.noun import NewsNounExtractor

In [None]:
# 이것보단 deep learning이 잘된다!
noun_extractor = LRNounExtractor()
nouns = noun_extractor.train_extract(corpus)
nouns

[Noun Extractor] used default noun predictor; Sejong corpus predictor
[Noun Extractor] used noun_predictor_sejong
[Noun Extractor] All 2398 r features was loaded
[Noun Extractor] scanning was done (L,R) has (160030, 81637) tokens
[Noun Extractor] building L-R graph was done
[Noun Extractor] 26634 nouns are extracted


{'자력': NounScore_v1(frequency=19, score=0.9996493333333334, known_r_ratio=1.0),
 '신축': NounScore_v1(frequency=112, score=0.9526965909090908, known_r_ratio=0.30985915492957744),
 '선배': NounScore_v1(frequency=217, score=0.7094869618320611, known_r_ratio=0.3898809523809524),
 '수로': NounScore_v1(frequency=15, score=0.999707, known_r_ratio=0.6666666666666666),
 '쎄씨': NounScore_v1(frequency=12, score=0.967547, known_r_ratio=1.0),
 '유지': NounScore_v1(frequency=2115, score=0.9904441107465135, known_r_ratio=0.7277611940298507),
 '입법': NounScore_v1(frequency=77, score=0.8071580555555555, known_r_ratio=0.16981132075471697),
 '손해': NounScore_v1(frequency=223, score=0.8958285801526719, known_r_ratio=0.42810457516339867),
 '상시': NounScore_v1(frequency=253, score=0.99883, known_r_ratio=0.056338028169014086),
 '정신': NounScore_v1(frequency=380, score=0.7075274236641221, known_r_ratio=0.5550847457627118),
 '억양': NounScore_v1(frequency=16, score=0.9096055454545454, known_r_ratio=1.0),
 '비롯': NounScore_v1

In [None]:
nouns["뉴스"]

KeyError: ignored

In [None]:
nouns["도둑"]
# 40번 나왔다

NounScore_v1(frequency=40, score=0.8191917741935484, known_r_ratio=0.7380952380952381)

In [None]:
noun_extractor = NewsNounExtractor()
nouns = noun_extractor.train_extract(corpus) # 같은 corpous로 학습시킨거임임oun_extractor = NewsNounExtractor()
nouns = noun_extractor.train_extract(corpus) # 같은 corpous로 학습시킨거임임
nouns

used default noun predictor; Sejong corpus based logistic predictor
/usr/local/lib/python3.7/dist-packages/soynlp
local variable 'f' referenced before assignment
local variable 'f' referenced before assignment
scan vocabulary ... 
done (Lset, Rset, Eojeol) = (658116, 363342, 403882)
predicting noun score was done                                        
before postprocessing 237871
_noun_scores_ 50196
checking hardrules ... done
after postprocessing 36027
extracted 2365 compounds from eojeols

{'독자제공': NewsNounScore(score=0, frequency=26, feature_proportion=0, eojeol_proportion=1.0, n_positive_feature=0, unique_positive_feature_proportion=0),
 '총격현장': NewsNounScore(score=0, frequency=5, feature_proportion=0, eojeol_proportion=1.0, n_positive_feature=0, unique_positive_feature_proportion=0),
 '연합뉴스자료사진': NewsNounScore(score=0, frequency=40, feature_proportion=0, eojeol_proportion=1.0, n_positive_feature=0, unique_positive_feature_proportion=0),
 '원전사태': NewsNounScore(score=0, frequency=4, feature_proportion=0, eojeol_proportion=1.0, n_positive_feature=0, unique_positive_feature_proportion=0),
 '자료사진': NewsNounScore(score=0, frequency=377, feature_proportion=0.0, eojeol_proportion=0.9973474801061007, n_positive_feature=0, unique_positive_feature_proportion=0),
 '군사기술': NewsNounScore(score=0, frequency=5, feature_proportion=0, eojeol_proportion=1.0, n_positive_feature=0, unique_positive_feature_proportion=0),
 '잠수함발사탄도미사일': NewsNounScore(score=0, frequency=18, feature_proportio

In [None]:
nouns["뉴스"]

In [None]:
nouns["도둑"]

## Noun Extractor ver 2

In [None]:
from soynlp.utils import DoublespaceLineCorpus
from soynlp.noun import LRNounExtractor_v2

In [None]:
sents = DoublespaceLineCorpus("./2016-10-20.txt", iter_sent=True)

noun_extractor = LRNounExtractor_v2(verbose=True)
nouns = noun_extractor.train_extract(sents)
nouns

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 403896 from 223357 sents. mem=2.514 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=4434442, mem=2.568 Gb
[Noun Extractor] batch prediction was completed for 119705 words
[Noun Extractor] checked compounds. discovered 70639 compounds
[Noun Extractor] postprocessing detaching_features : 109312 -> 92205
[Noun Extractor] postprocessing ignore_features : 92205 -> 91999
[Noun Extractor] postprocessing ignore_NJ : 91999 -> 90643
[Noun Extractor] 90643 nouns (70639 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=2.712 Gb                    
[Noun Extractor] 76.63 % eojeols are covered


{'남북보건의료교육재단': NounScore(frequency=19, score=1.0),
 '대전사이언스페스티벌': NounScore(frequency=14, score=1.0),
 '상호출자제한기업집단': NounScore(frequency=66, score=1.0),
 '부산섬유패션정책포럼': NounScore(frequency=16, score=1.0),
 '공주시립박찬호야구장': NounScore(frequency=5, score=1.0),
 '서울남부고용노동지청': NounScore(frequency=3, score=1.0),
 '한국과학기술연구원장': NounScore(frequency=10, score=1.0),
 '미래창조과학부장관상': NounScore(frequency=3, score=1.0),
 '한국가구수출협동조합': NounScore(frequency=2, score=1.0),
 '대통령연설기획비서관': NounScore(frequency=4, score=1.0),
 '충북도선거관리위원회': NounScore(frequency=4, score=1.0),
 '새마을운동글로벌리그': NounScore(frequency=11, score=1.0),
 '경기창조경제혁신센터': NounScore(frequency=41, score=1.0),
 '전국새마을지도자대회': NounScore(frequency=25, score=1.0),
 '경기도교육행정협의회': NounScore(frequency=15, score=1.0),
 '현대어린이자연학습원': NounScore(frequency=13, score=1.0),
 '대한여성과학기술인회': NounScore(frequency=13, score=1.0),
 '위치추적중앙관제센터': NounScore(frequency=14, score=1.0),
 '광명장애인종합복지관': NounScore(frequency=11, score=1.0),
 '경남창조경제혁신센터': NounScore(frequency=4,

In [None]:
nouns["뉴스"]

NewsNounScore(score=0.4397700934065934, frequency=11340, feature_proportion=0.025440313111545987, eojeol_proportion=0.3691358024691358, n_positive_feature=15, unique_positive_feature_proportion=0.8333333333333334)

In [None]:
nouns["아이오아이"]
# 아 treshold 값보다 크면 명사인거지
# 여기서느 score인거고 treshold는 명시안되어잇구구

NewsNounScore(score=0.5472699319727891, frequency=270, feature_proportion=1.0, eojeol_proportion=0.45555555555555555, n_positive_feature=14, unique_positive_feature_proportion=0.9333333333333333)

## Word Extraction

In [None]:
from soynlp.word import WordExtractor

In [None]:
word_extractor = WordExtractor(min_frequency=100,
    min_cohesion_forward=0.05, 
    min_right_branching_entropy=0.0
)
word_extractor.train(corpus) # list of str or like
words = word_extractor.extract()
words

training was done. used memory 3.408 Gb
all cohesion probabilities was computed. # words = 16942
all branching entropies was computed # words = 356080
all accessor variety was computed # words = 356080


{'쉬': Scores(cohesion_forward=0, cohesion_backward=0, left_branching_entropy=3.0967928563209224, right_branching_entropy=1.9700126304991492, left_accessor_variety=45, right_accessor_variety=23, leftside_frequency=452, rightside_frequency=169),
 '츠': Scores(cohesion_forward=0, cohesion_backward=0, left_branching_entropy=0.5013016219850168, right_branching_entropy=3.679030434386326, left_accessor_variety=2, right_accessor_variety=150, leftside_frequency=0, rightside_frequency=3208),
 '낮': Scores(cohesion_forward=0, cohesion_backward=0, left_branching_entropy=3.4135084192572367, right_branching_entropy=1.7549125011776283, left_accessor_variety=84, right_accessor_variety=14, leftside_frequency=1877, rightside_frequency=0),
 '농': Scores(cohesion_forward=0, cohesion_backward=0, left_branching_entropy=4.346505365239051, right_branching_entropy=2.3163335407155334, left_accessor_variety=181, right_accessor_variety=32, leftside_frequency=3090, rightside_frequency=159),
 '굉': Scores(cohesion_forw

In [None]:
len(words)

9048

In [None]:
words['아이오아이']

Scores(cohesion_forward=0.30063636035733476, cohesion_backward=0, left_branching_entropy=3.052466601803942, right_branching_entropy=2.7566594458546225, left_accessor_variety=32, right_accessor_variety=22, leftside_frequency=270, rightside_frequency=0)

In [None]:
def word_score(score):
    return (score.cohesion_forward * math.exp(score.right_branching_entropy))

print('단어   (빈도수, cohesion, branching entropy, socre)\n')
for word, score in sorted(words.items(), key=lambda x:word_score(x[1]), reverse=True)[:30]:
    print('%s     (%d, %.3f, %.3f, %.3f)' % (
                word, 
                score.leftside_frequency, 
                score.cohesion_forward,
                score.right_branching_entropy,
                word_score(score)
            )
         )

단어   (빈도수, cohesion, branching entropy, socre)

으로     (1634, 0.953, 5.334, 197.507)
까지     (654, 0.691, 5.347, 145.155)
함께     (7946, 0.912, 5.053, 142.587)
통해     (8471, 0.578, 5.278, 113.310)
된다     (2681, 0.982, 4.733, 111.584)
에서     (7494, 0.604, 5.187, 108.020)
먼저     (1112, 0.903, 4.666, 96.025)
면서     (1944, 0.458, 5.337, 95.291)
밝혔다     (8360, 0.836, 4.580, 81.505)
했다     (7070, 0.689, 4.768, 81.073)
됐다     (2219, 0.750, 4.668, 79.865)
또한     (2180, 0.440, 5.086, 71.244)
같은     (4429, 0.568, 4.832, 71.184)
였다     (211, 0.632, 4.724, 71.170)
됩니다     (247, 0.967, 4.275, 69.497)
새로운     (2334, 0.578, 4.784, 69.071)
관계자는     (2942, 0.501, 4.860, 64.636)
덧붙였다     (1093, 0.928, 4.168, 59.938)
예정이다     (3586, 0.607, 4.591, 59.899)
말했다     (8345, 0.706, 4.429, 59.232)
금지     (19959, 0.743, 4.378, 59.206)
때문에     (4742, 0.696, 4.438, 58.897)
과정에서     (990, 0.497, 4.738, 56.793)
위해     (8888, 0.367, 5.016, 55.333)
따라     (3669, 0.366, 4.975, 53.038)
냈다     (340, 0.659, 4.386, 52.925)
따

In [None]:
cohesion_scores = word_extractor.all_cohesion_scores()
cohesion_scores['아이오아이'] # (cohesion_forward, cohesion_backward)

 cohesion probabilities ... (1 in 17876)all cohesion probabilities was computed. # words = 16942


(0.30063636035733476, 0)

In [None]:
branching_entropy = word_extractor.all_branching_entropy()
branching_entropy['아이오아이'] # (left_branching_entropy, right_branching_entropy)

all branching entropies was computed # words = 356080


(3.052466601803942, 2.7566594458546225)

In [None]:
branching_entropy = word_extractor.all_branching_entropy()
branching_entropy['아이오아'] # (left_branching_entropy, right_branching_entropy)

all branching entropies was computed # words = 356080


(3.052466601803942, -0.0)

In [None]:
accessor_variety = word_extractor.all_accessor_variety()
accessor_variety['아이오아이'] # (left_accessor_variety, right_accessor_variety)

all accessor variety was computed # words = 356080


(32, 22)

## Tokenizer

### LTokenizer

In [None]:
from soynlp.tokenizer import LTokenizer

In [None]:
# score 및 LTokenizer
scores = {'데이':0.5, '데이터':0.5, '데이터마이닝':0.5, '공부':0.5, '공부중':0.45}
tokenizer = LTokenizer(scores=scores)

print(tokenizer.tokenize("데이터마이닝을 공부한다"))
print(tokenizer.tokenize("데이터마이닝을공부한다"))

['데이터마이닝', '을', '공부', '한다']
['데이터마이닝', '을공부한다']


In [None]:
from soynlp.word import WordExtractor

# WordExtractor 학습
word_extractor = WordExtractor(min_frequency=100,
    min_cohesion_forward=0.05, 
    min_right_branching_entropy=0.0
)
word_extractor.train(corpus) # list of str or like
words = word_extractor.extract()

training was done. used memory 3.413 Gb
all cohesion probabilities was computed. # words = 16942
all branching entropies was computed # words = 356080
all accessor variety was computed # words = 356080


In [None]:
# WordExtractor score
cohesion_score = {word:score.cohesion_forward for word, score in words.items()}
cohesion_score

{'쉬': 0,
 '츠': 0,
 '낮': 0,
 '농': 0,
 '굉': 0,
 '립': 0,
 '닝': 0,
 '웅': 0,
 '앞': 0,
 '둔': 0,
 '퇴': 0,
 '엠': 0,
 '삼': 0,
 '외': 0,
 '맞': 0,
 '토': 0,
 '려': 0,
 '급': 0,
 '죽': 0,
 '뿜': 0,
 '낯': 0,
 '내': 0,
 '젠': 0,
 '형': 0,
 '속': 0,
 '괜': 0,
 '표': 0,
 '첨': 0,
 '병': 0,
 '캘': 0,
 '묶': 0,
 '틸': 0,
 '런': 0,
 '싱': 0,
 '닉': 0,
 '럽': 0,
 '겉': 0,
 '갈': 0,
 '쏟': 0,
 '칼': 0,
 '쿄': 0,
 '린': 0,
 '흑': 0,
 '억': 0,
 '봤': 0,
 '흘': 0,
 '뱅': 0,
 '돈': 0,
 '멈': 0,
 '태': 0,
 '낸': 0,
 '오': 0,
 '샘': 0,
 '켜': 0,
 '쾌': 0,
 '셀': 0,
 '정': 0,
 '폰': 0,
 '위': 0,
 '트': 0,
 '구': 0,
 '둘': 0,
 '념': 0,
 '적': 0,
 '샀': 0,
 '좀': 0,
 '샵': 0,
 '겪': 0,
 '걱': 0,
 '친': 0,
 '몽': 0,
 '입': 0,
 '갚': 0,
 '촉': 0,
 '범': 0,
 '칙': 0,
 '컨': 0,
 '류': 0,
 '메': 0,
 '찢': 0,
 '성': 0,
 '평': 0,
 '엄': 0,
 '웰': 0,
 '힌': 0,
 '라': 0,
 '격': 0,
 '끼': 0,
 '얼': 0,
 '델': 0,
 '풀': 0,
 '앨': 0,
 '받': 0,
 '샌': 0,
 '넣': 0,
 '섬': 0,
 '연': 0,
 '깨': 0,
 '밑': 0,
 '겸': 0,
 '풍': 0,
 '밖': 0,
 '부': 0,
 '중': 0,
 '원': 0,
 '섞': 0,
 '웃': 0,
 '너': 0,
 '벼': 0,
 '랄': 0,
 '건': 0,
 

In [None]:
tokenizer = LTokenizer(scores=cohesion_score)

print(tokenizer.tokenize("데이터마이닝을 공부한다"))
print(tokenizer.tokenize("데이터마이닝을공부한다"))

['데이터', '마이닝을', '공부한다']
['데이터', '마이닝을공부한다']


In [None]:
from soynlp.noun import LRNounExtractor_v2

# LRNounExtractor_2 학습
noun_extractor = LRNounExtractor_v2()
nouns = noun_extractor.train_extract(corpus) # list of str like

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 403896 from 30013 sents. mem=3.687 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=4434442, mem=3.685 Gb
[Noun Extractor] batch prediction was completed for 119705 words
[Noun Extractor] checked compounds. discovered 70639 compounds
[Noun Extractor] postprocessing detaching_features : 109312 -> 92205
[Noun Extractor] postprocessing ignore_features : 92205 -> 91999
[Noun Extractor] postprocessing ignore_NJ : 91999 -> 90643
[Noun Extractor] 90643 nouns (70639 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=3.776 Gb                    
[Noun Extractor] 76.63 % eojeols are covered


In [None]:
# noun score
noun_scores = {noun:score.score for noun, score in nouns.items()}
noun_scores
# 스코어를 만들고
# 토크나이징한다?

{'남북보건의료교육재단': 1.0,
 '대전사이언스페스티벌': 1.0,
 '상호출자제한기업집단': 1.0,
 '부산섬유패션정책포럼': 1.0,
 '공주시립박찬호야구장': 1.0,
 '서울남부고용노동지청': 1.0,
 '한국과학기술연구원장': 1.0,
 '미래창조과학부장관상': 1.0,
 '한국가구수출협동조합': 1.0,
 '대통령연설기획비서관': 1.0,
 '충북도선거관리위원회': 1.0,
 '새마을운동글로벌리그': 1.0,
 '경기창조경제혁신센터': 1.0,
 '전국새마을지도자대회': 1.0,
 '경기도교육행정협의회': 1.0,
 '현대어린이자연학습원': 1.0,
 '대한여성과학기술인회': 1.0,
 '위치추적중앙관제센터': 1.0,
 '광명장애인종합복지관': 1.0,
 '경남창조경제혁신센터': 1.0,
 '한국지역대학연합회의': 1.0,
 '서울창조경제혁신센터': 0.5,
 '채무보증제한기업집단': 1.0,
 '제주도립김창열미술관': 1.0,
 '한국원자력안전기술원': 1.0,
 '세종푸드트럭페스티벌': 1.0,
 '부산울산경남지역본부': 1.0,
 '달성산업단지관리공단': 1.0,
 '리틀도쿄마켓플레이스': 1.0,
 '광주창조경제혁신센터': 1.0,
 '513억5000만원': 1.0,
 '101억5000만원': 1.0,
 '전세시장소비심리지수': 1.0,
 '고고도미사일방어체계': 1.0,
 '도시재생문화클러스터': 1.0,
 '강남순환도시고속도로': 1.0,
 '국립줄기세포재생센터': 1.0,
 '울진범군민대책위원회': 1.0,
 '111조1817억원': 1.0,
 '아우디폭스바겐코리아': 1.0,
 '도서관정보정책위원회': 1.0,
 '전주시사회복지관협회': 1.0,
 '다우존스산업평균지수': 1.0,
 '마이크로소프트연구소': 1.0,
 '고덕상업업무복합단지': 1.0,
 '전국이동통신유통협회': 1.0,
 '한국시각장애인연합회': 1.0,
 '문경시개발자문위원회': 1.0,
 '경기도노인복지관협회': 1.0,
 '사회복지공동모금회장': 1.0,


In [None]:
# noun_scores + cohesion_score
combined_scores = {noun:score + cohesion_score.get(noun, 0) for noun, score in noun_scores.items()}
# cohesion_score append
combined_scores.update({subword:cohesion for subword, cohesion in cohesion_score.items() if not (subword in combined_scores)})
combined_scores

{'남북보건의료교육재단': 1.0,
 '대전사이언스페스티벌': 1.0,
 '상호출자제한기업집단': 1.0,
 '부산섬유패션정책포럼': 1.0,
 '공주시립박찬호야구장': 1.0,
 '서울남부고용노동지청': 1.0,
 '한국과학기술연구원장': 1.0,
 '미래창조과학부장관상': 1.0,
 '한국가구수출협동조합': 1.0,
 '대통령연설기획비서관': 1.0,
 '충북도선거관리위원회': 1.0,
 '새마을운동글로벌리그': 1.0,
 '경기창조경제혁신센터': 1.0,
 '전국새마을지도자대회': 1.0,
 '경기도교육행정협의회': 1.0,
 '현대어린이자연학습원': 1.0,
 '대한여성과학기술인회': 1.0,
 '위치추적중앙관제센터': 1.0,
 '광명장애인종합복지관': 1.0,
 '경남창조경제혁신센터': 1.0,
 '한국지역대학연합회의': 1.0,
 '서울창조경제혁신센터': 0.5,
 '채무보증제한기업집단': 1.0,
 '제주도립김창열미술관': 1.0,
 '한국원자력안전기술원': 1.0,
 '세종푸드트럭페스티벌': 1.0,
 '부산울산경남지역본부': 1.0,
 '달성산업단지관리공단': 1.0,
 '리틀도쿄마켓플레이스': 1.0,
 '광주창조경제혁신센터': 1.0,
 '513억5000만원': 1.0,
 '101억5000만원': 1.0,
 '전세시장소비심리지수': 1.0,
 '고고도미사일방어체계': 1.0,
 '도시재생문화클러스터': 1.0,
 '강남순환도시고속도로': 1.0,
 '국립줄기세포재생센터': 1.0,
 '울진범군민대책위원회': 1.0,
 '111조1817억원': 1.0,
 '아우디폭스바겐코리아': 1.5444771680686054,
 '도서관정보정책위원회': 1.0,
 '전주시사회복지관협회': 1.0,
 '다우존스산업평균지수': 1.0,
 '마이크로소프트연구소': 1.0,
 '고덕상업업무복합단지': 1.0,
 '전국이동통신유통협회': 1.0,
 '한국시각장애인연합회': 1.0,
 '문경시개발자문위원회': 1.0,
 '경기도노인복지관협회': 1.0,
 '사회복

In [None]:
tokenizer = LTokenizer(scores=combined_scores)

print(tokenizer.tokenize("데이터마이닝을 공부한다"))
print(tokenizer.tokenize("데이터마이닝을공부한다"))

['데이터', '마이닝을', '공부', '한다']
['데이터', '마이닝을공부한다']


### MaxScoreTokenizer

In [None]:
from soynlp.tokenizer import MaxScoreTokenizer

In [None]:
# score 및 LTokenizer
scores = {'데이':0.5, '데이터':0.5, '데이터마이닝':0.5, '공부':0.5, '공부중':0.45}
tokenizer = MaxScoreTokenizer(scores=scores)

print(tokenizer.tokenize("데이터마이닝을 공부한다"))
print(tokenizer.tokenize("데이터마이닝을공부한다"))

['데이터마이닝', '을', '공부', '한다']
['데이터마이닝', '을', '공부', '한다']


In [None]:
tokenizer = MaxScoreTokenizer(scores=cohesion_score)

print(tokenizer.tokenize("데이터마이닝을 공부한다"))
print(tokenizer.tokenize("데이터마이닝을공부한다"))

['데이터', '마이', '닝을', '공부', '한다']
['데이터', '마이', '닝을공부', '한다']


In [None]:
tokenizer = MaxScoreTokenizer(scores=combined_scores)

print(tokenizer.tokenize("데이터마이닝을 공부한다"))
print(tokenizer.tokenize("데이터마이닝을공부한다"))

['데이터', '마이', '닝을', '공부', '한다']
['데이터', '마이', '닝을', '공부', '한다']


### RegexTokenizer

In [None]:
from soynlp.tokenizer import RegexTokenizer

In [None]:
tokenizer = RegexTokenizer()

print(tokenizer.tokenize('이렇게연속된문장은잘리지않습니다만'))
print(tokenizer.tokenize('숫자123이영어abc에섞여있으면ㅋㅋ잘리겠죠'))
print(tokenizer.tokenize('ㅋㅋ루삥뽕빵뽕ㅋㅋㅋㅋ무야호~~'))

# 규칙을 이용해 분해
# 단어의 형태가 바뀌는 경우

['이렇게연속된문장은잘리지않습니다만']
['숫자', '123', '이영어', 'abc', '에섞여있으면', 'ㅋㅋ', '잘리겠죠']
['ㅋㅋ', '루삥뽕빵뽕', 'ㅋㅋㅋㅋ', '무야호', '~~']


## Part of Speech Tagger

In [None]:
from soynlp.postagger import Dictionary
from soynlp.postagger import LRTemplateMatcher
from soynlp.postagger import LREvaluator
from soynlp.postagger import SimpleTagger
from soynlp.postagger import UnknowLRPostprocessor

In [None]:
pos_dict = {
    'Adverb': {'너무', '매우'}, 
    'Noun': {'너무너무너무', '아이오아이', '아이', '노래', '오', '이', '고양'},
    'Josa': {'는', '의', '이다', '입니다', '이', '이는', '를', '라', '라는'},
    'Verb': {'하는', '하다', '하고'},
    'Adjective': {'예쁜', '예쁘다'},
    'Exclamation': {'우와'}    
}

In [None]:
dictionary = Dictionary(pos_dict)
generator = LRTemplateMatcher(dictionary)    
evaluator = LREvaluator()
postprocessor = UnknowLRPostprocessor()
tagger = SimpleTagger(generator, evaluator, postprocessor)

In [None]:
# tagger.tag("너무너무너무는아이오아이의노래입니다!!")
tagger.tag('ㅋㅋ루삥봉 역시동탄 도둑수건엄준식')

[]