In [1]:
import fast_hangle_levenshtein

In [2]:
fast_hangle_levenshtein.__title__

'빠른 한글 수정 거리 검색을 위한 inverted index '

In [3]:
fast_hangle_levenshtein.__version__

'0.0.2'

## Toy example

In [4]:
from fast_hangle_levenshtein import LevenshteinIndex
indexer = LevenshteinIndex(verbose=True)

In [5]:
indexer.indexing('아이고 어이고 아이고야 아이고야야야야 어이구야 지화자 징화자 쟝화장'.split())

In [6]:
indexer._index

{'고': {'아이고', '아이고야', '아이고야야야야', '어이고'},
 '구': {'어이구야'},
 '아': {'아이고', '아이고야', '아이고야야야야'},
 '야': {'아이고야', '아이고야야야야', '어이구야'},
 '어': {'어이고', '어이구야'},
 '이': {'아이고', '아이고야', '아이고야야야야', '어이고', '어이구야'},
 '자': {'지화자', '징화자'},
 '장': {'쟝화장'},
 '쟝': {'쟝화장'},
 '지': {'지화자'},
 '징': {'징화자'},
 '화': {'쟝화장', '지화자', '징화자'}}

In [7]:
indexer._cho_index

{'ㄱ': {'아이고', '아이고야', '아이고야야야야', '어이고', '어이구야'},
 'ㅇ': {'아이고', '아이고야', '아이고야야야야', '어이고', '어이구야'},
 'ㅈ': {'쟝화장', '지화자', '징화자'},
 'ㅎ': {'쟝화장', '지화자', '징화자'}}

In [8]:
indexer.verbose = True
indexer.levenshtein_search('아이코')

query=아이코, candidates=5 -> 2, time=0.000798 sec.


[('아이고', 1)]

In [9]:
indexer.verbose = True
indexer.jamo_levenshtein_search('아이코')

query=아이코, candidates=8 -> 3, time=0.00143 sec.


[('아이고', 0.3333333333333333), ('어이고', 0.6666666666666666)]

## Financial text example

In [10]:
with open('./data/nouns_from_financial_news.json', encoding='utf-8') as f:
    import json    
    noun_scores = json.load(f)
len(noun_scores)

132864

In [11]:
list(noun_scores.keys())[:10]

['양식어가',
 '식품유통사',
 'ETN전담팀',
 '도로주행',
 '로우프라이스펀드',
 '국가브랜드',
 '대체부지',
 '한화솔라원',
 '박준영씨',
 '온라인마트']

In [12]:
financial_word_indexer = LevenshteinIndex(noun_scores)

character set을 기준으로 모두 등장하는 글자를 찾기 때문에 순서는 달라질 수 있습니다.

In [13]:
financial_word_indexer.verbose = True
financial_word_indexer.levenshtein_search('분식회계')

query=분식회계, candidates=10137 -> 7, time=0.00606 sec.


[('분식회계', 0), ('분식회', 1), ('분식회계설', 1), ('분석회계', 1)]

In [14]:
financial_word_indexer.verbose = True
financial_word_indexer.levenshtein_search('분식회계a')

query=분식회계a, candidates=10451 -> 3, time=0.00534 sec.


[('분식회계설', 1), ('분식회계', 1)]

In [15]:
financial_word_indexer.jamo_levenshtein_search('분식회곙')

query=분식회곙, candidates=129447 -> 162, time=0.235 sec.


[('분식회계', 0.3333333333333333),
 ('분석회계', 0.6666666666666666),
 ('분식회', 1),
 ('부실회계', 1.0)]

## Compare times

In [16]:
import time
from fast_hangle_levenshtein import levenshtein
from fast_hangle_levenshtein import jamo_levenshtein

query = '분식회계'

search_time = time.time()
distance = {word:levenshtein(word, query) for word in noun_scores}
search_time = time.time() - search_time
print('search time = {} sec'.format('%.2f'%search_time))

similars = sorted(filter(lambda x:x[1] <= 1, distance.items()), key=lambda x:x[1])
print(similars)

search time = 2.27 sec
[('분식회계', 0), ('분식회', 1), ('분식회계설', 1), ('분석회계', 1)]


In [17]:
search_time = time.time()
distance = {word:jamo_levenshtein(word, query) for word in noun_scores}
search_time = time.time() - search_time
print('search time = {} sec'.format('%.2f'%search_time))

similars = sorted(filter(lambda x:x[1] <= 1, distance.items()), key=lambda x:x[1])
print(similars)

search time = 27.39 sec
[('분식회계', 0), ('분석회계', 0.3333333333333333), ('부실회계', 0.6666666666666666), ('분식회', 1), ('분석체계', 1.0), ('분식회계설', 1)]
