In [1]:
import soynlp

## 한글 초/중/종성 처리

In [2]:
for char in 'azAZ가힣ㄱㄴㅎㅏ':
    print('{} == {}'.format(char, ord(char)))

a == 97
z == 122
A == 65
Z == 90
가 == 44032
힣 == 55203
ㄱ == 12593
ㄴ == 12596
ㅎ == 12622
ㅏ == 12623


In [3]:
for idx in [97, 122, 65, 90, 44032, 55203]:
    print('{} == {}'.format(idx, chr(idx)))

97 == a
122 == z
65 == A
90 == Z
44032 == 가
55203 == 힣


In [4]:
kor_begin = 44032
kor_end = 55203
chosung_base = 588
jungsung_base = 28
jaum_begin = 12593
jaum_end = 12622
moum_begin = 12623
moum_end = 12643

chosung_list = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 
        'ㅅ', 'ㅆ', 'ㅇ' , 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

jungsung_list = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 
        'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 
        'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 
        'ㅡ', 'ㅢ', 'ㅣ']

jongsung_list = [
    ' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ',
        'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 
        'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 
        'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

jaum_list = ['ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄸ', 'ㄹ', 
              'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 
              'ㅃ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

moum_list = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 
              'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']

def compose(chosung, jungsung, jongsung):
    char = chr(
        kor_begin +
        chosung_base * chosung_list.index(chosung) +
        jungsung_base * jungsung_list.index(jungsung) +
        jongsung_list.index(jongsung)
    )
    return char

def decompose(c):
    if not character_is_korean(c):
        return None
    i = ord(c)
    if (jaum_begin <= i <= jaum_end):
        return (c, ' ', ' ')
    if (moum_begin <= i <= moum_end):
        return (' ', c, ' ')
    i -= kor_begin
    cho  = i // chosung_base
    jung = ( i - cho * chosung_base ) // jungsung_base 
    jong = ( i - cho * chosung_base - jung * jungsung_base )    
    return (chosung_list[cho], jungsung_list[jung], jongsung_list[jong])

def character_is_korean(c):
    i = ord(c)
    return ((kor_begin <= i <= kor_end) or
            (jaum_begin <= i <= jaum_end) or
            (moum_begin <= i <= moum_end))

In [5]:
decompose('감')

('ㄱ', 'ㅏ', 'ㅁ')

In [6]:
compose('ㄲ', 'ㅜ', 'ㅁ')

'꿈'

In [7]:
from soynlp.hangle import compose
from soynlp.hangle import decompose

In [8]:
decompose('꼭')

('ㄲ', 'ㅗ', 'ㄱ')

## Levenshtein

### Basic

In [9]:
def levenshtein(s1, s2, debug=False):
    if len(s1) < len(s2):
        return levenshtein(s2, s1, debug)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))

        if debug:
            print(current_row[1:])

        previous_row = current_row

    return previous_row[-1]

In [10]:
s1 = '꿈을꾸는아이'
s2 = '아이오아이'
levenshtein(s1, s2, debug=True)

[1, 2, 3, 4, 5]
[2, 2, 3, 4, 5]
[3, 3, 3, 4, 5]
[4, 4, 4, 4, 5]
[4, 5, 5, 4, 5]
[5, 4, 5, 5, 4]


4

In [11]:
s1 = '아이돌'
s2 = '아이오아이'
levenshtein(s1, s2, debug=True)

[0, 1, 2]
[1, 0, 1]
[2, 1, 1]
[3, 2, 2]
[4, 3, 3]


3

In [12]:
s1 = '꿈을 꾸는 아이'
s2 = '아이는 꿈을 꿔요'
levenshtein(s1, s2, debug=True)

[1, 2, 3, 4, 5, 6, 6, 7]
[2, 2, 3, 4, 5, 6, 7, 6]
[3, 3, 3, 4, 4, 5, 6, 7]
[4, 4, 3, 4, 5, 4, 5, 6]
[4, 5, 4, 4, 5, 5, 5, 6]
[5, 4, 5, 5, 5, 6, 6, 6]
[6, 5, 4, 5, 6, 5, 6, 7]
[7, 6, 5, 5, 6, 6, 6, 7]
[8, 7, 6, 6, 6, 7, 7, 7]


7

In [13]:
# 어절 단위
levenshtein(s1.split(), s2.split(), debug=True)

[1, 1, 2]
[2, 2, 2]
[3, 3, 3]


3

### User defined cost

In [14]:
def levenshtein(s1, s2, cost=None, debug=False):
    if len(s1) < len(s2):
        return levenshtein(s2, s1, debug=debug)

    if len(s2) == 0:
        return len(s1)

    if cost is None:
        cost = {}

    def substitution_cost(c1, c2):
        if c1 == c2:
            return 0
        return cost.get((c1, c2), 1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            # Changed
            substitutions = previous_row[j] + substitution_cost(c1, c2)
            current_row.append(min(insertions, deletions, substitutions))

        if debug:
            print(current_row[1:])

        previous_row = current_row

    return previous_row[-1]

In [15]:
s1 = '아이쿠야'
s2 = '아이쿵야'
levenshtein(s1, s2, debug=True)

[0, 1, 2, 3]
[1, 0, 1, 2]
[2, 1, 1, 2]
[3, 2, 2, 1]


1

In [16]:
cost = {('쿠', '쿵'):0.1}
s1 = '아이쿠야'
s2 = '아이쿵야'
levenshtein(s1, s2, cost, debug=True)

[0, 1, 2, 3]
[1, 0, 1, 2]
[2, 1, 0.1, 1.1]
[3, 2, 1.1, 0.1]


0.1

### Jamo levenshtein

In [17]:
def jamo_levenshtein(s1, s2, debug=False):
    if len(s1) < len(s2):
        return jamo_levenshtein(s2, s1, debug)

    if len(s2) == 0:
        return len(s1)

    def substitution_cost(c1, c2):
        if c1 == c2:
            return 0
        return levenshtein(decompose(c1), decompose(c2))/3

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            # Changed
            substitutions = previous_row[j] + substitution_cost(c1, c2)
            current_row.append(min(insertions, deletions, substitutions))

        if debug:
            print(['%.3f'%v for v in current_row[1:]])

        previous_row = current_row

    return previous_row[-1]

In [18]:
s1 = '아이쿠야'
s2 = '아이쿵야'
jamo_levenshtein(s1, s2, debug=True)

['0.000', '1.000', '2.000', '3.000']
['1.000', '0.000', '1.000', '2.000']
['2.000', '1.000', '0.333', '1.333']
['3.000', '2.000', '1.333', '0.333']


0.3333333333333333

In [19]:
s1 = '아이쿵야'
s2 = '훍앜이쿠야'
jamo_levenshtein(s1, s2, debug=True)

['1.000', '2.000', '2.667', '3.667']
['1.333', '1.667', '2.667', '3.333']
['2.333', '1.333', '2.333', '3.000']
['3.333', '2.333', '1.667', '2.667']
['4.333', '3.333', '2.667', '1.667']


1.6666666666666665

In [20]:
s1 = '아이쿠야'
s2 = '아이쿵야'

s1_ = ''.join([comp for c in s1 for comp in decompose(c)])
s2_ = ''.join([comp for c in s2 for comp in decompose(c)])

print(s1_)
print(s2_)
print(levenshtein(s1_, s2_)/3)

ㅇㅏ ㅇㅣ ㅋㅜ ㅇㅑ 
ㅇㅏ ㅇㅣ ㅋㅜㅇㅇㅑ 
0.3333333333333333


In [21]:
s1 = '아이쿵야'
s2 = '훍앜이쿠야'

s1_ = ''.join([comp for c in s1 for comp in decompose(c)])
s2_ = ''.join([comp for c in s2 for comp in decompose(c)])

print(s1_)
print(s2_)
print(levenshtein(s1_, s2_)/3)

ㅇㅏ ㅇㅣ ㅋㅜㅇㅇㅑ 
ㅎㅜㄺㅇㅏㅋㅇㅣ ㅋㅜ ㅇㅑ 
1.6666666666666667


In [22]:
from soynlp.hangle import levenshtein
from soynlp.hangle import jamo_levenshtein

In [23]:
s1 = '아이쿠야'
s2 = '아이쿵야'

print(levenshtein(s1, s2))
print(jamo_levenshtein(s1, s2))

1
0.3333333333333333
