In [1]:
import itertools
import logging

import hangul_romanize
import hangul_romanize.rule
import jamo
import kroman
import networkx
import pandas as pd

from deromanize_hangul import to_jamo_groups, jamo_to_hangul, to_hangul

In [2]:
logging.basicConfig(level=logging.DEBUG)

_LOG = logging.getLogger(__name__)

In [4]:
SINGLE_HEAD_JAMOS - SINGLE_TAIL_JAMOS, SINGLE_TAIL_JAMOS - SINGLE_HEAD_JAMOS

(set(), {'ㅇ'})

In [5]:
# VOWELS

In [6]:
pd.DataFrame(data=[[ELEMENTS.get(prefix + base, '❌') for base in VOWEL_BASES]
                   for prefix in VOWEL_PREFIXES],
             index=VOWEL_PREFIXES, columns=VOWEL_BASES)

Unnamed: 0,a,i,u,e,o,eo,eu,ae,oe,ui
,ㅏ,ㅣ,ㅜ,ㅔ,ㅗ,ㅓ,ㅡ,ㅐ,ㅚ,ㅢ
y,ㅑ,❌,ㅠ,ㅖ,ㅛ,ㅕ,❌,ㅒ,❌,❌
w,ㅘ,ㅟ,❌,ㅞ,ㅝ,❌,❌,ㅙ,❌,❌


In [7]:
pd.DataFrame(data=[[ELEMENTS.get(repetitions * base, '❌') for base in DOUBLED_CONSONANTS]
                   for repetitions in (1, 2)],
             index=(1, 2), columns=DOUBLED_CONSONANTS)

Unnamed: 0,g,d,s,j,b
1,ㄱ,ㄷ,ㅅ,ㅈ,ㅂ
2,ㄲ,ㄸ,ㅆ,ㅉ,ㅃ


In [8]:
pd.DataFrame(data=[[ELEMENTS.get(base, '❌') for base in REMAINING_CONSONANTS]
                   for _ in ('',)],
             index=('',), columns=REMAINING_CONSONANTS)

Unnamed: 0,k,t,ch,h,p,m,ng,r,n
,ㅋ,ㅌ,ㅊ,ㅎ,ㅍ,ㅁ,ㅇ,ㄹ,ㄴ


In [14]:
_LOG.setLevel(logging.DEBUG)

In [20]:
_LOG.setLevel(logging.INFO)

In [11]:
examples = [
    # 'san-sung', 'sansung',
    # 'gat-i', 'gati',
    # 'cha',
    # 'ne, joha-yo.',
    'han-gug-eo', 'hangugeo',
    'ye-bbeu-da', 'ye-bbeuda',
    'sa-rang', 'sarang',
    'sa-rang-ha-da', 'saranghada',
    'ji-geum', 'jigeum',
    'mu-seun yeong-hwa-reur bur-gga-yo?', 'museun yeonghwareur burgga-yo?',
    'bo-da', 'boda']

In [17]:
for example in examples:
    jamo_groups = to_jamo_groups(example)
    for jamo_group, begin, end in jamo_groups:
        print(jamo_to_hangul(jamo_group), begin, end)

DEBUG:__main__:[] "" "han-gug-eo"
DEBUG:__main__:[] "ㅎ" "an-gug-eo"
DEBUG:__main__:[] "ㅎㅏ" "n-gug-eo"
DEBUG:__main__:[] "ㅎㅏㄴ" "-gug-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4)] "" "gug-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4)] "ㄱ" "ug-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4)] "ㄱㅜ" "g-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4)] "ㄱㅜㄱ" "-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4), ('ㄱㅜㄱ', 4, 8)] "" "eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4), ('ㄱㅜㄱ', 4, 8)] "ㅓ" ""
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4), ('ㄱㅜㄱ', 4, 8), ('ㅓ', 8, 11)] "" ""
DEBUG:__main__:"" "ㅎㅏㄴ"
DEBUG:__main__:"한" ""
DEBUG:__main__:"" "ㄱㅜㄱ"
DEBUG:__main__:"국" ""
DEBUG:__main__:"" "ㅓ"
DEBUG:__main__:"" "ㅇㅓ"
DEBUG:__main__:"어" ""
DEBUG:__main__:[] "" "hangugeo"
DEBUG:__main__:[] "ㅎ" "angugeo"
DEBUG:__main__:[] "ㅎㅏ" "ngugeo"
DEBUG:__main__:[] "ㅎㅏㅇ" "ugeo"
DEBUG:__main__:[] "ㅎㅏㅇㅜ" "geo"
DEBUG:__main__:[] "ㅎㅏㅇㅜㄱ" "eo"
DEBUG:__main__:[] "ㅎㅏㅇㅜㄱㅓ" ""
DEBUG:__main__:[('ㅎㅏㅇㅜㄱㅓ', 0, 9)] "" ""
DEBUG:__main__:"" "ㅎㅏㅇㅜㄱㅓ"
DEBUG:__main__:"항" "ㅜㄱㅓ"
DEBUG:__main__:"항" "ㅇㅜㄱㅓ"
DEBUG:__main__:"항욱" "ㅓ"

한 0 4
국 4 8
어 8 11
항욱어 0 9
예 0 3
쁘 3 8
다 8 12
예 0 3
쁟아 3 11
사 0 3
랑 3 8
살앙 0 7
사 0 3


DEBUG:__main__:"" "ㅎㅏ"
DEBUG:__main__:"하" ""
DEBUG:__main__:"" "ㄷㅏ"
DEBUG:__main__:"다" ""
DEBUG:__main__:[] "" "saranghada"
DEBUG:__main__:[] "ㅅ" "aranghada"
DEBUG:__main__:[] "ㅅㅏ" "ranghada"
DEBUG:__main__:[] "ㅅㅏㄹ" "anghada"
DEBUG:__main__:[] "ㅅㅏㄹㅏ" "nghada"
DEBUG:__main__:[] "ㅅㅏㄹㅏㅇ" "hada"
DEBUG:__main__:[] "ㅅㅏㄹㅏㅇㅎ" "ada"
DEBUG:__main__:[] "ㅅㅏㄹㅏㅇㅎㅏ" "da"
DEBUG:__main__:[] "ㅅㅏㄹㅏㅇㅎㅏㄷ" "a"
DEBUG:__main__:[] "ㅅㅏㄹㅏㅇㅎㅏㄷㅏ" ""
DEBUG:__main__:[('ㅅㅏㄹㅏㅇㅎㅏㄷㅏ', 0, 12)] "" ""
DEBUG:__main__:"" "ㅅㅏㄹㅏㅇㅎㅏㄷㅏ"
DEBUG:__main__:"살" "ㅏㅇㅎㅏㄷㅏ"
DEBUG:__main__:"살" "ㅇㅏㅇㅎㅏㄷㅏ"
DEBUG:__main__:"살앙" "ㅎㅏㄷㅏ"
DEBUG:__main__:"살앙핟" "ㅏ"
DEBUG:__main__:"살앙핟" "ㅇㅏ"
DEBUG:__main__:"살앙핟아" ""
DEBUG:__main__:[] "" "ji-geum"
DEBUG:__main__:[] "ㅈ" "i-geum"
DEBUG:__main__:[] "ㅈㅣ" "-geum"
DEBUG:__main__:[('ㅈㅣ', 0, 3)] "" "geum"
DEBUG:__main__:[('ㅈㅣ', 0, 3)] "ㄱ" "eum"
DEBUG:__main__:[('ㅈㅣ', 0, 3)] "ㄱㅡ" "m"
DEBUG:__main__:[('ㅈㅣ', 0, 3)] "ㄱㅡㅁ" ""
DEBUG:__main__:[('ㅈㅣ', 0, 3), ('ㄱㅡㅁ', 3, 9)] "" ""
DEBUG:__main__:"" "ㅈㅣ"
DEBUG:__main__:"

랑 3 8
하 8 11
다 11 15
살앙핟아 0 12
지 0 3
금 3 9
직음 0 8
무 0 3
슨 3 7
영 7 13
화 13 17
를 17 21
불 21 25
까 25 29
요 29 31


DEBUG:__main__:[('ㅁㅜㅅㅡㄴ', 0, 6), ('ㅕㅇㅎㅘㄹㅡㄹ', 6, 18)] "ㅂㅜㄹ" "gga-yo?"
DEBUG:__main__:[('ㅁㅜㅅㅡㄴ', 0, 6), ('ㅕㅇㅎㅘㄹㅡㄹ', 6, 18)] "ㅂㅜㄹㄲ" "a-yo?"
DEBUG:__main__:[('ㅁㅜㅅㅡㄴ', 0, 6), ('ㅕㅇㅎㅘㄹㅡㄹ', 6, 18)] "ㅂㅜㄹㄲㅏ" "-yo?"
DEBUG:__main__:[('ㅁㅜㅅㅡㄴ', 0, 6), ('ㅕㅇㅎㅘㄹㅡㄹ', 6, 18), ('ㅂㅜㄹㄲㅏ', 18, 25)] "" "yo?"
DEBUG:__main__:[('ㅁㅜㅅㅡㄴ', 0, 6), ('ㅕㅇㅎㅘㄹㅡㄹ', 6, 18), ('ㅂㅜㄹㄲㅏ', 18, 25)] "ㅛ" "?"
DEBUG:__main__:[('ㅁㅜㅅㅡㄴ', 0, 6), ('ㅕㅇㅎㅘㄹㅡㄹ', 6, 18), ('ㅂㅜㄹㄲㅏ', 18, 25), ('ㅛ', 25, 27)] "" ""
DEBUG:__main__:"" "ㅁㅜㅅㅡㄴ"
DEBUG:__main__:"뭇" "ㅡㄴ"
DEBUG:__main__:"뭇" "ㅇㅡㄴ"
DEBUG:__main__:"뭇은" ""
DEBUG:__main__:"" "ㅕㅇㅎㅘㄹㅡㄹ"
DEBUG:__main__:"" "ㅇㅕㅇㅎㅘㄹㅡㄹ"
DEBUG:__main__:"영" "ㅎㅘㄹㅡㄹ"
DEBUG:__main__:"영활" "ㅡㄹ"
DEBUG:__main__:"영활" "ㅇㅡㄹ"
DEBUG:__main__:"영활을" ""
DEBUG:__main__:"" "ㅂㅜㄹㄲㅏ"
DEBUG:__main__:"불" "ㄲㅏ"
DEBUG:__main__:"불까" ""
DEBUG:__main__:"" "ㅛ"
DEBUG:__main__:"" "ㅇㅛ"
DEBUG:__main__:"요" ""
DEBUG:__main__:[] "" "bo-da"
DEBUG:__main__:[] "ㅂ" "o-da"
DEBUG:__main__:[] "ㅂㅗ" "-da"
DEBUG:__main__:[('ㅂㅗ', 0, 3)] "" "da"
DEBUG:__main__:[('ㅂ

뭇은 0 6
영활을 6 18
불까 18 25
요 25 27
보 0 3
다 3 7
볻아 0 6


In [18]:
for example in examples:
    hangul = to_hangul(example)
    romanized = hangul_romanize.core.Transliter(hangul_romanize.rule.academic).translit(hangul)
    example = example.replace('-', '').replace('r', 'l').replace('gg', 'kk').replace('bb', 'pp')
    romanized = romanized.replace('-', '')
    assert example == romanized, (example, romanized)

DEBUG:__main__:[] "" "han-gug-eo"
DEBUG:__main__:[] "ㅎ" "an-gug-eo"
DEBUG:__main__:[] "ㅎㅏ" "n-gug-eo"
DEBUG:__main__:[] "ㅎㅏㄴ" "-gug-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4)] "" "gug-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4)] "ㄱ" "ug-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4)] "ㄱㅜ" "g-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4)] "ㄱㅜㄱ" "-eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4), ('ㄱㅜㄱ', 4, 8)] "" "eo"
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4), ('ㄱㅜㄱ', 4, 8)] "ㅓ" ""
DEBUG:__main__:[('ㅎㅏㄴ', 0, 4), ('ㄱㅜㄱ', 4, 8), ('ㅓ', 8, 11)] "" ""
DEBUG:__main__:"han-gug-eo" ㅓ 8:11
DEBUG:__main__:"" "ㅓ"
DEBUG:__main__:"" "ㅇㅓ"
DEBUG:__main__:"어" ""
DEBUG:__main__:"han-gug-어" ㄱㅜㄱ 4:8
DEBUG:__main__:"" "ㄱㅜㄱ"
DEBUG:__main__:"국" ""
DEBUG:__main__:"han-국어" ㅎㅏㄴ 0:4
DEBUG:__main__:"" "ㅎㅏㄴ"
DEBUG:__main__:"한" ""
DEBUG:__main__:"한국어"
DEBUG:__main__:[] "" "hangugeo"
DEBUG:__main__:[] "ㅎ" "angugeo"
DEBUG:__main__:[] "ㅎㅏ" "ngugeo"
DEBUG:__main__:[] "ㅎㅏㅇ" "ugeo"
DEBUG:__main__:[] "ㅎㅏㅇㅜ" "geo"
DEBUG:__main__:[] "ㅎㅏㅇㅜㄱ" "eo"
DEBUG:__main__:[] "ㅎㅏㅇㅜㄱㅓ" ""
DEBUG:__main__:[('

DEBUG:__main__:[('ㅁㅜ', 0, 3), ('ㅅㅡㄴ', 3, 7), ('ㅕㅇ', 7, 13), ('ㅎㅘ', 13, 17), ('ㄹㅡㄹ', 17, 21), ('ㅂㅜㄹ', 21, 25), ('ㄲㅏ', 25, 29), ('ㅛ', 29, 31)] "" ""
DEBUG:__main__:"mu-seun yeong-hwa-reur bur-gga-yo?" ㅛ 29:31
DEBUG:__main__:"" "ㅛ"
DEBUG:__main__:"" "ㅇㅛ"
DEBUG:__main__:"요" ""
DEBUG:__main__:"mu-seun yeong-hwa-reur bur-gg요yo?" ㄲㅏ 25:29
DEBUG:__main__:"" "ㄲㅏ"
DEBUG:__main__:"까" ""
DEBUG:__main__:"mu-seun yeong-hwa-reur bu까요yo?" ㅂㅜㄹ 21:25
DEBUG:__main__:"" "ㅂㅜㄹ"
DEBUG:__main__:"불" ""
DEBUG:__main__:"mu-seun yeong-hwa-reu불까요yo?" ㄹㅡㄹ 17:21
DEBUG:__main__:"" "ㄹㅡㄹ"
DEBUG:__main__:"를" ""
DEBUG:__main__:"mu-seun yeong-hwa를불까요yo?" ㅎㅘ 13:17
DEBUG:__main__:"" "ㅎㅘ"
DEBUG:__main__:"화" ""
DEBUG:__main__:"mu-seun yeong화를불까요yo?" ㅕㅇ 7:13
DEBUG:__main__:"" "ㅕㅇ"
DEBUG:__main__:"" "ㅇㅕㅇ"
DEBUG:__main__:"영" ""
DEBUG:__main__:"mu-seun영화를불까요yo?" ㅅㅡㄴ 3:7
DEBUG:__main__:"" "ㅅㅡㄴ"
DEBUG:__main__:"슨" ""
DEBUG:__main__:"mu-슨영화를불까요yo?" ㅁㅜ 0:3
DEBUG:__main__:"" "ㅁㅜ"
DEBUG:__main__:"무" ""
DEBUG:__main__:"무슨영화를불까요yo?"


AssertionError: ('museun yeonghwaleul bulkkayo?', 'museunyeonghwaleulbulkkayoyo?')

In [27]:
list(jamo.hangul_to_jamo('화')), list(jamo.hangul_to_jamo('ᅪ'))
# , list(jamo.hcj_to_jamo('ㅗ', 'ㅏ'))

(['ᄒ', 'ᅪ'], ['ᅪ'])

In [28]:
list(jamo.jamo_to_hangul('ㅎ', 'ᅪ'))

['화']

In [29]:
list(jamo.jamo_to_hangul('ㅎ', 'ㅗ', 'ㅏ'))
# is ᅪ below or to the right?
#'ㅎㅗㅏ', '화'

Could not parse jamo: U+0


InvalidJamoError: Could not synthesize characters to Hangul.

In [24]:
list(jamo.hangul_to_jamo('뜫'))

['ᄄ', 'ᅳ', 'ᆪ']

In [21]:
to_hangul('hwa')

'화'

In [22]:
to_hangul('hoa')

'호아'

In [30]:
to_hangul('ddurm')

'뚦'

testing out some packages

In [31]:
import hangul_romanize
import hangul_romanize.rule

hangul_romanize.core.Transliter(hangul_romanize.rule.academic).translit('사랑')

'salang'

In [13]:
kroman.parse('사랑')

'sa-rang'

In [15]:
kroman.parse('살앙')

'sal-ang'

In [7]:
import krt

krt.romanize('사랑')

AttributeError: 'str' object has no attribute 'decode'

In [9]:
krt.hangulize('salang', 'utf-8')

NameError: name 'unichr' is not defined

In [36]:
to_jamo_groups('world')

DEBUG:deromanize_hangul:[] "" "world"
DEBUG:deromanize_hangul:[] "ㅝ" "rld"
DEBUG:deromanize_hangul:[] "ㅝㄹ" "ld"
DEBUG:deromanize_hangul:[] "ㅝㄹㄹ" "d"
DEBUG:deromanize_hangul:[] "ㅝㄹㄹㄷ" ""
DEBUG:deromanize_hangul:[('ㅝㄹㄹㄷ', 0, 7)] "" ""


[('ㅝㄹㄹㄷ', 0, 7)]