In [1]:
import re

korean_pattern = re.compile('[ㄱ-ㅎㅏ-ㅣ가-힣0-9]+')
is_korean = lambda s:korean_pattern.match(s) is not None

for word in '아하 ㅋ ㅠ 알랄ㄹ랴 1호점 a (주)'.split():
    print(word, is_korean(word))

아하 True
ㅋ True
ㅠ True
알랄ㄹ랴 True
1호점 True
a False
(주) False


In [2]:
from collections import defaultdict

posdic = defaultdict(lambda: defaultdict(int))
with open('../data/processed/tagger/fulltag.txt', encoding='utf-8') as f:
    for line in f:
        try:
            tokens = line.strip().split()
            tokens = [token.split('/') for token in tokens]
            tokens = [token for token in tokens if len(token) == 2]
            for word, pos in tokens:
                if not is_korean(word):
                    continue
                posdic[pos][word] += 1
        except:
            continue

In [3]:
for pos in sorted(posdic, key=lambda x:-len(posdic[x])):
    print('%s has %d words' % (pos, len(posdic[pos])))

NNG has 90250 words
NNP has 59915 words
VV has 6600 words
MAG has 6312 words
XR has 2372 words
EF has 1650 words
VA has 1612 words
EC has 1484 words
IC has 1197 words
NNB has 553 words
NR has 421 words
NP has 277 words
MAJ has 258 words
MM has 228 words
ETM has 145 words
JX has 136 words
VX has 125 words
XSN has 124 words
JKB has 121 words
XPN has 77 words
EP has 74 words
JC has 51 words
XSV has 24 words
JKS has 20 words
XSA has 20 words
JKV has 16 words
ETN has 16 words
VCP has 13 words
JKG has 10 words
JKQ has 10 words
VCN has 9 words
JKO has 8 words
JKC has 5 words
NG has 3 words
NNG" has 3 words
NNGG has 2 words
VA" has 1 words
MAC has 1 words
VSV has 1 words
VX" has 1 words
V has 1 words
MAAG has 1 words
JKBB has 1 words
NN has 1 words
JG has 1 words
JKSS has 1 words
NNg has 1 words
MMM has 1 words
EEC has 1 words


In [4]:
tagset = [
    'NNG', 'NNP', 'VV', 'MAG', 'XR',
    'EF', 'EC', 'VA', 'IC', 'NNB',
    'NR', 'NP', 'MAJ', 'MM', 'ETM',
    'JX', 'VX', 'JKB', 'XSN', 'XPN',
    'EP', 'JC', 'XSV', 'XSA', 'JKS',
    'ETN', 'JKV', 'VCP', 'JKO', 'JKQ',
    'JKG', 'VCN', 'JKC'
]

for pos in tagset:
    with open('../dictionary/sejong/{}.txt'.format(pos), 'w', encoding='utf-8') as f:
        for word, count in sorted(posdic[pos].items()):
            f.write('{} {}\n'.format(word, count))

In [5]:
mapper = {
  'NNG': 'Noun',
  'NNP': 'Noun',
  'VV': 'Verb',
  'MAG': 'Adverb',
  'XR': 'Noun',
  'EF': 'Eomi',
  'EC': 'Eomi',
  'VA': 'Adjective',
  'IC': 'Exclamation',
  'NNB': 'Noun',
  'NR': 'Noun',
  'NP': 'Noun',
  'MAJ': 'Adverb',
  'MM': 'Determiner',
  'ETM': 'Eomi',
  'JX': 'Josa',
  'VX': 'Adjective',
  'JKB': 'Josa',
  'XSN': 'Eomi',
  'XPN': 'Determiner',
  'EP': 'Eomi',
  'JC': 'Josa',
  'XSV': 'Noun',
  'XSA': 'Noun',
  'JKS': 'Josa',
  'ETN': 'Eomi',
  'JKV': 'Josa',
  'VCP': 'Noun',
  'JKO': 'Josa',
  'JKQ': 'Josa',
  'JKG': 'Josa',
  'VCN': 'Noun',
  'JKC': 'Josa'
}

posdic_ = defaultdict(lambda: defaultdict(int))
for pos in tagset:
    pos_ = mapper[pos]
    for word, count in posdic[pos].items():
        posdic_[pos_][word] += count 

for pos in posdic_:
    with open('../dictionary/simplified/{}.txt'.format(pos), 'w', encoding='utf-8') as f:
        for word, count in sorted(posdic_[pos].items()):
            f.write('{} {}\n'.format(word, count))