세종말뭉치의 통계를 계산하기 위하여 만든 Jupyter notebook 파일입니다. 

In [1]:
from glob import glob
fnames = glob('../data/processed/cd1/02_말뭉치/현대/*/*/형태분석_말뭉치/*.txt')
len(fnames)

479

In [2]:
class Documents:
    def __init__(self, fnames, yield_text=True, yield_tag=True):
        self.fnames = fnames
        self.yield_text = yield_text
        self.yield_tag = yield_tag
    def __iter__(self):
        if self.yield_text or self.yield_tag:
            for fname in self.fnames:
                with open(fname, encoding='utf-8') as f:
                    for row in f:
                        sent, tag = row.strip().split('\t')
                        if self.yield_text and self.yield_tag:
                            yield (sent.split(), tag.split())
                        elif self.yield_text:
                            yield sent.split()
                        elif self.yield_tag:
                            yield tag.split()
                            
docs = Documents(fnames, yield_text=True, yield_tag=True)
for i, (words, tags) in enumerate(docs):
    if i > 3: break
    print(words, tags)

['뭐', '타고', '가?'] ['뭐/NP', '타/VV+고/EC', '가/VV+ㅏ/EF+?/SF']
['지하철.'] ['지하철/NNG+./SF']
['기차?'] ['기차/NNG+?/SF']
['아침에', '몇', '시에', '타고', '가는데?'] ['아침/NNG+에/JKB', '몇/MM', '시/NNB+에/JKB', '타/VV+고/EC', '가/VV+는데/EF+?/SF']


### number of sentences

In [7]:
fnames = glob('../data/processed/cd1/02_말뭉치/현대/구어/*/형태분석_말뭉치/*.txt')
print('num files = %d' % len(fnames))

docs = Documents(fnames, yield_text=True, yield_tag=False)
for n_sent, _ in enumerate(docs):
    continue
print('num sents = %d' % (n_sent+1))

num files = 200
num sents = 216723


In [8]:
fnames = glob('../data/processed/cd1/02_말뭉치/현대/문어/*/형태분석_말뭉치/*.txt')
print('num files = %d' % len(fnames))

docs = Documents(fnames, yield_text=True, yield_tag=False)
for n_sent, _ in enumerate(docs):
    continue
print('num sents = %d' % (n_sent+1))

num files = 279
num sents = 837843


### number of eojeol

In [15]:
fnames = glob('../data/processed/cd1/02_말뭉치/현대/*/*/형태분석_말뭉치/*.txt')
print('num files = %d' % len(fnames))

from collections import Counter
docs = Documents(fnames, yield_text=True, yield_tag=False)
counter = Counter((eojeol for sent in docs for eojeol in sent))
        
print('num eojeols = %d' % (sum(counter.values())))
print('unique num eojeols = %d' % (len(counter)))

print('Top 50 eojeols')

from pprint import pprint
pprint(sorted(counter.items(), key=lambda x:x[1], reverse=True)[:50])

num eojeols = 10807777
unique num eojeols = 1560437
Top 50 eojeols
[('그', 81457),
 ('수', 68306),
 ('있다.', 56845),
 ('있는', 55802),
 ('이', 47680),
 ('한', 39459),
 ('것이다.', 37810),
 ('하는', 25381),
 ('것은', 24816),
 ('것이', 24642),
 ('그러나', 24511),
 ('할', 22909),
 ('나는', 22162),
 ('한다.', 21743),
 ('같은', 21198),
 ('있었다.', 20835),
 ('대한', 20529),
 ('또', 19782),
 ('그런', 19575),
 ('그리고', 18481),
 ('안', 17729),
 ('더', 17374),
 ('것을', 16830),
 ('우리', 16461),
 ('하고', 16312),
 ('그는', 15906),
 ('다른', 15238),
 ('때', 14866),
 ('없는', 14781),
 ('했다.', 14724),
 ('등', 14473),
 ('이런', 14375),
 ('두', 13645),
 ('내가', 13533),
 ('것', 13433),
 ('잘', 13230),
 ('다시', 13112),
 ('어떤', 12771),
 ('때문에', 12745),
 ('내', 12236),
 ('다', 11823),
 ('그의', 11120),
 ('것으로', 11046),
 ('없다.', 10544),
 ('이렇게', 10499),
 ('게', 10452),
 ('위해', 10374),
 ('가장', 10309),
 ('아니라', 10125),
 ('있을', 9922)]


### number of (word,tag) pair

In [5]:
fnames = glob('../data/processed/cd1/02_말뭉치/현대/*/*/형태분석_말뭉치/*.txt')
print('num files = %d' % len(fnames))

from collections import Counter
docs = Documents(fnames, yield_text=False, yield_tag=True)
counter_wordtag = Counter((w for sent in docs for eojeol in sent for w in eojeol.split('+')))
        
print('num (word,tag) = %d' % (sum(counter_wordtag.values())))
print('unique (word,tag) = %d' % (len(counter_wordtag)))

print('Top 50 (word,tag)')

from pprint import pprint
pprint(sorted(counter_wordtag.items(), key=lambda x:x[1], reverse=True)[:50])

num files = 479
num (word,tag) = 24464414
unique (word,tag) = 233023
Top 50 (word,tag)
[('./SF', 844440),
 ('의/JKG', 520895),
 ('을/JKO', 518468),
 ('ᆫ/ETM', 504474),
 ('다/EF', 488114),
 ('하/XSV', 458052),
 ('이/VCP', 450704),
 (',/SP', 447595),
 ('에/JKB', 416730),
 ('이/JKS', 368869),
 ('는/ETM', 355747),
 ('고/EC', 334472),
 ('는/JX', 331517),
 ('를/JKO', 299948),
 ('었/EP', 262254),
 ('은/JX', 260232),
 ('가/JKS', 245540),
 ('았/EP', 219846),
 ('하/XSA', 213519),
 ('것/NNB', 213480),
 ('어/EC', 212189),
 ('아/EC', 197251),
 ('도/JX', 176239),
 ('ᆯ/ETM', 172974),
 ('들/XSN', 172217),
 ('"/SS', 168212),
 ('하/VV', 161492),
 ('으로/JKB', 155012),
 ('에서/JKB', 145061),
 ("'/SS", 130833),
 ('게/EC', 129173),
 ('있/VV', 129074),
 ('있/VX', 127462),
 ('적/XSN', 126644),
 ('로/JKB', 113025),
 ('되/XSV', 106876),
 ('은/ETM', 102896),
 ('지/EC', 99173),
 ('ᆫ다/EF', 97583),
 ('기/ETN', 96926),
 (')/SS', 96347),
 ('(/SS', 94872),
 ('수/NNB', 87306),
 ('되/VV', 85798),
 ('그/MM', 85468),
 ('하/VX', 84812),
 ('없/VA', 71620),
 ('?/

### write [token, wordtag, frequency] table

In [19]:
from collections import Counter
def to_eojeol_wordtag(sent, tags):
    return [(word, tag) for word, tag in zip(sent, tags)]
docs = Documents(fnames, yield_text=True, yield_tag=True)
counter_pair = Counter((pair for sent, tags in docs for pair in to_eojeol_wordtag(sent,tags)))
print('unique (eojeol,word,tag) = %d' % (len(counter_pair)))


unique (eojeol,word,tag) = 1642217


In [22]:
with open('../data/processed/lr/tokentable.txt', 'w', encoding='utf-8') as f:
    for (eojeol, wordtag), freq in sorted(counter_pair.items(), key=lambda x:x[1], reverse=True):
        f.write('%s\t%s\t%d\n' % (eojeol, wordtag, freq))

## Character statistics

In [66]:
docs = Documents(fnames, yield_text=True, yield_tag=False)
character_counter = Counter((char for doc in docs for eojeol in doc for char in eojeol))
character_counter = sorted(character_counter.items(), key=lambda x:x[1], reverse=True)

In [71]:
hangle_counter = [(c, f) for c, f in character_counter if 44032 <= ord(c) <= 55203]
hangle_sum = sum((f for _, f in hangle_counter))

print('n character= %d' % len(character_counter))
print('n hangle= %d\n\n' % len(hangle_counter))

cum_ = 0
for i, (c, f) in enumerate(hangle_counter[:500]):
    cum_ += f
    if i % 4 == 3:
        print('%s (%d, %.2f %s)' % (c, f, 100*cum_/hangle_sum, '%') ,end='\n')
    else:
        print('%s (%d, %.2f %s)' % (c, f, 100*cum_/hangle_sum, '%') ,end=', ')

n character= 6624
n hangle= 2317


이 (1167221, 3.71 %), 다 (977169, 6.81 %), 는 (850962, 9.52 %), 에 (621290, 11.49 %)
의 (609623, 13.43 %), 을 (582238, 15.28 %), 가 (551022, 17.03 %), 고 (548996, 18.77 %)
하 (511458, 20.40 %), 지 (481319, 21.92 %), 그 (415148, 23.24 %), 한 (404041, 24.53 %)
로 (402448, 25.81 %), 서 (389041, 27.04 %), 은 (387046, 28.27 %), 어 (364381, 29.43 %)
기 (356798, 30.56 %), 나 (323632, 31.59 %), 도 (322734, 32.62 %), 아 (306072, 33.59 %)
를 (304788, 34.56 %), 사 (300428, 35.51 %), 리 (278356, 36.39 %), 있 (271760, 37.26 %)
들 (257388, 38.08 %), 자 (257294, 38.89 %), 대 (237144, 39.65 %), 으 (233996, 40.39 %)
인 (227498, 41.11 %), 것 (227277, 41.83 %), 시 (219237, 42.53 %), 라 (213233, 43.21 %)
게 (207331, 43.87 %), 해 (204475, 44.52 %), 수 (202083, 45.16 %), 니 (183153, 45.74 %)
정 (175165, 46.30 %), 보 (168677, 46.83 %), 만 (167919, 47.37 %), 적 (167133, 47.90 %)
일 (164002, 48.42 %), 면 (161772, 48.93 %), 었 (157697, 49.43 %), 과 (156109, 49.93 %)
부 (151394, 50.41 %), 제 (150376, 50.89 %), 주 (147128, 5

## (word, tag) statistics

In [129]:
def is_tag(tag):
    for t in tag:
        if not (65 <= ord(t) <= 90):
            return False
    return True

docs = Documents(fnames, yield_text=False, yield_tag=True)
wordtag_counter = Counter((wt for doc in docs for eojeol in doc for wt in eojeol.split('+')))
print('n (word/tag) = %d' % len(wordtag_counter))

tag_counter = Counter((wt.split('/')[-1] for doc in docs for eojeol in doc for wt in eojeol.split('+')))
print('n tag = %d' % len({tag for tag in tag_counter if is_tag(tag)}))

tag_l0_counter = Counter((wt.split('/')[-1][0] for doc in docs for eojeol in doc for wt in eojeol.split('+') if wt.split('/')[0]))
print('n tag (level 0) = %d' % len({tag for tag in tag_l0_counter if is_tag(tag)}))

n (word/tag) = 233023
n tag = 60
n tag (level 0) = 9


In [127]:
sum_ = sum(tag_counter.values())
for tag, freq in sorted(tag_counter.items(), key=lambda x:x[1], reverse=True):
    if not is_tag(tag):
        continue
    print('%s: (%d, %.3f %s)' % (tag, freq, 100*freq/sum_, '%'))

NNG: (5423295, 22.168 %)
VV: (1847882, 7.553 %)
EC: (1774216, 7.252 %)
ETM: (1356148, 5.543 %)
JKB: (1023071, 4.182 %)
JX: (953858, 3.899 %)
SF: (930944, 3.805 %)
EF: (869235, 3.553 %)
JKO: (830502, 3.395 %)
NNB: (751990, 3.074 %)
MAG: (713017, 2.915 %)
JKS: (623600, 2.549 %)
XSV: (582283, 2.380 %)
SS: (569874, 2.329 %)
EP: (558982, 2.285 %)
JKG: (521486, 2.132 %)
VX: (514576, 2.103 %)
SP: (494628, 2.022 %)
NNP: (469068, 1.917 %)
VCP: (467462, 1.911 %)
XSN: (439658, 1.797 %)
VA: (429078, 1.754 %)
NP: (386861, 1.581 %)
MM: (346899, 1.418 %)
SN: (253364, 1.036 %)
XSA: (230358, 0.942 %)
JC: (149972, 0.613 %)
XR: (133908, 0.547 %)
ETN: (129824, 0.531 %)
MAJ: (117480, 0.480 %)
NR: (91895, 0.376 %)
IC: (82047, 0.335 %)
SL: (73109, 0.299 %)
SH: (59171, 0.242 %)
JKC: (58901, 0.241 %)
VCN: (49741, 0.203 %)
XPN: (44791, 0.183 %)
SW: (32540, 0.133 %)
SE: (27716, 0.113 %)
JKQ: (18922, 0.077 %)
SO: (7423, 0.030 %)
UNC: (6506, 0.027 %)
UNA: (6361, 0.026 %)
UNT: (5768, 0.024 %)
JKV: (3920, 0.016 %)
N

In [130]:
sum_ = sum(tag_l0_counter.values())
for tag, freq in sorted(tag_l0_counter.items(), key=lambda x:x[1], reverse=True):
    if not is_tag(tag):
        continue
    print('%s: (%d, %.3f)' % (tag, freq, 100*freq/sum_))

N: (7124644, 29.128)
E: (4688406, 19.168)
J: (4184235, 17.107)
V: (3308743, 13.527)
S: (2444442, 9.994)
X: (1430998, 5.850)
M: (1177399, 4.814)
I: (82047, 0.335)
U: (18635, 0.076)


## Noun length statistics

In [124]:
def is_noun(pair):
    if ('NG' in pair) or ('NN' in pair) or ('NP' in pair) or ('NR' in pair):
        return True
    return False

noun_counter = {pair:freq for pair, freq in wordtag_counter.items() if is_noun(pair)}

noun_length_counter = {}
for pair, freq in noun_counter.items():
    n = len(pair.split('/')[0])
    noun_length_counter[n] = noun_length_counter.get(n, 0) + freq

sum_ = sum(noun_length_counter.values())
for n, freq in sorted(noun_length_counter.items()):
    print('length = %d (%d, %.3f %s)' % (n, freq, 100*freq/sum_, '%'))
    if n == 11: break

print('\n\nnoun (length is 11)\n')
for pair, freq in noun_counter.items():
    n = len(pair.split('/')[0])
    if n == 11:
        print(pair)

length = 1 (1647186, 23.118 %)
length = 2 (4491998, 63.044 %)
length = 3 (763624, 10.717 %)
length = 4 (171364, 2.405 %)
length = 5 (32734, 0.459 %)
length = 6 (10796, 0.152 %)
length = 7 (4715, 0.066 %)
length = 8 (1641, 0.023 %)
length = 9 (608, 0.009 %)
length = 10 (275, 0.004 %)
length = 11 (138, 0.002 %)


noun (length is 11)

이십일세기세종전자사전/NNP
한국여성경영자총연합회/NNP
한국대학로켓연구연합회/NNP
전국한약관련학과협의회/NNP
MCI커뮤니케이션즈사/NNP
한국여성중소기업인협회/NNP
남북인간띠잇기대회본부/NNP
유니벌씨티오브알라바마/NNP
부산부일외국어고등학교/NNP
대한무역투자진흥공공사/NNP
다임러크라이슬러코리아/NNP
전남동부지역사회연구소/NNP
한국한국가정법률상담소/NNP
런던스쿨오브이코노믹스/NNP
오프토테크놀러지컴퍼니/NNP
니꼴스끄­우쑤리이스끄/NNP
서울지역신문노조협의회/NNP
조선문화건설중앙협의회/NNP
영월댐백지화투쟁위원회/NNP
슬로베니아민주야당연합/NNP
한국방송프로듀서연합회/NNP
전민족유일당조직협의회/NNP
전국신문통신노조협의회/NNP
한국교정교화사업연구소/NNP
한국민속종합조사보고서/NNP
지장보살점찰선악업보경/NNP
황룡사9층목탑찰주본기/NNP
한국자동차공업협동조합/NNP
중화민국출판도서목록회/NNP
서울초등교육정책연구회/NNP
전국여성자원봉사자대회/NNP
대구시립소년소녀합창단/NNP
사우스차이나모닝포스트/NNP
네바다라스베이거스대학/NNP
한국신문방송편집인협회/NNP
한국정보통신기술사협회/NNP
국제자유도시추진기획단/NNP
한국근대문예비평사연구/NNP
한국바이오벤처기업협회/NNP
밀양댐맑은물대책위원회/NNP
한국능률협회매니지먼트/NNP
한국광복운동단체연합

## compound noun as noun

In [131]:
def parse_compound_nouns(eojeol):
    eojeol = [wt.split('/') for wt in eojeol.split('+') if len(wt.split('/')) == 2]
    nouns = ''
    for i, (wi, ti) in enumerate(eojeol):
        if ti[:2] == 'NN' or ti[:2] == 'NP' or ti[:2] == 'NG' or ti[:2] == 'NR' or ti == 'XSN':
            nouns += wi
        else:
            break
    return nouns

# docs = Documents(fnames, yield_text=False, yield_tag=True)
# eojeol_counter = Counter((eojeol for sent in docs for eojeol in sent))
compound_nouns = {}

for i, (eojeol, freq) in enumerate(eojeol_counter.items()):
    noun = parse_compound_nouns(eojeol)
    if not noun: continue
    compound_nouns[noun] = compound_nouns.get(noun, 0) + freq
print('n noun + compound= %d' % len(compound_nouns))

n noun + compound= 266732


In [132]:
noun_length_counter = {}
for noun, freq in compound_nouns.items():
    n = len(noun)
    noun_length_counter[n] = noun_length_counter.get(n, 0) + freq

sum_ = sum(noun_length_counter.values())
for n, freq in sorted(noun_length_counter.items()):
    print('length = %d (%d, %.3f %s)' % (n, freq, 100*freq/sum_, '%'))
#     if n == 11: break

length = 1 (1263396, 20.172 %)
length = 2 (3522828, 56.247 %)
length = 3 (973362, 15.541 %)
length = 4 (375754, 5.999 %)
length = 5 (80711, 1.289 %)
length = 6 (31213, 0.498 %)
length = 7 (9749, 0.156 %)
length = 8 (3955, 0.063 %)
length = 9 (1281, 0.020 %)
length = 10 (557, 0.009 %)
length = 11 (222, 0.004 %)
length = 12 (74, 0.001 %)
length = 13 (23, 0.000 %)
length = 14 (10, 0.000 %)
length = 15 (7, 0.000 %)
length = 16 (4, 0.000 %)
length = 17 (3, 0.000 %)
length = 22 (1, 0.000 %)


In [116]:
for noun, freq in compound_nouns.items():
    n = len(noun)
    if n >= 14:
        print(noun)


한국사교양국사연구회청아출판사
소니컴퓨터엔터테인먼트코리아
에스에스더불유오에이아이케이아이
민속교육자료집봉천놀이마당우리교육
꼬사끄(Cossack)인들
전국남녀중고등학교대항테니스대회
파라다이스여자인비테이셔널골프대회
발레오만도전장시스템스코리아
사구체요세관집합관신우요관방광요도
서울경인사무서비스직노동조합
월드어소시에이션어브엔지오스
서울YMCA시청자시민운동본부
역사민속학연구한국역사민속학회
한국음식료품도매업협동조합연합회
멘로이그라흐트(Menleugracht)법
직원조회사항학급일지기록훈련
서울국제만화애니메이션페스티벌
서울노동자문화예술단체협의회
세계혼합단체배드민턴선수권대회
시아본사아아서어어가아모니이이불
서울국제아동청소년공연예술제
부산아시아경기대회조직위원회


## Verb and Adjective statistics

In [125]:
def is_verb_or_adjective(pair):
    if ('VV' in pair) or ('VA' in pair):
        return True
    return False

v_counter = {pair:freq for pair, freq in wordtag_counter.items() if is_verb_or_adjective(pair)}

v_length_counter = {}
for pair, freq in v_counter.items():
    n = len(pair.split('/')[0])
    v_length_counter[n] = v_length_counter.get(n, 0) + freq

sum_ = sum(v_length_counter.values())
for n, freq in sorted(v_length_counter.items()):
    print('length = %d (%d, %.3f %s)' % (n, freq, 100*freq/sum_, '%'))

print('\n\nverb (length is 8)\n')
for pair, freq in v_counter.items():
    n = len(pair.split('/')[0])
    if n == 8:
        print(pair)

length = 1 (1169712, 51.371 %)
length = 2 (803850, 35.303 %)
length = 3 (262088, 11.510 %)
length = 4 (38221, 1.679 %)
length = 5 (3006, 0.132 %)
length = 6 (121, 0.005 %)
length = 7 (2, 0.000 %)
length = 8 (7, 0.000 %)


verb (length is 8)

조물락조물락거리/VV
오무락조무락거리/VV
수군덕수군덕거리/VV
허우적허우적거리/VV
바스락바스락거리/VV
으르렁으르렁거리/VV
엎치락뒤치락거리/VV


## tokentable line grep

Jupyter notebook 에서 query가 들어있는 단어를 읽을 수 있도록 likesearch를 만들어 두었습니다. 

In [9]:
def likesearch(query, topk=100):
    n_count = 0
    with open('../data/processed/lr/tokentable.txt', encoding='utf-8') as f:
        for row in f:
            if n_count >= topk: break
            if query in row:
                n_count += 1
                print(row.strip())

In [62]:
likesearch('정말', topk=30)

정말	정말/MAG	4039
정말로	정말로/MAG	514
"정말	"/SS+정말/MAG	204
정말,	정말/MAG+,/SP	123
정말이지	정말/NNG+이/VCP+지/EC	85
정말?	정말/MAG+?/SF	76
정말.	정말/MAG+./SF	47
정::말	정말/MAG	20
정말이지	정말/MAG+이/VCP+지/EC	20
정말요?	정말/MAG+요/JX+?/SF	16
정말로,	정말로/MAG+,/SP	15
'정말	'/SS+정말/MAG	15
"정말?"	"/SS+정말/MAG+?/SF+"/SS	12
"정말,	"/SS+정말/MAG+,/SP	11
"정말로	"/SS+정말로/MAG	10
"정말?	"/SS+정말/MAG+?/SF	10
"정말입니다.	"/SS+정말/NNG+이/VCP+ᄇ니다/EF+./SF	8
정말이야.	정말/NNG+이/VCP+야/EF+./SF	7
정말<phon>증말</phon>	정말/MAG	7
"정말이지	"/SS+정말/NNG+이/VCP+지/EC	7
정말은	정말/NNG+은/JX	6
정말이야,	정말/NNG+이/VCP+야/EC+,/SP	6
정말::	정말/MAG	6
정말."	정말/MAG+./SF+"/SS	6
정말이냐!	정말/NNG+이/VCP+냐/EF+!/SF	6
정말!"	정말/MAG+!/SF+"/SS	6
정말!	정말/MAG+!/SF	5
정말이에요.	정말/NNG+이/VCP+에요/EF+./SF	5
정말이지,	정말/NNG+이/VCP+지/EC+,/SP	5
"정말이에요."	"/SS+정말/NNG+이/VCP+에요/EF+./SF+"/SS	5
