세종말뭉치의 통계를 계산하기 위하여 만든 Jupyter notebook 파일입니다. 

In [1]:
from glob import glob
fnames = glob('../data/processed/cd1/02_말뭉치/현대/*/*/형태분석_말뭉치/*.txt')
len(fnames)

479

In [2]:
class Documents:
    def __init__(self, fnames, yield_text=True, yield_tag=True):
        self.fnames = fnames
        self.yield_text = yield_text
        self.yield_tag = yield_tag
    def __iter__(self):
        if self.yield_text or self.yield_tag:
            for fname in self.fnames:
                with open(fname, encoding='utf-8') as f:
                    for row in f:
                        sent, tag = row.strip().split('\t')
                        if self.yield_text and self.yield_tag:
                            yield (sent.split(), tag.split())
                        elif self.yield_text:
                            yield sent.split()
                        elif self.yield_tag:
                            yield tag.split()
                            
docs = Documents(fnames, yield_text=True, yield_tag=True)
for i, (words, tags) in enumerate(docs):
    if i > 3: break
    print(words, tags)

['뭐', '타고', '가?'] ['뭐/NP', '타/VV+고/EC', '가/VV+ㅏ/EF+?/SF']
['지하철.'] ['지하철/NNG+./SF']
['기차?'] ['기차/NNG+?/SF']
['아침에', '몇', '시에', '타고', '가는데?'] ['아침/NNG+에/JKB', '몇/MM', '시/NNB+에/JKB', '타/VV+고/EC', '가/VV+는데/EF+?/SF']


### number of sentences

In [7]:
fnames = glob('../data/processed/cd1/02_말뭉치/현대/구어/*/형태분석_말뭉치/*.txt')
print('num files = %d' % len(fnames))

docs = Documents(fnames, yield_text=True, yield_tag=False)
for n_sent, _ in enumerate(docs):
    continue
print('num sents = %d' % (n_sent+1))

num files = 200
num sents = 216723


In [8]:
fnames = glob('../data/processed/cd1/02_말뭉치/현대/문어/*/형태분석_말뭉치/*.txt')
print('num files = %d' % len(fnames))

docs = Documents(fnames, yield_text=True, yield_tag=False)
for n_sent, _ in enumerate(docs):
    continue
print('num sents = %d' % (n_sent+1))

num files = 279
num sents = 837843


### number of eojeol

In [15]:
fnames = glob('../data/processed/cd1/02_말뭉치/현대/*/*/형태분석_말뭉치/*.txt')
print('num files = %d' % len(fnames))

from collections import Counter
docs = Documents(fnames, yield_text=True, yield_tag=False)
counter = Counter((eojeol for sent in docs for eojeol in sent))
        
print('num eojeols = %d' % (sum(counter.values())))
print('unique num eojeols = %d' % (len(counter)))

print('Top 50 eojeols')

from pprint import pprint
pprint(sorted(counter.items(), key=lambda x:x[1], reverse=True)[:50])

num eojeols = 10807777
unique num eojeols = 1560437
Top 50 eojeols
[('그', 81457),
 ('수', 68306),
 ('있다.', 56845),
 ('있는', 55802),
 ('이', 47680),
 ('한', 39459),
 ('것이다.', 37810),
 ('하는', 25381),
 ('것은', 24816),
 ('것이', 24642),
 ('그러나', 24511),
 ('할', 22909),
 ('나는', 22162),
 ('한다.', 21743),
 ('같은', 21198),
 ('있었다.', 20835),
 ('대한', 20529),
 ('또', 19782),
 ('그런', 19575),
 ('그리고', 18481),
 ('안', 17729),
 ('더', 17374),
 ('것을', 16830),
 ('우리', 16461),
 ('하고', 16312),
 ('그는', 15906),
 ('다른', 15238),
 ('때', 14866),
 ('없는', 14781),
 ('했다.', 14724),
 ('등', 14473),
 ('이런', 14375),
 ('두', 13645),
 ('내가', 13533),
 ('것', 13433),
 ('잘', 13230),
 ('다시', 13112),
 ('어떤', 12771),
 ('때문에', 12745),
 ('내', 12236),
 ('다', 11823),
 ('그의', 11120),
 ('것으로', 11046),
 ('없다.', 10544),
 ('이렇게', 10499),
 ('게', 10452),
 ('위해', 10374),
 ('가장', 10309),
 ('아니라', 10125),
 ('있을', 9922)]


### number of (word,tag) pair

In [5]:
fnames = glob('../data/processed/cd1/02_말뭉치/현대/*/*/형태분석_말뭉치/*.txt')
print('num files = %d' % len(fnames))

from collections import Counter
docs = Documents(fnames, yield_text=False, yield_tag=True)
counter_wordtag = Counter((w for sent in docs for eojeol in sent for w in eojeol.split('+')))
        
print('num (word,tag) = %d' % (sum(counter_wordtag.values())))
print('unique (word,tag) = %d' % (len(counter_wordtag)))

print('Top 50 (word,tag)')

from pprint import pprint
pprint(sorted(counter_wordtag.items(), key=lambda x:x[1], reverse=True)[:50])

num files = 479
num (word,tag) = 24464414
unique (word,tag) = 233023
Top 50 (word,tag)
[('./SF', 844440),
 ('의/JKG', 520895),
 ('을/JKO', 518468),
 ('ᆫ/ETM', 504474),
 ('다/EF', 488114),
 ('하/XSV', 458052),
 ('이/VCP', 450704),
 (',/SP', 447595),
 ('에/JKB', 416730),
 ('이/JKS', 368869),
 ('는/ETM', 355747),
 ('고/EC', 334472),
 ('는/JX', 331517),
 ('를/JKO', 299948),
 ('었/EP', 262254),
 ('은/JX', 260232),
 ('가/JKS', 245540),
 ('았/EP', 219846),
 ('하/XSA', 213519),
 ('것/NNB', 213480),
 ('어/EC', 212189),
 ('아/EC', 197251),
 ('도/JX', 176239),
 ('ᆯ/ETM', 172974),
 ('들/XSN', 172217),
 ('"/SS', 168212),
 ('하/VV', 161492),
 ('으로/JKB', 155012),
 ('에서/JKB', 145061),
 ("'/SS", 130833),
 ('게/EC', 129173),
 ('있/VV', 129074),
 ('있/VX', 127462),
 ('적/XSN', 126644),
 ('로/JKB', 113025),
 ('되/XSV', 106876),
 ('은/ETM', 102896),
 ('지/EC', 99173),
 ('ᆫ다/EF', 97583),
 ('기/ETN', 96926),
 (')/SS', 96347),
 ('(/SS', 94872),
 ('수/NNB', 87306),
 ('되/VV', 85798),
 ('그/MM', 85468),
 ('하/VX', 84812),
 ('없/VA', 71620),
 ('?/

### write [token, wordtag, frequency] table

In [19]:
from collections import Counter
def to_eojeol_wordtag(sent, tags):
    return [(word, tag) for word, tag in zip(sent, tags)]
docs = Documents(fnames, yield_text=True, yield_tag=True)
counter_pair = Counter((pair for sent, tags in docs for pair in to_eojeol_wordtag(sent,tags)))
print('unique (eojeol,word,tag) = %d' % (len(counter_pair)))


unique (eojeol,word,tag) = 1642217


In [22]:
with open('../data/processed/lr/tokentable.txt', 'w', encoding='utf-8') as f:
    for (eojeol, wordtag), freq in sorted(counter_pair.items(), key=lambda x:x[1], reverse=True):
        f.write('%s\t%s\t%d\n' % (eojeol, wordtag, freq))

## tokentable line grep

Jupyter notebook 에서 query가 들어있는 단어를 읽을 수 있도록 likesearch를 만들어 두었습니다. 

In [9]:
def likesearch(query, topk=100):
    n_count = 0
    with open('../data/processed/lr/tokentable.txt', encoding='utf-8') as f:
        for row in f:
            if n_count >= topk: break
            if query in row:
                n_count += 1
                print(row.strip())

In [41]:
likesearch('지난해', topk=30)

지난해	지난해/NNG	1311
지난해	지나/VV+ᆫ/ETM+해/NNG	55
지난해의	지난해/NNG+의/JKG	54
지난해보다	지난해/NNG+보다/JKB	52
지난해부터	지난해/NNG+부터/JX	52
지난해에	지난해/NNG+에/JKB	50
"지난해	"/SS+지난해/NNG	48
지난해에는	지난해/NNG+에/JKB+는/JX	34
지난해와	지난해/NNG+와/JKB	22
지난해까지	지난해/NNG+까지/JX	16
지난해말	지난해/NNG+말/NNB	13
지난해에도	지난해/NNG+에/JKB+도/JX	10
지난해말의	지난해/NNG+말/NNB+의/JKG	9
지난해엔	지난해/NNG+에/JKB+ᆫ/JX	8
지난해는	지난해/NNG+는/JX	6
지난해	지난/MM+해/NNG	6
지난해와	지난해/NNG+와/JC	5
지난해부터는	지난해/NNG+부터/JX+는/JX	5
"지난해에는	"/SS+지난해/NNG+에/JKB+는/JX	4
지난해를	지난해/NNG+를/JKO	4
지난해만	지난해/NNG+만/JX	4
지난해에만	지난해/NNG+에/JKB+만/JX	4
지난해까지만	지난해/NNG+까지/JX+만/JX	3
지난해,	지난해/NNG+,/SP	3
지난해까지는	지난해/NNG+까지/JX+는/JX	3
지지난해	지지난해/NNG	3
지난해(10.3%)와	지난해/NNG+(/SS+10/SN+./SP+3/SN+%/SW+)/SS+와/JKB	2
지난해에도	지나/VV+ᆫ/ETM+해/NNG+에/JKB+도/JX	2
지난해까지	지나/VV+ᆫ/ETM+해/NNG+까지/JX	2
지난해에,	지난해/NNG+에/JKB+,/SP	2
