In [1]:
from config import root_folder
from config import corpus_folder
from config import model_folder
from config import soynlp_path

import sys
sys.path.append(soynlp_path)
from soynlp.hangle import normalize
from soynlp.word import WordExtractor
from soynlp.utils import DoublespaceLineCorpus
from soynlp.utils import get_process_memory
normalize('테스트입니다 abc, 아하하12.!@', english=True, number=True)

from glob import glob
corpus_fnames = glob('{}/*_text'.format(corpus_folder))

## Tokenizing and counting subwords

In [3]:
import pickle

for n_corpus, fname in enumerate(corpus_fnames):
    corpus_index = fname.split('/')[-1].split('_')[0]
    model_fname = '{}/{}_subword_statistics.pkl'.format(model_folder, corpus_index)
    corpus = DoublespaceLineCorpus(fname, iter_sent=False)
    
    L = {}
    DF = {}
    for n_doc, doc in enumerate(corpus):
        for word in doc.split():
            for e in range(1, max(8, len(word))+1):
                l = word[:e]
                L[l] = L.get(l, 0) + 1
        subwords = {word[:e] for word in set(doc.split()) for e in range(2, max(8, len(word))+1) if len(word) > 1}
        for subword in subwords:
            DF[subword] = DF.get(subword, 0) + 1
        if n_doc % 1000 == 999:
            print('\rscanning ... {} / {}, {} docs'.format(n_corpus, len(corpus_fnames), n_doc+1), flush=True, end='')
    
    cohesions = {}
    for l, count in L.items():
        n = len(l)
        if n < 2 or count < 10:
            continue
        cohesion = pow(count/L[l[0]], 1/(n-1))
        cohesions[l] = cohesion
    
    params = {
        'l_frequency':L,
        'l_document_frequency':DF,
        'l_cohesion': cohesions,
        'num_doc': (n_doc+1)
    }
    print('\rscanning was done {} / {}, used memory = {} Gb'.format(n_corpus+1, len(corpus_fnames), ' %.3f'%get_process_memory()))
    with open(model_fname, 'wb') as f:
        pickle.dump(params, f)
    
    del cohesions
    del DF
    del L

scanning was done 1 / 27, used memory =  5.461 Gb
scanning was done 2 / 27, used memory =  2.140 Gb
scanning was done 3 / 27, used memory =  7.476 Gb
scanning was done 4 / 27, used memory =  13.441 Gb
scanning was done 5 / 27, used memory =  9.492 Gb
scanning was done 6 / 27, used memory =  3.949 Gb
scanning was done 7 / 27, used memory =  3.882 Gb
scanning was done 8 / 27, used memory =  3.107 Gb
scanning was done 9 / 27, used memory =  3.177 Gb
scanning was done 10 / 27, used memory =  5.651 Gb
scanning was done 11 / 27, used memory =  1.259 Gb
scanning was done 12 / 27, used memory =  1.744 Gb
scanning was done 13 / 27, used memory =  2.250 Gb
scanning was done 14 / 27, used memory =  5.637 Gb
scanning was done 15 / 27, used memory =  4.343 Gb
scanning was done 16 / 27, used memory =  3.322 Gb
scanning was done 17 / 27, used memory =  1.851 Gb
scanning was done 18 / 27, used memory =  2.777 Gb
scanning was done 19 / 27, used memory =  3.195 Gb
scanning was done 20 / 27, used memory 

## Universial dictionary

subwords their frequency >= 100 for each category

In [5]:
import pickle

min_frequency = 100
universial_subwords = set()

for n_corpus, fname in enumerate(corpus_fnames):
    corpus_index = fname.split('/')[-1].split('_')[0]
    model_fname = '{}/{}_subword_statistics.pkl'.format(model_folder, corpus_index)
    with open(model_fname, 'rb') as f:        
        params = pickle.load(f)
        for subword, frequency in params['l_frequency'].items():
            if frequency < min_frequency or len(subword) < 2:
                continue
            universial_subwords.add(subword)
    print('cumulated {} corpus, {} subwords'.format(n_corpus+1, len(universial_subwords)))
print('done')

with open('{}/universial_subwords.txt'.format(model_folder), 'w', encoding='utf-8') as f:
    for subword in sorted(universial_subwords):
        f.write('{}\n'.format(subword))

cumulated 1 corpus, 262708 subwords
cumulated 2 corpus, 277148 subwords
cumulated 3 corpus, 401945 subwords
cumulated 4 corpus, 410275 subwords
cumulated 5 corpus, 410810 subwords
cumulated 6 corpus, 458762 subwords
cumulated 7 corpus, 466827 subwords
cumulated 8 corpus, 467290 subwords
cumulated 9 corpus, 469025 subwords
cumulated 10 corpus, 557032 subwords
cumulated 11 corpus, 557802 subwords
cumulated 12 corpus, 560095 subwords
cumulated 13 corpus, 563796 subwords
cumulated 14 corpus, 579662 subwords
cumulated 15 corpus, 611946 subwords
cumulated 16 corpus, 616908 subwords
cumulated 17 corpus, 616922 subwords
cumulated 18 corpus, 627818 subwords
cumulated 19 corpus, 631040 subwords
cumulated 20 corpus, 640659 subwords
cumulated 21 corpus, 647536 subwords
cumulated 22 corpus, 649109 subwords
cumulated 23 corpus, 653026 subwords
cumulated 24 corpus, 653206 subwords
cumulated 25 corpus, 656056 subwords
cumulated 26 corpus, 656758 subwords
cumulated 27 corpus, 667805 subwords
done


In [14]:
subword2index = {subword:index for index, subword in enumerate(sorted(universial_subwords))}

import numpy as np
n = len(subword2index)
m = 27
subword_slot = np.zeros((n, m), dtype=np.float16)

In [26]:
for corpus_index in range(m):
#     corpus_fname = '{}/corpus_norm/{}_corpus_text'.format(root_folder, corpus_index)
    model_fname = '{}/{}_subword_statistics.pkl'.format(model_folder, corpus_index)
    
    with open(model_fname, 'rb') as f:
        params = pickle.load(f)
        num_doc = params['num_doc']
        for subword, df in params['l_document_frequency'].items():
            if not (subword in subword2index):
                continue
            i = subword2index[subword]
            df = df / num_doc
            subword_slot[i,corpus_index] = df
        
        del params


import pickle

with open('{}/subword_df_slot.pkl'.format(model_folder), 'wb') as f:
    params = {
        'subword_slot': subword_slot,
        'subword2index': subword2index
    }
    pickle.dump(params, f)
print('done')

done


## No-car words

In [27]:
index2subword = [word for word,index in sorted(subword2index.items(),key=lambda x:x[1])]
subword_slot.shape

(667805, 27)

- 다른 categories에서 한번도 안나온 단어들 때문에 난리남. 이런 단어들 어떻게 처리할지 고민해야 함
- universial subwords로 min frequency > 100 을 잡긴 했는데, df를 살펴보면 min_df도 있어야 싶음. 
- 혹은 document clustering을 먼저 거쳐서 단어 추출을 해야 하나? 

In [58]:
subword2index['귀신오빠']

205484

In [59]:
subword_slot[205484]

array([  1.61528587e-05,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00], dtype=float16)

In [57]:
top5_diff[607967]

array([  2.01718750e+001,   2.03750000e+001,   2.05156250e+001,
         2.05156250e+001,   2.06406250e+001,   2.36392688e-311])

In [51]:
top5_diff = np.ndarray((n,6))
for i in range(n):
    tops = sorted(subword_slot[i,:], key=lambda x:-x)[:6]
    mean = subword_slot[i,:].mean()
    for j in range(5):
        top5_diff[i,j] = (tops[0] - tops[j+1]) / mean
    if i % 100 == 99:
        print('\rcompute topk_diff score {} / {} ... '.format(i+1, n), flush=True, end='')
print('done')

compute topk_diff score 667800 / 667805 ... done


In [60]:
for word, score in sorted(enumerate(top5_diff[:,0]), key=lambda x:-x[1])[50000:51000]:
    print(index2subword[word], '%.3f' % score)

지라치 15.914
치료로 15.914
케이티앤지 15.914
파사트는 15.914
포메탈 15.914
폭스바겐골프중고 15.914
120240 15.906
CHEMICON 15.906
E220은 15.906
SERISE모델320 15.906
강준형 15.906
검버섯을 15.906
골든1 15.906
도시관리계 15.906
도시관리계획 15.906
디바인 15.906
리밸런싱을 15.906
명파리 15.906
문배철강 15.906
미항중 15.906
사내강 15.906
심포니아 15.906
에뜨와르 15.906
윤희도 15.906
인테그레이 15.906
자동차밧데리암 15.906
자동차밧데리암전 15.906
자동차밧데리암전류 15.906
자동차밧데리암전류측 15.906
자동차밧데리암전류측정 15.906
제외주 15.906
포켓몬을 15.906
한전K 15.906
합성유케미컬네 15.906
합성유케미컬네비 15.906
합성유케미컬네비게 15.906
합성유케미컬네비게이 15.906
합성유케미컬네비게이션 15.906
홀슈타인 15.906
0Al 15.898
16N01 15.898
4GH 15.898
Benchm 15.898
Benchma 15.898
D92A 15.898
HWV 15.898
NSFP 15.898
Nicc 15.898
Sisley 15.898
TATO 15.898
가젤의 15.898
개봉일을 15.898
공명정대하게 15.898
뉴스포티지TL 15.898
뉴스포티지TLX 15.898
다은마을 15.898
만평입니다 15.898
모집내 15.898
백청강의 15.898
상장심 15.898
세우테크 15.898
연주 15.898
연준지수 15.898
오늘보다는 15.898
일본한테 15.898
칼다스 15.898
터보R 15.898
파핸 15.898
현대피 15.898
0승 15.891
4FGG400I 15.891
78포인트 15.891
A11 15.891
ECB도 15.891
ECB총재 15.891
NASDAQ 15.891
OCI 15.891
Rege

## Exploration

이 단어들이 잘 나오는건, 매우 많이 등장하는 단어여서 그럴 거라 예상

In [40]:
for word in ['극장', '상영', '엔진', '차량', '악장', '휴양지', '터미네이터', '구매', '리스', '영화', '로봇', '커피', '미국', '선수']:
    idx = subword2index.get(word, -1)
    if idx == -1: continue
    print('\n{}'.format(word), (100*subword_slot[idx,:]).max())
    print(100*subword_slot[idx,:])


극장 0.86035
[ 0.28588867  0.38232422  0.36401367  0.16540527  0.10107422  0.13793945
  0.06970215  0.10089111  0.22375488  0.44873047  0.86035156  0.0531311
  0.07519531  0.0836792   0.21899414  0.07293701  0.08251953  0.46655273
  0.05950928  0.07037354  0.06213379  0.68603516  0.07562256  0.12304688
  0.10223389  0.46923828  0.32446289]

상영 0.39355
[ 0.11785889  0.21777344  0.20275879  0.10693359  0.08209229  0.08862305
  0.06970215  0.06524658  0.12054443  0.20690918  0.29418945  0.03564453
  0.03308105  0.05554199  0.06137085  0.03894043  0.03039551  0.27954102
  0.04534912  0.05303955  0.04141235  0.15307617  0.11090088  0.09173584
  0.03405762  0.39355469  0.27783203]

엔진 36.562
[ 21.546875   33.75       21.4375     21.703125   22.8125     23.90625
  28.03125    28.859375   34.375      15.5859375  18.078125   30.375
  29.984375   36.15625    28.953125   29.984375   17.1875     24.1875
  29.65625    31.59375    33.5625     14.9375     36.5625     33.75       26.375
  19.15625    2

In [2]:
import argparse
from glob import glob

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--corpus_directory', type=str, default='./', help='corpus directory')
    parser.add_argument('--model_directory', type=str, default='./', help='model directory')
    args = parser.parse_args()

    corpus_fnames = glob('{}/*_text'.format(args.corpus_directory))
    print(corpus_fnames)