In [3]:
import config
import navermovie_comments

## Max Score Tokenizer (Cohesion + NounExtractor + Actor name dictionary)

비지도학습 기반 단어 추출방법을 이용하는 토크나이저를 학습합니다.

토크나이저의 성능은 명사 인식 능력에 큰 영향을 받습니다. 한국어에서 가장 많이 이용되는 품사이며, 미등록단어가 자주 등장하는 단어입니다. 특히나 word embedding 을 적용할 때에는 하나의 명사가 두 개 이상의 subwords 로 나뉘어지지 않는 것이 중요합니다.

그렇기 때문에 soynlp 의 WordExtractor 로부터 학습되는 cohesion score 와 NounExtractor 로부터 학습되는 noun score 를 조합하여 이용합니다. 여기에, 이 도메인에서 알려진 단어사전인 영화배우 이름을 추가하였습니다.

In [4]:
from soynlp.word import WordExtractor
from soynlp.noun import LRNounExtractor_v2
from navermovie_comments import load_movie_comments

idxs, texts, rates = load_movie_comments(large=True, tokenize=None)

word_extractor = WordExtractor()
word_extractor.train(texts)
# {str:(L-score, R-score)}
cohesions = word_extractor.all_cohesion_scores()
# {str:L-score}
cohesions = {sub:lscore for sub, (lscore, rscore) in cohesions.items()}


# noun_extractor = LRNounExtractor_v2()
# # {str:NounScore}
# nouns = noun_extractor.train_extract(texts, min_noun_frequency=5)
# # {str:noun-score}
# nouns = {sub:score.score for sub, score in nouns.items()}

training was done. used memory 6.847 Gb use memory 8.188 Gb
all cohesion probabilities was computed. # words = 801752


In [None]:
import pickle
with open('../lovit_textmining_dataset/navermovie_comments/data/casting.pkl', 'rb') as f:
    castings = pickle.load(f)

actor_names = {credit['k_name'].replace(' ', '') for credits in castings.values() for credit in credits}
actor_names = {name:1.0 for name in actor_names} 
# actor name as noun
nouns.update(actor_names)

In [58]:
# word score = noun score + 0.3 * cohesion
word_scores = {sub: score + 0.3 * cohesions.get(sub, 0) for sub, score in nouns.items()}
word_scores.update({sub: 0.3 * score for sub, score in cohesions.items() if not (sub in nouns)})

학습된 토크나이저를 테스트합니다.

In [5]:
from soynlp.tokenizer import MaxScoreTokenizer

# combine noun score & cohesion
# tokenizer = MaxScoreTokenizer(scores = word_scores)

# only cohesion
tokenizer = MaxScoreTokenizer(scores = cohesions)

# test
tests = [
    '영화리뷰들로학습한모델입니다',
    '인셉션에는디카프리오가출연했습니다풀네임은레오나르도디카프리오입니다',
    '이동진은영화평론가입니다',
    '1점2점일점이점별점을줍니다',
    '영화관의종류에는imax3d4d포디4D등이있습니다'
]
for sent in tests:
    print(sent)
    print(tokenizer.tokenize(sent, ), end='\n\n')

영화리뷰들로학습한모델입니다
['영화', '리뷰', '들로', '학습', '한', '모델', '입니다']

인셉션에는디카프리오가출연했습니다풀네임은레오나르도디카프리오입니다
['인셉션', '에는', '디카프리오', '가', '출연', '했', '습니다', '풀', '네임', '은', '레오나르도디카프리오', '입니다']

이동진은영화평론가입니다
['이동진은', '영화', '평론가', '입니다']

1점2점일점이점별점을줍니다
['1점', '2점', '일', '점이', '점', '별점', '을', '줍니다']

영화관의종류에는imax3d4d포디4D등이있습니다
['영화', '관의', '종류', '에는', 'imax', '3d', '4d', '포디', '4D', '등이', '있', '습니다']



In [7]:
# tokenize

dirname = '../lovit_textmining_dataset/navermovie_comments/data/'
suffix = 'soynlp_cohesion'

if False:
    for large in [True, False]:
        idxs, texts, rates = load_movie_comments(large=large, tokenize=None)
        size = 'large'  if large else 'small'
        path = '{}/data_{}_{}.txt'.format(dirname, size, suffix)
        with open(path, 'w', encoding='utf-8') as f:
            for i, (idx, text, rate) in enumerate(zip(idxs, texts, rates)):
                if i % 10000 == 0:
                    print('\rtokenizing {} sents'.format(i), end='')
                text_ = ' '.join(tokenizer.tokenize(text))
                f.write('{}\t{}\t{}\n'.format(idx, text_, rate))
        print('\rdone {}{}'.format(size, ' '*20))

done large                    
done small                    


### Word2Vec, Doc2Vec training

In [2]:
from navermovie_comments import get_movie_comments_path

path = get_movie_comments_path(large=True, tokenize='soynlp_unsup')

In [35]:
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument

class Word2VecComments:
    def __init__(self, path, verbose=False):
        self.path = path
        self.verbose = verbose
        self.n_iter = 0

    def __iter__(self):
        # <idx, texts, rates>
        with open(self.path, encoding='utf-8') as f:
            for i, doc in enumerate(f):
                if self.verbose and (i % 10000 == 0):
                    print('\riter={}, sents={} ...'.format(self.n_iter, i), end='')
                yield self._tokenize(doc)
            if self.verbose:
                print('\riter={}, sents={} done'.format(self.n_iter, i))
            self.n_iter += 1

    def _tokenize(self, doc):
        idx, text, rate = doc.strip().split('\t')
        return text.split()

class Doc2VecComments(Word2VecComments):
    def _tokenize(self, doc):
        idx, text, rate = doc.strip().split('\t')
        return TaggedDocument(
                    words=text.split(), tags=['#%s' % idx]
                )

## train with large size dataset

In [None]:
word2vec_corpus = Word2VecComments(path)
doc2vec_corpus = Doc2VecComments(path)

word2vec_model = Word2Vec(word2vec_corpus)
doc2vec_model = Doc2Vec(doc2vec_corpus)

In [32]:
import pickle

dirname = '../lovit_textmining_dataset/navermovie_comments/models/'

path = '{}/word2vec_large_soynlp_unsup_gensim3-6.pkl'.format(dirname)
with open(path, 'wb') as f:
    pickle.dump(word2vec_model, f)

path = '{}/doc2vec_large_soynlp_unsup_gensim3-6.pkl'.format(dirname)
with open(path, 'wb') as f:
    pickle.dump(doc2vec_model, f)

## train with small size dataset

In [36]:
%%time

path = get_movie_comments_path(large=False, tokenize='soynlp_unsup')
word2vec_corpus = Word2VecComments(path, verbose=True)
word2vec_model = Word2Vec(word2vec_corpus)

path = '{}/word2vec_small_soynlp_unsup_gensim3-6.pkl'.format(dirname)
with open(path, 'wb') as f:
    pickle.dump(word2vec_model, f)

iter=0, sents=294492 done
iter=1, sents=294492 done
iter=2, sents=294492 done
iter=3, sents=294492 done
iter=4, sents=294492 done
iter=5, sents=294492 done
CPU times: user 59.5 s, sys: 264 ms, total: 59.8 s
Wall time: 29.3 s


In [49]:
%%time

path = get_movie_comments_path(large=False, tokenize='komoran')
word2vec_corpus = Word2VecComments(path, verbose=True)
word2vec_model = Word2Vec(word2vec_corpus)

path = '{}/word2vec_small_komoran_gensim3-6.pkl'.format(dirname)
with open(path, 'wb') as f:
    pickle.dump(word2vec_model, f)

iter=0, sents=294492 done
iter=1, sents=294492 done
iter=2, sents=294492 done
iter=3, sents=294492 done
iter=4, sents=294492 done
iter=5, sents=294492 done
CPU times: user 1min 5s, sys: 396 ms, total: 1min 6s
Wall time: 32.8 s
