In [1]:
import sys
sys.path.append('../')
import soynlp

## Build tokenizer

Vectorizer 가 이용할 토크나이저를 학습합니다. 

In [2]:
from soynlp import DoublespaceLineCorpus
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer
from soynlp.vectorizer import BaseVectorizer

corpus_path = 'YOURS'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
word_extractor = WordExtractor()
word_extractor.train(corpus)

# (leftside cohesion, rightside cohesion)
cohesion_scores = word_extractor.all_cohesion_scores()

# use only leftside cohesion
scores = {word:score[0] for word, score in cohesion_scores.items()}
tokenizer = LTokenizer(scores=scores)

## Vectorizing

corpus 를 term frequency matrix 로 변환합니다.

In [None]:
vectorizer = BaseVectorizer(
    tokenizer=tokenizer,
    min_tf=0,
    max_tf=10000,
    min_df=0,
    max_df=1.0,
    stopwords=None,
    lowercase=True,
    verbose=True
)

corpus.iter_sent = False
x = vectorizer.fit_transform(corpus)

## Save matrix market file without loading matrix on memory

대량의 문서에 대한 sparse matrix 를 메모리에 올리지 않고 곧바로 파일에 저장합니다. 

In [10]:
vectorizer = BaseVectorizer(min_tf=1, tokenizer=tokenizer)
corpus.iter_sent = False

matrix_path = 'YOURS'
vectorizer.fit_to_file(corpus, matrix_path)

scanning was done                                        
8193 terms are recognized
writing to file was done. 100 docs                  


In [13]:
vectorizer.encode_a_doc_to_bow('오늘 뉴스는 이것이 전부다')

{3: 1, 258: 1, 428: 1, 1814: 1}

In [14]:
vectorizer.decode_from_bow({3: 1, 258: 1, 428: 1, 1814: 1})

{'뉴스': 1, '는': 1, '오늘': 1, '이것이': 1}

In [15]:
vectorizer.encode_a_doc_to_list('오늘의 뉴스는 매우 심각합니다')

[258, 4, 428, 3, 333]

In [16]:
vectorizer.decode_from_list([258, 4, 428, 3, 333])

['오늘', '의', '뉴스', '는', '매우']