In [1]:
from config import raw_corpus_fname, tokenized_corpus_fname
print('raw_corpus_fname과 tokenized_corpus_fname의 타입은 str입니다. ')
print(type(raw_corpus_fname), type(tokenized_corpus_fname))

import sys
sys.path.append('../')
import soykeyword

raw_corpus_fname과 tokenized_corpus_fname의 타입은 str입니다. 
<class 'str'> <class 'str'>


In [2]:
tokenized_corpus_fname.split('/')[-1]

'2016-10-20_article_all_normed_nountokenized.txt'

In [3]:
class Corpus:
    def __init__(self, fname):
        self.fname = fname
        self.length = 0
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            for doc in f:
                yield doc.strip()
    def __len__(self):
        if self.length == 0:
            with open(self.fname, encoding='utf-8') as f:
                for n_doc, _ in enumerate(f):
                    continue
                self.length = (n_doc + 1)
        return self.length

In [4]:
from soykeyword.proportion import CorpusbasedKeywordExtractor

corpusbased_extractor = CorpusbasedKeywordExtractor(min_tf=20, 
                                                    min_df=2, 
                                                    tokenize=lambda x:x.strip().split(), 
                                                    verbose=True)

corpusbased_extractor.train(Corpus(tokenized_corpus_fname))

training was done 34572 terms, 30091 docs, memory = 0.436 Gb


In [5]:
for word in ['박근혜', '문재인', '최순실', '아이오아이', '트와이스', '군사', '외교']:
    print(word, corpusbased_extractor.frequency(word))

박근혜 1445
문재인 1010
최순실 1318
아이오아이 270
트와이스 655
군사 170
외교 881


In [6]:
documents = corpusbased_extractor.get_document_index('아이오아이')

In [7]:
documents[:10]

[6884, 6897, 6956, 7338, 7345, 7582, 8011, 8053, 9180, 9228]

In [8]:
keywords = corpusbased_extractor.extract_from_word('아이오아이', min_score=0.8, min_count=100)
keywords[:10]

[KeywordScore(word='아이오아이', frequency=270, score=1.0),
 KeywordScore(word='엠카운트다운', frequency=221, score=0.997897148491129),
 KeywordScore(word='펜타곤', frequency=104, score=0.9936420169665052),
 KeywordScore(word='잠깐', frequency=162, score=0.9931809154109712),
 KeywordScore(word='엠넷', frequency=125, score=0.9910325251765126),
 KeywordScore(word='걸크러쉬', frequency=111, score=0.9904705029926091),
 KeywordScore(word='타이틀곡', frequency=311, score=0.987384461584851),
 KeywordScore(word='코드', frequency=105, score=0.9871835929954923),
 KeywordScore(word='본명', frequency=105, score=0.9863934667369743),
 KeywordScore(word='엑스', frequency=101, score=0.9852544036088814)]

In [9]:
keywords = corpusbased_extractor.extract_from_docs(documents, min_score=0.8, min_count=100)
keywords[:10]

[KeywordScore(word='아이오아이', frequency=270, score=1.0),
 KeywordScore(word='엠카운트다운', frequency=221, score=0.997897148491129),
 KeywordScore(word='펜타곤', frequency=104, score=0.9936420169665052),
 KeywordScore(word='잠깐', frequency=162, score=0.9931809154109712),
 KeywordScore(word='엠넷', frequency=125, score=0.9910325251765126),
 KeywordScore(word='걸크러쉬', frequency=111, score=0.9904705029926091),
 KeywordScore(word='타이틀곡', frequency=311, score=0.987384461584851),
 KeywordScore(word='코드', frequency=105, score=0.9871835929954923),
 KeywordScore(word='본명', frequency=105, score=0.9863934667369743),
 KeywordScore(word='엑스', frequency=101, score=0.9852544036088814)]

In [10]:
for word in ['박근혜', '문재인', '최순실', '아이오아이', '트와이스', '군사', '외교']:
    keywords = corpusbased_extractor.extract_from_word(word, min_score=0.8, min_count=150)[:48]
    print('Aspect word = %s' % word)
    for i in range(12):
        subkeywords = keywords[4*i:4*i+4]
        subkeywords = '  -  '.join(['%17s'%s for s in ['%s (%d, %.2f)' % (keyword.word, keyword.frequency, keyword.score) for keyword in subkeywords]]) 
        print(subkeywords)
    print('-'*80)

Aspect word = 박근혜
 박근혜 (1445, 1.00)  -  수석비서관회의 (208, 1.00)  -    재단들 (152, 1.00)  -    연설문 (204, 0.99)
  누구라 (178, 0.99)  -   불법행위 (240, 0.99)  -     퇴임 (188, 0.98)  -     엄정 (388, 0.98)
 창조경제 (226, 0.98)  -    처벌받 (227, 0.98)  -     미르 (604, 0.98)  -  스포츠재단 (676, 0.97)
더블루케이 (194, 0.97)  -     최씨 (695, 0.97)  -    재단 (1690, 0.97)  -  자유학기제 (201, 0.97)
 비선실세 (219, 0.97)  -   최순실씨 (520, 0.97)  -   미르재단 (247, 0.96)  -    게이트 (303, 0.96)
 대통령 (5682, 0.96)  -     모녀 (223, 0.96)  -   행복교육 (227, 0.95)  -     실세 (309, 0.95)
   비선 (288, 0.95)  -   최순실 (1318, 0.95)  -    의혹 (3602, 0.95)  -     고양 (278, 0.95)
   국정 (185, 0.94)  -   청와대 (2112, 0.94)  -    지지층 (151, 0.94)  -    킨텍스 (332, 0.94)
   체육 (221, 0.94)  -     재계 (152, 0.93)  -     민생 (164, 0.93)  -  2002년 (186, 0.93)
   정권 (596, 0.93)  -     가중 (175, 0.93)  -     유용 (359, 0.93)  -    전경련 (348, 0.93)
   주재 (459, 0.93)  -    국민들 (441, 0.93)  -     백승 (216, 0.92)  -    갤러리 (271, 0.92)
  기업들 (808, 0.92)  -    지지율 (336, 0.92)  -     확산 (800, 

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0.001)
x = vectorizer.fit_transform(Corpus(tokenized_corpus_fname))

print(x.shape)

(30091, 9774)


In [12]:
word2index = vectorizer.vocabulary_
index2word = sorted(vectorizer.vocabulary_, key=lambda x:vectorizer.vocabulary_[x])

In [14]:
from soykeyword.proportion import MatrixbasedKeywordExtractor

matrixbased_extractor = MatrixbasedKeywordExtractor(min_tf=20, 
                                                    min_df=2,
                                                    verbose=True)
matrixbased_extractor.train(x)

MatrixbasedKeywordExtractor trained


In [15]:
keywords = matrixbased_extractor.extract_from_word(5537, min_score=0.8, min_count=100)
keywords[:10]

[KeywordScore(word=5537, frequency=270, score=1.0),
 KeywordScore(word=5880, frequency=221, score=0.9978307775631691),
 KeywordScore(word=8976, frequency=104, score=0.9934422266805437),
 KeywordScore(word=7126, frequency=162, score=0.9929667382454291),
 KeywordScore(word=5879, frequency=125, score=0.9907514986652862),
 KeywordScore(word=1103, frequency=111, score=0.99017203825805),
 KeywordScore(word=8721, frequency=311, score=0.9869906112674688),
 KeywordScore(word=8651, frequency=105, score=0.9867835556082788),
 KeywordScore(word=4035, frequency=105, score=0.98596911773225),
 KeywordScore(word=5869, frequency=101, score=0.9847950780631249)]

In [16]:
for keyword in keywords[:10]:
    print('word=%s, frequency=%d, score=%.3f' % (index2word[keyword.word], keyword.frequency, keyword.score))

word=아이오아이, frequency=270, score=1.000
word=엠카운트다운, frequency=221, score=0.998
word=펜타곤, frequency=104, score=0.993
word=잠깐, frequency=162, score=0.993
word=엠넷, frequency=125, score=0.991
word=걸크러쉬, frequency=111, score=0.990
word=타이틀곡, frequency=311, score=0.987
word=코드, frequency=105, score=0.987
word=본명, frequency=105, score=0.986
word=엑스, frequency=101, score=0.985


In [18]:
try:
    keywords = matrixbased_extractor.extract_from_word('아이오아이', min_score=0.8, min_count=100)
except Exception as e:
    print(e)

If you want to insert str word, you should trained index2word first


In [19]:
matrixbased_extractor_w_indexer = MatrixbasedKeywordExtractor(min_tf=20, 
                                                    min_df=2,
                                                    verbose=True)
matrixbased_extractor.train(x, index2word)

MatrixbasedKeywordExtractor trained


In [20]:
keywords = matrixbased_extractor.extract_from_word(5537, min_score=0.8, min_count=100)
keywords[:10]

[KeywordScore(word='아이오아이', frequency=0, score=1.0),
 KeywordScore(word='엠카운트다운', frequency=0, score=0.9978307775631691),
 KeywordScore(word='펜타곤', frequency=0, score=0.9934422266805437),
 KeywordScore(word='잠깐', frequency=0, score=0.9929667382454291),
 KeywordScore(word='엠넷', frequency=0, score=0.9907514986652862),
 KeywordScore(word='걸크러쉬', frequency=0, score=0.99017203825805),
 KeywordScore(word='타이틀곡', frequency=0, score=0.9869906112674688),
 KeywordScore(word='코드', frequency=0, score=0.9867835556082788),
 KeywordScore(word='본명', frequency=0, score=0.98596911773225),
 KeywordScore(word='엑스', frequency=0, score=0.9847950780631249)]

In [21]:
keywords = matrixbased_extractor.extract_from_word('아이오아이', min_score=0.8, min_count=100)
keywords[:10]

[KeywordScore(word='아이오아이', frequency=0, score=1.0),
 KeywordScore(word='엠카운트다운', frequency=0, score=0.9978307775631691),
 KeywordScore(word='펜타곤', frequency=0, score=0.9934422266805437),
 KeywordScore(word='잠깐', frequency=0, score=0.9929667382454291),
 KeywordScore(word='엠넷', frequency=0, score=0.9907514986652862),
 KeywordScore(word='걸크러쉬', frequency=0, score=0.99017203825805),
 KeywordScore(word='타이틀곡', frequency=0, score=0.9869906112674688),
 KeywordScore(word='코드', frequency=0, score=0.9867835556082788),
 KeywordScore(word='본명', frequency=0, score=0.98596911773225),
 KeywordScore(word='엑스', frequency=0, score=0.9847950780631249)]