### 创建预料库

In [23]:
# save logging events
import logging
# logging.basicConfig(format='%(asctime)s:%(levlename)s:%(message)s', level=logging.INFO)
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [1]:
import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()     # 创建一个临时文件，使用完成后自动删除
print('文件目录 "%s" 将被用于保存临时文件和语料库。' % TEMP_FOLDER)

文件目录 "C:\Users\86132\AppData\Local\Temp" 将被用于保存临时文件和语料库。


In [1]:
from gensim  import corpora

In [2]:
help(corpora.Dictionary)

Help on class Dictionary in module gensim.corpora.dictionary:

class Dictionary(gensim.utils.SaveLoad, collections.abc.Mapping)
 |  Dictionary(documents=None, prune_at=2000000)
 |  
 |  Dictionary encapsulates the mapping between normalized words and their integer ids.
 |  
 |  Notable instance attributes:
 |  
 |  Attributes
 |  ----------
 |  token2id : dict of (str, int)
 |      token -> tokenId.
 |  id2token : dict of (int, str)
 |      Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed).
 |  cfs : dict of (int, int)
 |      Collection frequencies: token_id -> how many instances of this token are contained in the documents.
 |  dfs : dict of (int, int)
 |      Document frequencies: token_id -> how many documents contain this token.
 |  num_docs : int
 |      Number of documents processed.
 |  num_pos : int
 |      Total number of corpus positions (number of processed words).
 |  num_nnz : int
 |      Total number of non-zeroes in the

In [3]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [4]:
# 去除通用词汇表
stopword = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stopword]
         for document in documents]

# 去除仅出现一次的词语
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [7]:
dictionary = corpora.Dictionary(texts)    # 加载预料库
dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict'))     # 为语料库中的所有词语分配一个整数ID
print(dictionary)
print(dictionary.token2id)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [9]:
# 将文档转化为向量
new_doc = "Human computer interaction human"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (1, 2)]


#### 使用dictionary.doc2bow()方法将文档中的单词与语料中的单词进行匹配，将单词转化为语料库中的整数id,并统计次数，返回一个稀疏向量：如“human”的ID为1，文本中出现了2词则结果为：(1, 2)
#### 其效果类似于sklearn 中的CountVectorizer类中的transform()

In [11]:
# 将原始文档转化为稀疏矩阵
corpus = [dictionary.doc2bow(text) for text in texts] 
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'), corpus)    # 将结果存到磁盘
for c in corpus:
    print(c)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


In [12]:
# smart_open 超大文件流读写，基于open（）方法
from smart_open import smart_open 
class Mycorpus(object):
    def __iter__(self):
        for line in smart_open('test.txt', 'rb'):
            yield dictionary.doc2bow(line.lower().split())

In [None]:
# corpus_memeory_friendly = MyCorpus()
print(corpus_memory_friendly)

# 使用语料库

In [14]:
dictionary = corpora.Dictionary.load(os.path.join(TEMP_FOLDER, 'deerwester.dict'))   # 读取语料库
corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm'))    # d

In [16]:
from gensim import models

In [18]:
# 使用 tfidf模型计算词语的 tfidf值
tfidf = models.TfidfModel(corpus)

In [21]:
doc_bow = [(0, 1), (1, 1)]
print(tf[doc_bow])       # 调用模型生成向量

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [23]:
for doc in tf[corpus]:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]
