In [3]:
from gensim import corpora


# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))
# remove stop words and words that appear only once
stoplist = set('for a of the and to in'.split())
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
dictionary.compactify()  # remove gaps in id sequence after words that were removed
print(dictionary)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [5]:
import pprint
pprint.pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


In [6]:
dictionary.save('./tmp/test.dict')

In [9]:
mydic=corpora.Dictionary.load('./tmp/test.dict')
print(mydic)

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [10]:
from smart_open import open  # for transparently opening remote files


class MyCorpus:
    def __iter__(self):
        for line in open('mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield mydic.doc2bow(line.lower().split())

In [11]:
corpus_memory_friendly = MyCorpus()
for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


In [12]:
new_doc = "Human computer interaction"
new_vec = mydic.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


In [14]:
corpora.MmCorpus.serialize('./tmp/test.mm', corpus_memory_friendly)

In [16]:
corpus = corpora.MmCorpus('./tmp/test.mm')
print(list(corpus))

[[(0, 1.0), (1, 1.0), (2, 1.0)], [(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)], [(2, 1.0), (5, 1.0), (7, 1.0), (8, 1.0)], [(1, 1.0), (5, 2.0), (8, 1.0)], [(3, 1.0), (6, 1.0), (7, 1.0)], [(9, 1.0)], [(9, 1.0), (10, 1.0)], [(9, 1.0), (10, 1.0), (11, 1.0)], [(4, 1.0), (10, 1.0), (11, 1.0)]]


In [17]:
for doc in corpus:
    print(doc)

[(0, 1.0), (1, 1.0), (2, 1.0)]
[(0, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0)]
[(2, 1.0), (5, 1.0), (7, 1.0), (8, 1.0)]
[(1, 1.0), (5, 2.0), (8, 1.0)]
[(3, 1.0), (6, 1.0), (7, 1.0)]
[(9, 1.0)]
[(9, 1.0), (10, 1.0)]
[(9, 1.0), (10, 1.0), (11, 1.0)]
[(4, 1.0), (10, 1.0), (11, 1.0)]


In [19]:
import gensim
import numpy as np
# numpy_matrix = np.random.randint(10, size=[5, 2])  # random matrix as an example
# corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=12)
numpy_matrix

array([[1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 1., 2., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 1., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1.]], dtype=float32)

In [20]:
from gensim import models

tfidf_model = models.TfidfModel(corpus)  # step 1 -- initialize a model

In [21]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf_model[doc_bow])  # step 2 -- use the model to transform vectors

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [22]:
corpus_tfidf = tfidf_model[corpus]
for doc in corpus_tfidf:
    print(doc)
#不建议

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [24]:
import os
import tempfile
import numpy as np
import gensim
import pandas as pd
import matplotlib.pyplot as plt

with tempfile.NamedTemporaryFile(prefix='model-',suffix='.tfidf',delete=False) as test:
    tfidf_model.save('test.name')
    
loaded_tfidf_model = models.TfidfModel.load('test.name')

# os.unlink('tmp.name')

corpus_tfidf = loaded_tfidf_model[corpus]
# for doc in corpus_tfidf:
#     print(doc)
numpy_matrix = gensim.matutils.corpus2dense(corpus_tfidf, num_terms=12)
df_numpy_matrix = pd.DataFrame(numpy_matrix)
df_numpy_matrix.T
# plt.plot(df_numpy_matrix.T)
# plt.xticks(range(9))
# plt.show()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.57735,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.444246,0.0,0.0,0.444246,0.444246,0.324487,0.444246,0.324487,0.0,0.0,0.0,0.0
2,0.0,0.0,0.571006,0.0,0.0,0.417076,0.0,0.417076,0.571006,0.0,0.0,0.0
3,0.0,0.491826,0.0,0.0,0.0,0.718481,0.0,0.0,0.491826,0.0,0.0,0.0
4,0.0,0.0,0.0,0.628258,0.0,0.0,0.628258,0.458894,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.707107,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.508043,0.508043,0.695546
8,0.0,0.0,0.0,0.0,0.628258,0.0,0.0,0.0,0.0,0.0,0.458894,0.628258


In [25]:
from gensim import similarities
index = similarities.MatrixSimilarity(tfidf_model[corpus])  # transform corpus to LSI space and index it

In [26]:
index.save('./tmp/test.index')
index = similarities.MatrixSimilarity.load('./tmp/test.index')

In [27]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_tfidf = tfidf_model[vec_bow]  # convert the query to LSI space
print(vec_tfidf)

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [40]:
sims = index[vec_tfidf]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

[(0, 0.81649655), (1, 0.31412902), (2, 0.0), (3, 0.3477732), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [43]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
df_sims = pd.DataFrame(sims)
df_sims.iloc[:,:]

TypeError: bad operand type for unary -: 'tuple'