-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
ldamodel.py
244 lines (207 loc) · 11 KB
/
ldamodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Scikit learn interface for :class:`~gensim.models.ldamodel.LdaModel`.
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
Examples
--------
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.sklearn_api import LdaTransformer
>>>
>>> # Reduce each document to 2 dimensions (topics) using the sklearn interface.
>>> model = LdaTransformer(num_topics=2, id2word=common_dictionary, iterations=20, random_state=1)
>>> docvecs = model.fit_transform(common_corpus)
"""
import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
from gensim import matutils
class LdaTransformer(TransformerMixin, BaseEstimator):
"""Base LDA module, wraps :class:`~gensim.models.ldamodel.LdaModel`.
The inner workings of this class depends heavily on `Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_ and
`David M. Blei, Andrew Y. Ng, Michael I. Jordan: "Latent Dirichlet Allocation"
<http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf>`_.
"""
def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32):
"""
Parameters
----------
num_topics : int, optional
The number of requested latent topics to be extracted from the training corpus.
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Mapping from integer ID to words in the corpus. Used to determine vocabulary size and logging.
chunksize : int, optional
Number of documents in batch.
passes : int, optional
Number of passes through the corpus during training.
update_every : int, optional
Number of documents to be iterated through for each update.
Set to 0 for batch learning, > 1 for online iterative learning.
alpha : {np.ndarray, str}, optional
Can be set to an 1D array of length equal to the number of expected topics that expresses
our a-priori belief for the each topics' probability.
Alternatively default prior selecting strategies can be employed by supplying a string:
* 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`.
* 'default': Learns an assymetric prior from the corpus.
eta : {float, np.array, str}, optional
A-priori belief on word probability, this can be:
* scalar for a symmetric prior over topic/word probability,
* vector of length num_words to denote an asymmetric user defined probability for each word,
* matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
* the string 'auto' to learn the asymmetric prior from the data.
decay : float, optional
A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
when each new document is examined. Corresponds to Kappa from
`Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
offset : float, optional
Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
eval_every : int, optional
Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
iterations : int, optional
Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
gamma_threshold : float, optional
Minimum change in the value of the gamma parameters to continue iterating.
minimum_probability : float, optional
Topics with a probability lower than this threshold will be filtered out.
random_state : {np.random.RandomState, int}, optional
Either a randomState object or a seed to generate one. Useful for reproducibility.
scorer : str, optional
Method to compute a score reflecting how well the model has fit the input corpus, allowed values are:
* 'perplexity': Perplexity of language model
* 'mass_u': Use :class:`~gensim.models.coherencemodel.CoherenceModel` to compute a topics coherence.
dtype : {numpy.float16, numpy.float32, numpy.float64}, optional
Data-type to use during calculations inside model. All inputs are also converted.
Notes
-----
Configure `passes` and `update_every` params to choose the mode among:
* online (single-pass): update_every != None and passes == 1
* online (multi-pass): update_every != None and passes > 1
* batch: update_every == None
By default, 'online (single-pass)' mode is used for training the LDA model.
"""
self.gensim_model = None
self.num_topics = num_topics
self.id2word = id2word
self.chunksize = chunksize
self.passes = passes
self.update_every = update_every
self.alpha = alpha
self.eta = eta
self.decay = decay
self.offset = offset
self.eval_every = eval_every
self.iterations = iterations
self.gamma_threshold = gamma_threshold
self.minimum_probability = minimum_probability
self.random_state = random_state
self.scorer = scorer
self.dtype = dtype
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : {iterable of iterable of (int, int), scipy.sparse matrix}
A collection of documents in BOW format used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.ldamodel.LdaTransformer`
The trained model.
"""
if sparse.issparse(X):
corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
else:
corpus = X
self.gensim_model = models.LdaModel(
corpus=corpus, num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations,
gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability,
random_state=self.random_state, dtype=self.dtype
)
return self
def transform(self, docs):
"""Infer the topic distribution for `docs`.
Parameters
----------
docs : {iterable of list of (int, number), list of (int, number)}
Document or sequence of documents in BoW format.
Returns
-------
numpy.ndarray of shape [`len(docs)`, `num_topics`]
The topic distribution for each input document.
"""
if self.gensim_model is None:
raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)
# The input as array of array
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn
# but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(distribution), (len(docs), self.num_topics))
def partial_fit(self, X):
"""Train model over a potentially incomplete set of documents.
Uses the parameters set in the constructor.
This method can be used in two ways:
* On an unfitted model in which case the model is initialized and trained on `X`.
* On an already fitted model in which case the model is **updated** by `X`.
Parameters
----------
X : {iterable of iterable of (int, int), scipy.sparse matrix}
A collection of documents in BOW format used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.ldamodel.LdaTransformer`
The trained model.
"""
if sparse.issparse(X):
X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
if self.gensim_model is None:
self.gensim_model = models.LdaModel(
num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold,
minimum_probability=self.minimum_probability, random_state=self.random_state,
dtype=self.dtype
)
self.gensim_model.update(corpus=X)
return self
def score(self, X, y=None):
"""Compute score reflecting how well the model has fitted for the input data.
The scoring method is set using the `scorer` argument in :meth:`~gensim.sklearn_api.ldamodel.LdaTransformer`.
Higher score is better.
Parameters
----------
X : iterable of list of (int, number)
Sequence of documents in BOW format.
Returns
-------
float
The score computed based on the selected method.
"""
if self.scorer == 'perplexity':
corpus_words = sum(cnt for document in X for _, cnt in document)
subsample_ratio = 1.0
perwordbound = \
self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
return -1 * np.exp2(-perwordbound) # returning (-1*perplexity) to select model with minimum value
elif self.scorer == 'u_mass':
goodcm = models.CoherenceModel(model=self.gensim_model, corpus=X, coherence=self.scorer, topn=3)
return goodcm.get_coherence()
else:
raise ValueError("Invalid value {} supplied for `scorer` param".format(self.scorer))