-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
ldaseqmodel.py
146 lines (126 loc) · 6.61 KB
/
ldaseqmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Scikit learn interface for :class:`~gensim.models.ldaseqmodel.LdaSeqModel`.
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
Examples
--------
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer
>>>
>>> # Create a sequential LDA transformer to extract 2 topics from the common corpus.
>>> # Divide the work into 3 unequal time slices.
>>> model = LdaSeqTransformer(id2word=common_dictionary, num_topics=2, time_slice=[3, 4, 2], initialize='gensim')
>>>
>>> # Each document almost entirely belongs to one of the two topics.
>>> transformed_corpus = model.fit_transform(common_corpus)
"""
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
class LdaSeqTransformer(TransformerMixin, BaseEstimator):
"""Base Sequential LDA module, wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model.
For more information take a look at `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
<https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.
"""
def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None,
lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None,
lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
"""
Parameters
----------
time_slice : list of int, optional
Number of documents in each time-slice.
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Mapping from an ID to the word it represents in the vocabulary.
alphas : float, optional
The prior probability of each topic.
num_topics : int, optional
Number of latent topics to be discovered in the corpus.
initialize : {'gensim', 'own', 'ldamodel'}, optional
Controls the initialization of the DTM model. Supports three different modes:
* 'gensim': Uses gensim's own LDA initialization.
* 'own': Uses your own initialization matrix of an LDA model that has been previously trained.
* 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
sstats : np.ndarray of shape [vocab_len, `num_topics`], optional
If `initialize` is set to 'own' this will be used to initialize the DTM model.
lda_model : :class:`~gensim.models.ldamodel.LdaModel`, optional
If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix.
obs_variance : float, optional
Observed variance used to approximate the true and forward variance as shown in
`David M. Blei, John D. Lafferty: "Dynamic Topic Models"
<https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.
chain_variance : float, optional
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.
passes : int, optional
Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
random_state : {numpy.random.RandomState, int}, optional
Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.
lda_inference_max_iter : int, optional
Maximum number of iterations in the inference step of the LDA training.
em_min_iter : int, optional
Minimum number of iterations until converge of the Expectation-Maximization algorithm
em_max_iter : int, optional
Maximum number of iterations until converge of the Expectation-Maximization algorithm
chunksize : int, optional
Number of documents in the corpus do be processed in in a chunk.
"""
self.gensim_model = None
self.time_slice = time_slice
self.id2word = id2word
self.alphas = alphas
self.num_topics = num_topics
self.initialize = initialize
self.sstats = sstats
self.lda_model = lda_model
self.obs_variance = obs_variance
self.chain_variance = chain_variance
self.passes = passes
self.random_state = random_state
self.lda_inference_max_iter = lda_inference_max_iter
self.em_min_iter = em_min_iter
self.em_max_iter = em_max_iter
self.chunksize = chunksize
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : {iterable of list of (int, number), scipy.sparse matrix}
A collection of documents in BOW format used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer`
The trained model.
"""
self.gensim_model = models.LdaSeqModel(
corpus=X, time_slice=self.time_slice, id2word=self.id2word,
alphas=self.alphas, num_topics=self.num_topics, initialize=self.initialize, sstats=self.sstats,
lda_model=self.lda_model, obs_variance=self.obs_variance, chain_variance=self.chain_variance,
passes=self.passes, random_state=self.random_state, lda_inference_max_iter=self.lda_inference_max_iter,
em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize
)
return self
def transform(self, docs):
"""Infer the topic distribution for `docs`.
Parameters
----------
docs : {iterable of list of (int, number), scipy.sparse matrix}
A collection of documents in BOW format to be transformed.
Returns
-------
numpy.ndarray of shape [`len(docs)`, `num_topics`]
The topic representation of each document.
"""
if self.gensim_model is None:
raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)
# The input as array of array
if isinstance(docs[0], tuple):
docs = [docs]
proportions = [self.gensim_model[doc] for doc in docs]
return np.reshape(np.array(proportions), (len(docs), self.num_topics))