-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
rpmodel.py
98 lines (76 loc) · 3.19 KB
/
rpmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Scikit learn interface for :class:`~gensim.models.rpmodel.RpModel`.
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
Examples
--------
.. sourcecode:: pycon
>>> from gensim.sklearn_api.rpmodel import RpTransformer
>>> from gensim.test.utils import common_dictionary, common_corpus
>>>
>>> # Initialize and fit the model.
>>> model = RpTransformer(id2word=common_dictionary).fit(common_corpus)
>>>
>>> # Use the trained model to transform a document.
>>> result = model.transform(common_corpus[3])
"""
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
from gensim import matutils
class RpTransformer(TransformerMixin, BaseEstimator):
"""Base Word2Vec module, wraps :class:`~gensim.models.rpmodel.RpModel`.
For more information please have a look to `Random projection <https://en.wikipedia.org/wiki/Random_projection>`_.
"""
def __init__(self, id2word=None, num_topics=300):
"""
Parameters
----------
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Mapping `token_id` -> `token`, will be determined from corpus if `id2word == None`.
num_topics : int, optional
Number of dimensions.
"""
self.gensim_model = None
self.id2word = id2word
self.num_topics = num_topics
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : iterable of list of (int, number)
Input corpus in BOW format.
Returns
-------
:class:`~gensim.sklearn_api.rpmodel.RpTransformer`
The trained model.
"""
self.gensim_model = models.RpModel(corpus=X, id2word=self.id2word, num_topics=self.num_topics)
return self
def transform(self, docs):
"""Find the Random Projection factors for `docs`.
Parameters
----------
docs : {iterable of iterable of (int, int), list of (int, number)}
Document or documents to be transformed in BOW format.
Returns
-------
numpy.ndarray of shape [`len(docs)`, `num_topics`]
RP representation for each input document.
"""
if self.gensim_model is None:
raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)
# The input as array of array
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn
# but we should go back to sparse representation in the future
presentation = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(presentation), (len(docs), self.num_topics))