gensim/sklearn_api/lsimodel.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""Scikit learn interface for :class:`gensim.models.lsimodel.LsiModel`.

Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.

Examples
--------
Integrate with sklearn Pipelines:

.. sourcecode:: pycon

    >>> from sklearn.pipeline import Pipeline
    >>> from sklearn import linear_model
    >>> from gensim.test.utils import common_corpus, common_dictionary
    >>> from gensim.sklearn_api import LsiTransformer
    >>>
    >>> # Create stages for our pipeline (including gensim and sklearn models alike).
    >>> model = LsiTransformer(num_topics=15, id2word=common_dictionary)
    >>> clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
    >>> pipe = Pipeline([('features', model,), ('classifier', clf)])
    >>>
    >>> # Create some random binary labels for our documents.
    >>> labels = np.random.choice([0, 1], len(common_corpus))
    >>>
    >>> # How well does our pipeline perform on the training set?
    >>> score = pipe.fit(common_corpus, labels).score(common_corpus, labels)

"""
import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from gensim import models
from gensim import matutils


class LsiTransformer(TransformerMixin, BaseEstimator):
    """Base LSI module, wraps :class:`~gensim.models.lsimodel.LsiModel`.

    For more information please have a look to `Latent semantic analysis
    <https://en.wikipedia.org/wiki/Latent_semantic_analysis>`_.

    """
    def __init__(self, num_topics=200, id2word=None, chunksize=20000,
                 decay=1.0, onepass=True, power_iters=2, extra_samples=100):
        """

        Parameters
        ----------
        num_topics : int, optional
            Number of requested factors (latent dimensions).
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            ID to word mapping, optional.
        chunksize :  int, optional
            Number of documents to be used in each training chunk.
        decay : float, optional
            Weight of existing observations relatively to new ones.
        onepass : bool, optional
            Whether the one-pass algorithm should be used for training, pass `False` to force a
            multi-pass stochastic algorithm.
        power_iters: int, optional
            Number of power iteration steps to be used.
            Increasing the number of power iterations improves accuracy, but lowers performance.
        extra_samples : int, optional
            Extra samples to be used besides the rank `k`. Can improve accuracy.

        """
        self.gensim_model = None
        self.num_topics = num_topics
        self.id2word = id2word
        self.chunksize = chunksize
        self.decay = decay
        self.onepass = onepass
        self.extra_samples = extra_samples
        self.power_iters = power_iters

    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            A collection of documents in BOW format to be transformed.

        Returns
        -------
        :class:`~gensim.sklearn_api.lsimodel.LsiTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
        else:
            corpus = X

        self.gensim_model = models.LsiModel(
            corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
            decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples
        )
        return self

    def transform(self, docs):
        """Computes the latent factors for `docs`.

        Parameters
        ----------
        docs : {iterable of list of (int, number), list of (int, number), scipy.sparse matrix}
            Document or collection of documents in BOW format to be transformed.

        Returns
        -------
        numpy.ndarray of shape [`len(docs)`, `num_topics`]
            Topic distribution matrix.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of array
        if isinstance(docs[0], tuple):
            docs = [docs]
        # returning dense representation for compatibility with sklearn
        # but we should go back to sparse representation in the future
        distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
        return np.reshape(np.array(distribution), (len(docs), self.num_topics))

    def partial_fit(self, X):
        """Train model over a potentially incomplete set of documents.

        This method can be used in two ways:
            1. On an unfitted model in which case the model is initialized and trained on `X`.
            2. On an already fitted model in which case the model is **further** trained on `X`.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            Stream of document vectors or sparse matrix of shape: [`num_terms`, `num_documents`].

        Returns
        -------
        :class:`~gensim.sklearn_api.lsimodel.LsiTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)

        if self.gensim_model is None:
            self.gensim_model = models.LsiModel(
                num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, decay=self.decay,
                onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples
            )

        self.gensim_model.add_documents(corpus=X)
        return self