gensim/models/logentropy_model.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""This module allows simple Bag of Words (BoW) represented corpus to be transformed into log entropy space.
It implements Log Entropy Model that produces entropy-weighted logarithmic term frequency representation.

Empirical study by Lee et al. 2015 [1]_ suggests log entropy-weighted model yields better results among other forms of
representation.

References
----------
.. [1] Lee et al. 2005. An Empirical Evaluation of Models of Text Document Similarity.
       https://escholarship.org/uc/item/48g155nq

"""

import logging
import math

from gensim import interfaces, matutils, utils

logger = logging.getLogger(__name__)


class LogEntropyModel(interfaces.TransformationABC):
    r"""Objects of this class realize the transformation between word-document co-occurrence matrix (int)
    into a locally/globally weighted matrix (positive floats).

    This is done by a log entropy normalization, optionally normalizing the resulting documents to unit length.
    The following formulas explain how o compute the log entropy weight for term :math:`i` in document :math:`j`:

    .. math::

        local\_weight_{i,j} = log(frequency_{i,j} + 1)

        P_{i,j} = \frac{frequency_{i,j}}{\sum_j frequency_{i,j}}

        global\_weight_i = 1 + \frac{\sum_j P_{i,j} * log(P_{i,j})}{log(number\_of\_documents + 1)}

        final\_weight_{i,j} = local\_weight_{i,j} * global\_weight_i

    Examples
    --------
    .. sourcecode:: pycon

        >>> from gensim.models import LogEntropyModel
        >>> from gensim.test.utils import common_texts
        >>> from gensim.corpora import Dictionary
        >>>
        >>> dct = Dictionary(common_texts)  # fit dictionary
        >>> corpus = [dct.doc2bow(row) for row in common_texts]  # convert to BoW format
        >>> model = LogEntropyModel(corpus)  # fit model
        >>> vector = model[corpus[1]]  # apply model to document

    """

    def __init__(self, corpus, normalize=True):
        """

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Input corpus in BoW format.
        normalize : bool, optional
            If True, the resulted log entropy weighted vector will be normalized to length of 1,
            If False - do nothing.

        """
        self.normalize = normalize
        self.n_docs = 0
        self.n_words = 0
        self.entr = {}
        if corpus is not None:
            self.initialize(corpus)

    def __str__(self):
        return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words)

    def initialize(self, corpus):
        """Calculates the global weighting for all terms in a given corpus and transforms the simple
        count representation into the log entropy normalized space.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Corpus is BoW format

        """
        logger.info("calculating counts")
        glob_freq = {}
        glob_num_words, doc_no = 0, -1
        for doc_no, bow in enumerate(corpus):
            if doc_no % 10000 == 0:
                logger.info("PROGRESS: processing document #%i", doc_no)
            glob_num_words += len(bow)
            for term_id, term_count in bow:
                glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count

        # keep some stats about the training corpus
        self.n_docs = doc_no + 1
        self.n_words = glob_num_words

        # and finally compute the global weights
        logger.info(
            "calculating global log entropy weights for %i documents and %i features (%i matrix non-zeros)",
            self.n_docs, len(glob_freq), self.n_words
        )
        logger.debug('iterating over corpus')

        # initialize doc_no2 index in case corpus is empty
        doc_no2 = 0
        for doc_no2, bow in enumerate(corpus):
            for key, freq in bow:
                p = (float(freq) / glob_freq[key]) * math.log(float(freq) / glob_freq[key])
                self.entr[key] = self.entr.get(key, 0.0) + p
        if doc_no2 != doc_no:
            raise ValueError("LogEntropyModel doesn't support generators as training data")

        logger.debug('iterating over keys')
        for key in self.entr:
            self.entr[key] = 1 + self.entr[key] / math.log(self.n_docs + 1)

    def __getitem__(self, bow):
        """Get log entropy representation of the input vector and/or corpus.

        Parameters
        ----------
        bow : list of (int, int)
            Document in BoW format.

        Returns
        -------
        list of (int, float)
            Log-entropy vector for passed `bow`.

        """
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [
            (term_id, math.log(tf + 1) * self.entr.get(term_id))
            for term_id, tf in bow
            if term_id in self.entr
        ]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector