gensim/models/doc2vec_corpusfile.pyx

#!/usr/bin/env cython
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
# cython: embedsignature=True
# coding: utf-8
#
# Copyright (C) 2018 Dmitry Persiyanov <dmitry.persiyanov@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""Optimized cython functions for file-based training :class:`~gensim.models.doc2vec.Doc2Vec` model."""

import cython
import numpy as np
cimport numpy as np

from libcpp.string cimport string
from libcpp.vector cimport vector

from libc.string cimport memset, memcpy

# scipy <= 0.15
try:
    from scipy.linalg.blas import fblas
except ImportError:
    # in scipy > 0.15, fblas function has been removed
    import scipy.linalg.blas as fblas

from gensim.models.doc2vec_inner cimport (
    fast_document_dbow_hs,
    fast_document_dbow_neg,
    fast_document_dm_hs,
    fast_document_dm_neg,
    fast_document_dmc_hs,
    fast_document_dmc_neg,
    init_d2v_config,
    Doc2VecConfig
)

from gensim.models.word2vec_inner cimport random_int32, sscal, REAL_t, our_saxpy

from gensim.models.word2vec_corpusfile cimport (
    VocabItem,
    CythonVocab,
    CythonLineSentence,
    get_alpha,
    get_next_alpha,
    cvocab_t
)

DEF MAX_DOCUMENT_LEN = 10000

cdef int ONE = 1
cdef REAL_t ONEF = <REAL_t>1.0


cdef void prepare_c_structures_for_batch(
        vector[string] &doc_words, int sample, int hs, int window, long long *total_words,
        int *effective_words, unsigned long long *next_random, cvocab_t *vocab,
        np.uint32_t *indexes, int *codelens, np.uint8_t **codes, np.uint32_t **points,
        np.uint32_t *reduced_windows, int *document_len, int train_words,
        int docvecs_count, int doc_tag,
    ) nogil:
    cdef VocabItem predict_word
    cdef string token
    cdef int i = 0

    total_words[0] += doc_words.size()

    for token in doc_words:
        if vocab[0].find(token) == vocab[0].end():  # shrink document to leave out word
            continue  # leaving i unchanged

        predict_word = vocab[0][token]
        if sample and predict_word.sample_int < random_int32(next_random):
            continue
        indexes[i] = predict_word.index
        if hs:
            codelens[i] = predict_word.code_len
            codes[i] = predict_word.code
            points[i] = predict_word.point

        effective_words[0] += 1
        i += 1
        if i == MAX_DOCUMENT_LEN:
            break  # TODO: log warning, tally overflow?
    document_len[0] = i

    if train_words and reduced_windows != NULL:
        for i in range(document_len[0]):
            reduced_windows[i] = random_int32(next_random) % window

    if doc_tag < docvecs_count:
        effective_words[0] += 1


def d2v_train_epoch_dbow(
        model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
        _expected_words, work, neu1, docvecs_count, word_vectors=None, words_lockf=None,
        train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
        doctag_vectors=None, doctags_lockf=None,
    ):
    """Train distributed bag of words model ("PV-DBOW") by training on a corpus file.

    Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train`.

    Parameters
    ----------
    model : :class:`~gensim.models.doc2vec.Doc2Vec`
        The FastText model instance to train.
    corpus_file : str
        Path to corpus file.
    _cur_epoch : int
        Current epoch number. Used for calculating and decaying learning rate.
    work : np.ndarray
        Private working memory for each worker.
    neu1 : np.ndarray
        Private working memory for each worker.
    train_words : bool, optional
        Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** `learn_words`
        and `train_words` are set to True.
    learn_doctags : bool, optional
        Whether the tag vectors should be updated.
    learn_words : bool, optional
        Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**
        `learn_words` and `train_words` are set to True.
    learn_hidden : bool, optional
        Whether or not the weights of the hidden layer will be updated.
    word_vectors : numpy.ndarray, optional
        The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.
    words_lockf : numpy.ndarray, optional
        EXPERIMENTAL. A learning lock factor for each word-vector, value 0.0 completely blocks updates, a value
        of 1.0 allows normal updates to word-vectors.
    doctag_vectors : numpy.ndarray, optional
        Vector representations of the tags. If None, these will be retrieved from the model.
    doctags_lockf : numpy.ndarray, optional
        EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors.

    Returns
    -------
    int
        Number of words in the input document that were actually used for training.

    """
    cdef Doc2VecConfig c

    cdef int cur_epoch = _cur_epoch
    cdef int num_epochs = model.epochs
    cdef long long expected_examples = (-1 if _expected_examples is None else _expected_examples)
    cdef long long expected_words = (-1 if _expected_words is None else _expected_words)
    cdef REAL_t start_alpha = model.alpha
    cdef REAL_t end_alpha = model.min_alpha
    cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)

    cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
    cdef CythonVocab vocab = _cython_vocab

    cdef int i, j, document_len
    cdef int effective_words = 0
    cdef long long total_documents = 0
    cdef long long total_effective_words = 0, total_words = 0
    cdef int sent_idx, idx_start, idx_end

    cdef vector[string] doc_words
    cdef long long _doc_tag = start_doctag

    init_d2v_config(
        &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=train_words,
        work=work, neu1=neu1, word_vectors=word_vectors, words_lockf=words_lockf,
        doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf, docvecs_count=docvecs_count)

    # release GIL & train on the full corpus, document by document
    with nogil:
        input_stream.reset()
        while not (input_stream.is_eof() or total_words > expected_words / c.workers):
            effective_words = 0

            doc_words = input_stream.read_sentence()

            if doc_words.empty():
                continue

            prepare_c_structures_for_batch(
                doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
                &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens,  c.codes, c.points,
                c.reduced_windows, &document_len, c.train_words, c.docvecs_count, _doc_tag)

            for i in range(document_len):
                if c.train_words:  # simultaneous skip-gram wordvec-training
                    j = i - c.window + c.reduced_windows[i]
                    if j < 0:
                        j = 0
                    k = i + c.window + 1 - c.reduced_windows[i]
                    if k > document_len:
                        k = document_len
                    for j in range(j, k):
                        if j == i:
                            continue
                        if c.hs:
                            # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
                            fast_document_dbow_hs(
                                c.points[i], c.codes[i], c.codelens[i], c.word_vectors, c.syn1, c.layer1_size,
                                c.indexes[j], c.alpha, c.work, c.learn_words, c.learn_hidden, c.words_lockf,
                                c.words_lockf_len)

                        if c.negative:
                            # we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
                            c.next_random = fast_document_dbow_neg(
                                c.negative, c.cum_table, c.cum_table_len, c.word_vectors, c.syn1neg,
                                c.layer1_size, c.indexes[i], c.indexes[j], c.alpha, c.work,
                                c.next_random, c.learn_words, c.learn_hidden, c.words_lockf, c.words_lockf_len)

                # docvec-training
                if _doc_tag < c.docvecs_count:
                    if c.hs:
                        fast_document_dbow_hs(
                            c.points[i], c.codes[i], c.codelens[i], c.doctag_vectors, c.syn1, c.layer1_size,
                            _doc_tag, c.alpha, c.work, c.learn_doctags, c.learn_hidden, c.doctags_lockf,
                            c.doctags_lockf_len)

                    if c.negative:
                        c.next_random = fast_document_dbow_neg(
                            c.negative, c.cum_table, c.cum_table_len, c.doctag_vectors, c.syn1neg,
                            c.layer1_size, c.indexes[i], _doc_tag, c.alpha, c.work, c.next_random,
                            c.learn_doctags, c.learn_hidden, c.doctags_lockf, c.doctags_lockf_len)

            total_documents += 1
            total_effective_words += effective_words
            _doc_tag += 1

            c.alpha = get_next_alpha(
                start_alpha, end_alpha, total_documents, total_words,
                expected_examples, expected_words, cur_epoch, num_epochs)

    return total_documents, total_effective_words, total_words


def d2v_train_epoch_dm(
        model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
        _expected_words, work, neu1, docvecs_count, word_vectors=None, words_lockf=None,
        learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None, doctags_lockf=None,
    ):
    """Train distributed memory model ("PV-DM") by training on a corpus file.
    This method implements the DM model with a projection (input) layer that is either the sum or mean of the context
    vectors, depending on the model's `dm_mean` configuration field.

    Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train`.

    Parameters
    ----------
    model : :class:`~gensim.models.doc2vec.Doc2Vec`
        The FastText model instance to train.
    corpus_file : str
        Path to corpus file.
    _cur_epoch : int
        Current epoch number. Used for calculating and decaying learning rate.
    work : np.ndarray
        Private working memory for each worker.
    neu1 : np.ndarray
        Private working memory for each worker.
    learn_doctags : bool, optional
        Whether the tag vectors should be updated.
    learn_words : bool, optional
        Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**
        `learn_words` and `train_words` are set to True.
    learn_hidden : bool, optional
        Whether or not the weights of the hidden layer will be updated.
    word_vectors : numpy.ndarray, optional
        The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.
    words_lockf : numpy.ndarray, optional
        EXPERIMENTAL. A learning lock factor for each word-vector, value 0.0 completely blocks updates, a value
        of 1.0 allows normal updates to word-vectors.
    doctag_vectors : numpy.ndarray, optional
        Vector representations of the tags. If None, these will be retrieved from the model.
    doctags_lockf : numpy.ndarray, optional
        EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors.

    Returns
    -------
    int
        Number of words in the input document that were actually used for training.

    """
    cdef Doc2VecConfig c

    cdef int cur_epoch = _cur_epoch
    cdef int num_epochs = model.epochs
    cdef long long expected_examples = (-1 if _expected_examples is None else _expected_examples)
    cdef long long expected_words = (-1 if _expected_words is None else _expected_words)
    cdef REAL_t start_alpha = model.alpha
    cdef REAL_t end_alpha = model.min_alpha
    cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)

    cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
    cdef CythonVocab vocab = _cython_vocab

    cdef int i, j, k, m, document_len
    cdef int effective_words = 0
    cdef long long total_documents = 0
    cdef long long total_effective_words = 0, total_words = 0
    cdef int sent_idx, idx_start, idx_end
    cdef REAL_t count, inv_count = 1.0

    cdef vector[string] doc_words
    cdef long long _doc_tag = start_doctag

    init_d2v_config(
        &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False,
        work=work, neu1=neu1, word_vectors=word_vectors, words_lockf=words_lockf,
        doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf, docvecs_count=docvecs_count)

    # release GIL & train on the full corpus, document by document
    with nogil:
        input_stream.reset()
        while not (input_stream.is_eof() or total_words > expected_words / c.workers):
            effective_words = 0

            doc_words = input_stream.read_sentence()

            if doc_words.empty():
                continue

            prepare_c_structures_for_batch(
                doc_words, c.sample, c.hs, c.window, &total_words, &effective_words, &c.next_random,
                vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes, c.points, c.reduced_windows,
                &document_len, c.train_words, c.docvecs_count, _doc_tag)

            for i in range(document_len):
                j = i - c.window + c.reduced_windows[i]
                if j < 0:
                    j = 0
                k = i + c.window + 1 - c.reduced_windows[i]
                if k > document_len:
                    k = document_len

                # compose l1 (in _neu1) & clear _work
                memset(c.neu1, 0, c.layer1_size * cython.sizeof(REAL_t))
                count = <REAL_t>0.0
                for m in range(j, k):
                    if m == i:
                        continue
                    else:
                        count += ONEF
                        our_saxpy(&c.layer1_size, &ONEF, &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE, c.neu1, &ONE)

                if _doc_tag < c.docvecs_count:
                    count += ONEF
                    our_saxpy(&c.layer1_size, &ONEF, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE, c.neu1, &ONE)
                if count > (<REAL_t>0.5):
                    inv_count = ONEF/count
                if c.cbow_mean:
                    sscal(&c.layer1_size, &inv_count, c.neu1, &ONE)  # (does this need BLAS-variants like saxpy?)
                memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t))  # work to accumulate l1 error
                if c.hs:
                    fast_document_dm_hs(
                        c.points[i], c.codes[i], c.codelens[i], c.neu1,
                        c.syn1, c.alpha, c.work, c.layer1_size, c.learn_hidden)

                if c.negative:
                    c.next_random = fast_document_dm_neg(
                        c.negative, c.cum_table, c.cum_table_len, c.next_random, c.neu1,
                        c.syn1neg, c.indexes[i], c.alpha, c.work, c.layer1_size, c.learn_hidden)

                if not c.cbow_mean:
                    sscal(&c.layer1_size, &inv_count, c.work, &ONE)  # (does this need BLAS-variants like saxpy?)
                # apply accumulated error in work
                if c.learn_doctags and _doc_tag < c.docvecs_count:
                    our_saxpy(
                        &c.layer1_size, &c.doctags_lockf[_doc_tag % c.doctags_lockf_len], c.work,
                        &ONE, &c.doctag_vectors[_doc_tag * c.layer1_size], &ONE)
                if c.learn_words:
                    for m in range(j, k):
                        if m == i:
                            continue
                        else:
                             our_saxpy(
                                &c.layer1_size, &c.words_lockf[c.indexes[m] % c.words_lockf_len], c.work, &ONE,
                                &c.word_vectors[c.indexes[m] * c.layer1_size], &ONE)

            total_documents += 1
            total_effective_words += effective_words
            _doc_tag += 1

            c.alpha = get_next_alpha(
                start_alpha, end_alpha, total_documents, total_words, expected_examples,
                expected_words, cur_epoch, num_epochs)

    return total_documents, total_effective_words, total_words


def d2v_train_epoch_dm_concat(
        model, corpus_file, offset, start_doctag, _cython_vocab, _cur_epoch, _expected_examples,
        _expected_words, work, neu1, docvecs_count, word_vectors=None, words_lockf=None,
        learn_doctags=True, learn_words=True, learn_hidden=True, doctag_vectors=None,
        doctags_lockf=None,
    ):
    """Train distributed memory model ("PV-DM") by training on a corpus file, using a concatenation of the context
     window word vectors (rather than a sum or average).
     This might be slower since the input at each batch will be significantly larger.

    Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train`.

    Parameters
    ----------
    model : :class:`~gensim.models.doc2vec.Doc2Vec`
        The FastText model instance to train.
    corpus_file : str
        Path to corpus file.
    _cur_epoch : int
        Current epoch number. Used for calculating and decaying learning rate.
    work : np.ndarray
        Private working memory for each worker.
    neu1 : np.ndarray
        Private working memory for each worker.
    learn_doctags : bool, optional
        Whether the tag vectors should be updated.
    learn_words : bool, optional
        Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**
        `learn_words` and `train_words` are set to True.
    learn_hidden : bool, optional
        Whether or not the weights of the hidden layer will be updated.
    word_vectors : numpy.ndarray, optional
        The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.
    words_lockf : numpy.ndarray, optional
        EXPERIMENTAL. A learning lock factor for each word-vector, value 0.0 completely blocks updates, a value
        of 1.0 allows normal updates to word-vectors.
    doctag_vectors : numpy.ndarray, optional
        Vector representations of the tags. If None, these will be retrieved from the model.
    doctags_lockf : numpy.ndarray, optional
        EXPERIMENTAL. The lock factors for each tag, same as `words_lockf`, but for document-vectors.

    Returns
    -------
    int
        Number of words in the input document that were actually used for training.

    """
    cdef Doc2VecConfig c

    cdef int cur_epoch = _cur_epoch
    cdef int num_epochs = model.epochs
    cdef long long expected_examples = (-1 if _expected_examples is None else _expected_examples)
    cdef long long expected_words = (-1 if _expected_words is None else _expected_words)
    cdef REAL_t start_alpha = model.alpha
    cdef REAL_t end_alpha = model.min_alpha
    cdef REAL_t _alpha = get_alpha(model.alpha, end_alpha, cur_epoch, num_epochs)

    cdef CythonLineSentence input_stream = CythonLineSentence(corpus_file, offset)
    cdef CythonVocab vocab = _cython_vocab

    cdef int i, j, k, m, n, document_len
    cdef int effective_words = 0
    cdef long long total_documents = 0
    cdef long long total_effective_words = 0, total_words = 0
    cdef int sent_idx, idx_start, idx_end

    cdef vector[string] doc_words
    cdef long long _doc_tag = start_doctag

    init_d2v_config(
        &c, model, _alpha, learn_doctags, learn_words, learn_hidden, train_words=False,
        work=work, neu1=neu1, word_vectors=word_vectors, words_lockf=words_lockf,
        doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf, docvecs_count=docvecs_count)

    # release GIL & train on the full corpus, document by document
    with nogil:
        input_stream.reset()
        while not (input_stream.is_eof() or total_words > expected_words / c.workers):
            effective_words = 0

            doc_words = input_stream.read_sentence()

            # FIXME? These next 2 lines look fishy to me (gojomo). First, skipping to
            # 'total_documents' (end) seems it'd do nothing useful. Second, assigning
            # into what is typically a count (`doctag_len`) from a boolean test is
            # sketchy, even if in the current limitations of this mode (corpus_file)
            # only '1' is a workable value. But, this code seems to pass at least
            # one real has-some-function test (test_dmc_hs_fromfile), and this mode
            # is rarely used, & I haven't written this code & would prefer to see the
            # whole duplicate-logic of corpus_file mode removed in favor of an approach
            # with less duplication. So I'm not sure anything is broken & it's far from
            # a near-term priority - thus leaving this note.
            _doc_tag = total_documents
            c.doctag_len = _doc_tag < c.docvecs_count

             # skip doc either empty or without expected number of tags
            if doc_words.empty() or c.expected_doctag_len != c.doctag_len:
                continue

            prepare_c_structures_for_batch(
                doc_words, c.sample, c.hs, c.window, &total_words, &effective_words,
                &c.next_random, vocab.get_vocab_ptr(), c.indexes, c.codelens, c.codes,
                c.points, NULL, &document_len, c.train_words, c.docvecs_count, _doc_tag)

            for i in range(document_len):
                j = i - c.window      # negative OK: will pad with null word
                k = i + c.window + 1  # past document end OK: will pad with null word

                # compose l1 & clear work
                if _doc_tag < c.docvecs_count:
                    # doc vector(s)
                    memcpy(&c.neu1[0], &c.doctag_vectors[_doc_tag * c.vector_size], c.vector_size * cython.sizeof(REAL_t))
                n = 0
                for m in range(j, k):
                    # word vectors in window
                    if m == i:
                        continue
                    if m < 0 or m >= document_len:
                        c.window_indexes[n] = c.null_word_index
                    else:
                        c.window_indexes[n] = c.indexes[m]
                    n += 1
                for m in range(2 * c.window):
                    memcpy(
                        &c.neu1[(c.doctag_len + m) * c.vector_size], &c.word_vectors[c.window_indexes[m] * c.vector_size],
                        c.vector_size * cython.sizeof(REAL_t))
                memset(c.work, 0, c.layer1_size * cython.sizeof(REAL_t))  # work to accumulate l1 error

                if c.hs:
                    fast_document_dmc_hs(
                        c.points[i], c.codes[i], c.codelens[i], c.neu1, c.syn1,
                        c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden)

                if c.negative:
                    c.next_random = fast_document_dmc_neg(
                        c.negative, c.cum_table, c.cum_table_len, c.next_random, c.neu1, c.syn1neg,
                        c.indexes[i], c.alpha, c.work, c.layer1_size, c.vector_size, c.learn_hidden)

                if c.learn_doctags and _doc_tag < c.docvecs_count:
                    our_saxpy(
                        &c.vector_size, &c.doctags_lockf[_doc_tag % c.doctags_lockf_len], &c.work[m * c.vector_size],
                        &ONE, &c.doctag_vectors[_doc_tag * c.vector_size], &ONE)
                if c.learn_words:
                    for m in range(2 * c.window):
                        our_saxpy(
                            &c.vector_size, &c.words_lockf[c.window_indexes[m] % c.words_lockf_len], &c.work[(c.doctag_len + m) * c.vector_size],
                            &ONE, &c.word_vectors[c.window_indexes[m] * c.vector_size], &ONE)

            total_documents += 1
            total_effective_words += effective_words
            _doc_tag += 1

            c.alpha = get_next_alpha(
                start_alpha, end_alpha, total_documents, total_words, expected_examples,
                expected_words, cur_epoch, num_epochs)

    return total_documents, total_effective_words, total_words


CORPUSFILE_VERSION = 1