Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new ValueError in place of assertion error for no model data provided in lsi model #3271

Merged
merged 12 commits into from
Mar 22, 2022
21 changes: 19 additions & 2 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import logging
import sys
import time
import types

import numpy as np
import scipy.linalg
Expand Down Expand Up @@ -482,14 +483,29 @@ def add_documents(self, corpus, chunksize=None, decay=None):
If the distributed mode is on, each chunk is sent to a different worker/computer.

"""
def is_empty(corpus):
mark-todd marked this conversation as resolved.
Show resolved Hide resolved
"""Is the corpus (an iterable or a scipy.sparse array) empty?"""
if scipy.sparse.issparse(corpus):
return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns
if isinstance(corpus, types.GeneratorType):
return False # don't try to guess emptiness of generators, may lose elements irretrievably
try:
first_doc = next(iter(corpus)) # list, numpy array etc
return False # first document exists => not empty
except StopIteration:
return True
except Exception:
return False

mark-todd marked this conversation as resolved.
Show resolved Hide resolved
logger.info("updating model with new documents")

# get computation parameters; if not specified, use the ones from constructor
if chunksize is None:
chunksize = self.chunksize
if decay is None:
decay = self.decay

if is_empty(corpus):
logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?')
if not scipy.sparse.issparse(corpus):
if not self.onepass:
# we are allowed multiple passes over the input => use a faster, randomized two-pass algo
Expand Down Expand Up @@ -590,7 +606,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
Latent representation of corpus in BoW format if `bow` is corpus.

"""
assert self.projection.u is not None, "decomposition not initialized yet"
if self.projection.u is None:
raise ValueError('No training data provided - LSI model not initialized yet')

# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
Expand Down