Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added new ValueError in place of assertion error for no model data provided in lsi model #3271

Merged
merged 12 commits into from
Mar 22, 2022
7 changes: 5 additions & 2 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@

from gensim import interfaces, matutils, utils
from gensim.models import basemodel
piskvorky marked this conversation as resolved.
Show resolved Hide resolved
from gensim.utils import is_empty
mark-todd marked this conversation as resolved.
Show resolved Hide resolved

piskvorky marked this conversation as resolved.
Show resolved Hide resolved
logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -489,7 +490,8 @@ def add_documents(self, corpus, chunksize=None, decay=None):
chunksize = self.chunksize
if decay is None:
decay = self.decay

if is_empty(corpus):
logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?')
if not scipy.sparse.issparse(corpus):
if not self.onepass:
# we are allowed multiple passes over the input => use a faster, randomized two-pass algo
Expand Down Expand Up @@ -590,7 +592,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
Latent representation of corpus in BoW format if `bow` is corpus.

"""
assert self.projection.u is not None, "decomposition not initialized yet"
if self.projection.u is None:
raise ValueError('No training data provided - LSI model not initialized yet')

# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
Expand Down
15 changes: 15 additions & 0 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from copy import deepcopy
from datetime import datetime
import platform
import types

import numpy as np
import scipy.sparse
Expand Down Expand Up @@ -2084,3 +2085,17 @@ def effective_n_jobs(n_jobs):
elif n_jobs < 0:
n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1)
return n_jobs

def is_empty(corpus):
"""Is the corpus (an iterable or a scipy.sparse array) empty?"""
if scipy.sparse.issparse(corpus):
return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns
if isinstance(corpus, types.GeneratorType):
mark-todd marked this conversation as resolved.
Show resolved Hide resolved
return False # don't try to guess emptiness of generators, may lose elements irretrievably
try:
first_doc = next(iter(corpus)) # list, numpy array etc
return False # first document exists => not empty
except StopIteration:
return True
except Exception:
return False