Skip to content

Commit

Permalink
Added new ValueError in place of assertion error for no model data pr…
Browse files Browse the repository at this point in the history
…ovided in lsi model (#3271)

* Added new ValueError in place of assertion error for no model data provided in lsi model

Added warning to lsi model for initialising a model with no data

* Update lsimodel.py

* Update gensim/models/lsimodel.py

* Added better empty corpus testing

* Moved is_empty function to utils

* Update gensim/models/lsimodel.py

Added space

Co-authored-by: Radim Řehůřek <me@radimrehurek.com>

* Update gensim/models/lsimodel.py

Added import space

Co-authored-by: Radim Řehůřek <me@radimrehurek.com>

* Update gensim/utils.py

Added space after False

Co-authored-by: Radim Řehůřek <me@radimrehurek.com>

* Moved import

* Update utils.py

* fix flake8 problem

Co-authored-by: Radim Řehůřek <me@radimrehurek.com>
Co-authored-by: Michael Penkov <m@penkov.dev>
  • Loading branch information
3 people committed Mar 22, 2022
1 parent c19f223 commit a4808c1
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
7 changes: 5 additions & 2 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@

from gensim import interfaces, matutils, utils
from gensim.models import basemodel
from gensim.utils import is_empty

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -489,7 +490,8 @@ def add_documents(self, corpus, chunksize=None, decay=None):
chunksize = self.chunksize
if decay is None:
decay = self.decay

if is_empty(corpus):
logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?')
if not scipy.sparse.issparse(corpus):
if not self.onepass:
# we are allowed multiple passes over the input => use a faster, randomized two-pass algo
Expand Down Expand Up @@ -590,7 +592,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
Latent representation of corpus in BoW format if `bow` is corpus.
"""
assert self.projection.u is not None, "decomposition not initialized yet"
if self.projection.u is None:
raise ValueError('No training data provided - LSI model not initialized yet')

# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
Expand Down
17 changes: 17 additions & 0 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from copy import deepcopy
from datetime import datetime
import platform
import types

import numpy as np
import scipy.sparse
Expand Down Expand Up @@ -2084,3 +2085,19 @@ def effective_n_jobs(n_jobs):
elif n_jobs < 0:
n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1)
return n_jobs


def is_empty(corpus):
"""Is the corpus (an iterable or a scipy.sparse array) empty?"""
if scipy.sparse.issparse(corpus):
return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns
if isinstance(corpus, types.GeneratorType):
return False # don't try to guess emptiness of generators, may lose elements irretrievably
try:
# list, numpy array etc
first_doc = next(iter(corpus)) # noqa: F841 (ignore unused variable)
return False # first document exists => not empty
except StopIteration:
return True
except Exception:
return False

0 comments on commit a4808c1

Please sign in to comment.