piskvorky · mpenkov · Mar 22, 2022 · Nov 26, 2021 · Feb 21, 2022 · Feb 21, 2022
diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
@@ -70,6 +70,7 @@
 
 from gensim import interfaces, matutils, utils
 from gensim.models import basemodel
+from gensim.utils import is_empty
 
 logger = logging.getLogger(__name__)
 
@@ -489,7 +490,8 @@ def add_documents(self, corpus, chunksize=None, decay=None):
             chunksize = self.chunksize
         if decay is None:
             decay = self.decay
-
+        if is_empty(corpus):
+            logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?')
         if not scipy.sparse.issparse(corpus):
             if not self.onepass:
                 # we are allowed multiple passes over the input => use a faster, randomized two-pass algo
@@ -590,7 +592,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
             Latent representation of corpus in BoW format if `bow` is corpus.
 
         """
-        assert self.projection.u is not None, "decomposition not initialized yet"
+        if self.projection.u is None:
+            raise ValueError('No training data provided - LSI model not initialized yet')
 
         # if the input vector is in fact a corpus, return a transformed corpus as a result
         is_corpus, bow = utils.is_corpus(bow)

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -30,6 +30,7 @@
 from copy import deepcopy
 from datetime import datetime
 import platform
+import types
 
 import numpy as np
 import scipy.sparse
@@ -2084,3 +2085,17 @@ def effective_n_jobs(n_jobs):
     elif n_jobs < 0:
         n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1)
     return n_jobs
+
+def is_empty(corpus):
+    """Is the corpus (an iterable or a scipy.sparse array) empty?"""
+    if scipy.sparse.issparse(corpus):
+        return corpus.shape[1] == 0  # by convention, scipy.sparse documents are columns
+    if isinstance(corpus, types.GeneratorType):
+        return False  # don't try to guess emptiness of generators, may lose elements irretrievably
+    try:
+        first_doc = next(iter(corpus))  # list, numpy array etc
+        return False  # first document exists => not empty
+    except StopIteration:
+        return True
+    except Exception:
+        return False