diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index df5f8f5f8e..ebe2201a6d 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -79,13 +79,13 @@ jobs: travis-os-name: osx manylinux-version: 1 python-version: 3.8 - build-depends: numpy==1.21.0 + build-depends: numpy==1.17.3 - os: macos-latest travis-os-name: osx manylinux-version: 1 python-version: 3.9 - build-depends: numpy==1.21.0 + build-depends: numpy==1.19.3 - os: windows-latest manylinux-version: 2010 @@ -114,7 +114,7 @@ jobs: PLAT: x86_64 UNICODE_WIDTH: 32 MB_PYTHON_VERSION: ${{ matrix.python-version }} # MB_PYTHON_VERSION is needed by Multibuild - TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest mock cython nmslib pyemd testfixtures scikit-learn pyemd + TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest pytest-cov mock cython nmslib pyemd testfixtures scikit-learn pyemd DOCKER_TEST_IMAGE: multibuild/xenial_x86_64 TRAVIS_OS_NAME: ${{ matrix.travis-os-name }} SKIP_NETWORK_TESTS: 1 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 41a608ef90..3bdcda0bd2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,6 +20,9 @@ jobs: - {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'py36-linux'} - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'py37-linux'} - {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux'} + - {name: Windows, python: 3.6, os: windows-2019, tox: 'py36-win'} + - {name: Windows, python: 3.7, os: windows-2019, tox: 'py37-win'} + - {name: Windows, python: 3.8, os: windows-2019, tox: 'py38-win'} env: TOX_PARALLEL_NO_SPINNER: 1 @@ -38,24 +41,35 @@ jobs: # https://www.scala-sbt.org/1.x/docs/Installing-sbt-on-Linux.html # - name: Update sbt + if: matrix.os == 'ubuntu-20.04' run: | echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add sudo apt-get update -y sudo apt-get install -y sbt - - name: Install tox, gdb + - name: Install tox + run: pip install tox + - name: Install GDB & enable core dumps + if: matrix.os == 'ubuntu-20.04' run: | - pip install tox sudo apt-get update -y sudo apt-get install -y gdb - - name: Enable core dumps - run: ulimit -c unlimited -S # enable core dumps + ulimit -c unlimited -S # enable core dumps - name: Run tox tests run: tox -e ${{ matrix.tox }} + - name: Upload coverage to Codecov + if: matrix.os == 'ubuntu-20.04' && matrix.python == '3.8' + uses: codecov/codecov-action@v2 + with: + fail_ci_if_error: true + files: ./coverage.xml + verbose: true + + - name: Collect corefile - if: ${{ failure() }} + if: ${{ failure() }} && matrix.os == 'ubuntu-20.04' run: | pwd COREFILE=$(find . -maxdepth 1 -name "core*" | head -n 1) - if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi + if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index a0cd690a6a..f1c9f05e99 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,13 +20,6 @@ env: - MB_ML_VER=2014 - SKIP_NETWORK_TESTS=1 - DOCKER_TEST_IMAGE=multibuild/xenial_arm64v8 - # - # Build wheels with the oldest possible numpy version to avoid - # the problem encountered by: - # - # https://github.com/RaRe-Technologies/gensim/issues/3085 - # - - BUILD_DEPENDS="oldest-supported-numpy scipy==1.7.0" # # The contents of this file mirror the linux_testenv list # in gensim's setup.py. I can't think of a better way to include @@ -36,6 +29,10 @@ env: - TEST_DEPENDS="pytest mock cython nmslib pyemd testfixtures Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 scikit-learn" matrix: + # + # See .github/workflows/build-wheels.yml for a discussion of why we + # handle numpy versions explicitly. + # - os: linux env: - MB_PYTHON_VERSION=3.6 @@ -43,16 +40,24 @@ matrix: # scipy 1.7.0 wheels not available for Py3.6, so we have to build using # an older version. # - - BUILD_DEPENDS="oldest-supported-numpy scipy==1.5.3" + - BUILD_DEPENDS="numpy==1.19.2 scipy==1.5.3" - os: linux env: - MB_PYTHON_VERSION=3.7 + - BUILD_DEPENDS="numpy==1.19.2 scipy==1.7.0" - os: linux env: - MB_PYTHON_VERSION=3.8 + - BUILD_DEPENDS="numpy==1.19.2 scipy==1.7.0" - os: linux env: - MB_PYTHON_VERSION=3.9 + # + # oldest-supported-numpy does not seem to handle this particular case + # (aarch64, Py3.9) explicitly, but I've double-checked that wheels for + # this numpy release are available via PyPI. + # + - BUILD_DEPENDS="numpy==1.19.3 scipy==1.7.0" before_install: - source multibuild/common_utils.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 54c461e7ca..c1afd04e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,16 @@ Changes ## Unreleased -[#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) +* [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) +* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) +* [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) +* [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) + +## 4.1.2, 2021-09-17 + +This is a bugfix release that addresses left over compatibility issues with older versions of numpy and MacOS. ## 4.1.1, 2021-09-14 diff --git a/README.md b/README.md index f61cd390e4..f1cb9f3ddd 100644 --- a/README.md +++ b/README.md @@ -176,4 +176,3 @@ BibTeX entry: [OpenBLAS]: http://xianyi.github.io/OpenBLAS/ [source tar.gz]: http://pypi.python.org/pypi/gensim [documentation]: http://radimrehurek.com/gensim/install.html - diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..3cbff53d79 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,17 @@ +# Security Policy + +## Supported Versions + +Use this section to tell people about which versions of your project are +currently being supported with security updates. + +| Version | Supported | +| ------- | ------------------ | +| 4. x | :white_check_mark: | +| < 4.0 | :x: | + +## Reporting a Vulnerability + +Open a ticket and add the "security" label to it. +Describe the vulnerability in general. +We'll reach out to you for specifics. diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 8e8102fa12..0000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,32 +0,0 @@ -pool: - vmImage: 'vs2017-win2016' - -strategy: - matrix: - py36: - python.version: '3.6' - TOXENV: "py36-win" - py37: - python.version: '3.7' - TOXENV: "py37-win" - py38: - python.version: '3.8' - TOXENV: "py38-win" - py39: - python.version: '3.9' - TOXENV: "py39-win" - -steps: -- task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - displayName: 'Use Python $(python.version)' - -- script: | - python -m pip install --upgrade pip - python -m pip install tox - displayName: 'Install tox' - -- script: | - tox - displayName: 'Testing' diff --git a/continuous_integration/BucketLifecycleConfiguration.json b/continuous_integration/BucketLifecycleConfiguration.json new file mode 100644 index 0000000000..1512b59b5c --- /dev/null +++ b/continuous_integration/BucketLifecycleConfiguration.json @@ -0,0 +1,10 @@ +{ + "Rules": [ + { + "Expiration": {"Days": 30}, + "Filter": {"Prefix": ""}, + "ID": "Delete all files older than 30 days to save storage costs", + "Status": "Enabled" + } + ] +} diff --git a/continuous_integration/BucketLifecycleConfiguration.txt b/continuous_integration/BucketLifecycleConfiguration.txt new file mode 100644 index 0000000000..7392c06393 --- /dev/null +++ b/continuous_integration/BucketLifecycleConfiguration.txt @@ -0,0 +1,15 @@ +JSON files can't have comments, so this file is here to explain the rules in BucketLifecycleConfiguration.json. + +Our CI puts wheels in a publicly readable, privately writable S3 bucket (s3://gensim-wheels). +These wheels can be for gensim releases, in which case we fetch them and push them to PyPI when making a release. +Once the wheels are on PyPI, we don't need to keep our own copy. + +These wheels can also be development wheels: we currently build wheels on every push to develop. +These can be helpful when tracking down a problem, but they can also build up quickly, consume storage space and contribute to AWS costs. + +So, we delete all files in the gensim-wheels bucket every 90 days. +We rarely need to access wheels that are several months old, anyway. + +If you modify the JSON configuration, then you can update it using the command: + + aws --profile smart_open s3api put-bucket-lifecycle-configuration --bucket gensim-wheels --lifecycle-configuration file://continuous_integration/BucketLifecycleConfiguration.json diff --git a/docs/src/conf.py b/docs/src/conf.py index 1ce41292ea..669a56a20a 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -63,7 +63,7 @@ # The short X.Y version. version = '4.1' # The full version, including alpha/beta/rc tags. -release = '4.1.1' +release = '4.1.3.dev0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/gensim/__init__.py b/gensim/__init__.py index e1c230be65..c97e0f74ae 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -4,7 +4,7 @@ """ -__version__ = '4.1.1' +__version__ = '4.1.3.dev0' import logging diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index d954061caf..3bfa65942e 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -143,7 +143,9 @@ def __len__(self): def __str__(self): some_keys = list(itertools.islice(self.token2id.keys(), 5)) - return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') + return "%s<%i unique tokens: %s%s>" % ( + self.__class__.__name__, len(self), some_keys, '...' if len(self) > 5 else '' + ) @staticmethod def from_documents(documents): diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py index d76c622c95..24aca6cb65 100644 --- a/gensim/examples/dmlcz/dmlcorpus.py +++ b/gensim/examples/dmlcz/dmlcorpus.py @@ -59,8 +59,9 @@ def addSource(self, source): self.sources[sourceId] = source def __str__(self): - return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" % - (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs))) + return "%s" % ( + self.__class__.__name__, self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs) + ) # endclass DmlConfig diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 838c7634e3..75893c5ac0 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -331,8 +331,8 @@ def __str__(self): String representation of current instance. """ - return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \ - (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) + return "%s" % \ + (self.__class__.__name__, self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) def init_empty_corpus(self): """Initialize an empty corpus. diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c4b28316b7..c1ff25d994 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -130,7 +130,7 @@ def __str__(self): Human readable representation of the object's state (words and tags). """ - return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) + return '%s<%s, %s>' % (self.__class__.__name__, self.words, self.tags) @dataclass @@ -494,7 +494,7 @@ def train( """ if corpus_file is None and corpus_iterable is None: - raise TypeError("Either one of corpus_file or documents value must be provided") + raise TypeError("Either one of corpus_file or corpus_iterable value must be provided") if corpus_file is not None and corpus_iterable is not None: raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") @@ -713,7 +713,7 @@ def __str__(self): segments.append('s%g' % self.sample) if self.workers > 1: segments.append('t%d' % self.workers) - return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) + return '%s<%s>' % (self.__class__.__name__, ','.join(segments)) def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool. diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index a94bc17f27..6d992d9b94 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -38,8 +38,8 @@ >>> print(len(common_texts)) 9 >>> model = FastText(vector_size=4, window=3, min_count=1) # instantiate - >>> model.build_vocab(sentences=common_texts) - >>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train + >>> model.build_vocab(corpus_iterable=common_texts) + >>> model.train(corpus_iterable=common_texts, total_examples=len(common_texts), epochs=10) # train Once you have a model, you can access its keyed vectors via the `model.wv` attributes. The keyed vectors instance is quite powerful: it can perform a wide range of NLP tasks. @@ -108,9 +108,9 @@ >>> >>> >>> model4 = FastText(vector_size=4, window=3, min_count=1) - >>> model4.build_vocab(sentences=MyIter()) + >>> model4.build_vocab(corpus_iterable=MyIter()) >>> total_examples = model4.corpus_count - >>> model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5) + >>> model4.train(corpus_iterable=MyIter(), total_examples=total_examples, epochs=5) Persist a model to disk with: diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index b5debb21c1..f56adb0b14 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1804,7 +1804,7 @@ def __lt__(self, other): # used for sorting in a priority queue def __str__(self): vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] - return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) + return "%s<%s>" % (self.__class__.__name__, ', '.join(vals)) # compatibility alias, allowing older pickle-based `.save()`s to load diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 6691ddcc31..10a0c60134 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -615,8 +615,8 @@ def __str__(self): Human readable representation of the most important model parameters. """ - return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( - self.num_terms, self.num_topics, self.decay, self.chunksize + return "%s" % ( + self.__class__.__name__, self.num_terms, self.num_topics, self.decay, self.chunksize ) def sync_state(self, current_Elogbeta=None): diff --git a/gensim/models/logentropy_model.py b/gensim/models/logentropy_model.py index a79c685660..16fbace8d2 100644 --- a/gensim/models/logentropy_model.py +++ b/gensim/models/logentropy_model.py @@ -76,7 +76,7 @@ def __init__(self, corpus, normalize=True): self.initialize(corpus) def __str__(self): - return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words) + return "%s" % (self.__class__.__name__, self.n_docs, self.n_words) def initialize(self, corpus): """Calculates the global weighting for all terms in a given corpus and transforms the simple diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 06055722e1..6a407e860e 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -23,7 +23,7 @@ * distributed computing for very large corpora, making use of a cluster of machines -Wall-clock `performance on the English Wikipedia `_ +Wall-clock `performance on the English Wikipedia `_ (2G corpus positions, 3.2M documents, 100K features, 0.5G non-zero entries in the final TF-IDF matrix), requesting the top 400 LSI factors: @@ -162,8 +162,11 @@ class Projection(utils.SaveLoad): via :meth:`~gensim.models.lsimodel.Projection.merge`. This is how incremental training actually happens. """ - def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, - extra_dims=P2_EXTRA_DIMS, dtype=np.float64): + + def __init__( + self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, + extra_dims=P2_EXTRA_DIMS, dtype=np.float64, random_seed=None, + ): """Construct the (U, S) projection from a corpus. Parameters @@ -183,11 +186,15 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER Extra samples to be used besides the rank `k`. Tune to improve accuracy. dtype : numpy.dtype, optional Enforces a type for elements of the decomposed matrix. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. """ self.m, self.k = m, k self.power_iters = power_iters self.extra_dims = extra_dims + self.random_seed = random_seed if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition, # *in-core*. @@ -195,7 +202,7 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER u, s = stochastic_svd( docs, k, chunksize=sys.maxsize, num_terms=m, power_iters=self.power_iters, - extra_dims=self.extra_dims, dtype=dtype) + extra_dims=self.extra_dims, dtype=dtype, random_seed=self.random_seed) else: try: import sparsesvd @@ -223,7 +230,10 @@ def empty_like(self): An empty copy (without corpus) of the current projection. """ - return Projection(self.m, self.k, power_iters=self.power_iters, extra_dims=self.extra_dims) + return Projection( + self.m, self.k, power_iters=self.power_iters, + extra_dims=self.extra_dims, random_seed=self.random_seed, + ) def merge(self, other, decay=1.0): """Merge current :class:`~gensim.models.lsimodel.Projection` instance with another. @@ -354,9 +364,9 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): def __init__( self, corpus=None, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, distributed=False, onepass=True, - power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64 - ): + decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, + extra_samples=P2_EXTRA_DIMS, dtype=np.float64, random_seed=None, + ): """Build an LSI model. Parameters @@ -383,6 +393,9 @@ def __init__( Extra samples to be used besides the rank `k`. Can improve accuracy. dtype : type, optional Enforces a type for elements of the decomposed matrix. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. """ self.id2word = id2word @@ -396,6 +409,7 @@ def __init__( self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters self.dtype = dtype + self.random_seed = random_seed if corpus is None and self.id2word is None: raise ValueError( @@ -411,7 +425,8 @@ def __init__( self.docs_processed = 0 self.projection = Projection( - self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype + self.num_terms, self.num_topics, power_iters=self.power_iters, + extra_dims=self.extra_samples, dtype=dtype, random_seed=self.random_seed ) self.numworkers = 1 @@ -478,11 +493,15 @@ def add_documents(self, corpus, chunksize=None, decay=None): if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo - update = Projection(self.num_terms, self.num_topics, None, dtype=self.dtype) + update = Projection( + self.num_terms, self.num_topics, None, + dtype=self.dtype, random_seed=self.random_seed, + ) update.u, update.s = stochastic_svd( corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, - extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype + extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype, + random_seed=self.random_seed, ) self.projection.merge(update, decay=decay) self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0 @@ -499,7 +518,9 @@ def add_documents(self, corpus, chunksize=None, decay=None): # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") job = matutils.corpus2csc( - chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz, dtype=self.dtype) + chunk, num_docs=len(chunk), num_terms=self.num_terms, + num_nnz=nnz, dtype=self.dtype, + ) del chunk doc_no += job.shape[1] if self.dispatcher: @@ -513,7 +534,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): # serial version, there is only one "worker" (myself) => process the job directly update = Projection( self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, - power_iters=self.power_iters, dtype=self.dtype + power_iters=self.power_iters, dtype=self.dtype, random_seed=self.random_seed, ) del job self.projection.merge(update, decay=decay) @@ -530,7 +551,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): assert not self.dispatcher, "must be in serial mode to receive jobs" update = Projection( self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, - power_iters=self.power_iters, dtype=self.dtype + power_iters=self.power_iters, dtype=self.dtype, ) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents", corpus.shape[1]) @@ -545,8 +566,8 @@ def __str__(self): A human readable string of the current objects parameters. """ - return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( - self.num_terms, self.num_topics, self.decay, self.chunksize + return "%s" % ( + self.__class__.__name__, self.num_terms, self.num_topics, self.decay, self.chunksize ) def __getitem__(self, bow, scaled=False, chunksize=512): @@ -731,7 +752,7 @@ def print_debug(self, num_topics=5, num_words=10): print_debug( self.id2word, self.projection.u, self.projection.s, range(min(num_topics, len(self.projection.u.T))), - num_words=num_words + num_words=num_words, ) def save(self, fname, *args, **kwargs): @@ -864,8 +885,10 @@ def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): logger.info('topic #%s(%.3f): %s, ..., %s', topic, s[topic], ', '.join(pos), ', '.join(neg)) -def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, - power_iters=0, dtype=np.float64, eps=1e-6): +def stochastic_svd( + corpus, rank, num_terms, chunksize=20000, extra_dims=None, + power_iters=0, dtype=np.float64, eps=1e-6, random_seed=None, +): """Run truncated Singular Value Decomposition (SVD) on a sparse input. Parameters @@ -888,6 +911,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, Enforces a type for elements of the decomposed matrix. eps: float, optional Percentage of the spectrum's energy to be discarded. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. + Notes ----- @@ -924,13 +951,16 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, # and more memory friendly than processing all documents at once) y = np.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix", str(y.shape)) + random_state = np.random.RandomState(random_seed) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) - o = np.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix - sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, - corpus.data, o.ravel(), y.ravel()) # y = corpus * o + o = random_state.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix + sparsetools.csc_matvecs( + m, n, samples, corpus.indptr, corpus.indices, + corpus.data, o.ravel(), y.ravel(), + ) # y = corpus * o del o # unlike np, scipy.sparse `astype()` copies everything, even if there is no change to dtype! @@ -960,10 +990,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") - o = np.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix + o = random_state.normal(0.0, 1.0, (n, samples), ).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o - chunk.data, o.ravel(), y.ravel() + chunk.data, o.ravel(), y.ravel(), ) del chunk, o y = [y] diff --git a/gensim/models/normmodel.py b/gensim/models/normmodel.py index 3292f6514e..62cbfc8fef 100644 --- a/gensim/models/normmodel.py +++ b/gensim/models/normmodel.py @@ -41,7 +41,9 @@ def __init__(self, corpus=None, norm='l2'): pass def __str__(self): - return "NormModel(num_docs=%s, num_nnz=%s, norm=%s)" % (self.num_docs, self.num_nnz, self.norm) + return "%s" % ( + self.__class__.__name__, self.num_docs, self.num_nnz, self.norm + ) def calc_norm(self, corpus): """Calculate the norm by calling :func:`~gensim.matutils.unitvec` with the norm parameter. diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 1b2bf9fbb2..cbdaf4cb55 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -70,7 +70,7 @@ def __init__(self, corpus, id2word=None, num_topics=300): self.add_lifecycle_event("created", msg=f"created {self}") def __str__(self): - return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) + return "%s" % (self.__class__.__name__, self.num_terms, self.num_topics) def initialize(self, corpus): """Initialize the random projection matrix. diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 4152f3eb3d..cf2c3d3e1a 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -6,10 +6,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module implements functionality related to the `Term Frequency - Inverse Document Frequency -` vector space bag-of-words models. - -For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes), -see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/ +`_ class of bag-of-words vector space models. """ @@ -347,11 +344,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. - See Also - -------- - ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme. - resolve_weights : Function that also uses the SMART scheme. - References ---------- .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length @@ -435,7 +427,7 @@ def load(cls, *args, **kwargs): return model def __str__(self): - return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) + return "%s" % (self.__class__.__name__, self.num_docs, self.num_nnz) def initialize(self, corpus): """Compute inverse document weights, which will be used to modify term frequencies for documents. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 356f711408..d7df12e283 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -200,6 +200,7 @@ from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector from gensim import utils, matutils +from smart_open.compression import get_supported_extensions logger = logging.getLogger(__name__) @@ -833,11 +834,11 @@ def make_cum_table(self, domain=2**31 - 1): train_words_pow = 0.0 for word_index in range(vocab_size): count = self.wv.get_vecattr(word_index, 'count') - train_words_pow += count**self.ns_exponent + train_words_pow += count**float(self.ns_exponent) cumulative = 0.0 for word_index in range(vocab_size): count = self.wv.get_vecattr(word_index, 'count') - cumulative += count**self.ns_exponent + cumulative += count**float(self.ns_exponent) self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain @@ -1502,6 +1503,14 @@ def _check_corpus_sanity(self, corpus_iterable=None, corpus_file=None, passes=1) raise TypeError( f"Using a generator as corpus_iterable can't support {passes} passes. Try a re-iterable sequence.") + if corpus_iterable is None: + _, corpus_ext = os.path.splitext(corpus_file) + if corpus_ext.lower() in get_supported_extensions(): + raise TypeError( + f"Training from compressed files is not supported with the `corpus_path` argument. " + f"Please decompress {corpus_file} or use `corpus_iterable` instead." + ) + def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None, **kwargs): """Checks whether the training parameters make sense. @@ -1882,7 +1891,7 @@ def __str__(self): and learning rate. """ - return "%s(vocab=%s, vector_size=%s, alpha=%s)" % ( + return "%s" % ( self.__class__.__name__, len(self.wv.index_to_key), self.wv.vector_size, self.alpha, ) diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index db66db67e0..ee73328ff1 100644 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -147,7 +147,7 @@ def __getstate__(self): return result def __str__(self): - return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname()) + return "%s<%i documents in %s>" % (self.cls.__name__, len(self), self.fullname()) def get_index(self): """Load & get index. @@ -359,8 +359,8 @@ def __len__(self): return len(self.fresh_docs) + sum(len(shard) for shard in self.shards) def __str__(self): - return "Similarity index with %i documents in %i shards (stored under %s)" % ( - len(self), len(self.shards), self.output_prefix + return "%s<%i documents in %i shards stored under %s>" % ( + self.__class__.__name__, len(self), len(self.shards), self.output_prefix ) def add_documents(self, corpus): diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py index d2a3f6728f..f97801ca66 100644 --- a/gensim/similarities/termsim.py +++ b/gensim/similarities/termsim.py @@ -61,7 +61,7 @@ def most_similar(self, term, topn=10): def __str__(self): members = ', '.join('%s=%s' % pair for pair in vars(self).items()) - return '%s(%s)' % (self.__class__.__name__, members) + return '%s<%s>' % (self.__class__.__name__, members) class UniformTermSimilarityIndex(TermSimilarityIndex): diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 6c09ea2d1f..431b07c0ce 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -27,7 +27,7 @@ from gensim.test.utils import datapath, get_tmpfile, common_corpus -AZURE = bool(os.environ.get('PIPELINE_WORKSPACE')) +GITHUB_ACTIONS_WINDOWS = os.environ.get('RUNNER_OS') == 'Windows' class DummyTransformer: @@ -62,7 +62,7 @@ def tearDown(self): except OSError: pass - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_load(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -71,7 +71,7 @@ def test_load(self): # the deerwester corpus always has nine documents self.assertEqual(len(docs), 9) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_len(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -87,7 +87,7 @@ def test_len(self): self.assertEqual(len(corpus), 9) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_empty_input(self): tmpf = get_tmpfile('gensim_corpus.tst') with open(tmpf, 'w') as f: @@ -102,7 +102,7 @@ def test_empty_input(self): docs = list(corpus) self.assertEqual(len(docs), 0) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_save(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -114,7 +114,7 @@ def test_save(self): corpus2 = list(self.corpus_class(tmpf)) self.assertEqual(corpus, corpus2) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_serialize(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -136,7 +136,7 @@ def test_serialize(self): idx = [1, 3, 5, 7] self.assertEqual(corpus[idx], corpus2[idx]) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_serialize_compressed(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -154,7 +154,7 @@ def test_serialize_compressed(self): for i in range(len(corpus)): self.assertEqual(corpus[i], corpus2[i]) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_switch_id2word(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -172,7 +172,7 @@ def test_switch_id2word(self): testdoc2 = set((to_unicode(corpus.id2word[x]), y) for x, y in firstdoc2) self.assertEqual(testdoc2, {('computer', 1), ('human', 1), ('interface', 1)}) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_indexing(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -245,7 +245,7 @@ def test_closed_file_object(self): self.assertEqual(f, 0) self.assertEqual(s, 0) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_load(self): self.assertEqual(self.corpus.num_docs, 9) self.assertEqual(self.corpus.num_terms, 12) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index c8b7516c99..a7e1fa58df 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -720,6 +720,15 @@ def test_train_warning(self, loglines): def test_load_on_class_error(self): """Test if exception is raised when loading doc2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) + + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = doc2vec.Doc2Vec(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('d2v_negative_exp.tst') + model.save(tmpf) + loaded_model = doc2vec.Doc2Vec.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent # endclass TestDoc2VecModel diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index efc6a3ca8e..a557368faa 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -762,6 +762,15 @@ def test_vectors_for_all_without_inference(self): predicted = vectors_for_all['responding'] assert np.allclose(expected, predicted) + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = FT_gensim(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('fasttext_negative_exp.tst') + model.save(tmpf) + loaded_model = FT_gensim.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent + @pytest.mark.parametrize('shrink_windows', [True, False]) def test_cbow_hs_training(shrink_windows): diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index b809b39754..297006b75f 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -24,7 +24,7 @@ from gensim.test import basetmtests from gensim.test.utils import datapath, get_tmpfile, common_texts -AZURE = bool(os.environ.get('PIPELINE_WORKSPACE')) +GITHUB_ACTIONS_WINDOWS = os.environ.get('RUNNER_OS') == 'Windows' dictionary = Dictionary(common_texts) corpus = [dictionary.doc2bow(text) for text in common_texts] @@ -232,7 +232,7 @@ def test_get_topic_terms(self): self.assertTrue(isinstance(k, numbers.Integral)) self.assertTrue(np.issubdtype(v, np.floating)) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_get_document_topics(self): model = self.class_( diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index b21fdc6063..bd98ca10d9 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # encoding: utf-8 +import sys from collections import namedtuple import unittest import logging @@ -60,6 +61,10 @@ def test_translate_nn(self): for idx, item in enumerate(self.test_word_pairs): self.assertTrue(item[1] in translated_words[item[0]]) + @unittest.skipIf( + (sys.version_info.major == 3) and (sys.version_info.minor == 9) and (sys.platform == 'darwin'), + 'blinking test, can be related to ' + ) def test_translate_gc(self): # Test globally corrected neighbour retrieval method model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 43505b0be2..79974f97b7 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -13,6 +13,7 @@ import os import bz2 import sys +import tempfile import numpy as np @@ -1040,6 +1041,13 @@ def test_load_on_class_error(self): """Test if exception is raised when loading word2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) + def test_file_should_not_be_compressed(self): + """ + Is corpus_file a compressed file? + """ + with tempfile.NamedTemporaryFile(suffix=".bz2") as fp: + self.assertRaises(TypeError, word2vec.Word2Vec, (None, fp.name)) + def test_reset_from(self): """Test if reset_from() uses pre-built structures from other model""" model = word2vec.Word2Vec(sentences, min_count=1) @@ -1054,6 +1062,15 @@ def test_compute_training_loss(self): training_loss_val = model.get_latest_training_loss() self.assertTrue(training_loss_val > 0.0) + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = word2vec.Word2Vec(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('w2v_negative_exp.tst') + model.save(tmpf) + loaded_model = word2vec.Word2Vec.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent + # endclass TestWord2VecModel diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 83cbdc6471..67b0208f5a 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -436,7 +436,7 @@ def __init__(self, processes, *args, **kwargs): self.batch_size = kwargs.get('batch_size', 64) def __str__(self): - return "%s(processes=%s, batch_size=%s)" % ( + return "%s" % ( self.__class__.__name__, self.processes, self.batch_size) def accumulate(self, texts, window_size): diff --git a/gensim/utils.py b/gensim/utils.py index 30b6d85f58..d4fc6a71dc 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -827,7 +827,7 @@ def __init__(self, num_terms): self.num_terms = num_terms def __str__(self): - return "FakeDict(num_terms=%s)" % self.num_terms + return "%s" % (self.__class__.__name__, self.num_terms) def __getitem__(self, val): if 0 <= val < self.num_terms: diff --git a/setup.py b/setup.py index 1b1b0eaac7..ccb1142fb6 100644 --- a/setup.py +++ b/setup.py @@ -268,6 +268,7 @@ def run(self): # packages included for build-testing everywhere core_testenv = [ 'pytest', + 'pytest-cov', # 'pytest-rerunfailures', # disabled 2020-08-28 for 'mock', 'cython', @@ -338,7 +339,7 @@ def run(self): setup( name='gensim', - version='4.1.1', + version='4.1.3.dev0', description='Python framework for fast Vector Space Modelling', long_description=LONG_DESCRIPTION, diff --git a/tox.ini b/tox.ini index 12811b8ba5..058b37d9f5 100644 --- a/tox.ini +++ b/tox.ini @@ -24,8 +24,26 @@ ignore = E203, # space before : exclude = .venv, .git, .tox, dist, doc, build, gensim/models/deprecated +[coverage:run] +source=gensim + +[coverage:report] +omit = + gensim/test/* + */__init__.py + +exclude_lines = + pragma: no cover + def __repr__ + def __str__ + raise AssertionError + raise NotImplementedError + if __name__ == .__main__.: + +ignore_errors = True + [pytest] -addopts = -rfxEXs --durations=20 --showlocals +addopts = -rfxEXs --durations=20 --showlocals --cov=gensim/ --cov-report=xml [testenv] @@ -46,7 +64,7 @@ setenv = MALLET_HOME={env:MALLET_HOME:} SKIP_NETWORK_TESTS={env:SKIP_NETWORK_TESTS:} BOTO_CONFIG={env:BOTO_CONFIG:} - PIPELINE_WORKSPACE={env:PIPELINE_WORKSPACE:} + RUNNER_OS={env:RUNNER_OS:} PYTHONHASHSEED=1 TOX_PARALLEL_NO_SPINNER=1