Merge branch 'develop' into sparse2corpus-getitem

piskvorky · Dec 4, 2021 · c46241b · c46241b
2 parents 5c70ccf + fa2d1b1
commit c46241b
Show file tree

Hide file tree

Showing 36 changed files with 261 additions and 129 deletions.
diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
@@ -79,13 +79,13 @@ jobs:
           travis-os-name: osx
           manylinux-version: 1
           python-version: 3.8
-          build-depends: numpy==1.21.0
+          build-depends: numpy==1.17.3
 
         - os: macos-latest
           travis-os-name: osx
           manylinux-version: 1
           python-version: 3.9
-          build-depends: numpy==1.21.0
+          build-depends: numpy==1.19.3
 
         - os: windows-latest
           manylinux-version: 2010
@@ -114,7 +114,7 @@ jobs:
       PLAT: x86_64
       UNICODE_WIDTH: 32
       MB_PYTHON_VERSION: ${{ matrix.python-version }} # MB_PYTHON_VERSION is needed by Multibuild
-      TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest mock cython nmslib pyemd testfixtures scikit-learn pyemd
+      TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest pytest-cov mock cython nmslib pyemd testfixtures scikit-learn pyemd
       DOCKER_TEST_IMAGE: multibuild/xenial_x86_64
       TRAVIS_OS_NAME: ${{ matrix.travis-os-name }}
       SKIP_NETWORK_TESTS: 1

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -20,6 +20,9 @@ jobs:
           - {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'py36-linux'}
           - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'py37-linux'}
           - {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux'}
+          - {name: Windows, python: 3.6, os: windows-2019, tox: 'py36-win'}
+          - {name: Windows, python: 3.7, os: windows-2019, tox: 'py37-win'}
+          - {name: Windows, python: 3.8, os: windows-2019, tox: 'py38-win'}
     env:
       TOX_PARALLEL_NO_SPINNER: 1
 
@@ -38,24 +41,35 @@ jobs:
       # https://www.scala-sbt.org/1.x/docs/Installing-sbt-on-Linux.html
       #
       - name: Update sbt
+        if: matrix.os == 'ubuntu-20.04'
         run: |
           echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
           echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list
           curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add
           sudo apt-get update -y
           sudo apt-get install -y sbt
-      - name: Install tox, gdb
+      - name: Install tox
+        run: pip install tox
+      - name: Install GDB & enable core dumps
+        if: matrix.os == 'ubuntu-20.04'
         run: |
-          pip install tox
           sudo apt-get update -y
           sudo apt-get install -y gdb
-      - name: Enable core dumps
-        run: ulimit -c unlimited -S  # enable core dumps
+          ulimit -c unlimited -S  # enable core dumps
       - name: Run tox tests
         run: tox -e ${{ matrix.tox }}
+      - name: Upload coverage to Codecov
+        if: matrix.os == 'ubuntu-20.04' && matrix.python == '3.8'
+        uses: codecov/codecov-action@v2
+        with:
+          fail_ci_if_error: true
+          files: ./coverage.xml
+          verbose: true
+
+
       - name: Collect corefile
-        if: ${{ failure() }}
+        if: ${{ failure() }} && matrix.os == 'ubuntu-20.04'
         run: |
           pwd
           COREFILE=$(find . -maxdepth 1 -name "core*" | head -n 1)
-          if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi
+          if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi
diff --git a/.travis.yml b/.travis.yml
@@ -20,13 +20,6 @@ env:
       - MB_ML_VER=2014
       - SKIP_NETWORK_TESTS=1
       - DOCKER_TEST_IMAGE=multibuild/xenial_arm64v8
-        #
-        # Build wheels with the oldest possible numpy version to avoid
-        # the problem encountered by:
-        #
-        # https://github.com/RaRe-Technologies/gensim/issues/3085
-        #
-      - BUILD_DEPENDS="oldest-supported-numpy scipy==1.7.0"
         #
         # The contents of this file mirror the linux_testenv list
         # in gensim's setup.py.  I can't think of a better way to include
@@ -36,23 +29,35 @@ env:
       - TEST_DEPENDS="pytest mock cython nmslib pyemd testfixtures Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 scikit-learn"
 
 matrix:
+      #
+      # See .github/workflows/build-wheels.yml for a discussion of why we
+      # handle numpy versions explicitly.
+      #
     - os: linux
       env:
         - MB_PYTHON_VERSION=3.6
         #
         # scipy 1.7.0 wheels not available for Py3.6, so we have to build using
         # an older version.
         #
-        - BUILD_DEPENDS="oldest-supported-numpy scipy==1.5.3"
+        - BUILD_DEPENDS="numpy==1.19.2 scipy==1.5.3"
     - os: linux
       env:
         - MB_PYTHON_VERSION=3.7
+        - BUILD_DEPENDS="numpy==1.19.2 scipy==1.7.0"
     - os: linux
       env:
         - MB_PYTHON_VERSION=3.8
+        - BUILD_DEPENDS="numpy==1.19.2 scipy==1.7.0"
     - os: linux
       env:
         - MB_PYTHON_VERSION=3.9
+        #
+        # oldest-supported-numpy does not seem to handle this particular case
+        # (aarch64, Py3.9) explicitly, but I've double-checked that wheels for
+        # this numpy release are available via PyPI.
+        #
+        - BUILD_DEPENDS="numpy==1.19.3 scipy==1.7.0"
 
 before_install:
     - source multibuild/common_utils.sh

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,16 @@ Changes
 
 ## Unreleased
 
-[#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec)
+* [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola)
+* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv)
+* [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse)
+* [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec)
+* [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv)
+* [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky)
+
+## 4.1.2, 2021-09-17
+
+This is a bugfix release that addresses left over compatibility issues with older versions of numpy and MacOS.
 
 ## 4.1.1, 2021-09-14
 

diff --git a/README.md b/README.md
@@ -176,4 +176,3 @@ BibTeX entry:
   [OpenBLAS]: http://xianyi.github.io/OpenBLAS/
   [source tar.gz]: http://pypi.python.org/pypi/gensim
   [documentation]: http://radimrehurek.com/gensim/install.html
-
diff --git a/SECURITY.md b/SECURITY.md
@@ -0,0 +1,17 @@
+# Security Policy
+
+## Supported Versions
+
+Use this section to tell people about which versions of your project are
+currently being supported with security updates.
+
+| Version | Supported          |
+| ------- | ------------------ |
+| 4.  x   | :white_check_mark: |
+| < 4.0   | :x:                |
+
+## Reporting a Vulnerability
+
+Open a ticket and add the "security" label to it.
+Describe the vulnerability in general.
+We'll reach out to you for specifics.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
diff --git a/continuous_integration/BucketLifecycleConfiguration.json b/continuous_integration/BucketLifecycleConfiguration.json
@@ -0,0 +1,10 @@
+{
+  "Rules": [
+    {
+      "Expiration": {"Days": 30},
+      "Filter": {"Prefix": ""},
+      "ID": "Delete all files older than 30 days to save storage costs",
+      "Status": "Enabled"
+    }
+  ]
+}
diff --git a/continuous_integration/BucketLifecycleConfiguration.txt b/continuous_integration/BucketLifecycleConfiguration.txt
@@ -0,0 +1,15 @@
+JSON files can't have comments, so this file is here to explain the rules in BucketLifecycleConfiguration.json.
+
+Our CI puts wheels in a publicly readable, privately writable S3 bucket (s3://gensim-wheels).
+These wheels can be for gensim releases, in which case we fetch them and push them to PyPI when making a release.
+Once the wheels are on PyPI, we don't need to keep our own copy.
+
+These wheels can also be development wheels: we currently build wheels on every push to develop.
+These can be helpful when tracking down a problem, but they can also build up quickly, consume storage space and contribute to AWS costs.
+
+So, we delete all files in the gensim-wheels bucket every 90 days.
+We rarely need to access wheels that are several months old, anyway.
+
+If you modify the JSON configuration, then you can update it using the command:
+
+    aws --profile smart_open s3api put-bucket-lifecycle-configuration --bucket gensim-wheels --lifecycle-configuration file://continuous_integration/BucketLifecycleConfiguration.json
diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -63,7 +63,7 @@
 # The short X.Y version.
 version = '4.1'
 # The full version, including alpha/beta/rc tags.
-release = '4.1.1'
+release = '4.1.3.dev0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/gensim/__init__.py b/gensim/__init__.py
@@ -4,7 +4,7 @@
 
 """
 
-__version__ = '4.1.1'
+__version__ = '4.1.3.dev0'
 
 import logging
 

diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -143,7 +143,9 @@ def __len__(self):
 
     def __str__(self):
         some_keys = list(itertools.islice(self.token2id.keys(), 5))
-        return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '')
+        return "%s<%i unique tokens: %s%s>" % (
+            self.__class__.__name__, len(self), some_keys, '...' if len(self) > 5 else ''
+        )
 
     @staticmethod
     def from_documents(documents):

diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py
@@ -59,8 +59,9 @@ def addSource(self, source):
         self.sources[sourceId] = source
 
     def __str__(self):
-        return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" %
-                (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs)))
+        return "%s<id=%s, sources=[%s], acceptLangs=[%s]>" % (
+            self.__class__.__name__, self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs)
+        )
 # endclass DmlConfig
 
 

diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py
@@ -331,8 +331,8 @@ def __str__(self):
             String representation of current instance.
 
         """
-        return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \
-            (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize)
+        return "%s<num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s>" % \
+            (self.__class__.__name__, self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize)
 
     def init_empty_corpus(self):
         """Initialize an empty corpus.

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -130,7 +130,7 @@ def __str__(self):
            Human readable representation of the object's state (words and tags).
 
         """
-        return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags)
+        return '%s<%s, %s>' % (self.__class__.__name__, self.words, self.tags)
 
 
 @dataclass
@@ -494,7 +494,7 @@ def train(
 
         """
         if corpus_file is None and corpus_iterable is None:
-            raise TypeError("Either one of corpus_file or documents value must be provided")
+            raise TypeError("Either one of corpus_file or corpus_iterable value must be provided")
 
         if corpus_file is not None and corpus_iterable is not None:
             raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time")
@@ -713,7 +713,7 @@ def __str__(self):
             segments.append('s%g' % self.sample)
         if self.workers > 1:
             segments.append('t%d' % self.workers)
-        return '%s(%s)' % (self.__class__.__name__, ','.join(segments))
+        return '%s<%s>' % (self.__class__.__name__, ','.join(segments))
 
     def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False):
         """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool.

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -38,8 +38,8 @@
     >>> print(len(common_texts))
     9
     >>> model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
-    >>> model.build_vocab(sentences=common_texts)
-    >>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train
+    >>> model.build_vocab(corpus_iterable=common_texts)
+    >>> model.train(corpus_iterable=common_texts, total_examples=len(common_texts), epochs=10)  # train
 
 Once you have a model, you can access its keyed vectors via the `model.wv` attributes.
 The keyed vectors instance is quite powerful: it can perform a wide range of NLP tasks.
@@ -108,9 +108,9 @@
     >>>
     >>>
     >>> model4 = FastText(vector_size=4, window=3, min_count=1)
-    >>> model4.build_vocab(sentences=MyIter())
+    >>> model4.build_vocab(corpus_iterable=MyIter())
     >>> total_examples = model4.corpus_count
-    >>> model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
+    >>> model4.train(corpus_iterable=MyIter(), total_examples=total_examples, epochs=5)
 
 Persist a model to disk with:
 

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -1804,7 +1804,7 @@ def __lt__(self, other):  # used for sorting in a priority queue
 
     def __str__(self):
         vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')]
-        return "%s(%s)" % (self.__class__.__name__, ', '.join(vals))
+        return "%s<%s>" % (self.__class__.__name__, ', '.join(vals))
 
 
 # compatibility alias, allowing older pickle-based `.save()`s to load

diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -615,8 +615,8 @@ def __str__(self):
             Human readable representation of the most important model parameters.
 
         """
-        return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % (
-            self.num_terms, self.num_topics, self.decay, self.chunksize
+        return "%s<num_terms=%s, num_topics=%s, decay=%s, chunksize=%s>" % (
+            self.__class__.__name__, self.num_terms, self.num_topics, self.decay, self.chunksize
         )
 
     def sync_state(self, current_Elogbeta=None):

diff --git a/gensim/models/logentropy_model.py b/gensim/models/logentropy_model.py
@@ -76,7 +76,7 @@ def __init__(self, corpus, normalize=True):
             self.initialize(corpus)
 
     def __str__(self):
-        return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words)
+        return "%s<n_docs=%s, n_words=%s>" % (self.__class__.__name__, self.n_docs, self.n_words)
 
     def initialize(self, corpus):
         """Calculates the global weighting for all terms in a given corpus and transforms the simple