From 10f90444832ff0ae1d57f73a2074c884124c50c6 Mon Sep 17 00:00:00 2001 From: Yusuke Matsui Date: Fri, 31 Aug 2018 11:11:08 +0900 Subject: [PATCH 1/4] bug of print functions. some docs update --- README.md | 21 ++++++++++--------- docs/source/tips.rst | 44 +++++++++++++++++++++++++++++++--------- docs/source/tutorial.rst | 7 +++++-- rii/rii.py | 5 ++++- 4 files changed, 54 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 8a59084..8c0101b 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ -Reconfigurable Inverted Index (Rii): fast and memory efficient approximate nearest neighbor search method +Reconfigurable Inverted Index (Rii): IVFPQ-based fast and memory efficient approximate nearest neighbor search method with a subset-search functionality. Reference: @@ -19,9 +19,9 @@ Reference: ![](http://yusukematsui.me/project/rii/img/teaser1.png) | ![](http://yusukematsui.me/project/rii/img/teaser2.png) :---:|:---: The search can be operated for a subset of a database. | Rii remains fast even after many new items are added. -- Fast and memory efficient ANN. Can handle billion-scale data on memory at once. The search is less than 10 ms. -- Can run the search over a **subset** of the whole database -- Remain fast even after a large number of vectors are newly added (i.e., the data structure can be **reconfigured**) +- Fast and memory efficient ANN. Rii enables you to run billion-scale search in less than 10 ms. +- You can run the search over a **subset** of the whole database +- Rii Remains fast even after many vectors are newly added (i.e., the data structure can be **reconfigured**) ## Installing @@ -62,12 +62,12 @@ e.add_configure(vecs=X) ids, dists = e.query(q=q, topk=3) print(ids, dists) # e.g., [7484 8173 1556] [15.06257439 15.38533878 16.16935158] ``` -Note that, if you want, you can construct a codec at the same time as the instantiation of the Rii class +Note that you can construct a PQ codec and instantiate the Rii class at the same time if you want. ```python e = rii.Rii(fine_quantizer=nanopq.PQ(M=32).fit(vecs=Xt)) e.add_configure(vecs=X) ``` -Furthermore, you can even construct the class and add the vectors in one line +Furthermore, you can even write them in one line by chaining a function. ```python e = rii.Rii(fine_quantizer=nanopq.PQ(M=32).fit(vecs=Xt)).add_configure(vecs=X) ``` @@ -109,7 +109,7 @@ with open('rii.pkl', 'rb') as f: e_dumped = pickle.load(f) # e_dumped is identical to e ``` -### Utils +### Util functions ```python # Print the current parameters e.print_params() @@ -117,15 +117,16 @@ e.print_params() # Delete all PQ-codes and posting lists. fine_quantizer is kept. e.clear() +# You can switch the verbose flag +e.verbose = False + # You can merge two Rii instances if they have the same fine_quantizer e1 = rii.Rii(fine_quantizer=codec) e2 = rii.Rii(fine_quantizer=codec) e1.add_reconfigure(vecs=X1) e2.add_reconfigure(vecs=X2) -e1.merge(e2) # e1 will have (PQ-codes of) both X1 and X2 +e1.merge(e2) # Now e1 contains both X1 and X2 -# You can switch the verbose flag -e.verbose = False ``` ## [Examples](./examples) diff --git a/docs/source/tips.rst b/docs/source/tips.rst index b80635c..effa271 100644 --- a/docs/source/tips.rst +++ b/docs/source/tips.rst @@ -39,28 +39,32 @@ Some useful tips for tuning of search parameters: .. _sequential_add: -Initializing a Rii class by adding vectors sequentially +Adding vectors sequentially -------------------------------------------------------- -For the first data addition, one might want to add vectors one by one. +You might want to add vectors one by one. There are two ways to achieve that. The first option is simply calling :func:`rii.Rii.add_configure` everytime. .. code-block:: python + # Suppose X is a set of vectors (np.ndarray with the shape (N, D)) e = rii.Rii(fine_quantizer=codec) for x in X: e.add_configure(vecs=x.reshape(1, -1)) # Don't forget reshaping (D, ) to (1, D) -This works perfectly. But this would take time if you would like to add many vectors -by this way. -It is because the reconfigure function is called (i.e., posting lists are computed from -scrath) whenever each vector ``x`` is added. +This works perfectly. +But this would take time if you would like to add many vectors by this way. +It is because the :func:`rii.Rii.reconfigure` function is called +(inside :func:`rii.Rii.add_configure`) whenever a new vector ``x`` is added. +The reconfiguration step creates postings list from scratch, +that does not need to be run for every addition. -Alternatively, you can call :func:`add` for each ``x`` without updating + +Alternatively, you can call :func:`rii.Rii.add` for each ``x`` without updating the posting lists, and run -:func:`reconfigure` finally. +:func:`rii.Rii.reconfigure` finally. .. code-block:: python @@ -69,8 +73,28 @@ the posting lists, and run e.add(vecs=x.reshape(1, -1)) # Don't forget reshaping (D, ) to (1, D) e.reconfigure() -This is much faster. The final result of both ways are same. -But you must call :func:`rii.Rii.reconfigure` in the final step to create posting lists. +This is much faster. The final results from both ways are identical. +Please remember that you must call :func:`rii.Rii.reconfigure` in the final step to create posting lists. + +Note that, if you receive your data in a batch way, that can be handled in the same manner: + +.. code-block:: python + + # X1 is a set of vectors (batch). Xs is a set of batches. + # You might receive Xs as a generator/iterator + # because the whole Xs is too large to read on memory at once + Xs = [X1, X2, X3] + + # Running "add_configure" everytime + e1 = rii.Rii(fine_quantizer=codec) + for X in Xs: + e1.add_configure(vecs=X) + + # Or, you can run "add" for each batch, and finally run "reconfigure" + e2 = rii.Rii(fine_quantizer=codec) + for X in Xs: + e2.add(vecs=X) + e2.reconfigure() diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 0e0efd5..58d2fee 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -72,6 +72,10 @@ Compared to PQ, OPQ is little bit slower for encoding/searching but slightly mor # Prepare a PQ/OPQ codec with M=32 sub spaces codec = nanopq.PQ(M=32, Ks=256, verbose=True).fit(vecs=Xt) # Trained using Xt +Here, ``M`` is a parameter to control the runtime, accuracy, and memory-consumption. +Each input vector is divided by ``M`` parts later (hence ``D`` must be dividable by ``M``). +With a larger ``M`` value, the search becomes more accurate but slower with a larger memory footprint. +Another parameter ``Ks`` can be 256 for usual cases. See `the tutorial of nanopq `_ for more details about the parameter selection of the codec. Note that you can use ``X`` or the part of ``X`` for training if you @@ -122,8 +126,7 @@ Inside this function, :func:`rii.Rii.add` and :func:`rii.Rii.reconfigure` are ca Make sure that you must call :func:`rii.Rii.add_configure` (not :func:`rii.Rii.add`) for the first data addition. It is because you need to create coarse centers (posting lists). -Note that, if you would like to add vectors sequentially -when constructing the class, please refer this; :ref:`sequential_add` +Note that, if you would like to add vectors sequentially, please refer this; :ref:`sequential_add` .. hint:: diff --git a/rii/rii.py b/rii/rii.py index 875c433..7e802e0 100644 --- a/rii/rii.py +++ b/rii/rii.py @@ -338,7 +338,10 @@ def print_params(self): print("nlist:", self.nlist) print("L0:", self.L0) print("cordwords.shape:", self.codewords.shape) - print("coarse_centers.shape:", self.coarse_centers.shape) + if self.nlist == 0: + print("coarse_centers.shape:", None) + else: + print("coarse_centers.shape:", self.coarse_centers.shape) if self.codes is None: print("codes.shape:", None) From beca908ea0c43815cdf29b894f6d5e7b627a41ff Mon Sep 17 00:00:00 2001 From: Yusuke Matsui Date: Fri, 31 Aug 2018 11:13:33 +0900 Subject: [PATCH 2/4] updated changelog --- docs/source/changelog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index b5ca2e5..b19a87a 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -6,6 +6,7 @@ v0.2.2 (August XX, 2018) - `#14 `_ Build on Mac with clang (without OpenMP) - `#16 `_ SIMD implementation for squared L2 distance (SSE, AVX, and AVX512) - `#18 `_ Implemented a merge function +- `#20 `_ Bug fix v0.2.1 (August 24, 2018) ---------------------------- From 347a310df169a2415f6c76c08d8d27777c7c0903 Mon Sep 17 00:00:00 2001 From: Yusuke Matsui Date: Fri, 31 Aug 2018 11:27:27 +0900 Subject: [PATCH 3/4] v0.2.2 --- docs/source/changelog.rst | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index b19a87a..53229ea 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,7 +1,7 @@ Changelog ============= -v0.2.2 (August XX, 2018) +v0.2.2 (August 31, 2018) ---------------------------- - `#14 `_ Build on Mac with clang (without OpenMP) - `#16 `_ SIMD implementation for squared L2 distance (SSE, AVX, and AVX512) diff --git a/setup.py b/setup.py index 5f33112..f77c3b4 100644 --- a/setup.py +++ b/setup.py @@ -105,7 +105,7 @@ def build_extensions(self): setup( name='rii', - version='0.2.1', + version='0.2.2', description='Fast and memory-efficient ANN with a subset-search functionality', long_description=readme, long_description_content_type='text/markdown', From 00b1e8ae600c1bda8e6dd394066de0c8d3aff729 Mon Sep 17 00:00:00 2001 From: Yusuke Matsui Date: Fri, 31 Aug 2018 11:35:04 +0900 Subject: [PATCH 4/4] pypi version badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8c0101b..fc31f8c 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![Build Status](https://travis-ci.org/matsui528/rii.svg?branch=master)](https://travis-ci.org/matsui528/rii) [![Documentation Status](https://readthedocs.org/projects/rii/badge/?version=latest)](https://rii.readthedocs.io/en/latest/?badge=latest) +[![PyPI version](https://badge.fury.io/py/rii.svg)](https://badge.fury.io/py/rii)