From ddea06f2766637f872a776eb987a30b7da3cf977 Mon Sep 17 00:00:00 2001
From: Dmitry Parfenchik <d.parfenchik@criteo.com>
Date: Sun, 5 Apr 2020 21:30:18 +0200
Subject: [PATCH 1/2] [L2Space] Perf improvement for dimension not of factor 4
 and 16

Currently SIMD (SSE or AVX) is used for the cases when dimension is
multiple of 4 or 16, when dimension size is not strictly equal to
multiple of 4 or 16 a slower non-vectorized method is used.

To improve performance for these cases new methods are added:
`L2SqrSIMD(4|16)ExtResidual` - relies on existing `L2SqrSIMD(4|16)Ext`
to compute up to *4 and *16 dimensions and finishes residual
computation by method `L2Sqr`.

Performance improvement compared to baseline is x3-4 times depending on
dimension. Benchmark results:

Run on (4 X 3300 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 4096 KiB (x1)
Load Average: 2.18, 2.35, 3.88
-----------------------------------------------------------
Benchmark          Time             CPU        Iterations
-----------------------------------------------------------
TstDim65        14.7 ns         14.7 ns     20 * 47128209
RefDim65        50.2 ns         50.1 ns     20 * 10373751
TstDim101       24.7 ns         24.7 ns     20 * 28064436
RefDim101       90.4 ns         90.2 ns     20 *  7592191
TstDim129       31.4 ns         31.3 ns     20 * 22397921
RefDim129        125 ns          124 ns     20 *  5548862
TstDim257       59.3 ns         59.2 ns     20 * 10856753
RefDim257        266 ns          266 ns     20 *  2630926
---
 hnswlib/space_l2.h | 79 +++++++++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 29 deletions(-)

diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
index 4d3ac69a..bc00af72 100644
--- a/hnswlib/space_l2.h
+++ b/hnswlib/space_l2.h
@@ -4,16 +4,19 @@
 namespace hnswlib {
 
     static float
-    L2Sqr(const void *pVect1, const void *pVect2, const void *qty_ptr) {
-        //return *((float *)pVect2);
+    L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        float *pVect1 = (float *) pVect1v;
+        float *pVect2 = (float *) pVect2v;
         size_t qty = *((size_t *) qty_ptr);
+
         float res = 0;
-        for (unsigned i = 0; i < qty; i++) {
-            float t = ((float *) pVect1)[i] - ((float *) pVect2)[i];
+        for (size_t i = 0; i < qty; i++) {
+            float t = *pVect1 - *pVect2;
+            pVect1++;
+            pVect2++;
             res += t * t;
         }
         return (res);
-
     }
 
 #if defined(USE_AVX)
@@ -49,10 +52,8 @@ namespace hnswlib {
         }
 
         _mm256_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-
-        return (res);
-}
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
+    }
 
 #elif defined(USE_SSE)
 
@@ -62,12 +63,9 @@ namespace hnswlib {
         float *pVect2 = (float *) pVect2v;
         size_t qty = *((size_t *) qty_ptr);
         float PORTABLE_ALIGN32 TmpRes[8];
-        // size_t qty4 = qty >> 2;
         size_t qty16 = qty >> 4;
 
         const float *pEnd1 = pVect1 + (qty16 << 4);
-        // const float* pEnd2 = pVect1 + (qty4 << 2);
-        // const float* pEnd3 = pVect1 + qty;
 
         __m128 diff, v1, v2;
         __m128 sum = _mm_set1_ps(0);
@@ -102,10 +100,24 @@ namespace hnswlib {
             diff = _mm_sub_ps(v1, v2);
             sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         }
+
         _mm_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+    }
+#endif
 
-        return (res);
+#if defined(USE_SSE) || defined(USE_AVX)
+    static float
+    L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty16 = qty >> 4 << 4;
+        float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16);
+        float *pVect1 = (float *) pVect1v + qty16;
+        float *pVect2 = (float *) pVect2v + qty16;
+
+        size_t qty_left = qty - qty16;
+        float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
+        return (res + res_tail);
     }
 #endif
 
@@ -119,10 +131,9 @@ namespace hnswlib {
         size_t qty = *((size_t *) qty_ptr);
 
 
-        // size_t qty4 = qty >> 2;
-        size_t qty16 = qty >> 2;
+        size_t qty4 = qty >> 2;
 
-        const float *pEnd1 = pVect1 + (qty16 << 2);
+        const float *pEnd1 = pVect1 + (qty4 << 2);
 
         __m128 diff, v1, v2;
         __m128 sum = _mm_set1_ps(0);
@@ -136,9 +147,22 @@ namespace hnswlib {
             sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         }
         _mm_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+    }
 
-        return (res);
+    static float
+    L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty4 = qty >> 2 << 2;
+
+        float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4);
+        size_t qty_left = qty - qty4;
+
+        float *pVect1 = (float *) pVect1v + qty4;
+        float *pVect2 = (float *) pVect2v + qty4;
+        float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
+
+        return (res + res_tail);
     }
 #endif
 
@@ -151,13 +175,14 @@ namespace hnswlib {
         L2Space(size_t dim) {
             fstdistfunc_ = L2Sqr;
         #if defined(USE_SSE) || defined(USE_AVX)
-            if (dim % 4 == 0)
-                fstdistfunc_ = L2SqrSIMD4Ext;
             if (dim % 16 == 0)
                 fstdistfunc_ = L2SqrSIMD16Ext;
-            /*else{
-                throw runtime_error("Data type not supported!");
-            }*/
+            else if (dim % 4 == 0)
+                fstdistfunc_ = L2SqrSIMD4Ext;
+            else if (dim > 16)
+                fstdistfunc_ = L2SqrSIMD16ExtResiduals;
+            else if (dim > 4)
+                fstdistfunc_ = L2SqrSIMD4ExtResiduals;
         #endif
             dim_ = dim;
             data_size_ = dim * sizeof(float);
@@ -185,10 +210,6 @@ namespace hnswlib {
         int res = 0;
         unsigned char *a = (unsigned char *) pVect1;
         unsigned char *b = (unsigned char *) pVect2;
-        /*for (int i = 0; i < qty; i++) {
-            int t = int((a)[i]) - int((b)[i]);
-            res += t*t;
-        }*/
 
         qty = qty >> 2;
         for (size_t i = 0; i < qty; i++) {
@@ -241,4 +262,4 @@ namespace hnswlib {
     };
 
 
-}
+}
\ No newline at end of file

From 30ac4c574df9b699c656d7a92276e312ea356f6a Mon Sep 17 00:00:00 2001
From: Dmitry Parfenchik <d.parfenchik@criteo.com>
Date: Sun, 19 Apr 2020 09:05:14 +0200
Subject: [PATCH 2/2] [InnerProductSpace] Perf improvement for dimension not of
 factor 4 and 16

Currently SIMD (SSE or AVX) is used for the cases when dimension is
multiple of 4 or 16, when dimension size is not strictly equal to
multiple of 4 or 16 a slower non-vectorized method is used.

To improve performnance for these cases new methods are added:
`InnerProductSIMD(4|16)ExtResidual` - relies on existing
`InnerProductSIMD(4|16)Ext` to compute up to *4 and *16 dimensions and
finishes residual computation by non-vectorized method `InnerProduct`.

Performance improvement compared to baseline is x3-4 times depending on
dimension. Benchmark results:

Run on (4 X 3300 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 4096 KiB (x1)
Load Average: 2.10, 2.25, 2.46

----------------------------------------------------------
Benchmark          Time             CPU        Iterations
----------------------------------------------------------
TstDim65        14.0 ns         14.0 ns     20 * 48676012
RefDim65        50.3 ns         50.2 ns     20 * 12907985
TstDim101       23.8 ns         23.8 ns     20 * 27976276
RefDim101       91.4 ns         91.3 ns     20 *  7364003
TetDim129       30.0 ns         30.0 ns     20 * 23413955
RefDim129        123 ns          123 ns     20 *  5656383
TstDim257       57.8 ns         57.7 ns     20 * 11263073
RefDim257        268 ns          267 ns     20 *  2617478
---
 hnswlib/space_ip.h | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/hnswlib/space_ip.h b/hnswlib/space_ip.h
index e9467473..d0497ff7 100644
--- a/hnswlib/space_ip.h
+++ b/hnswlib/space_ip.h
@@ -211,6 +211,36 @@ namespace hnswlib {
 
 #endif
 
+#if defined(USE_SSE) || defined(USE_AVX)
+    static float
+    InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty16 = qty >> 4 << 4;
+        float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
+        float *pVect1 = (float *) pVect1v + qty16;
+        float *pVect2 = (float *) pVect2v + qty16;
+
+        size_t qty_left = qty - qty16;
+        float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
+        return res + res_tail - 1.0f;
+    }
+
+    static float
+    InnerProductSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty4 = qty >> 2 << 2;
+
+        float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
+        size_t qty_left = qty - qty4;
+
+        float *pVect1 = (float *) pVect1v + qty4;
+        float *pVect2 = (float *) pVect2v + qty4;
+        float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
+
+        return res + res_tail - 1.0f;
+    }
+#endif
+
     class InnerProductSpace : public SpaceInterface<float> {
 
         DISTFUNC<float> fstdistfunc_;
@@ -220,11 +250,15 @@ namespace hnswlib {
         InnerProductSpace(size_t dim) {
             fstdistfunc_ = InnerProduct;
     #if defined(USE_AVX) || defined(USE_SSE)
-            if (dim % 4 == 0)
-                fstdistfunc_ = InnerProductSIMD4Ext;
             if (dim % 16 == 0)
                 fstdistfunc_ = InnerProductSIMD16Ext;
-#endif
+            else if (dim % 4 == 0)
+                fstdistfunc_ = InnerProductSIMD4Ext;
+            else if (dim > 16)
+                fstdistfunc_ = InnerProductSIMD16ExtResiduals;
+            else if (dim > 4)
+                fstdistfunc_ = InnerProductSIMD4ExtResiduals;
+    #endif
             dim_ = dim;
             data_size_ = dim * sizeof(float);
         }