From ddea06f2766637f872a776eb987a30b7da3cf977 Mon Sep 17 00:00:00 2001 From: Dmitry Parfenchik Date: Sun, 5 Apr 2020 21:30:18 +0200 Subject: [PATCH 1/2] [L2Space] Perf improvement for dimension not of factor 4 and 16 Currently SIMD (SSE or AVX) is used for the cases when dimension is multiple of 4 or 16, when dimension size is not strictly equal to multiple of 4 or 16 a slower non-vectorized method is used. To improve performance for these cases new methods are added: `L2SqrSIMD(4|16)ExtResidual` - relies on existing `L2SqrSIMD(4|16)Ext` to compute up to *4 and *16 dimensions and finishes residual computation by method `L2Sqr`. Performance improvement compared to baseline is x3-4 times depending on dimension. Benchmark results: Run on (4 X 3300 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 256 KiB (x2) L3 Unified 4096 KiB (x1) Load Average: 2.18, 2.35, 3.88 ----------------------------------------------------------- Benchmark Time CPU Iterations ----------------------------------------------------------- TstDim65 14.7 ns 14.7 ns 20 * 47128209 RefDim65 50.2 ns 50.1 ns 20 * 10373751 TstDim101 24.7 ns 24.7 ns 20 * 28064436 RefDim101 90.4 ns 90.2 ns 20 * 7592191 TstDim129 31.4 ns 31.3 ns 20 * 22397921 RefDim129 125 ns 124 ns 20 * 5548862 TstDim257 59.3 ns 59.2 ns 20 * 10856753 RefDim257 266 ns 266 ns 20 * 2630926 --- hnswlib/space_l2.h | 79 +++++++++++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index 4d3ac69a..bc00af72 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -4,16 +4,19 @@ namespace hnswlib { static float - L2Sqr(const void *pVect1, const void *pVect2, const void *qty_ptr) { - //return *((float *)pVect2); + L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + float *pVect1 = (float *) pVect1v; + float *pVect2 = (float *) pVect2v; size_t qty = *((size_t *) qty_ptr); + float res = 0; - for (unsigned i = 0; i < qty; i++) { - float t = ((float *) pVect1)[i] - ((float *) pVect2)[i]; + for (size_t i = 0; i < qty; i++) { + float t = *pVect1 - *pVect2; + pVect1++; + pVect2++; res += t * t; } return (res); - } #if defined(USE_AVX) @@ -49,10 +52,8 @@ namespace hnswlib { } _mm256_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - - return (res); -} + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; + } #elif defined(USE_SSE) @@ -62,12 +63,9 @@ namespace hnswlib { float *pVect2 = (float *) pVect2v; size_t qty = *((size_t *) qty_ptr); float PORTABLE_ALIGN32 TmpRes[8]; - // size_t qty4 = qty >> 2; size_t qty16 = qty >> 4; const float *pEnd1 = pVect1 + (qty16 << 4); - // const float* pEnd2 = pVect1 + (qty4 << 2); - // const float* pEnd3 = pVect1 + qty; __m128 diff, v1, v2; __m128 sum = _mm_set1_ps(0); @@ -102,10 +100,24 @@ namespace hnswlib { diff = _mm_sub_ps(v1, v2); sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); } + _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + } +#endif - return (res); +#if defined(USE_SSE) || defined(USE_AVX) + static float + L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + size_t qty16 = qty >> 4 << 4; + float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16); + float *pVect1 = (float *) pVect1v + qty16; + float *pVect2 = (float *) pVect2v + qty16; + + size_t qty_left = qty - qty16; + float res_tail = L2Sqr(pVect1, pVect2, &qty_left); + return (res + res_tail); } #endif @@ -119,10 +131,9 @@ namespace hnswlib { size_t qty = *((size_t *) qty_ptr); - // size_t qty4 = qty >> 2; - size_t qty16 = qty >> 2; + size_t qty4 = qty >> 2; - const float *pEnd1 = pVect1 + (qty16 << 2); + const float *pEnd1 = pVect1 + (qty4 << 2); __m128 diff, v1, v2; __m128 sum = _mm_set1_ps(0); @@ -136,9 +147,22 @@ namespace hnswlib { sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); } _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + } - return (res); + static float + L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + size_t qty4 = qty >> 2 << 2; + + float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4); + size_t qty_left = qty - qty4; + + float *pVect1 = (float *) pVect1v + qty4; + float *pVect2 = (float *) pVect2v + qty4; + float res_tail = L2Sqr(pVect1, pVect2, &qty_left); + + return (res + res_tail); } #endif @@ -151,13 +175,14 @@ namespace hnswlib { L2Space(size_t dim) { fstdistfunc_ = L2Sqr; #if defined(USE_SSE) || defined(USE_AVX) - if (dim % 4 == 0) - fstdistfunc_ = L2SqrSIMD4Ext; if (dim % 16 == 0) fstdistfunc_ = L2SqrSIMD16Ext; - /*else{ - throw runtime_error("Data type not supported!"); - }*/ + else if (dim % 4 == 0) + fstdistfunc_ = L2SqrSIMD4Ext; + else if (dim > 16) + fstdistfunc_ = L2SqrSIMD16ExtResiduals; + else if (dim > 4) + fstdistfunc_ = L2SqrSIMD4ExtResiduals; #endif dim_ = dim; data_size_ = dim * sizeof(float); @@ -185,10 +210,6 @@ namespace hnswlib { int res = 0; unsigned char *a = (unsigned char *) pVect1; unsigned char *b = (unsigned char *) pVect2; - /*for (int i = 0; i < qty; i++) { - int t = int((a)[i]) - int((b)[i]); - res += t*t; - }*/ qty = qty >> 2; for (size_t i = 0; i < qty; i++) { @@ -241,4 +262,4 @@ namespace hnswlib { }; -} +} \ No newline at end of file From 30ac4c574df9b699c656d7a92276e312ea356f6a Mon Sep 17 00:00:00 2001 From: Dmitry Parfenchik Date: Sun, 19 Apr 2020 09:05:14 +0200 Subject: [PATCH 2/2] [InnerProductSpace] Perf improvement for dimension not of factor 4 and 16 Currently SIMD (SSE or AVX) is used for the cases when dimension is multiple of 4 or 16, when dimension size is not strictly equal to multiple of 4 or 16 a slower non-vectorized method is used. To improve performnance for these cases new methods are added: `InnerProductSIMD(4|16)ExtResidual` - relies on existing `InnerProductSIMD(4|16)Ext` to compute up to *4 and *16 dimensions and finishes residual computation by non-vectorized method `InnerProduct`. Performance improvement compared to baseline is x3-4 times depending on dimension. Benchmark results: Run on (4 X 3300 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 256 KiB (x2) L3 Unified 4096 KiB (x1) Load Average: 2.10, 2.25, 2.46 ---------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------- TstDim65 14.0 ns 14.0 ns 20 * 48676012 RefDim65 50.3 ns 50.2 ns 20 * 12907985 TstDim101 23.8 ns 23.8 ns 20 * 27976276 RefDim101 91.4 ns 91.3 ns 20 * 7364003 TetDim129 30.0 ns 30.0 ns 20 * 23413955 RefDim129 123 ns 123 ns 20 * 5656383 TstDim257 57.8 ns 57.7 ns 20 * 11263073 RefDim257 268 ns 267 ns 20 * 2617478 --- hnswlib/space_ip.h | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/hnswlib/space_ip.h b/hnswlib/space_ip.h index e9467473..d0497ff7 100644 --- a/hnswlib/space_ip.h +++ b/hnswlib/space_ip.h @@ -211,6 +211,36 @@ namespace hnswlib { #endif +#if defined(USE_SSE) || defined(USE_AVX) + static float + InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + size_t qty16 = qty >> 4 << 4; + float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16); + float *pVect1 = (float *) pVect1v + qty16; + float *pVect2 = (float *) pVect2v + qty16; + + size_t qty_left = qty - qty16; + float res_tail = InnerProduct(pVect1, pVect2, &qty_left); + return res + res_tail - 1.0f; + } + + static float + InnerProductSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + size_t qty4 = qty >> 2 << 2; + + float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4); + size_t qty_left = qty - qty4; + + float *pVect1 = (float *) pVect1v + qty4; + float *pVect2 = (float *) pVect2v + qty4; + float res_tail = InnerProduct(pVect1, pVect2, &qty_left); + + return res + res_tail - 1.0f; + } +#endif + class InnerProductSpace : public SpaceInterface { DISTFUNC fstdistfunc_; @@ -220,11 +250,15 @@ namespace hnswlib { InnerProductSpace(size_t dim) { fstdistfunc_ = InnerProduct; #if defined(USE_AVX) || defined(USE_SSE) - if (dim % 4 == 0) - fstdistfunc_ = InnerProductSIMD4Ext; if (dim % 16 == 0) fstdistfunc_ = InnerProductSIMD16Ext; -#endif + else if (dim % 4 == 0) + fstdistfunc_ = InnerProductSIMD4Ext; + else if (dim > 16) + fstdistfunc_ = InnerProductSIMD16ExtResiduals; + else if (dim > 4) + fstdistfunc_ = InnerProductSIMD4ExtResiduals; + #endif dim_ = dim; data_size_ = dim * sizeof(float); }