From b5c35000552008b6fe8f2168dc06f87b60ae315e Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Mon, 28 Jan 2019 09:36:25 -0800 Subject: [PATCH] decompress_generic: Add fastpath for small offsets For small offsets of size 1, 2, 4 and 8, we can set a single uint64_t, and then use it to do a memset() variation. In particular, this makes the somewhat-common RLE (offset 1) about 2-4x faster than the previous implementation - we avoid not only the load blocked by store, but also avoid the loads entirely. --- lib/lz4.c | 78 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/lib/lz4.c b/lib/lz4.c index 45a36bbe2..ccc697e58 100644 --- a/lib/lz4.c +++ b/lib/lz4.c @@ -297,6 +297,30 @@ void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) do { memcpy(d,s,8); d+=8; s+=8; } while (d 16) LZ4_wildCopy(op+8, match+8, cpy); + LZ4_memcpy_using_offset(op, match, cpy, offset); } else { LZ4_wildCopy32(op, match, cpy); }