Skip to content

Commit

Permalink
Use the optimization level of O2 for the decompression functions on p…
Browse files Browse the repository at this point in the history
…pc64le with gcc, to avoid harmful unrolling and SIMDization with O3
  • Loading branch information
odaira committed Oct 13, 2017
1 parent 34da12c commit 73bcf90
Showing 1 changed file with 34 additions and 1 deletion.
35 changes: 34 additions & 1 deletion lib/lz4.c
Expand Up @@ -117,6 +117,28 @@
# endif /* _MSC_VER */
#endif /* LZ4_FORCE_INLINE */

/* LZ4_FORCE_O2_GCC_PPC64LE and LZ4_FORCE_O2_INLINE_GCC_PPC64LE
* Gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy,
* together with a simple 8-byte copy loop as a fall-back path.
* However, this optimization hurts the decompression speed by >30%,
* because the execution does not go to the optimized loop
* for typical compressible data, and all of the preamble checks
* before going to the fall-back path become useless overhead.
* This optimization happens only with the -O3 flag, and -O2 generates
* a simple 8-byte copy loop.
* With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy
* functions are annotated with __attribute__((optimize("O2"))),
* and also LZ4_wildCopy is forcibly inlined, so that the O2 attribute
* of LZ4_wildCopy does not affect the compression speed.
*/
#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__)
# define LZ4_FORCE_O2_GCC_PPC64LE __attribute__((optimize("O2")))
# define LZ4_FORCE_O2_INLINE_GCC_PPC64LE __attribute__((optimize("O2"))) LZ4_FORCE_INLINE
#else
# define LZ4_FORCE_O2_GCC_PPC64LE
# define LZ4_FORCE_O2_INLINE_GCC_PPC64LE static
#endif

#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
# define expect(expr,value) (__builtin_expect ((expr),(value)) )
#else
Expand Down Expand Up @@ -253,7 +275,8 @@ static void LZ4_copy8(void* dst, const void* src)
}

/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
LZ4_FORCE_O2_INLINE_GCC_PPC64LE
void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
{
BYTE* d = (BYTE*)dstPtr;
const BYTE* s = (const BYTE*)srcPtr;
Expand Down Expand Up @@ -1112,6 +1135,7 @@ int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
* Note that it is important for performance that this function really get inlined,
* in order to remove useless branches during compilation optimization.
*/
LZ4_FORCE_O2_GCC_PPC64LE
LZ4_FORCE_INLINE int LZ4_decompress_generic(
const char* const src,
char* const dst,
Expand Down Expand Up @@ -1272,16 +1296,19 @@ LZ4_FORCE_INLINE int LZ4_decompress_generic(
}


LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
{
return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0);
}

LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize)
{
return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0);
}

LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
{
return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB);
Expand Down Expand Up @@ -1327,6 +1354,7 @@ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dicti
If it's not possible, save the relevant part of decoded data into a safe buffer,
and indicate where it stands using LZ4_setStreamDecode()
*/
LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
{
LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
Expand All @@ -1353,6 +1381,7 @@ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const ch
return result;
}

LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
{
LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
Expand Down Expand Up @@ -1387,6 +1416,7 @@ Advanced decoding functions :
the dictionary must be explicitly provided within parameters
*/

LZ4_FORCE_O2_GCC_PPC64LE
LZ4_FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize)
{
if (dictSize==0)
Expand All @@ -1399,17 +1429,20 @@ LZ4_FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char*
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
}

LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize);
}

LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize);
}

/* debug function */
LZ4_FORCE_O2_GCC_PPC64LE
int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
Expand Down

0 comments on commit 73bcf90

Please sign in to comment.