Skip to content

Commit

Permalink
More neon/sse optimizations; neon dispatch added for arm32
Browse files Browse the repository at this point in the history
  • Loading branch information
okuhara committed Nov 14, 2020
1 parent 3486e6c commit 343493d
Show file tree
Hide file tree
Showing 24 changed files with 1,497 additions and 694 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ Loop optimization and flip using carry propagation. One time execution but affec

## 3. eval.c
Eval feature calculation using SSE2 / AVX2 (now in eval_sse.c) improves midgame by 15-30% and endgame by 8-12%.
Restoring eval from backup instead of rewinding.
eval_open (one time execution) is also optimized.

## 4. hash.c
Expand Down
4 changes: 2 additions & 2 deletions src/Android.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
LOCAL_MODULE := aEdax # should be renamed to lib..aEdax..so afterwords
LOCAL_CFLAGS += -DUNICODE
LOCAL_SRC_FILES := all.c
# LOCAL_ARM_NEON := true
LOCAL_SRC_FILES := all.c board_sse.c.neon eval_sse.c.neon flip_neon_bitscan.c.neon android/cpu-features.c
LOCAL_ARM_NEON := false
# cmd-strip :=
include $(BUILD_EXECUTABLE)
39 changes: 18 additions & 21 deletions src/bit.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,7 @@
#include "util.h"

/** coordinate to bit table converter */
const unsigned long long X_TO_BIT[] = {
0x0000000000000001ULL, 0x0000000000000002ULL, 0x0000000000000004ULL, 0x0000000000000008ULL,
0x0000000000000010ULL, 0x0000000000000020ULL, 0x0000000000000040ULL, 0x0000000000000080ULL,
0x0000000000000100ULL, 0x0000000000000200ULL, 0x0000000000000400ULL, 0x0000000000000800ULL,
0x0000000000001000ULL, 0x0000000000002000ULL, 0x0000000000004000ULL, 0x0000000000008000ULL,
0x0000000000010000ULL, 0x0000000000020000ULL, 0x0000000000040000ULL, 0x0000000000080000ULL,
0x0000000000100000ULL, 0x0000000000200000ULL, 0x0000000000400000ULL, 0x0000000000800000ULL,
0x0000000001000000ULL, 0x0000000002000000ULL, 0x0000000004000000ULL, 0x0000000008000000ULL,
0x0000000010000000ULL, 0x0000000020000000ULL, 0x0000000040000000ULL, 0x0000000080000000ULL,
0x0000000100000000ULL, 0x0000000200000000ULL, 0x0000000400000000ULL, 0x0000000800000000ULL,
0x0000001000000000ULL, 0x0000002000000000ULL, 0x0000004000000000ULL, 0x0000008000000000ULL,
0x0000010000000000ULL, 0x0000020000000000ULL, 0x0000040000000000ULL, 0x0000080000000000ULL,
0x0000100000000000ULL, 0x0000200000000000ULL, 0x0000400000000000ULL, 0x0000800000000000ULL,
0x0001000000000000ULL, 0x0002000000000000ULL, 0x0004000000000000ULL, 0x0008000000000000ULL,
0x0010000000000000ULL, 0x0020000000000000ULL, 0x0040000000000000ULL, 0x0080000000000000ULL,
0x0100000000000000ULL, 0x0200000000000000ULL, 0x0400000000000000ULL, 0x0800000000000000ULL,
0x1000000000000000ULL, 0x2000000000000000ULL, 0x4000000000000000ULL, 0x8000000000000000ULL,
0, 0 // <- hack for passing move & nomove
};
unsigned long long X_TO_BIT[66];

/** Conversion array: neighbour bits */
const unsigned long long NEIGHBOUR[] = {
Expand Down Expand Up @@ -167,14 +149,25 @@ static int bit_count_32(unsigned int b)
*/
void bit_init(void)
{
#ifndef POPCOUNT
unsigned int i;
unsigned long long ll;

ll = 1;
for (i = 0; i < 66; ++i) { // X_TO_BIT[64] = X_TO_BIT[65] = 0 for passing move & nomove
X_TO_BIT[i] = ll;
ll <<= 1;
}

#ifndef POPCOUNT
for (i = 0; i < (1 << 16); ++i)
PopCnt16[i] = bit_count_32(i);
#endif
#if (defined(USE_GAS_MMX) || defined(USE_MSVC_X86)) && !defined(hasSSE2)
init_mmx();
#endif
#if defined(ANDROID) && !defined(hasNeon) && !defined(hasSSE2)
init_neon();
#endif
}

/**
Expand Down Expand Up @@ -450,15 +443,19 @@ unsigned long long vertical_mirror(unsigned long long b)
*/
unsigned int horizontal_mirror_32(unsigned int b)
{
#ifdef __ARM_ACLE
return __rev(__rbit(b));
#else
b = ((b >> 1) & 0x55555555U) + 2 * (b & 0x55555555U);
b = ((b >> 2) & 0x33333333U) + 4 * (b & 0x33333333U);
b = ((b >> 4) & 0x0F0F0F0FU) + 16 * (b & 0x0F0F0F0FU);
return b;
#endif
}

unsigned long long horizontal_mirror(unsigned long long b)
{
#ifdef HAS_CPU_64
#if defined(HAS_CPU_64) && !defined(__ARM_ACLE)
b = ((b >> 1) & 0x5555555555555555ULL) | ((b & 0x5555555555555555ULL) << 1);
b = ((b >> 2) & 0x3333333333333333ULL) | ((b & 0x3333333333333333ULL) << 2);
b = ((b >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((b & 0x0F0F0F0F0F0F0F0FULL) << 4);
Expand Down
20 changes: 14 additions & 6 deletions src/bit.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,15 @@ unsigned int horizontal_mirror_32(unsigned int b);
unsigned long long horizontal_mirror(unsigned long long);
int get_rand_bit(unsigned long long, struct Random*);

extern const unsigned long long X_TO_BIT[];
extern unsigned long long X_TO_BIT[];
extern const unsigned long long NEIGHBOUR[];

/** Return a bitboard with bit x set. */
#ifdef __aarch64__ // 1% slower on Sandy Bridge
#define x_to_bit(x) (1ULL << (x))
#else
#define x_to_bit(x) X_TO_BIT[x]

//#define x_to_bit(x) (1ULL << (x)) // 1% slower on Sandy Bridge
#endif

/** Loop over each bit set. */
#if (defined(__GNUC__) && __GNUC__ >= 4) || __has_builtin(__builtin_ctzll)
Expand Down Expand Up @@ -67,7 +71,7 @@ extern const unsigned long long X_TO_BIT[];
#endif

// popcount
#if !defined(POPCOUNT) && (defined(__ARM_NEON__) || defined(_M_ARM) || defined(_M_ARM64))
#if !defined(POPCOUNT) && defined(hasNeon)
#define POPCOUNT 1
#endif

Expand Down Expand Up @@ -117,6 +121,10 @@ extern const unsigned long long X_TO_BIT[];
#endif
#endif

#if defined(ANDROID) && ((defined(__arm__) && !defined(hasNeon)) || (defined(__i386__) && !defined(hasSSE2)))
extern bool hasSSE2;
#endif

typedef union {
unsigned long long ull[2];
#if defined(hasSSE2) || defined(USE_MSVC_X86)
Expand Down Expand Up @@ -152,7 +160,7 @@ typedef union {
#endif

// X64 compatibility sims for X86
#if !defined(__x86_64__) && !defined(_M_X64)
#ifndef HAS_CPU_64
#if defined(hasSSE2) || defined(USE_MSVC_X86)
static inline __m128i _mm_cvtsi64_si128(const unsigned long long x) {
return _mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(x >> 32));
Expand All @@ -171,6 +179,6 @@ static inline unsigned long long _mm_cvtsi128_si64(__m128i x) {
| (unsigned int) _mm_cvtsi128_si32(x);
}
#endif
#endif
#endif // !HAS_CPU_64

#endif // EDAX_BIT_H
18 changes: 17 additions & 1 deletion src/bit_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@
#define hasMMX 1
#endif

#if defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
#define hasNeon 1
#ifndef __ARM_NEON__
#define __ARM_NEON__ 1
#endif
#endif
#ifdef __ARM_NEON__
#include "arm_neon.h"
#endif

#ifdef _MSC_VER
#include <intrin.h>
#ifdef _M_IX86
Expand Down Expand Up @@ -58,7 +68,7 @@ static inline unsigned char mirror_byte(unsigned int b) { return ((((b * 0x20080
#elif defined(_MSC_VER)
#define rotl8(x,y) _rotl8((x),(y))
#else // may not compile into 8-bit rotate
#define rotl8(x,y) ((unsigned char)(((x)<<(y))|((unsigned)(x)>>(8-(y)))))
#define rotl8(x,y) ((unsigned char)(((x)<<(y))|((unsigned char)(x)>>(8-(y)))))
#endif

// bswap
Expand Down Expand Up @@ -137,6 +147,12 @@ static inline int _tzcnt_u64(unsigned long long x) {
#define lzcnt_u64(x) _CountLeadingZeros64(x)

#elif defined(_MSC_VER)
static inline int lzcnt_u32(unsigned int n) {
unsigned int i;
if (!_BitScanReverse(&i, n))
i = 32 ^ 31;
return i ^ 31;
}
#ifdef _M_X64
static inline int lzcnt_u64(unsigned long long n) {
unsigned long i;
Expand Down
Loading

0 comments on commit 343493d

Please sign in to comment.