From 343493d47e00b1d1321bd1cfca2d7450cbe1efe2 Mon Sep 17 00:00:00 2001 From: okuhara Date: Tue, 22 Sep 2020 09:55:53 +0900 Subject: [PATCH] More neon/sse optimizations; neon dispatch added for arm32 --- README.md | 1 + src/Android.mk | 4 +- src/bit.c | 39 ++-- src/bit.h | 20 +- src/bit_intrinsics.h | 18 +- src/board.c | 116 +++++------ src/board.h | 16 +- src/board_sse.c | 237 ++++++++++++++++++++--- src/const.h | 3 - src/count_last_flip_neon.c | 212 ++++++++++++++++++++ src/endgame.c | 10 +- src/endgame_neon.c | 386 +++++++++++++++++++++++++++++++++++++ src/endgame_sse.c | 204 ++++++++------------ src/eval.c | 13 +- src/eval.h | 7 +- src/eval_sse.c | 10 +- src/flip_bitscan.c | 379 ++++++++++++++++++------------------ src/flip_neon_bitscan.c | 93 ++++----- src/flip_neon_lzcnt.c | 10 +- src/flip_neon_ppfill.c | 249 +++++++++--------------- src/flip_neon_rbit.c | 121 ++++++++++++ src/flip_sse.c | 24 +-- src/search.h | 4 +- src/settings.h | 15 +- 24 files changed, 1497 insertions(+), 694 deletions(-) create mode 100644 src/count_last_flip_neon.c create mode 100644 src/endgame_neon.c create mode 100644 src/flip_neon_rbit.c diff --git a/README.md b/README.md index 4a2a9e2..ae2fa4c 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ Loop optimization and flip using carry propagation. One time execution but affec ## 3. eval.c Eval feature calculation using SSE2 / AVX2 (now in eval_sse.c) improves midgame by 15-30% and endgame by 8-12%. +Restoring eval from backup instead of rewinding. eval_open (one time execution) is also optimized. ## 4. hash.c diff --git a/src/Android.mk b/src/Android.mk index 098254e..4a54afd 100644 --- a/src/Android.mk +++ b/src/Android.mk @@ -2,7 +2,7 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) LOCAL_MODULE := aEdax # should be renamed to lib..aEdax..so afterwords LOCAL_CFLAGS += -DUNICODE -LOCAL_SRC_FILES := all.c -# LOCAL_ARM_NEON := true +LOCAL_SRC_FILES := all.c board_sse.c.neon eval_sse.c.neon flip_neon_bitscan.c.neon android/cpu-features.c +LOCAL_ARM_NEON := false # cmd-strip := include $(BUILD_EXECUTABLE) diff --git a/src/bit.c b/src/bit.c index 67eb49f..8c06bba 100644 --- a/src/bit.c +++ b/src/bit.c @@ -15,25 +15,7 @@ #include "util.h" /** coordinate to bit table converter */ -const unsigned long long X_TO_BIT[] = { - 0x0000000000000001ULL, 0x0000000000000002ULL, 0x0000000000000004ULL, 0x0000000000000008ULL, - 0x0000000000000010ULL, 0x0000000000000020ULL, 0x0000000000000040ULL, 0x0000000000000080ULL, - 0x0000000000000100ULL, 0x0000000000000200ULL, 0x0000000000000400ULL, 0x0000000000000800ULL, - 0x0000000000001000ULL, 0x0000000000002000ULL, 0x0000000000004000ULL, 0x0000000000008000ULL, - 0x0000000000010000ULL, 0x0000000000020000ULL, 0x0000000000040000ULL, 0x0000000000080000ULL, - 0x0000000000100000ULL, 0x0000000000200000ULL, 0x0000000000400000ULL, 0x0000000000800000ULL, - 0x0000000001000000ULL, 0x0000000002000000ULL, 0x0000000004000000ULL, 0x0000000008000000ULL, - 0x0000000010000000ULL, 0x0000000020000000ULL, 0x0000000040000000ULL, 0x0000000080000000ULL, - 0x0000000100000000ULL, 0x0000000200000000ULL, 0x0000000400000000ULL, 0x0000000800000000ULL, - 0x0000001000000000ULL, 0x0000002000000000ULL, 0x0000004000000000ULL, 0x0000008000000000ULL, - 0x0000010000000000ULL, 0x0000020000000000ULL, 0x0000040000000000ULL, 0x0000080000000000ULL, - 0x0000100000000000ULL, 0x0000200000000000ULL, 0x0000400000000000ULL, 0x0000800000000000ULL, - 0x0001000000000000ULL, 0x0002000000000000ULL, 0x0004000000000000ULL, 0x0008000000000000ULL, - 0x0010000000000000ULL, 0x0020000000000000ULL, 0x0040000000000000ULL, 0x0080000000000000ULL, - 0x0100000000000000ULL, 0x0200000000000000ULL, 0x0400000000000000ULL, 0x0800000000000000ULL, - 0x1000000000000000ULL, 0x2000000000000000ULL, 0x4000000000000000ULL, 0x8000000000000000ULL, - 0, 0 // <- hack for passing move & nomove -}; +unsigned long long X_TO_BIT[66]; /** Conversion array: neighbour bits */ const unsigned long long NEIGHBOUR[] = { @@ -167,14 +149,25 @@ static int bit_count_32(unsigned int b) */ void bit_init(void) { -#ifndef POPCOUNT unsigned int i; + unsigned long long ll; + + ll = 1; + for (i = 0; i < 66; ++i) { // X_TO_BIT[64] = X_TO_BIT[65] = 0 for passing move & nomove + X_TO_BIT[i] = ll; + ll <<= 1; + } + +#ifndef POPCOUNT for (i = 0; i < (1 << 16); ++i) PopCnt16[i] = bit_count_32(i); #endif #if (defined(USE_GAS_MMX) || defined(USE_MSVC_X86)) && !defined(hasSSE2) init_mmx(); #endif +#if defined(ANDROID) && !defined(hasNeon) && !defined(hasSSE2) + init_neon(); +#endif } /** @@ -450,15 +443,19 @@ unsigned long long vertical_mirror(unsigned long long b) */ unsigned int horizontal_mirror_32(unsigned int b) { +#ifdef __ARM_ACLE + return __rev(__rbit(b)); +#else b = ((b >> 1) & 0x55555555U) + 2 * (b & 0x55555555U); b = ((b >> 2) & 0x33333333U) + 4 * (b & 0x33333333U); b = ((b >> 4) & 0x0F0F0F0FU) + 16 * (b & 0x0F0F0F0FU); return b; +#endif } unsigned long long horizontal_mirror(unsigned long long b) { -#ifdef HAS_CPU_64 +#if defined(HAS_CPU_64) && !defined(__ARM_ACLE) b = ((b >> 1) & 0x5555555555555555ULL) | ((b & 0x5555555555555555ULL) << 1); b = ((b >> 2) & 0x3333333333333333ULL) | ((b & 0x3333333333333333ULL) << 2); b = ((b >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((b & 0x0F0F0F0F0F0F0F0FULL) << 4); diff --git a/src/bit.h b/src/bit.h index 9996e3b..d6a1b6e 100644 --- a/src/bit.h +++ b/src/bit.h @@ -30,11 +30,15 @@ unsigned int horizontal_mirror_32(unsigned int b); unsigned long long horizontal_mirror(unsigned long long); int get_rand_bit(unsigned long long, struct Random*); -extern const unsigned long long X_TO_BIT[]; +extern unsigned long long X_TO_BIT[]; +extern const unsigned long long NEIGHBOUR[]; + /** Return a bitboard with bit x set. */ +#ifdef __aarch64__ // 1% slower on Sandy Bridge +#define x_to_bit(x) (1ULL << (x)) +#else #define x_to_bit(x) X_TO_BIT[x] - -//#define x_to_bit(x) (1ULL << (x)) // 1% slower on Sandy Bridge +#endif /** Loop over each bit set. */ #if (defined(__GNUC__) && __GNUC__ >= 4) || __has_builtin(__builtin_ctzll) @@ -67,7 +71,7 @@ extern const unsigned long long X_TO_BIT[]; #endif // popcount -#if !defined(POPCOUNT) && (defined(__ARM_NEON__) || defined(_M_ARM) || defined(_M_ARM64)) +#if !defined(POPCOUNT) && defined(hasNeon) #define POPCOUNT 1 #endif @@ -117,6 +121,10 @@ extern const unsigned long long X_TO_BIT[]; #endif #endif +#if defined(ANDROID) && ((defined(__arm__) && !defined(hasNeon)) || (defined(__i386__) && !defined(hasSSE2))) +extern bool hasSSE2; +#endif + typedef union { unsigned long long ull[2]; #if defined(hasSSE2) || defined(USE_MSVC_X86) @@ -152,7 +160,7 @@ typedef union { #endif // X64 compatibility sims for X86 -#if !defined(__x86_64__) && !defined(_M_X64) +#ifndef HAS_CPU_64 #if defined(hasSSE2) || defined(USE_MSVC_X86) static inline __m128i _mm_cvtsi64_si128(const unsigned long long x) { return _mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(x >> 32)); @@ -171,6 +179,6 @@ static inline unsigned long long _mm_cvtsi128_si64(__m128i x) { | (unsigned int) _mm_cvtsi128_si32(x); } #endif -#endif +#endif // !HAS_CPU_64 #endif // EDAX_BIT_H diff --git a/src/bit_intrinsics.h b/src/bit_intrinsics.h index 5b70668..f3ef2a5 100644 --- a/src/bit_intrinsics.h +++ b/src/bit_intrinsics.h @@ -24,6 +24,16 @@ #define hasMMX 1 #endif +#if defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64) + #define hasNeon 1 + #ifndef __ARM_NEON__ + #define __ARM_NEON__ 1 + #endif +#endif +#ifdef __ARM_NEON__ +#include "arm_neon.h" +#endif + #ifdef _MSC_VER #include #ifdef _M_IX86 @@ -58,7 +68,7 @@ static inline unsigned char mirror_byte(unsigned int b) { return ((((b * 0x20080 #elif defined(_MSC_VER) #define rotl8(x,y) _rotl8((x),(y)) #else // may not compile into 8-bit rotate - #define rotl8(x,y) ((unsigned char)(((x)<<(y))|((unsigned)(x)>>(8-(y))))) + #define rotl8(x,y) ((unsigned char)(((x)<<(y))|((unsigned char)(x)>>(8-(y))))) #endif // bswap @@ -137,6 +147,12 @@ static inline int _tzcnt_u64(unsigned long long x) { #define lzcnt_u64(x) _CountLeadingZeros64(x) #elif defined(_MSC_VER) + static inline int lzcnt_u32(unsigned int n) { + unsigned int i; + if (!_BitScanReverse(&i, n)) + i = 32 ^ 31; + return i ^ 31; + } #ifdef _M_X64 static inline int lzcnt_u64(unsigned long long n) { unsigned long i; diff --git a/src/board.c b/src/board.c index 3c03359..59af98f 100644 --- a/src/board.c +++ b/src/board.c @@ -34,7 +34,12 @@ #elif MOVE_GENERATOR == MOVE_GENERATOR_SSE #include "flip_sse.c" #elif MOVE_GENERATOR == MOVE_GENERATOR_BITSCAN - #include "flip_bitscan.c" + #ifdef hasNeon + #define flip_neon flip + #include "flip_neon_bitscan.c" + #else + #include "flip_bitscan.c" + #endif #elif MOVE_GENERATOR == MOVE_GENERATOR_ROXANE #include "flip_roxane.c" #elif MOVE_GENERATOR == MOVE_GENERATOR_32 @@ -46,7 +51,11 @@ #elif MOVE_GENERATOR == MOVE_GENERATOR_AVX512 #include "flip_avx512cd.c" #elif MOVE_GENERATOR == MOVE_GENERATOR_NEON - #include "flip_neon_lzcnt.c" + #ifdef __aarch64__ + #include "flip_neon_rbit.c" + #else + #include "flip_neon_lzcnt.c" + #endif #else // MOVE_GENERATOR == MOVE_GENERATOR_KINDERGARTEN #include "flip_kindergarten.c" #endif @@ -56,45 +65,12 @@ unsigned char edge_stability[256 * 256]; /** conversion from an 8-bit line to the A1-A8 line */ -const unsigned long long A1_A8[256] = { - 0x0000000000000000, 0x0000000000000001, 0x0000000000000100, 0x0000000000000101, 0x0000000000010000, 0x0000000000010001, 0x0000000000010100, 0x0000000000010101, - 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, 0x0000000001000101, 0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x0000000001010101, - 0x0000000100000000, 0x0000000100000001, 0x0000000100000100, 0x0000000100000101, 0x0000000100010000, 0x0000000100010001, 0x0000000100010100, 0x0000000100010101, - 0x0000000101000000, 0x0000000101000001, 0x0000000101000100, 0x0000000101000101, 0x0000000101010000, 0x0000000101010001, 0x0000000101010100, 0x0000000101010101, - 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000000101, 0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010000010101, - 0x0000010001000000, 0x0000010001000001, 0x0000010001000100, 0x0000010001000101, 0x0000010001010000, 0x0000010001010001, 0x0000010001010100, 0x0000010001010101, - 0x0000010100000000, 0x0000010100000001, 0x0000010100000100, 0x0000010100000101, 0x0000010100010000, 0x0000010100010001, 0x0000010100010100, 0x0000010100010101, - 0x0000010101000000, 0x0000010101000001, 0x0000010101000100, 0x0000010101000101, 0x0000010101010000, 0x0000010101010001, 0x0000010101010100, 0x0000010101010101, - 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x0001000000000101, 0x0001000000010000, 0x0001000000010001, 0x0001000000010100, 0x0001000000010101, - 0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101, 0x0001000001010000, 0x0001000001010001, 0x0001000001010100, 0x0001000001010101, - 0x0001000100000000, 0x0001000100000001, 0x0001000100000100, 0x0001000100000101, 0x0001000100010000, 0x0001000100010001, 0x0001000100010100, 0x0001000100010101, - 0x0001000101000000, 0x0001000101000001, 0x0001000101000100, 0x0001000101000101, 0x0001000101010000, 0x0001000101010001, 0x0001000101010100, 0x0001000101010101, - 0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x0001010000000101, 0x0001010000010000, 0x0001010000010001, 0x0001010000010100, 0x0001010000010101, - 0x0001010001000000, 0x0001010001000001, 0x0001010001000100, 0x0001010001000101, 0x0001010001010000, 0x0001010001010001, 0x0001010001010100, 0x0001010001010101, - 0x0001010100000000, 0x0001010100000001, 0x0001010100000100, 0x0001010100000101, 0x0001010100010000, 0x0001010100010001, 0x0001010100010100, 0x0001010100010101, - 0x0001010101000000, 0x0001010101000001, 0x0001010101000100, 0x0001010101000101, 0x0001010101010000, 0x0001010101010001, 0x0001010101010100, 0x0001010101010101, - 0x0100000000000000, 0x0100000000000001, 0x0100000000000100, 0x0100000000000101, 0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000000010101, - 0x0100000001000000, 0x0100000001000001, 0x0100000001000100, 0x0100000001000101, 0x0100000001010000, 0x0100000001010001, 0x0100000001010100, 0x0100000001010101, - 0x0100000100000000, 0x0100000100000001, 0x0100000100000100, 0x0100000100000101, 0x0100000100010000, 0x0100000100010001, 0x0100000100010100, 0x0100000100010101, - 0x0100000101000000, 0x0100000101000001, 0x0100000101000100, 0x0100000101000101, 0x0100000101010000, 0x0100000101010001, 0x0100000101010100, 0x0100000101010101, - 0x0100010000000000, 0x0100010000000001, 0x0100010000000100, 0x0100010000000101, 0x0100010000010000, 0x0100010000010001, 0x0100010000010100, 0x0100010000010101, - 0x0100010001000000, 0x0100010001000001, 0x0100010001000100, 0x0100010001000101, 0x0100010001010000, 0x0100010001010001, 0x0100010001010100, 0x0100010001010101, - 0x0100010100000000, 0x0100010100000001, 0x0100010100000100, 0x0100010100000101, 0x0100010100010000, 0x0100010100010001, 0x0100010100010100, 0x0100010100010101, - 0x0100010101000000, 0x0100010101000001, 0x0100010101000100, 0x0100010101000101, 0x0100010101010000, 0x0100010101010001, 0x0100010101010100, 0x0100010101010101, - 0x0101000000000000, 0x0101000000000001, 0x0101000000000100, 0x0101000000000101, 0x0101000000010000, 0x0101000000010001, 0x0101000000010100, 0x0101000000010101, - 0x0101000001000000, 0x0101000001000001, 0x0101000001000100, 0x0101000001000101, 0x0101000001010000, 0x0101000001010001, 0x0101000001010100, 0x0101000001010101, - 0x0101000100000000, 0x0101000100000001, 0x0101000100000100, 0x0101000100000101, 0x0101000100010000, 0x0101000100010001, 0x0101000100010100, 0x0101000100010101, - 0x0101000101000000, 0x0101000101000001, 0x0101000101000100, 0x0101000101000101, 0x0101000101010000, 0x0101000101010001, 0x0101000101010100, 0x0101000101010101, - 0x0101010000000000, 0x0101010000000001, 0x0101010000000100, 0x0101010000000101, 0x0101010000010000, 0x0101010000010001, 0x0101010000010100, 0x0101010000010101, - 0x0101010001000000, 0x0101010001000001, 0x0101010001000100, 0x0101010001000101, 0x0101010001010000, 0x0101010001010001, 0x0101010001010100, 0x0101010001010101, - 0x0101010100000000, 0x0101010100000001, 0x0101010100000100, 0x0101010100000101, 0x0101010100010000, 0x0101010100010001, 0x0101010100010100, 0x0101010100010101, - 0x0101010101000000, 0x0101010101000001, 0x0101010101000100, 0x0101010101000101, 0x0101010101010000, 0x0101010101010001, 0x0101010101010100, 0x0101010101010101, -}; +unsigned long long A1_A8[256]; #if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) #include "board_mmx.c" #endif -#if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) || defined(hasSSE2) +#if !defined(ANDROID) && (defined(USE_GAS_MMX) || defined(USE_MSVC_X86) || defined(hasSSE2) || defined(hasNeon)) #include "board_sse.c" #endif @@ -280,7 +256,7 @@ bool board_equal(const Board *b1, const Board *b2) return (b1->player == b2->player && b1->opponent == b2->opponent); } -#ifndef hasSSE2 // SSE version in board_sse.c +#if !defined(hasSSE2) && !defined(hasNeon) // SSE version in board_sse.c /** * @brief symetric board * @@ -462,7 +438,7 @@ void board_pass(Board *board) board_check(board); } -#if !(defined(hasSSE2) && ((MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_SSE))) // SSE version in endgame_sse.c +#if (MOVE_GENERATOR != MOVE_GENERATOR_AVX) && (MOVE_GENERATOR != MOVE_GENERATOR_SSE) && (MOVE_GENERATOR != MOVE_GENERATOR_NEON) // SSE version in board_sse.c /** * @brief Compute a board resulting of a move played on a previous board. * @@ -503,7 +479,7 @@ unsigned long long board_pass_next(const Board *board, const int x, Board *next) } #endif -#if !defined(__x86_64__) && !defined(_M_X64) && !defined(__AVX2__) // sse version in board_sse.c +#if !defined(hasSSE2) && !defined(hasNeon) // sse version in board_sse.c /** * @brief Get a part of the moves. * @@ -587,12 +563,14 @@ unsigned long long get_moves(const unsigned long long P, const unsigned long lon { unsigned long long moves, OM; - #if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) + #if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) || defined(ANDROID) if (hasSSE2) return get_moves_sse(P, O); + #if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) else if (hasMMX) return get_moves_mmx(P, O); #endif + #endif OM = O & 0x7e7e7e7e7e7e7e7e; moves = ( get_some_moves(P, OM, 1) // horizontal @@ -602,7 +580,7 @@ unsigned long long get_moves(const unsigned long long P, const unsigned long lon return moves & ~(P|O); // mask with empties } -#endif +#endif // hasSSE2/hasNeon /** * @brief Get legal moves on a 6x6 board. @@ -627,7 +605,7 @@ unsigned long long get_moves_6x6(const unsigned long long P, const unsigned long */ bool can_move(const unsigned long long P, const unsigned long long O) { -#if defined(__x86_64__) || defined(_M_X64) || defined(hasMMX) +#if defined(hasMMX) || defined(hasNeon) return get_moves(P, O) != 0; #else @@ -787,11 +765,12 @@ static int find_edge_stable(const int old_P, const int old_O, int stable) } /** - * @brief Initialize the edge stability tables. + * @brief Initialize the edge stability and A1_A8 tables. */ void edge_stability_init(void) { int P, O, PO, rPO; + unsigned long long Q; // long long t = cpu_clock(); for (PO = 0; PO < 256 * 256; ++PO) { @@ -808,6 +787,12 @@ void edge_stability_init(void) } } // printf("edge_stability_init: %d\n", (int)(cpu_clock() - t)); + + Q = 0; + for (P = 0; P < 256; ++P) { + A1_A8[P] = Q; + Q = ((Q | ~0x0101010101010101) + 1) & 0x0101010101010101; + } } #ifdef HAS_CPU_64 @@ -818,7 +803,25 @@ void edge_stability_init(void) #define packH1H8(X) (((((unsigned int)((X) >> 32) & 0x80808080) + (((unsigned int)(X) & 0x80808080) >> 4)) * 0x00204081) >> 24) #endif -#if !defined(__x86_64__) && !defined(_M_X64) +#ifndef HAS_CPU_64 +/** + * @brief Get stable edge. + * + * @param P bitboard with player's discs. + * @param O bitboard with opponent's discs. + * @return a bitboard with (some of) player's stable discs. + * + */ +unsigned long long get_stable_edge(const unsigned long long P, const unsigned long long O) +{ // compute the exact stable edges (from precomputed tables) + return edge_stability[((unsigned int) P & 0xff) * 256 + ((unsigned int) O & 0xff)] + | (unsigned long long) edge_stability[(unsigned int) (P >> 56) * 256 + (unsigned int) (O >> 56)] << 56 + | A1_A8[edge_stability[packA1A8(P) * 256 + packA1A8(O)]] + | A1_A8[edge_stability[packH1H8(P) * 256 + packH1H8(O)]] << 7; +} +#endif + +#if !defined(HAS_CPU_64) && !(defined(ANDROID) && (defined(hasNeon) || defined(hasSSE2))) /** * @brief Get full lines. * @@ -918,22 +921,6 @@ static unsigned long long get_full_lines_v(unsigned long long full) return full; } -/** - * @brief Get stable edge. - * - * @param P bitboard with player's discs. - * @param O bitboard with opponent's discs. - * @return a bitboard with (some of) player's stable discs. - * - */ -static unsigned long long get_stable_edge(const unsigned long long P, const unsigned long long O) -{ // compute the exact stable edges (from precomputed tables) - return edge_stability[((unsigned int) P & 0xff) * 256 + ((unsigned int) O & 0xff)] - | (unsigned long long) edge_stability[(unsigned int) (P >> 56) * 256 + (unsigned int) (O >> 56)] << 56 - | A1_A8[edge_stability[packA1A8(P) * 256 + packA1A8(O)]] - | A1_A8[edge_stability[packH1H8(P) * 256 + packH1H8(O)]] << 7; -} - /** * @brief Estimate the stability. * @@ -948,7 +935,10 @@ int get_stability(const unsigned long long P, const unsigned long long O) unsigned long long P_central, disc, full_h, full_v, full_d7, full_d9; unsigned long long stable_h, stable_v, stable_d7, stable_d9, stable, old_stable; -#if (defined(USE_GAS_MMX) && !(defined(__clang__) && (__clang__major__ < 3))) || defined(USE_MSVC_X86) +#ifdef ANDROID + if (hasSSE2) + return get_stability_sse(P, O); +#elif (defined(USE_GAS_MMX) && !(defined(__clang__) && (__clang__major__ < 3))) || defined(USE_MSVC_X86) if (hasMMX) return get_stability_mmx(P, O); #endif @@ -982,7 +972,7 @@ int get_stability(const unsigned long long P, const unsigned long long O) return bit_count(stable); } -#endif // __x86_64__ +#endif // HAS_CPU_64/ANDROID /** * @brief Estimate the stability of edges. diff --git a/src/board.h b/src/board.h index 87b0154..9e218a9 100644 --- a/src/board.h +++ b/src/board.h @@ -65,6 +65,7 @@ int get_mobility(const unsigned long long, const unsigned long long); int get_weighted_mobility(const unsigned long long, const unsigned long long); int get_potential_mobility(const unsigned long long, const unsigned long long); void edge_stability_init(void); +unsigned long long get_stable_edge(const unsigned long long, const unsigned long long); int get_stability(const unsigned long long, const unsigned long long); int get_edge_stability(const unsigned long long, const unsigned long long); int get_corner_stability(const unsigned long long); @@ -75,7 +76,13 @@ unsigned long long get_moves_mmx(unsigned long long, unsigned long long); unsigned long long get_moves_sse(unsigned long long, unsigned long long); int get_stability_mmx(unsigned long long, unsigned long long); int get_potential_mobility_mmx(unsigned long long, unsigned long long); + +#elif defined(ANDROID) && !defined(hasNeon) && !defined(hasSSE2) +void init_neon (void); +unsigned long long get_moves_sse(unsigned long long, unsigned long long); +int get_stability_sse(const unsigned long long P, const unsigned long long O); #endif + #if defined(USE_GAS_MMX) && defined(__3dNOW__) unsigned long long board_get_hash_code_mmx(const unsigned char *p); #elif defined(USE_GAS_MMX) || defined(USE_MSVC_X86) @@ -83,9 +90,9 @@ unsigned long long board_get_hash_code_sse(const unsigned char *p); #endif extern unsigned char edge_stability[256 * 256]; -extern const unsigned long long A1_A8[256]; +extern unsigned long long A1_A8[256]; -#if ((LAST_FLIP_COUNTER == COUNT_LAST_FLIP_PLAIN) || (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_SSE) || (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_BMI2)) +#if (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_PLAIN) || (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_SSE) || (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_BMI2) extern int last_flip(int pos, unsigned long long P); #else extern int (*count_last_flip[BOARD_SIZE + 1])(const unsigned long long); @@ -103,6 +110,11 @@ extern const unsigned long long A1_A8[256]; #define mm_Flip(OP,x) mm_flip[x](OP) #define board_flip(board,x) ((unsigned long long) _mm_cvtsi128_si64(mm_flip[x](_mm_loadu_si128((__m128i *) (board))))) +#elif MOVE_GENERATOR == MOVE_GENERATOR_NEON + extern uint64x2_t mm_Flip(uint64x2_t OP, int pos); + #define Flip(x,P,O) vgetq_lane_u64(mm_Flip(vcombine_u64(vcreate_u64(P), vcreate_u64(O)), (x)), 0) + #define board_flip(board,x) vgetq_lane_u64(mm_Flip(vld1q_u64((uint64_t *) board), (x)), 0) + #elif MOVE_GENERATOR == MOVE_GENERATOR_32 extern unsigned long long (*flip[BOARD_SIZE + 2])(unsigned int, unsigned int, unsigned int, unsigned int); #define Flip(x,P,O) flip[x]((unsigned int)(P), (unsigned int)((P) >> 32), (unsigned int)(O), (unsigned int)((O) >> 32)) diff --git a/src/board_sse.c b/src/board_sse.c index cc59f9c..7a5b464 100644 --- a/src/board_sse.c +++ b/src/board_sse.c @@ -12,6 +12,33 @@ #include "hash.h" #include "board.h" +#if defined(ANDROID) && !defined(hasNeon) && !defined(hasSSE2) +#include "android/cpu-features.h" + +bool hasSSE2 = false; + +void init_neon (void) +{ +#ifdef __arm__ + if (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON) { + #if (MOVE_GENERATOR == MOVE_GENERATOR_BITSCAN) + extern unsigned long long (*flip_neon[66])(const unsigned long long, const unsigned long long); + memcpy(flip, flip_neon, sizeof(flip_neon)); + #endif + hasSSE2 = true; + } +#else // android x86 w/o SSE2 - uncommon and not tested + int cpuid_edx, cpuid_ecx; + __asm__ ( + "movl $1, %%eax\n\t" + "cpuid" + : "=d" (cpuid_edx), "=c" (cpuid_ecx) :: "%eax", "%ebx" ); + if ((cpuid_edx & 0x04000000u) != 0) + hasSSE2 = true; +#endif +} +#endif + /** * @brief SSE2 translation of board_symetry * @@ -77,7 +104,42 @@ void board_symetry(const Board *board, const int s, Board *sym) board_check(sym); } -#if (MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_SSE) +#elif defined(hasNeon) + +void board_symetry(const Board *board, const int s, Board *sym) +{ + uint64x2_t bb = vld1q_u64((uint64_t *) board); + uint64x2_t tt; + + if (s & 1) { // horizontal_mirror +#ifdef HAS_CPU_64 + bb = vreinterpretq_u64_u8(vrbitq_u8(vreinterpretq_u8_u64(bb))); +#else + bb = vbslq_u64(vdupq_n_u64(0x5555555555555555), vshrq_n_u64(bb, 1), vshlq_n_u64(bb, 1)); + bb = vbslq_u64(vdupq_n_u64(0x3333333333333333), vshrq_n_u64(bb, 2), vshlq_n_u64(bb, 2)); + bb = vreinterpretq_u64_u8(vsliq_n_u8(vshrq_n_u8(vreinterpretq_u8_u64(bb), 4), vreinterpretq_u8_u64(bb), 4)); +#endif + } + + if (s & 2) { // vertical_mirror + bb = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(bb))); + } + + if (s & 4) { // transpose + tt = vandq_u64(veorq_u64(bb, vshrq_n_u64(bb, 7)), vdupq_n_u64(0x00AA00AA00AA00AA)); + bb = veorq_u64(veorq_u64(bb, tt), vshlq_n_u64(tt, 7)); + tt = vandq_u64(veorq_u64(bb, vshrq_n_u64(bb, 14)), vdupq_n_u64(0x0000CCCC0000CCCC)); + bb = veorq_u64(veorq_u64(bb, tt), vshlq_n_u64(tt, 14)); + tt = vandq_u64(veorq_u64(bb, vshrq_n_u64(bb, 28)), vdupq_n_u64(0x00000000F0F0F0F0)); + bb = veorq_u64(veorq_u64(bb, tt), vshlq_n_u64(tt, 28)); + } + + vst1q_u64((uint64_t *) sym, bb); + board_check(sym); +} + +#endif // hasSSE2/Neon + /** * @brief Compute a board resulting of a move played on a previous board. * @@ -86,6 +148,8 @@ void board_symetry(const Board *board, const int s, Board *sym) * @param next resulting board. * @return flipped discs. */ +#if (MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_SSE) + unsigned long long board_next(const Board *board, const int x, Board *next) { __m128i OP = _mm_loadu_si128((__m128i *) board); @@ -97,6 +161,22 @@ unsigned long long board_next(const Board *board, const int x, Board *next) return _mm_cvtsi128_si64(flipped); } +#elif MOVE_GENERATOR == MOVE_GENERATOR_NEON + +unsigned long long board_next(const Board *board, const int x, Board *next) +{ + uint64x2_t OP = vld1q_u64((uint64_t *) board); + uint64x2_t flipped = mm_Flip(OP, x); + + OP = veorq_u64(OP, flipped); + vst1_u64((uint64_t *) &next->player, vget_high_u64(OP)); + vst1_u64((uint64_t *) &next->opponent, vorr_u64(vget_low_u64(OP), vcreate_u64(X_TO_BIT[x]))); + + return vgetq_lane_u64(flipped, 0); +} + +#endif + /** * @brief Compute a board resulting of an opponent move played on a previous board. * @@ -107,6 +187,8 @@ unsigned long long board_next(const Board *board, const int x, Board *next) * @param next resulting board. * @return flipped discs. */ +#if (MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_SSE) + unsigned long long board_pass_next(const Board *board, const int x, Board *next) { __m128i PO = _mm_shuffle_epi32(_mm_loadu_si128((__m128i *) board), 0x4e); @@ -117,9 +199,23 @@ unsigned long long board_pass_next(const Board *board, const int x, Board *next) return _mm_cvtsi128_si64(flipped); } -#endif -#endif // hasSSE2 +#elif MOVE_GENERATOR == MOVE_GENERATOR_NEON + +unsigned long long board_pass_next(const Board *board, const int x, Board *next) +{ + uint64x2_t OP = vld1q_u64((uint64_t *) board); + uint64x2_t PO = vextq_u64(OP, OP, 1); + uint64x2_t flipped = mm_Flip(PO, x); + + PO = veorq_u64(PO, flipped); + vst1_u64((uint64_t *) &next->player, vget_high_u64(PO)); + vst1_u64((uint64_t *) &next->opponent, vorr_u64(vget_low_u64(PO), vcreate_u64(X_TO_BIT[x]))); + + return vgetq_lane_u64(flipped, 0); +} + +#endif /** * @brief X64 optimized get_moves @@ -191,7 +287,7 @@ unsigned long long get_moves(const unsigned long long P, const unsigned long lon return moves & ~(P|O); // mask with empties } -#elif 0 // 4 CPU +#elif defined(__aarch64__) || defined(_M_ARM64) // 4 CPU unsigned long long get_moves(const unsigned long long P, const unsigned long long O) { @@ -215,18 +311,68 @@ unsigned long long get_moves(const unsigned long long P, const unsigned long lon return moves & ~(P|O); // mask with empties } -#else // __x86_64__ +#elif defined(__ARM_NEON__) // 3 Neon, 1 CPU(32) + +#ifdef hasNeon +#define get_moves_sse get_moves // no dispatch +#endif + +unsigned long long get_moves_sse(unsigned long long P, unsigned long long O) +{ + unsigned int mO, movesL, movesH, flip1, pre1; + uint64x1_t rP, rO; + uint64x2_t PP, OO, MM, flip, pre; + + /* vertical_mirror in PP[1], OO[1] */ mO = (unsigned int) O & 0x7e7e7e7e; + rP = vreinterpret_u64_u8(vrev64_u8(vcreate_u8(P))); flip1 = mO & ((unsigned int) P << 1); + PP = vcombine_u64(vcreate_u64(P), rP); flip1 |= mO & (flip1 << 1); + pre1 = mO & (mO << 1); + rO = vreinterpret_u64_u8(vrev64_u8(vcreate_u8(O))); flip1 |= pre1 & (flip1 << 2); + OO = vcombine_u64(vcreate_u64(O), rO); flip1 |= pre1 & (flip1 << 2); + movesL = flip1 << 1; + + flip = vandq_u64(OO, vshlq_n_u64(PP, 8)); flip1 = mO & ((unsigned int) P >> 1); + flip = vorrq_u64(flip, vandq_u64(OO, vshlq_n_u64(flip, 8))); flip1 |= mO & (flip1 >> 1); + pre = vandq_u64(OO, vshlq_n_u64(OO, 8)); pre1 >>= 1; + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 16))); flip1 |= pre1 & (flip1 >> 2); + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 16))); flip1 |= pre1 & (flip1 >> 2); + MM = vshlq_n_u64(flip, 8); movesL |= flip1 >> 1; + + OO = vandq_u64(OO, vdupq_n_u64(0x7e7e7e7e7e7e7e7e)); mO = (unsigned int) (O >> 32) & 0x7e7e7e7e; + flip = vandq_u64(OO, vshlq_n_u64(PP, 7)); flip1 = mO & ((unsigned int) (P >> 32) << 1); + flip = vorrq_u64(flip, vandq_u64(OO, vshlq_n_u64(flip, 7))); flip1 |= mO & (flip1 << 1); + pre = vandq_u64(OO, vshlq_n_u64(OO, 7)); pre1 = mO & (mO << 1); + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 14))); flip1 |= pre1 & (flip1 << 2); + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 14))); flip1 |= pre1 & (flip1 << 2); + MM = vorrq_u64(MM, vshlq_n_u64(flip, 7)); movesH = flip1 << 1; + + flip = vandq_u64(OO, vshlq_n_u64(PP, 9)); flip1 = mO & ((unsigned int) (P >> 32) >> 1); + flip = vorrq_u64(flip, vandq_u64(OO, vshlq_n_u64(flip, 9))); flip1 |= mO & (flip1 >> 1); + pre = vandq_u64(OO, vshlq_n_u64(OO, 9)); pre1 >>= 1; + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 18))); flip1 |= pre1 & (flip1 >> 2); + flip = vorrq_u64(flip, vandq_u64(pre, vshlq_n_u64(flip, 18))); flip1 |= pre1 & (flip1 >> 2); + MM = vorrq_u64(MM, vshlq_n_u64(flip, 9)); movesH |= flip1 >> 1; + + movesL |= vgetq_lane_u32(MM, 0) | __rev(vgetq_lane_u32(MM, 3)); + movesH |= vgetq_lane_u32(MM, 1) | __rev(vgetq_lane_u32(MM, 2)); + return (movesL | ((unsigned long long) movesH << 32)) & ~(P|O); // mask with empties +} + +#else // AVX/x86_64/arm /** - * @brief SSE optimized get_moves for x86 (3 SSE, 1 CPU) + * @brief SSE optimized get_moves for x86 - 3 SSE, 1 CPU(32) * */ -#if defined(hasSSE2) || defined(USE_MSVC_X86) +#if defined(hasSSE2) || defined(USE_MSVC_X86) || defined(ANDROID) + +#ifdef hasSSE2 +#define get_moves_sse get_moves // no dispatch +#endif unsigned long long get_moves_sse(unsigned long long P, unsigned long long O) { unsigned int mO, movesL, movesH, flip1, pre1; __m128i OP, rOP, PP, OO, MM, flip, pre; - const __m128i mask7e = _mm_set1_epi8(0x7e); // vertical_mirror in PP[1], OO[1] OP = _mm_unpacklo_epi64(_mm_cvtsi64_si128(P), _mm_cvtsi64_si128(O)); mO = (unsigned int) O & 0x7e7e7e7eU; @@ -245,7 +391,7 @@ unsigned long long get_moves_sse(unsigned long long P, unsigned long long O) flip = _mm_or_si128(flip, _mm_and_si128(pre, _mm_slli_epi64(flip, 16))); flip1 |= pre1 & (flip1 >> 2); MM = _mm_slli_epi64(flip, 8); movesL |= flip1 >> 1; - OO = _mm_and_si128(OO, mask7e); mO = (unsigned int) (O >> 32) & 0x7e7e7e7eU; + OO = _mm_and_si128(OO, _mm_set1_epi8(0x7e)); mO = (unsigned int) (O >> 32) & 0x7e7e7e7eU; flip = _mm_and_si128(OO, _mm_slli_epi64(PP, 7)); flip1 = mO & ((unsigned int) (P >> 32) << 1); flip = _mm_or_si128(flip, _mm_and_si128(OO, _mm_slli_epi64(flip, 7))); flip1 |= mO & (flip1 << 1); pre = _mm_and_si128(OO, _mm_slli_epi64(OO, 7)); pre1 = mO & (mO << 1); @@ -382,7 +528,6 @@ unsigned long long get_moves_sse(unsigned long long P, unsigned long long O) #endif // hasSSE2 #endif // x86 -#if defined(__x86_64__) || defined(_M_X64) /** * @brief SSE optimized get_stable_edge * @@ -391,7 +536,21 @@ unsigned long long get_moves_sse(unsigned long long P, unsigned long long O) * @return a bitboard with (some of) player's stable discs. * */ -static unsigned long long get_stable_edge(const unsigned long long P, const unsigned long long O) +#if defined(__aarch64__) || defined(_M_ARM64) +unsigned long long get_stable_edge(unsigned long long P, unsigned long long O) +{ // compute the exact stable edges (from precomputed tables) + const int16x8_t shiftv = { 0, 1, 2, 3, 4, 5, 6, 7 }; + uint8x16_t PO = vzipq_u8(vreinterpretq_u8_u64(vdupq_n_u64(O)), vreinterpretq_u8_u64(vdupq_n_u64(P))).val[0]; + uint16x8_t a1a8 = vshlq_u16(vreinterpretq_u16_u8(vandq_u8(PO, vdupq_n_u8(1))), shiftv); + uint16x8_t h1h8 = vshlq_u16(vreinterpretq_u16_u8(vshrq_n_u8(PO, 7)), shiftv); + return edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 0)] + | (unsigned long long) edge_stability[vgetq_lane_u16(vreinterpretq_u16_u8(PO), 7)] << 56 + | A1_A8[edge_stability[vaddvq_u16(a1a8)]] + | A1_A8[edge_stability[vaddvq_u16(h1h8)]] << 7; +} + +#elif defined(__x86_64__) || defined(_M_X64) +unsigned long long get_stable_edge(const unsigned long long P, const unsigned long long O) { // compute the exact stable edges (from precomputed tables) unsigned int a1a8po, h1h8po; @@ -407,14 +566,16 @@ static unsigned long long get_stable_edge(const unsigned long long P, const unsi a1a8po = _mm_movemask_epi8(_mm_slli_epi64(PO, 7)); h1h8po = _mm_movemask_epi8(PO); #if 0 // def __BMI2__ // pdep is slow on AMD - stable_edge |= _pdep_u64(edge_stability[a1a8po], 0x0101010101010101ULL) - | _pdep_u64(edge_stability[h1h8po], 0x8080808080808080ULL); + stable_edge |= _pdep_u64(edge_stability[a1a8po], 0x0101010101010101) + | _pdep_u64(edge_stability[h1h8po], 0x8080808080808080); #else stable_edge |= A1_A8[edge_stability[a1a8po]] | (A1_A8[edge_stability[h1h8po]] << 7); #endif return stable_edge; } +#endif // __aarch64__/__x86_64__/_M_X64 +#if defined(HAS_CPU_64) || defined(ANDROID) /** * @brief X64 optimized get_stability * @@ -431,11 +592,11 @@ static unsigned long long get_stable_edge(const unsigned long long P, const unsi int get_stability(const unsigned long long P, const unsigned long long O) { unsigned long long disc = (P | O); - unsigned long long P_central = (P & 0x007e7e7e7e7e7e00ULL); + unsigned long long P_central = (P & 0x007e7e7e7e7e7e00); unsigned long long l8, stable; __m128i l81, l79, v2_stable, v2_old_stable, v2_P_central; __m256i lr79, v4_disc, v4_stable, v4_full; - const __m128i kff = _mm_set1_epi64x(0xffffffffffffffff); + const __m128i kff = _mm_set1_epi64x(0xffffffffffffffff);; const __m256i shift1897 = _mm256_set_epi64x(7, 9, 8, 1); #if 0 // PCMPEQQ static const V4DI m791 = {{ 0x0402010000804020, 0x2040800000010204, 0x0804020180402010, 0x1020408001020408 }}; // V8SI @@ -513,36 +674,56 @@ int get_stability(const unsigned long long P, const unsigned long long O) return bit_count(_mm_cvtsi128_si64(v2_stable)); } -#else +#else // __AVX2__ -int get_stability(const unsigned long long P, const unsigned long long O) +#if defined(hasNeon) || defined(hasSSE2) +#define get_stability_sse get_stability // no dispatch +#endif + +int get_stability_sse(const unsigned long long P, const unsigned long long O) { unsigned long long disc = (P | O); - unsigned long long P_central = (P & 0x007e7e7e7e7e7e00ULL); + unsigned long long P_central = (P & 0x007e7e7e7e7e7e00); unsigned long long l8, full_h, full_v, full_d7, full_d9, stable; unsigned long long stable_h, stable_v, stable_d7, stable_d9, old_stable; -#if 1 // 1 CPU, 3 SSE +#ifdef __ARM_NEON__ + uint8x8_t l01; + uint64x2_t l79, r79; + const uint64x2_t e790 = vdupq_n_u64(0x007e7e7e7e7e7e00); + const uint64x2_t e791 = vdupq_n_u64(0x00003f3f3f3f3f3f); + const uint64x2_t e792 = vdupq_n_u64(0x0f0f0f0ff0f0f0f0); + + l01 = vcreate_u8(disc); l79 = r79 = vreinterpretq_u64_u8(vcombine_u8(l01, vrev64_u8(l01))); + l01 = vceq_u8(l01, vdup_n_u8(0xff)); l79 = vandq_u64(l79, vornq_u64(vshrq_n_u64(l79, 9), e790)); + full_h = vget_lane_u64(vreinterpret_u64_u8(l01), 0); + r79 = vandq_u64(r79, vornq_u64(vshlq_n_u64(r79, 9), e790)); + l8 = disc; l79 = vbicq_u64(l79, vbicq_u64(e791, vshrq_n_u64(l79, 18))); // De Morgan + l8 &= (l8 >> 8) | (l8 << 56); r79 = vbicq_u64(r79, vshlq_n_u64(vbicq_u64(e791, r79), 18)); + l8 &= (l8 >> 16) | (l8 << 48); l79 = vandq_u64(vandq_u64(l79, r79), vorrq_u64(e792, vsliq_n_u64(vshrq_n_u64(l79, 36), r79, 36))); + l8 &= (l8 >> 32) | (l8 << 32); full_d9 = vgetq_lane_u64(l79, 0); + full_v = l8; full_d7 = vertical_mirror(vgetq_lane_u64(l79, 1)); + +#elif 1 // 1 CPU, 3 SSE __m128i l01, l79, r79; // full lines const __m128i kff = _mm_set1_epi64x(0xffffffffffffffff); const __m128i edge = _mm_set1_epi64x(0xff818181818181ff); - const __m128i e791 = _mm_set1_epi64x(0xffffc0c0c0c0c0c0); - const __m128i e792 = _mm_set1_epi64x(0x030303030303ffff); - const __m128i e793 = _mm_set1_epi64x(0x0f0f0f0ff0f0f0f0); + const __m128i e791 = _mm_set1_epi64x(0x00003f3f3f3f3f3f); + const __m128i e792 = _mm_set1_epi64x(0x0f0f0f0ff0f0f0f0); l01 = l79 = _mm_cvtsi64_si128(disc); r79 = _mm_cvtsi64_si128(vertical_mirror(disc)); l01 = _mm_cmpeq_epi8(kff, l01); l79 = r79 = _mm_unpacklo_epi64(l79, r79); full_h = _mm_cvtsi128_si64(l01); l79 = _mm_and_si128(l79, _mm_or_si128(edge, _mm_srli_epi64(l79, 9))); r79 = _mm_and_si128(r79, _mm_or_si128(edge, _mm_slli_epi64(r79, 9))); - l8 = disc; l79 = _mm_and_si128(l79, _mm_or_si128(e791, _mm_srli_epi64(l79, 18))); - l8 &= (l8 >> 8) | (l8 << 56); r79 = _mm_and_si128(r79, _mm_or_si128(e792, _mm_slli_epi64(r79, 18))); - l8 &= (l8 >> 16) | (l8 << 48); l79 = _mm_and_si128(_mm_and_si128(l79, r79), _mm_or_si128(e793, _mm_or_si128(_mm_srli_epi64(l79, 36), _mm_slli_epi64(r79, 36)))); + l8 = disc; l79 = _mm_andnot_si128(_mm_andnot_si128(_mm_srli_epi64(l79, 18), e791), l79); // De Morgan + l8 &= (l8 >> 8) | (l8 << 56); r79 = _mm_andnot_si128(_mm_slli_epi64(_mm_andnot_si128(r79, e791), 18), r79); + l8 &= (l8 >> 16) | (l8 << 48); l79 = _mm_and_si128(_mm_and_si128(l79, r79), _mm_or_si128(e792, _mm_or_si128(_mm_srli_epi64(l79, 36), _mm_slli_epi64(r79, 36)))); l8 &= (l8 >> 32) | (l8 << 32); full_d9 = _mm_cvtsi128_si64(l79); full_v = l8; full_d7 = vertical_mirror(_mm_cvtsi128_si64(_mm_unpackhi_epi64(l79, l79))); #else // 4 CPU unsigned long long l1, l7, l9, r7, r9; // full lines - static const unsigned long long edge = 0xff818181818181ffULL; - static const unsigned long long k01 = 0x0101010101010101ULL; + static const unsigned long long edge = 0xff818181818181ff; + static const unsigned long long k01 = 0x0101010101010101; static const unsigned long long e7[] = { 0xffff030303030303, 0xc0c0c0c0c0c0ffff, 0xffffffff0f0f0f0f, 0xf0f0f0f0ffffffff }; static const unsigned long long e9[] = { 0xffffc0c0c0c0c0c0, 0x030303030303ffff, 0x0f0f0f0ff0f0f0f0 }; @@ -582,7 +763,7 @@ int get_stability(const unsigned long long P, const unsigned long long O) } #endif // __AVX2__ -#endif // __x86_64__ +#endif // HAS_CPU_64/ANDROID /** * @brief SSE translation of board_get_hash_code. diff --git a/src/const.h b/src/const.h index ac88661..a53099e 100644 --- a/src/const.h +++ b/src/const.h @@ -45,9 +45,6 @@ enum { OFF_SIDE }; -extern const unsigned long long X_TO_BIT[]; -extern const unsigned long long NEIGHBOUR[]; - /** infinite score: a huge value unreachable as a score and fitting in a char */ #define SCORE_INF 127 diff --git a/src/count_last_flip_neon.c b/src/count_last_flip_neon.c new file mode 100644 index 0000000..7b8ee22 --- /dev/null +++ b/src/count_last_flip_neon.c @@ -0,0 +1,212 @@ +/** + * @file count_last_flip_neon.c + * + * + * A function is provided to count the number of fipped disc of the last move. + * + * The basic principle is to read into an array a precomputed result. Doing + * this is easy for a single line ; as we can use arrays of the form: + * - COUNT_FLIP[square where we play][8-bits disc pattern]. + * The problem is thus to convert any line of a 64-bits disc pattern into an + * 8-bits disc pattern. A fast way to do this is to select the right line, + * with a bit-mask, to gather the masked-bits into a continuous set by the + * neon vaddvq_u16 instruction. + * Once we get our 8-bits disc patterns, we directly get the number of + * flipped discs from the precomputed array, and add them from each flipping + * lines. + * For optimization purpose, the value returned is twice the number of flipped + * disc, to facilitate the computation of disc difference. + * + * @date 1998 - 2020 + * @author Richard Delorme + * @author Toshihiko Okuhara + * @version 4.4 + * + */ + +#include + +/** precomputed count flip array */ +const unsigned char COUNT_FLIP[8][256] = { + { + 0, 0, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 12, 12, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 10, 10, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + 8, 8, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, 6, 6, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 2, 2, 0, 0, + }, + { + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 10, 10, 10, 10, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 8, 8, 8, 8, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + 6, 6, 6, 6, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, + }, + { + 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 8, 10, 8, 8, 8, 10, 8, 8, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 6, 8, 6, 6, 6, 8, 6, 6, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + 4, 6, 4, 4, 4, 6, 4, 4, 0, 2, 0, 0, 0, 2, 0, 0, 2, 4, 2, 2, 2, 4, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, + }, + { + 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 6, 10, 8, 8, 6, 6, 6, 6, 6, 10, 8, 8, 6, 6, 6, 6, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 4, 8, 6, 6, 4, 4, 4, 4, 4, 8, 6, 6, 4, 4, 4, 4, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + 2, 6, 4, 4, 2, 2, 2, 2, 2, 6, 4, 4, 2, 2, 2, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 4, 2, 2, 0, 0, 0, 0, + }, + { + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 4, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 6, 6, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 10, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, + { + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 12, 10, 10, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }, +}; + +/* bit masks for diagonal lines (interleaved) */ +const uint64x2_t mask_dvhd[64][2] = { + {{ 0x000000000000ff01, 0x0000000000000000 }, { 0x0801040102010101, 0x8001400120011001 }}, + {{ 0x000000000001ff02, 0x0000000000000000 }, { 0x1002080204020202, 0x0002800240022002 }}, + {{ 0x000000010002ff04, 0x0000000000000000 }, { 0x2004100408040404, 0x0004000480044004 }}, + {{ 0x000100020004ff08, 0x0000000000000000 }, { 0x4008200810080808, 0x0008000800088008 }}, + {{ 0x000200040008ff10, 0x0000000000000001 }, { 0x8010401020101010, 0x0010001000100010 }}, + {{ 0x000400080010ff20, 0x0000000000010002 }, { 0x0020802040202020, 0x0020002000200020 }}, + {{ 0x000800100020ff40, 0x0000000100020004 }, { 0x0040004080404040, 0x0040004000400040 }}, + {{ 0x001000200040ff80, 0x0001000200040008 }, { 0x0080008000808080, 0x0080008000800080 }}, + {{ 0x00000000ff010002, 0x0000000000000000 }, { 0x0401020101010001, 0x4001200110010801 }}, + {{ 0x00000001ff020004, 0x0000000000000000 }, { 0x0802040202020102, 0x8002400220021002 }}, + {{ 0x00010002ff040008, 0x0000000000000000 }, { 0x1004080404040204, 0x0004800440042004 }}, + {{ 0x00020004ff080010, 0x0000000000000001 }, { 0x2008100808080408, 0x0008000880084008 }}, + {{ 0x00040008ff100020, 0x0000000000010002 }, { 0x4010201010100810, 0x0010001000108010 }}, + {{ 0x00080010ff200040, 0x0000000100020004 }, { 0x8020402020201020, 0x0020002000200020 }}, + {{ 0x00100020ff400080, 0x0001000200040008 }, { 0x0040804040402040, 0x0040004000400040 }}, + {{ 0x00200040ff800000, 0x0002000400080010 }, { 0x0080008080804080, 0x0080008000800080 }}, + {{ 0x0000ff0100020004, 0x0000000000000000 }, { 0x0201010100010001, 0x2001100108010401 }}, + {{ 0x0001ff0200040008, 0x0000000000000000 }, { 0x0402020201020002, 0x4002200210020802 }}, + {{ 0x0002ff0400080010, 0x0000000000000001 }, { 0x0804040402040104, 0x8004400420041004 }}, + {{ 0x0004ff0800100020, 0x0000000000010002 }, { 0x1008080804080208, 0x0008800840082008 }}, + {{ 0x0008ff1000200040, 0x0000000100020004 }, { 0x2010101008100410, 0x0010001080104010 }}, + {{ 0x0010ff2000400080, 0x0001000200040008 }, { 0x4020202010200820, 0x0020002000208020 }}, + {{ 0x0020ff4000800000, 0x0002000400080010 }, { 0x8040404020401040, 0x0040004000400040 }}, + {{ 0x0040ff8000000000, 0x0004000800100020 }, { 0x0080808040802080, 0x0080008000800080 }}, + {{ 0xff01000200040008, 0x0000000000000000 }, { 0x0101000100010001, 0x1001080104010201 }}, + {{ 0xff02000400080010, 0x0000000000000001 }, { 0x0202010200020002, 0x2002100208020402 }}, + {{ 0xff04000800100020, 0x0000000000010002 }, { 0x0404020401040004, 0x4004200410040804 }}, + {{ 0xff08001000200040, 0x0000000100020004 }, { 0x0808040802080108, 0x8008400820081008 }}, + {{ 0xff10002000400080, 0x0001000200040008 }, { 0x1010081004100210, 0x0010801040102010 }}, + {{ 0xff20004000800000, 0x0002000400080010 }, { 0x2020102008200420, 0x0020002080204020 }}, + {{ 0xff40008000000000, 0x0004000800100020 }, { 0x4040204010400840, 0x0040004000408040 }}, + {{ 0xff80000000000000, 0x0008001000200040 }, { 0x8080408020801080, 0x0080008000800080 }}, + {{ 0x0002000400080010, 0x000000000000ff01 }, { 0x0001000100010001, 0x0801040102010101 }}, + {{ 0x0004000800100020, 0x000000000001ff02 }, { 0x0102000200020002, 0x1002080204020202 }}, + {{ 0x0008001000200040, 0x000000010002ff04 }, { 0x0204010400040004, 0x2004100408040404 }}, + {{ 0x0010002000400080, 0x000100020004ff08 }, { 0x0408020801080008, 0x4008200810080808 }}, + {{ 0x0020004000800000, 0x000200040008ff10 }, { 0x0810041002100110, 0x8010401020101010 }}, + {{ 0x0040008000000000, 0x000400080010ff20 }, { 0x1020082004200220, 0x0020802040202020 }}, + {{ 0x0080000000000000, 0x000800100020ff40 }, { 0x2040104008400440, 0x0040004080404040 }}, + {{ 0x0000000000000000, 0x001000200040ff80 }, { 0x4080208010800880, 0x0080008000808080 }}, + {{ 0x0004000800100020, 0x00000000ff010002 }, { 0x0001000100010001, 0x0401020101010001 }}, + {{ 0x0008001000200040, 0x00000001ff020004 }, { 0x0002000200020002, 0x0802040202020102 }}, + {{ 0x0010002000400080, 0x00010002ff040008 }, { 0x0104000400040004, 0x1004080404040204 }}, + {{ 0x0020004000800000, 0x00020004ff080010 }, { 0x0208010800080008, 0x2008100808080408 }}, + {{ 0x0040008000000000, 0x00040008ff100020 }, { 0x0410021001100010, 0x4010201010100810 }}, + {{ 0x0080000000000000, 0x00080010ff200040 }, { 0x0820042002200120, 0x8020402020201020 }}, + {{ 0x0000000000000000, 0x00100020ff400080 }, { 0x1040084004400240, 0x0040804040402040 }}, + {{ 0x0000000000000000, 0x00200040ff800000 }, { 0x2080108008800480, 0x0080008080804080 }}, + {{ 0x0008001000200040, 0x0000ff0100020004 }, { 0x0001000100010001, 0x0201010100010001 }}, + {{ 0x0010002000400080, 0x0001ff0200040008 }, { 0x0002000200020002, 0x0402020201020002 }}, + {{ 0x0020004000800000, 0x0002ff0400080010 }, { 0x0004000400040004, 0x0804040402040104 }}, + {{ 0x0040008000000000, 0x0004ff0800100020 }, { 0x0108000800080008, 0x1008080804080208 }}, + {{ 0x0080000000000000, 0x0008ff1000200040 }, { 0x0210011000100010, 0x2010101008100410 }}, + {{ 0x0000000000000000, 0x0010ff2000400080 }, { 0x0420022001200020, 0x4020202010200820 }}, + {{ 0x0000000000000000, 0x0020ff4000800000 }, { 0x0840044002400140, 0x8040404020401040 }}, + {{ 0x0000000000000000, 0x0040ff8000000000 }, { 0x1080088004800280, 0x0080808040802080 }}, + {{ 0x0010002000400080, 0xff01000200040008 }, { 0x0001000100010001, 0x0101000100010001 }}, + {{ 0x0020004000800000, 0xff02000400080010 }, { 0x0002000200020002, 0x0202010200020002 }}, + {{ 0x0040008000000000, 0xff04000800100020 }, { 0x0004000400040004, 0x0404020401040004 }}, + {{ 0x0080000000000000, 0xff08001000200040 }, { 0x0008000800080008, 0x0808040802080108 }}, + {{ 0x0000000000000000, 0xff10002000400080 }, { 0x0110001000100010, 0x1010081004100210 }}, + {{ 0x0000000000000000, 0xff20004000800000 }, { 0x0220012000200020, 0x2020102008200420 }}, + {{ 0x0000000000000000, 0xff40008000000000 }, { 0x0440024001400040, 0x4040204010400840 }}, + {{ 0x0000000000000000, 0xff80000000000000 }, { 0x0880048002800180, 0x8080408020801080 }} +}; + +/** + * Count last flipped discs when playing on the last empty. + * + * @param pos the last empty square. + * @param P player's disc pattern. + * @return flipped disc count. + */ + +#ifndef HAS_CPU_64 +#define vaddvq_u16(x) vget_lane_u64(vpaddl_u32(vpaddl_u16(vadd_u16(vget_high_u16(x), vget_low_u16(x)))), 0) +#endif + +int last_flip(int pos, unsigned long long P) +{ + unsigned int n_flips, t; + const unsigned char *COUNT_FLIP_X = COUNT_FLIP[pos & 7]; + const unsigned char *COUNT_FLIP_Y = COUNT_FLIP[pos >> 3]; + uint8x16_t PP; + const uint8x16_t dmask = { 1, 1, 2, 2, 4, 4, 8, 8, 16, 16, 32, 32, 64, 64, 128, 128 }; + + PP = vreinterpretq_u8_u64(vdupq_n_u64(P)); + PP = vzipq_u8(PP, PP).val[0]; + t = vaddvq_u16(vreinterpretq_u16_u64(vandq_u64(vreinterpretq_u64_u8(PP), mask_dvhd[pos][0]))); + n_flips = COUNT_FLIP_X[t >> 8]; + n_flips += COUNT_FLIP_X[(unsigned char) t]; + t = vaddvq_u16(vreinterpretq_u16_u8(vandq_u8(vtstq_u8(PP, vreinterpretq_u8_u64(mask_dvhd[pos][1])), dmask))); + n_flips += COUNT_FLIP_Y[t >> 8]; + n_flips += COUNT_FLIP_Y[(unsigned char) t]; + + return n_flips; +} + diff --git a/src/endgame.c b/src/endgame.c index a093097..ac5f95e 100644 --- a/src/endgame.c +++ b/src/endgame.c @@ -21,7 +21,11 @@ #if LAST_FLIP_COUNTER == COUNT_LAST_FLIP_CARRY #include "count_last_flip_carry_64.c" #elif LAST_FLIP_COUNTER == COUNT_LAST_FLIP_SSE - #include "count_last_flip_sse.c" + #ifdef hasSSE2 + #include "count_last_flip_sse.c" + #else + #include "count_last_flip_neon.c" + #endif #elif LAST_FLIP_COUNTER == COUNT_LAST_FLIP_BITSCAN #include "count_last_flip_bitscan.c" #elif LAST_FLIP_COUNTER == COUNT_LAST_FLIP_PLAIN @@ -36,6 +40,8 @@ #if ((MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_SSE)) && (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_SSE) #include "endgame_sse.c" // vectorcall version +#elif (MOVE_GENERATOR == MOVE_GENERATOR_NEON) && (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_SSE) + #include "endgame_neon.c" #endif /** @@ -122,7 +128,7 @@ int board_score_1(const Board *board, const int beta, const int x) return score; } -#if !(((MOVE_GENERATOR == MOVE_GENERATOR_AVX) || (MOVE_GENERATOR == MOVE_GENERATOR_SSE)) && (LAST_FLIP_COUNTER == COUNT_LAST_FLIP_SSE)) +#if ((MOVE_GENERATOR != MOVE_GENERATOR_AVX) && (MOVE_GENERATOR != MOVE_GENERATOR_SSE) && (MOVE_GENERATOR != MOVE_GENERATOR_NEON)) || (LAST_FLIP_COUNTER != COUNT_LAST_FLIP_SSE) /** * @brief Get the final score. * diff --git a/src/endgame_neon.c b/src/endgame_neon.c new file mode 100644 index 0000000..bdd5a6c --- /dev/null +++ b/src/endgame_neon.c @@ -0,0 +1,386 @@ +/** + * @file endgame_neon.c + * + * + * Arm Neon optimized version of endgame.c for the last four empties. + * + * Bitboard and empty list is kept in Neon registers. + * + * @date 1998 - 2020 + * @author Richard Delorme + * @author Toshihiko Okuhara + * @version 4.4 + * + */ + +#include "bit_intrinsics.h" +#include "settings.h" +#include "search.h" +#include + +#define TESTZ_FLIP(X) (!vgetq_lane_u64((X), 0)) + +#ifndef HAS_CPU_64 +#define vaddv_u8(x) vget_lane_u64(vpaddl_u32(vpaddl_u16(vpaddl_u8(x))), 0) +#define vaddvq_u16(x) vget_lane_u64(vpaddl_u32(vpaddl_u16(vadd_u16(vget_high_u16(x), vget_low_u16(x)))), 0) +#endif + +// in count_last_flip_neon.c +extern const unsigned char COUNT_FLIP[8][256]; +extern const uint64x2_t mask_dvhd[64][2]; + +/** + * @brief Compute a board resulting of a move played on a previous board. + * + * @param OP board to play the move on. + * @param x move to play. + * @param next resulting board. + * @return true if no flips. + */ +static inline uint64x2_t board_next_neon(uint64x2_t OP, int x, uint64x2_t flipped) +{ + OP = veorq_u64(OP, flipped); + return vcombine_u64(vget_high_u64(OP), vorr_u64(vget_low_u64(OP), vcreate_u64(X_TO_BIT[x]))); +} + +/** + * @brief Get the final score. + * + * Get the final score, when no move can be made. + * + * @param OP Board. + * @param n_empties Number of empty squares remaining on the board. + * @return The final score, as a disc difference. + */ +static int board_solve_neon(uint64x2_t OP, int n_empties) +{ + int score = vaddv_u8(vcnt_u8(vreinterpret_u8_u64(vget_low_u64(OP)))) * 2 - SCORE_MAX; // in case of opponents win + int diff = score + n_empties; // = n_discs_p - (64 - n_empties - n_discs_p) + + SEARCH_STATS(++statistics.n_search_solve); + + if (diff >= 0) + score = diff; + if (diff > 0) + score += n_empties; + return score; +} + +/** + * @brief Get the final score. + * + * Get the final score, when 1 empty squares remain. + * The following code has been adapted from Zebra by Gunnar Anderson. + * + * @param OP Board to evaluate. + * @param beta Beta bound. + * @param pos Last empty square to play. + * @return The final opponent score, as a disc difference. + */ +static int board_score_sse_1(uint64x2_t OP, const int beta, const int pos) +{ + int score, score2; + unsigned int n_flips, t0, t1, m; + const unsigned char *COUNT_FLIP_X = COUNT_FLIP[pos & 7]; + const unsigned char *COUNT_FLIP_Y = COUNT_FLIP[pos >> 3]; + uint8x16_t PP = vzipq_u8(vreinterpretq_u8_u64(OP), vreinterpretq_u8_u64(OP)).val[0]; + const uint8x16_t dmask = { 1, 1, 2, 2, 4, 4, 8, 8, 16, 16, 32, 32, 64, 64, 128, 128 }; + static const unsigned short o_mask[64] = { + 0xff01, 0x7f03, 0x3f07, 0x1f0f, 0x0f1f, 0x073f, 0x037f, 0x01ff, + 0xfe03, 0xff07, 0x7f0f, 0x3f1f, 0x1f3f, 0x0f7f, 0x07ff, 0x03fe, + 0xfc07, 0xfe0f, 0xff1f, 0x7f3f, 0x3f7f, 0x1fff, 0x0ffe, 0x07fc, + 0xf80f, 0xfc1f, 0xfe3f, 0xff7f, 0x7fff, 0x3ffe, 0x1ffc, 0x0ff8, + 0xf01f, 0xf83f, 0xfc7f, 0xfeff, 0xfffe, 0x7ffc, 0x3ff8, 0x1ff0, + 0xe03f, 0xf07f, 0xf8ff, 0xfcfe, 0xfefc, 0xfff8, 0x7ff0, 0x3fe0, + 0xc07f, 0xe0ff, 0xf0fe, 0xf8fc, 0xfcf8, 0xfef0, 0xffe0, 0x7fc0, + 0x80ff, 0xc0fe, 0xe0fc, 0xf0f8, 0xf8f0, 0xfce0, 0xfec0, 0xff80 + }; + + score = SCORE_MAX - 2 - 2 * vaddv_u8(vcnt_u8(vreinterpret_u8_u64(vget_low_u64(OP)))); // 2 * bit_count(O) - SCORE_MAX + + // n_flips = last_flip(pos, P); + t0 = vaddvq_u16(vreinterpretq_u16_u64(vandq_u64(vreinterpretq_u64_u8(PP), mask_dvhd[pos][0]))); + n_flips = COUNT_FLIP_X[t0 >> 8]; + n_flips += COUNT_FLIP_X[(unsigned char) t0]; + t1 = vaddvq_u16(vreinterpretq_u16_u8(vandq_u8(vtstq_u8(PP, vreinterpretq_u8_u64(mask_dvhd[pos][1])), dmask))); + n_flips += COUNT_FLIP_Y[t1 >> 8]; + n_flips += COUNT_FLIP_Y[(unsigned char) t1]; + score -= n_flips; + + if (n_flips == 0) { + score2 = score + 2; // empty for player + if (score >= 0) + score = score2; + + if (score < beta) { // lazy cut-off + // n_flips = last_flip(pos, O); + m = o_mask[pos]; // valid diagonal bits + n_flips = COUNT_FLIP_X[(t0 >> 8) ^ 0xff]; + n_flips += COUNT_FLIP_X[(unsigned char) (t0 ^ m)]; + n_flips += COUNT_FLIP_Y[(t1 ^ m) >> 8]; + n_flips += COUNT_FLIP_Y[(unsigned char) ~t1]; + + if (n_flips != 0) + score = score2 + n_flips; + } + } + + return score; +} + +/** + * @brief Get the final score. + * + * Get the final score, when 2 empty squares remain. + * + * @param OP The board to evaluate. + * @param empties Packed empty square coordinates. + * @param alpha Alpha bound. + * @param n_nodes Node counter. + * @return The final score, as a disc difference. + */ +static int board_solve_neon_2(uint64x2_t OP, int alpha, volatile unsigned long long *n_nodes, uint8x8_t empties) +{ + uint64x2_t flipped, PO; + int score, bestscore, nodes; + int x1 = vget_lane_u8(empties, 1); + int x2 = vget_lane_u8(empties, 0); + unsigned long long bb; + // const int beta = alpha + 1; + + SEARCH_STATS(++statistics.n_board_solve_2); + + bb = vgetq_lane_u64(OP, 1); // opponent + if ((NEIGHBOUR[x1] & bb) && !TESTZ_FLIP(flipped = mm_Flip(OP, x1))) { + bestscore = board_score_sse_1(board_next_neon(OP, x1, flipped), alpha + 1, x2); + nodes = 2; + + if ((bestscore <= alpha) && (NEIGHBOUR[x2] & bb) && !TESTZ_FLIP(flipped = mm_Flip(OP, x2))) { + score = board_score_sse_1(board_next_neon(OP, x2, flipped), alpha + 1, x1); + if (score > bestscore) bestscore = score; + nodes = 3; + } + + } else if ((NEIGHBOUR[x2] & bb) && !TESTZ_FLIP(flipped = mm_Flip(OP, x2))) { + bestscore = board_score_sse_1(board_next_neon(OP, x2, flipped), alpha + 1, x1); + nodes = 2; + + } else { // pass + bb = vgetq_lane_u64(OP, 0); // player + PO = vextq_u64(OP, OP, 1); + if ((NEIGHBOUR[x1] & bb) && !TESTZ_FLIP(flipped = mm_Flip(PO, x1))) { + bestscore = -board_score_sse_1(board_next_neon(PO, x1, flipped), -alpha, x2); + nodes = 2; + + if ((bestscore > alpha) && (NEIGHBOUR[x2] & bb) && !TESTZ_FLIP(flipped = mm_Flip(PO, x2))) { + score = -board_score_sse_1(board_next_neon(PO, x2, flipped), -alpha, x1); + if (score < bestscore) bestscore = score; + nodes = 3; + } + + } else if ((NEIGHBOUR[x2] & bb) && !TESTZ_FLIP(flipped = mm_Flip(PO, x2))) { + bestscore = -board_score_sse_1(board_next_neon(PO, x2, flipped), -alpha, x1); + nodes = 2; + + } else { // gameover + bestscore = board_solve_neon(OP, 2); + nodes = 1; + } + } + + SEARCH_UPDATE_2EMPTIES_NODES(*n_nodes += nodes;) + assert(SCORE_MIN <= bestscore && bestscore <= SCORE_MAX); + assert((bestscore & 1) == 0); + return bestscore; +} + +/** + * @brief Get the final score. + * + * Get the final score, when 3 empty squares remain. + * + * @param OP The board to evaluate. + * @param empties Packed empty square coordinates. + * @param alpha Alpha bound. + * @param sort3 Parity flags. + * @param n_nodes Node counter. + * @return The final score, as a disc difference. + */ +static int search_solve_sse_3(uint64x2_t OP, int alpha, volatile unsigned long long *n_nodes, uint8x8_t empties) +{ + uint64x2_t flipped, PO; + int score, bestscore, x; + unsigned long long bb; + // const int beta = alpha + 1; + + SEARCH_STATS(++statistics.n_search_solve_3); + SEARCH_UPDATE_INTERNAL_NODES(*n_nodes); + + // best move alphabeta search + bestscore = -SCORE_INF; + bb = vgetq_lane_u64(OP, 1); // opponent + x = vget_lane_u8(empties, 2); + if ((NEIGHBOUR[x] & bb) && !TESTZ_FLIP(flipped = mm_Flip(OP, x))) { + bestscore = -board_solve_neon_2(board_next_neon(OP, x, flipped), -(alpha + 1), n_nodes, empties); + if (bestscore > alpha) return bestscore; + } + + x = vget_lane_u8(empties, 1); + if ((NEIGHBOUR[x] & bb) && !TESTZ_FLIP(flipped = mm_Flip(OP, x))) { + score = -board_solve_neon_2(board_next_neon(OP, x, flipped), -(alpha + 1), n_nodes, vuzp_u8(empties, empties).val[0]); + if (score > alpha) return score; + else if (score > bestscore) bestscore = score; + } + + x = vget_lane_u8(empties, 0); + if ((NEIGHBOUR[x] & bb) && !TESTZ_FLIP(flipped = mm_Flip(OP, x))) { + score = -board_solve_neon_2(board_next_neon(OP, x, flipped), -(alpha + 1), n_nodes, vext_u8(empties, empties, 1)); + if (score > bestscore) bestscore = score; + } + + else if (bestscore == -SCORE_INF) { // pass ? + // best move alphabeta search + bestscore = SCORE_INF; + bb = vgetq_lane_u64(OP, 0); // player + PO = vextq_u64(OP, OP, 1); + x = vget_lane_u8(empties, 2); + if ((NEIGHBOUR[x] & bb) && !TESTZ_FLIP(flipped = mm_Flip(PO, x))) { + bestscore = board_solve_neon_2(board_next_neon(PO, x, flipped), alpha, n_nodes, empties); + if (bestscore <= alpha) return bestscore; + } + + x = vget_lane_u8(empties, 1); + if ((NEIGHBOUR[x] & bb) && !TESTZ_FLIP(flipped = mm_Flip(PO, x))) { + score = board_solve_neon_2(board_next_neon(PO, x, flipped), alpha, n_nodes, vuzp_u8(empties, empties).val[0]); + if (score <= alpha) return score; + else if (score < bestscore) bestscore = score; + } + + x = vget_lane_u8(empties, 0); + if ((NEIGHBOUR[x] & bb) && !TESTZ_FLIP(flipped = mm_Flip(PO, x))) { + score = board_solve_neon_2(board_next_neon(PO, x, flipped), alpha, n_nodes, vext_u8(empties, empties, 1)); + if (score < bestscore) bestscore = score; + } + + else if (bestscore == SCORE_INF) // gameover + bestscore = board_solve_neon(OP, 3); + } + + assert(SCORE_MIN <= bestscore && bestscore <= SCORE_MAX); + return bestscore; +} + +/** + * @brief Get the final score. + * + * Get the final score, when 4 empty squares remain. + * + * @param search Search position. + * @param alpha Upper score value. + * @return The final score, as a disc difference. + */ + +int search_solve_4(Search *search, const int alpha) +{ + uint64x2_t OP, flipped; + uint8x16_t empties_series; // B15:4th, B11:3rd, B7:2nd, B3:1st, lower 3 bytes for 3 empties + uint32x4_t shuf; + int x1, x2, x3, x4, paritysort, score, bestscore; + unsigned long long opp; + // const int beta = alpha + 1; + static const unsigned char parity_case[64] = { /* x4x3x2x1 = */ + /*0000*/ 0, /*0001*/ 0, /*0010*/ 1, /*0011*/ 9, /*0100*/ 2, /*0101*/ 10, /*0110*/ 11, /*0111*/ 3, + /*0002*/ 0, /*0003*/ 0, /*0012*/ 0, /*0013*/ 0, /*0102*/ 4, /*0103*/ 4, /*0112*/ 5, /*0113*/ 5, + /*0020*/ 1, /*0021*/ 0, /*0030*/ 1, /*0031*/ 0, /*0120*/ 6, /*0121*/ 7, /*0130*/ 6, /*0131*/ 7, + /*0022*/ 9, /*0023*/ 0, /*0032*/ 0, /*0033*/ 9, /*0122*/ 8, /*0123*/ 0, /*0132*/ 0, /*0133*/ 8, + /*0200*/ 2, /*0201*/ 4, /*0210*/ 6, /*0211*/ 8, /*0300*/ 2, /*0301*/ 4, /*0310*/ 6, /*0311*/ 8, + /*0202*/ 10, /*0203*/ 4, /*0212*/ 7, /*0213*/ 0, /*0302*/ 4, /*0303*/ 10, /*0312*/ 0, /*0313*/ 7, + /*0220*/ 11, /*0221*/ 5, /*0230*/ 6, /*0231*/ 0, /*0320*/ 6, /*0321*/ 0, /*0330*/ 11, /*0331*/ 5, + /*0222*/ 3, /*0223*/ 5, /*0232*/ 7, /*0233*/ 8, /*0322*/ 8, /*0323*/ 7, /*0332*/ 5, /*0333*/ 3 + }; + static const uint32x4_t shuf_mask[] = { + { 0x03020100, 0x02030100, 0x01030200, 0x00030201 }, // 0: 1(x1) 3(x2 x3 x4), 1(x1) 1(x2) 2(x3 x4), 1 1 1 1, 4 + { 0x03020100, 0x02030100, 0x01020300, 0x00020301 }, // 1: 1(x2) 3(x1 x3 x4) + { 0x03010200, 0x02010300, 0x01030200, 0x00010302 }, // 2: 1(x3) 3(x1 x2 x4) + { 0x03000201, 0x02000301, 0x01000302, 0x00030201 }, // 3: 1(x4) 3(x1 x2 x3) + { 0x03010200, 0x01030200, 0x02030100, 0x00030201 }, // 4: 1(x1) 1(x3) 2(x2 x4) x4x1x2x3-x2x1x3x4-x3x1x2x4-x1x3x2x4 + { 0x03000201, 0x00030201, 0x02030100, 0x01030200 }, // 5: 1(x1) 1(x4) 2(x2 x3) x3x1x2x4-x2x1x3x4-x4x1x2x3-x1x4x2x3 + { 0x02010300, 0x01020300, 0x03020100, 0x00030201 }, // 6: 1(x2) 1(x3) 2(x1 x4) x4x1x2x3-x1x2x3x4-x3x2x1x4-x2x3x1x4 + { 0x02000301, 0x00020301, 0x03020100, 0x01030200 }, // 7: 1(x2) 1(x4) 2(x1 x3) x3x1x2x4-x1x2x3x4-x4x2x1x3-x2x4x1x3 + { 0x01000302, 0x00010302, 0x03020100, 0x02030100 }, // 8: 1(x3) 1(x4) 2(x1 x2) x2x1x3x4-x1x2x3x4-x4x3x1x2-x3x4x1x2 + { 0x03020100, 0x02030100, 0x01000302, 0x00010302 }, // 9: 2(x1 x2) 2(x3 x4) x4x3x1x2-x3x4x1x2-x2x1x3x4-x1x2x3x4 + { 0x03010200, 0x02000301, 0x01030200, 0x00020301 }, // 10: 2(x1 x3) 2(x2 x4) x4x2x1x3-x3x1x2x4-x2x4x1x3-x1x3x2x4 + { 0x03000201, 0x02010300, 0x01020300, 0x00030201 } // 11: 2(x1 x4) 2(x2 x3) x4x1x2x3-x3x2x1x4-x2x3x1x4-x1x4x2x3 + }; + + SEARCH_STATS(++statistics.n_search_solve_4); + SEARCH_UPDATE_INTERNAL_NODES(search->n_nodes); + + // stability cutoff + if (search_SC_NWS(search, alpha, &score)) return score; + + OP = vld1q_u64((uint64_t *) &search->board); + x1 = search->empties[NOMOVE].next; + x2 = search->empties[x1].next; + x3 = search->empties[x2].next; + x4 = search->empties[x3].next; + + // parity based move sorting. + // The following hole sizes are possible: + // 4 - 1 3 - 2 2 - 1 1 2 - 1 1 1 1 + // Only the 1 1 2 case needs move sorting on this ply. + empties_series = vreinterpretq_u8_u32(vdupq_n_u32((x1 << 24) | (x2 << 16) | (x3 << 8) | x4)); + paritysort = parity_case[((x3 ^ x4) & 0x24) + (((x2 ^ x4) & 0x24) >> 1) + (((x1 ^ x4) & 0x24) >> 2)]; + shuf = shuf_mask[paritysort]; +#ifdef HAS_CPU_64 + empties_series = vqtbl1q_u8(empties_series, vreinterpretq_u8_u32(shuf)); +#else + empties_series = vcombine_u8(vtbl1_u8(vget_low_u8(empties_series), vget_low_u8(vreinterpretq_u8_u32(shuf))), + vtbl1_u8(vget_low_u8(empties_series), vget_high_u8(vreinterpretq_u8_u32(shuf)))); +#endif + + // best move alphabeta search + bestscore = -SCORE_INF; + opp = vgetq_lane_u64(OP, 1); + x1 = vgetq_lane_u8(empties_series, 3); + if ((NEIGHBOUR[x1] & opp) && !TESTZ_FLIP(flipped = mm_Flip(OP, x1))) { + bestscore = -search_solve_sse_3(board_next_neon(OP, x1, flipped), -(alpha + 1), &search->n_nodes, vget_low_u8(empties_series)); + if (bestscore > alpha) return bestscore; + } + + empties_series = vextq_u8(empties_series, empties_series, 4); + x2 = vgetq_lane_u8(empties_series, 3); + if ((NEIGHBOUR[x2] & opp) && !TESTZ_FLIP(flipped = mm_Flip(OP, x2))) { + score = -search_solve_sse_3(board_next_neon(OP, x2, flipped), -(alpha + 1), &search->n_nodes, vget_low_u8(empties_series)); + if (score > alpha) return score; + else if (score > bestscore) bestscore = score; + } + + empties_series = vextq_u8(empties_series, empties_series, 4); + x3 = vgetq_lane_u8(empties_series, 3); + if ((NEIGHBOUR[x3] & opp) && !TESTZ_FLIP(flipped = mm_Flip(OP, x3))) { + score = -search_solve_sse_3(board_next_neon(OP, x3, flipped), -(alpha + 1), &search->n_nodes, vget_low_u8(empties_series)); + if (score > alpha) return score; + else if (score > bestscore) bestscore = score; + } + + empties_series = vextq_u8(empties_series, empties_series, 4); + x4 = vgetq_lane_u8(empties_series, 3); + if ((NEIGHBOUR[x4] & opp) && !TESTZ_FLIP(flipped = mm_Flip(OP, x4))) { + score = -search_solve_sse_3(board_next_neon(OP, x4, flipped), -(alpha + 1), &search->n_nodes, vget_low_u8(empties_series)); + if (score > bestscore) bestscore = score; + } + + else if (bestscore == -SCORE_INF) { // no move + if (can_move(opp, vgetq_lane_u64(OP, 0))) { // pass + search_pass_endgame(search); + bestscore = -search_solve_4(search, -(alpha + 1)); + search_pass_endgame(search); + } else { // gameover + bestscore = board_solve_neon(OP, 4); + } + } + + assert(SCORE_MIN <= bestscore && bestscore <= SCORE_MAX); + return bestscore; +} diff --git a/src/endgame_sse.c b/src/endgame_sse.c index f175445..03f73a0 100644 --- a/src/endgame_sse.c +++ b/src/endgame_sse.c @@ -96,34 +96,27 @@ static int vectorcall board_solve_sse(__m128i OP, int n_empties) */ static int vectorcall board_score_sse_1(__m128i OP, const int beta, const int pos) { - int score, score2; unsigned char n_flips; - unsigned long long P; unsigned int t; + unsigned long long P = _mm_cvtsi128_si64(OP); + int score = SCORE_MAX - 2 - 2 * bit_count(P); // 2 * bit_count(O) - SCORE_MAX + int score2; const unsigned char *COUNT_FLIP_X = COUNT_FLIP[pos & 7]; const unsigned char *COUNT_FLIP_Y = COUNT_FLIP[pos >> 3]; -#ifdef AVXLASTFLIP - __m256i MP, MO; -#else - __m128i PP, OO; -#endif __m128i II; - P = _mm_cvtsi128_si64(OP); - score = SCORE_MAX - 2 - 2 * bit_count(P); // 2 * bit_count(O) - SCORE_MAX - // n_flips = last_flip(pos, P); #ifdef AVXLASTFLIP + __m256i MP = _mm256_and_si256(_mm256_broadcastq_epi64(OP), mask_dvhd[pos].v4); n_flips = COUNT_FLIP_X[(unsigned char) (P >> (pos & 0x38))]; - MP = _mm256_and_si256(_mm256_broadcastq_epi64(OP), mask_dvhd[pos].v4); t = _mm256_movemask_epi8(_mm256_sub_epi8(_mm256_setzero_si256(), MP)); n_flips += COUNT_FLIP_Y[(unsigned char) t]; t >>= 16; #else - PP = _mm_shuffle_epi32(OP, DUPLO); + __m128i PP = _mm_shuffle_epi32(OP, DUPLO); II = _mm_sad_epu8(_mm_and_si128(PP, mask_dvhd[pos].v2[0]), _mm_setzero_si128()); - n_flips = COUNT_FLIP_X[_mm_cvtsi128_si32(II)]; - n_flips += COUNT_FLIP_X[_mm_extract_epi16(II, 4)]; + n_flips = COUNT_FLIP_X[_mm_extract_epi16(II, 4)]; + n_flips += COUNT_FLIP_X[_mm_cvtsi128_si32(II)]; t = _mm_movemask_epi8(_mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(PP, mask_dvhd[pos].v2[1]))); #endif n_flips += COUNT_FLIP_Y[t >> 8]; @@ -138,16 +131,16 @@ static int vectorcall board_score_sse_1(__m128i OP, const int beta, const int po if (score < beta) { // lazy cut-off // n_flips = last_flip(pos, EXTRACT_O(OP)); #ifdef AVXLASTFLIP - MO = _mm256_and_si256(_mm256_permute4x64_epi64(_mm256_castsi128_si256(OP), 0x55), mask_dvhd[pos].v4); - II = _mm_sad_epu8(_mm256_castsi256_si128(MO), _mm_setzero_si128()); - t = _mm_movemask_epi8(_mm_sub_epi8(_mm_setzero_si128(), _mm256_extracti128_si256(MO, 1))); + MP = _mm256_and_si256(_mm256_permute4x64_epi64(_mm256_castsi128_si256(OP), 0x55), mask_dvhd[pos].v4); + II = _mm_sad_epu8(_mm256_castsi256_si128(MP), _mm_setzero_si128()); + t = _mm_movemask_epi8(_mm_sub_epi8(_mm_setzero_si128(), _mm256_extracti128_si256(MP, 1))); #else - OO = _mm_shuffle_epi32(OP, DUPHI); - II = _mm_sad_epu8(_mm_and_si128(OO, mask_dvhd[pos].v2[0]), _mm_setzero_si128()); - t = _mm_movemask_epi8(_mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(OO, mask_dvhd[pos].v2[1]))); + PP = _mm_shuffle_epi32(OP, DUPHI); + II = _mm_sad_epu8(_mm_and_si128(PP, mask_dvhd[pos].v2[0]), _mm_setzero_si128()); + t = _mm_movemask_epi8(_mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(PP, mask_dvhd[pos].v2[1]))); #endif - n_flips = COUNT_FLIP_X[_mm_cvtsi128_si32(II)]; - n_flips += COUNT_FLIP_X[_mm_extract_epi16(II, 4)]; + n_flips = COUNT_FLIP_X[_mm_extract_epi16(II, 4)]; + n_flips += COUNT_FLIP_X[_mm_cvtsi128_si32(II)]; n_flips += COUNT_FLIP_Y[t >> 8]; n_flips += COUNT_FLIP_Y[(unsigned char) t]; @@ -237,7 +230,7 @@ static int vectorcall board_solve_sse_2(__m128i OP, int alpha, volatile unsigned * @param n_nodes Node counter. * @return The final score, as a disc difference. */ -static int vectorcall search_solve_sse_3(__m128i OP, int alpha, unsigned int sort3, volatile unsigned long long *n_nodes, __m128i empties) +static int vectorcall search_solve_sse_3(__m128i OP, int alpha, int sort3, volatile unsigned long long *n_nodes, __m128i empties) { __m128i flipped, PO; int score, bestscore, x; @@ -250,7 +243,7 @@ static int vectorcall search_solve_sse_3(__m128i OP, int alpha, unsigned int sor empties = _mm_cvtepu8_epi16(empties); // to ease shuffle // parity based move sorting if (sort3 & 0x03) { -#ifndef __AVX__ +#if !(defined(__SSSE3__) || defined(__AVX__)) if (sort3 & 0x01) empties = _mm_shufflelo_epi16(empties, 0xd8); // case 1(x2) 2(x1 x3) else @@ -322,50 +315,59 @@ static int vectorcall search_solve_sse_3(__m128i OP, int alpha, unsigned int sor * @return The final score, as a disc difference. */ -typedef union { - unsigned int ui[4]; - __m128i v4; -} V4SI; - int search_solve_4(Search *search, const int alpha) { __m128i OP, flipped; __m128i empties_series; // B15:4th, B11:3rd, B7:2nd, B3:1st, lower 3 bytes for 3 empties - int x1, x2, x3, x4, q1, q2, q3; - int score, bestscore; - unsigned int parity; + int x1, x2, x3, x4, paritysort, score, bestscore; unsigned long long opp; // const int beta = alpha + 1; -#ifdef __AVX__ - static const V4SI shuf_x1_1_2[2] = { // case 1(x1) 1(x4) 2(x2 x3), case 1(x1) 1(x3) 2(x2 x4) - {{ 0x03000201, 0x00030201, 0x02030100, 0x01030200 }}, // x3x1x2x4-x2x1x3x4-x4x1x2x3-x1x4x2x3 - {{ 0x03010200, 0x01030200, 0x02030100, 0x00030201 }}}; // x4x1x2x3-x2x1x3x4-x3x1x2x4-x1x3x2x4 - static const V4SI shuf_x2_1_2[2] = { // case 1(x2) 1(x4) 2(x1 x3), case 1(x2) 1(x3) 2(x1 x4) - {{ 0x02000301, 0x00020301, 0x03020100, 0x01030200 }}, // x3x1x2x4-x1x2x3x4-x4x2x1x3-x2x4x1x3 - {{ 0x02010300, 0x01020300, 0x03020100, 0x00030201 }}}; // x4x1x2x3-x1x2x3x4-x3x2x1x4-x2x3x1x4 - static const V4SI shuf_1_3[2][2] = { - {{{ 0x03020100, 0x02030100, 0x01030200, 0x00030201 }}, // case 1(x1) 3(x2 x3 x4), case 1 1 1 1 - {{ 0x03020100, 0x02030100, 0x01020300, 0x00020301 }}}, // case 1(x2) 3(x1 x3 x4) - {{{ 0x03010200, 0x02010300, 0x01030200, 0x00010302 }}, // case 1(x3) 3(x1 x2 x4) - {{ 0x03000201, 0x02000301, 0x01000302, 0x00030201 }}}}; // case 1(x4) 3(x1 x2 x3) - static const V4SI shuf_2_2[2][2] = { - {{{ 0x03000201, 0x02010300, 0x01020300, 0x00030201 }}, // case 2(x1 x4) 2(x2 x3) - {{ 0x03010200, 0x02000301, 0x01030200, 0x00020301 }}}, // case 2(x1 x3) 2(x2 x4) - {{{ 0x03020100, 0x02030100, 0x01000302, 0x00010302 }}, // case 2(x1 x2) 2(x3 x4) - {{ 0x03020100, 0x02030100, 0x01030200, 0x00030201 }}}}; // case 4 + static const unsigned char parity_case[64] = { /* x4x3x2x1 = */ + /*0000*/ 0, /*0001*/ 0, /*0010*/ 1, /*0011*/ 9, /*0100*/ 2, /*0101*/ 10, /*0110*/ 11, /*0111*/ 3, + /*0002*/ 0, /*0003*/ 0, /*0012*/ 0, /*0013*/ 0, /*0102*/ 4, /*0103*/ 4, /*0112*/ 5, /*0113*/ 5, + /*0020*/ 1, /*0021*/ 0, /*0030*/ 1, /*0031*/ 0, /*0120*/ 6, /*0121*/ 7, /*0130*/ 6, /*0131*/ 7, + /*0022*/ 9, /*0023*/ 0, /*0032*/ 0, /*0033*/ 9, /*0122*/ 8, /*0123*/ 0, /*0132*/ 0, /*0133*/ 8, + /*0200*/ 2, /*0201*/ 4, /*0210*/ 6, /*0211*/ 8, /*0300*/ 2, /*0301*/ 4, /*0310*/ 6, /*0311*/ 8, + /*0202*/ 10, /*0203*/ 4, /*0212*/ 7, /*0213*/ 0, /*0302*/ 4, /*0303*/ 10, /*0312*/ 0, /*0313*/ 7, + /*0220*/ 11, /*0221*/ 5, /*0230*/ 6, /*0231*/ 0, /*0320*/ 6, /*0321*/ 0, /*0330*/ 11, /*0331*/ 5, + /*0222*/ 3, /*0223*/ 5, /*0232*/ 7, /*0233*/ 8, /*0322*/ 8, /*0323*/ 7, /*0332*/ 5, /*0333*/ 3 + }; +#if defined(__SSSE3__) || defined(__AVX__) + union V4SI { + unsigned int ui[4]; + __m128i v4; + }; + static const union V4SI shuf_mask[] = { // make search order identical to 4.4.0 + {{ 0x03020100, 0x02030100, 0x01030200, 0x00030201 }}, // 0: 1(x1) 3(x2 x3 x4), 1(x1) 1(x2) 2(x3 x4), 1 1 1 1, 4 + {{ 0x03020100, 0x02030100, 0x01020300, 0x00020301 }}, // 1: 1(x2) 3(x1 x3 x4) + {{ 0x03010200, 0x02010300, 0x01030200, 0x00010302 }}, // 2: 1(x3) 3(x1 x2 x4) + {{ 0x03000201, 0x02000301, 0x01000302, 0x00030201 }}, // 3: 1(x4) 3(x1 x2 x3) + {{ 0x03010200, 0x01030200, 0x02030100, 0x00030201 }}, // 4: 1(x1) 1(x3) 2(x2 x4) + {{ 0x03000201, 0x00030201, 0x02030100, 0x01030200 }}, // 5: 1(x1) 1(x4) 2(x2 x3) + {{ 0x02010300, 0x01020300, 0x03020100, 0x00030201 }}, // 6: 1(x2) 1(x3) 2(x1 x4) + {{ 0x02000301, 0x00020301, 0x03020100, 0x01030200 }}, // 7: 1(x2) 1(x4) 2(x1 x3) + {{ 0x01000302, 0x00010302, 0x03020100, 0x02030100 }}, // 8: 1(x3) 1(x4) 2(x1 x2) + {{ 0x03020100, 0x02030100, 0x01000302, 0x00010302 }}, // 9: 2(x1 x2) 2(x3 x4) + {{ 0x03010200, 0x02000301, 0x01030200, 0x00020301 }}, // 10: 2(x1 x3) 2(x2 x4) + {{ 0x03000201, 0x02010300, 0x01020300, 0x00030201 }} // 11: 2(x1 x4) 2(x2 x3) + }; enum { sort3 = 0 }; // sort is done on 4 empties #else - unsigned int sort3; // for move sorting on 3 empties - static const short sort3_1_3[2][2] = - {{ 0x0000, // case 1(x1) 3(x2 x3 x4) // x4x1x2x3-x3x1x2x4-x2x1x3x4-x1x2x3x4 - 0x1100 }, // case 1(x2) 3(x1 x3 x4) // x4x2x1x3-x3x2x1x4-x2x1x3x4-x1x2x3x4 - { 0x2011, // case 1(x3) 3(x1 x2 x4) // x4x3x1x2-x3x1x2x4-x2x3x1x4-x1x3x2x4 - 0x0222 }}; // case 1(x4) 3(x1 x2 x3) // x4x1x2x3-x3x4x1x2-x2x4x1x3-x1x4x2x3 - static const short sort3_2_2[2][2] = - {{ 0x0112, // case 2(x1 x4) 2(x2 x3) // x4x1x2x3-x3x2x1x4-x2x3x1x4-x1x4x2x3 - 0x1021 }, // case 2(x1 x3) 2(x2 x4) // x4x2x1x3-x3x1x2x4-x2x4x1x3-x1x3x2x4 - { 0x2200, // case 2(x1 x2) 2(x3 x4) // x4x3x1x2-x3x4x1x2-x2x1x3x4-x1x2x3x4 - 0x0000 }}; // case 4 // x4x1x2x3-x3x1x2x4-x2x1x3x4-x1x2x3x4 + int sort3; // for move sorting on 3 empties + static const short sort3_shuf[] = { + 0x0000, // 0: 1(x1) 3(x2 x3 x4), 1(x1) 1(x2) 2(x3 x4), 1 1 1 1, 4 + 0x1100, // 1: 1(x2) 3(x1 x3 x4) x4x2x1x3-x3x2x1x4-x2x1x3x4-x1x2x3x4 + 0x2011, // 2: 1(x3) 3(x1 x2 x4) x4x3x1x2-x3x1x2x4-x2x3x1x4-x1x3x2x4 + 0x0222, // 3: 1(x4) 3(x1 x2 x3) x4x1x2x3-x3x4x1x2-x2x4x1x3-x1x4x2x3 + 0x0001, // 4: 1(x1) 1(x3) 2(x2 x4) x4x1x2x3-x2x1x3x4-x3x1x2x4-x1x3x2x4 + 0x0002, // 5: 1(x1) 1(x4) 2(x2 x3) x3x1x2x4-x2x1x3x4-x4x1x2x3-x1x4x2x3 + 0x0011, // 6: 1(x2) 1(x3) 2(x1 x4) x4x1x2x3-x1x2x3x4-x3x2x1x4-x2x3x1x4 + 0x0012, // 7: 1(x2) 1(x4) 2(x1 x3) x3x1x2x4-x1x2x3x4-x4x2x1x3-x2x4x1x3 + 0x0022, // 8: 1(x3) 1(x4) 2(x1 x2) x2x1x3x4-x1x2x3x4-x4x3x1x2-x3x4x1x2 + 0x2200, // 9: 2(x1 x2) 2(x3 x4) x4x3x1x2-x3x4x1x2-x2x1x3x4-x1x2x3x4 + 0x1021, // 10: 2(x1 x3) 2(x2 x4) x4x2x1x3-x3x1x2x4-x2x4x1x3-x1x3x2x4 + 0x0112 // 11: 2(x1 x4) 2(x2 x3) x4x1x2x3-x3x2x1x4-x2x3x1x4-x1x4x2x3 + }; #endif SEARCH_STATS(++statistics.n_search_solve_4); @@ -384,35 +386,10 @@ int search_solve_4(Search *search, const int alpha) // The following hole sizes are possible: // 4 - 1 3 - 2 2 - 1 1 2 - 1 1 1 1 // Only the 1 1 2 case needs move sorting on this ply. - parity = search->eval.parity; - q1 = QUADRANT_ID[x1]; - q2 = QUADRANT_ID[x2]; - q3 = QUADRANT_ID[x3]; -#ifdef __AVX__ + paritysort = parity_case[((x3 ^ x4) & 0x24) + ((((x2 ^ x4) & 0x24) * 2 + ((x1 ^ x4) & 0x24)) >> 2)]; +#if defined(__SSSE3__) || defined(__AVX__) empties_series = _mm_cvtsi32_si128((x1 << 24) | (x2 << 16) | (x3 << 8) | x4); - if (parity & q1) { - if (parity & q2) { - if (parity & q3) { // case 1 3, case 1 1 1 1 - empties_series = _mm_shuffle_epi8(empties_series, shuf_1_3[q1 == q2][q1 == q3].v4); - } else { // case 1(x1) 1(x2) 2(x3 x4) // x4x1x2x3-x3x1x2x4-x2x1x3x4-x1x2x3x4 - empties_series = _mm_shuffle_epi8(empties_series, - _mm_set_epi32(0x00030201, 0x01030200, 0x02030100, 0x03020100)); - } - } else { // case 1(x1) 1(x3) 2(x2 x4), case 1(x1) 1(x4) 2(x2 x3) - empties_series = _mm_shuffle_epi8(empties_series, shuf_x1_1_2[(parity & q3) != 0].v4); - } - } else { - if (parity & q2) { // case 1(x2) 1(x3) 2(x1 x4), case 1(x2) 1(x4) 2(x1 x3) - empties_series = _mm_shuffle_epi8(empties_series, shuf_x2_1_2[(parity & q3) != 0].v4); - } else { - if (parity & q3) { // case 1(x3) 1(x4) 2(x1 x2) // x2x1x3x4-x1x2x3x4-x4x3x1x2-x3x4x1x2 - empties_series = _mm_shuffle_epi8(empties_series, - _mm_set_epi32(0x02030100, 0x03020100, 0x00010302, 0x01000302)); - } else { // case 2 2, case 4 - empties_series = _mm_shuffle_epi8(empties_series, shuf_2_2[q1 == q2][q1 == q3].v4); - } - } - } + empties_series = _mm_shuffle_epi8(empties_series, shuf_mask[paritysort].v4); #else // SSE empties_series = _mm_cvtsi32_si128((x3 << 16) | x4); @@ -420,40 +397,25 @@ int search_solve_4(Search *search, const int alpha) empties_series = _mm_insert_epi16(empties_series, x1, 3); empties_series = _mm_packus_epi16(_mm_unpacklo_epi64(empties_series, _mm_shufflelo_epi16(empties_series, 0xb4)), _mm_unpacklo_epi64(_mm_shufflelo_epi16(empties_series, 0x78), _mm_shufflelo_epi16(empties_series, 0x39))); - // x4x1x2x3-x3x1x2x4-x2x1x3x4-x1x2x3x4 - if (parity & q1) { - if (parity & q2) { - sort3 = 0; // case 1(x1) 1(x2) 2(x3 x4) - if (parity & q3) { // case 1 3, case 1 1 1 1 - sort3 = sort3_1_3[q1 == q2][q1 == q3]; - } - } else { - if (parity & q3) { // case 1(x1) 1(x3) 2(x2 x4) - empties_series = _mm_shuffle_epi32(empties_series, 0xd8); // x4...x2...x3...x1... - sort3 = 0x0001; // ..-x1x3x2x4 - } else { // case 1(x1) 1(x4) 2(x2 x3) - empties_series = _mm_shuffle_epi32(empties_series, 0x9c); // x3...x2...x4...x1... - sort3 = 0x0002; // ..-x1x4x2x3 - } - } - } else { - if (parity & q2) { - if (parity & q3) { // case 1(x2) 1(x3) 2(x1 x4) - empties_series = _mm_shuffle_epi32(empties_series, 0xc9); // x4...x1...x3...x2... - sort3 = 0x0011; // ..-x3x2x1x4-x2x3x1x4 - } else { // case 1(x2) 1(x4) 2(x1 x3) - empties_series = _mm_shuffle_epi32(empties_series, 0x8d); // x3...x1...x4...x2... - sort3 = 0x0012; // ..-x4x2x1x3-x2x4x1x3 - } - } else { - if (parity & q3) { // case 1(x3) 1(x4) 2(x1 x2) - empties_series = _mm_shuffle_epi32(empties_series, 0x4e); // x2...x1...x4...x3... - sort3 = 0x0022; // ..-x4x3x1x2-x3x4x1x2 - } else { // case 2 2, case 4 - sort3 = sort3_2_2[q1 == q2][q1 == q3]; - } - } + // x4x1x2x3-x3x1x2x4-x2x1x3x4-x1x2x3x4 + switch (paritysort) { + case 4: // case 1(x1) 1(x3) 2(x2 x4) + empties_series = _mm_shuffle_epi32(empties_series, 0xd8); // x4...x2...x3...x1... + break; + case 5: // case 1(x1) 1(x4) 2(x2 x3) + empties_series = _mm_shuffle_epi32(empties_series, 0x9c); // x3...x2...x4...x1... + break; + case 6: // case 1(x2) 1(x3) 2(x1 x4) + empties_series = _mm_shuffle_epi32(empties_series, 0xc9); // x4...x1...x3...x2... + break; + case 7: // case 1(x2) 1(x4) 2(x1 x3) + empties_series = _mm_shuffle_epi32(empties_series, 0x8d); // x3...x1...x4...x2... + break; + case 8: // case 1(x3) 1(x4) 2(x1 x2) + empties_series = _mm_shuffle_epi32(empties_series, 0x4e); // x2...x1...x4...x3... + break; } + sort3 = sort3_shuf[paritysort]; #endif // best move alphabeta search diff --git a/src/eval.c b/src/eval.c index 614642e..9812611 100644 --- a/src/eval.c +++ b/src/eval.c @@ -170,7 +170,7 @@ static const CoordinateToFeature EVAL_X2F[] = { }; #endif -#if defined(VECTOR_EVAL_UPDATE) || defined(hasSSE2) || defined(hasNeon) || defined(USE_GAS_MMX) || defined(USE_MSVC_X86) +#if defined(VECTOR_EVAL_UPDATE) || defined(hasSSE2) || defined(hasNeon) || defined(ANDROID) || defined(USE_GAS_MMX) || defined(USE_MSVC_X86) const EVAL_FEATURE_V EVAL_FEATURE[65] = { {{ // a1 @@ -735,7 +735,10 @@ void eval_close(void) EVAL_WEIGHT = NULL; } -#if defined(hasSSE2) || defined(hasNeon) || defined(USE_GAS_MMX) || defined(USE_MSVC_X86) +#ifdef ANDROID +extern void eval_update_sse_0(Eval *eval_out, const Eval *eval_in, const Move *move); +extern void eval_update_sse_1(Eval *eval_out, const Eval *eval_in, const Move *move); +#elif defined(hasSSE2) || defined(hasNeon) || defined(USE_GAS_MMX) || defined(USE_MSVC_X86) #include "eval_sse.c" #endif @@ -900,7 +903,7 @@ void eval_update(Eval *eval, const Move *move) { assert(move->flipped); -#if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) +#if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) || defined(ANDROID) if (hasSSE2) { if (eval->n_empties & 1) eval_update_sse_1(eval, eval, move); @@ -917,7 +920,7 @@ void eval_update(Eval *eval, const Move *move) void eval_update_leaf(Eval *eval_out, const Eval *eval_in, const Move *move) { -#if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) +#if defined(USE_GAS_MMX) || defined(USE_MSVC_X86) || defined(ANDROID) if (hasSSE2) { if (eval_in->n_empties & 1) eval_update_sse_1(eval_out, eval_in, move); @@ -933,7 +936,7 @@ void eval_update_leaf(Eval *eval_out, const Eval *eval_in, const Move *move) eval_update_0(eval_out, move); } -#endif // hasSSE2 +#endif // !defined(hasSSE2) && !defined(hasNeon) /** * @brief Update/Restore the features after a passing move. diff --git a/src/eval.h b/src/eval.h index 160bb1a..3d8e012 100644 --- a/src/eval.h +++ b/src/eval.h @@ -13,11 +13,6 @@ #include "bit.h" -#if defined(__ARM_NEON__) || defined(_M_ARM) || defined(_M_ARM64) -#define hasNeon -#include "arm_neon.h" -#endif - /** number of features */ enum { EVAL_N_FEATURE = 47 }; @@ -27,7 +22,7 @@ enum { EVAL_N_FEATURE = 47 }; */ typedef union { unsigned short us[48]; -#ifdef hasNeon +#ifdef __ARM_NEON__ int16x8_t v8[6]; #elif defined(hasSSE2) || defined(USE_MSVC_X86) __m128i v8[6]; diff --git a/src/eval_sse.c b/src/eval_sse.c index e2aeab4..c300527 100644 --- a/src/eval_sse.c +++ b/src/eval_sse.c @@ -10,21 +10,21 @@ #include -#include "bit.h" +#include "bit_intrinsics.h" #include "board.h" #include "move.h" #include "eval.h" -#if defined(__ARM_NEON__) || defined(_M_ARM) || defined(_M_ARM64) +#ifdef __ARM_NEON__ #define __m128i int16x8_t #define _mm_add_epi16 vaddq_s16 #define _mm_sub_epi16 vsubq_s16 #define _mm_slli_epi16 vshlq_n_s16 #endif -#if defined(hasSSE2) || defined(hasNeon) || defined(USE_MSVC_X86) +#if defined(hasSSE2) || defined(__ARM_NEON__) || defined(USE_MSVC_X86) -static void eval_update_sse_0(Eval *eval_out, const Eval *eval_in, const Move *move) +void eval_update_sse_0(Eval *eval_out, const Eval *eval_in, const Move *move) { int x = move->x; unsigned long long f = move->flipped; @@ -82,7 +82,7 @@ static void eval_update_sse_0(Eval *eval_out, const Eval *eval_in, const Move *m * @param eval Evaluation function. * @param move Move. */ -static void eval_update_sse_1(Eval *eval_out, const Eval *eval_in, const Move *move) +void eval_update_sse_1(Eval *eval_out, const Eval *eval_in, const Move *move) { int x = move->x; unsigned long long f = move->flipped; diff --git a/src/flip_bitscan.c b/src/flip_bitscan.c index e4ec65d..26827ef 100644 --- a/src/flip_bitscan.c +++ b/src/flip_bitscan.c @@ -46,6 +46,9 @@ #include "bit_intrinsics.h" +#define LODWORD(l) ((unsigned int)(l)) +#define HIDWORD(l) ((unsigned int)((l)>>32)) + /** rotated outflank array (indexed with inner 6 bits) */ static const unsigned char OUTFLANK_2[64] = { // ...ahgfe 0x00, 0x10, 0x00, 0x00, 0x01, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x02, 0x12, 0x00, 0x00, @@ -252,8 +255,8 @@ static unsigned long long flip_C1(const unsigned long long P, const unsigned lon outflank_v = ((O | ~0x0404040404040400) + 1) & P & 0x0404040404040400; flipped = OutflankToFlipmask(outflank_v) & 0x0404040404040400; - outflank_d = ((P & 0x0000804020110a04) * 0x0101010101010101) >> 52; // hgfedcb[ahgfe]... - outflank_d = OUTFLANK_2[((O & 0x0000004020100a04) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_2[(((HIDWORD(O) & 0x00000040) + (LODWORD(O) & 0x20100a04)) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000804020110a04) * 0x0101010101010101) >> 52; // hgfedcb[ahgfe]... flipped |= FLIPPED_2_H[outflank_d] & 0x0000004020100a04; // A3C1H6 outflank_h = OUTFLANK_2[(O >> 1) & 0x3f] & rotl8(P, 4); @@ -277,8 +280,8 @@ static unsigned long long flip_D1(const unsigned long long P, const unsigned lon outflank_v = ((O | ~0x0808080808080800) + 1) & P & 0x0808080808080800; flipped = OutflankToFlipmask(outflank_v) & 0x0808080808080800; - outflank_d = ((P & 0x0000008041221408) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0000000040221408) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[((LODWORD(O) & 0x40221408) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000008041221408) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0000000040221408; // A4D1H5 outflank_h = OUTFLANK_3[(O >> 1) & 0x3f] & rotl8(P, 3); @@ -302,8 +305,8 @@ static unsigned long long flip_E1(const unsigned long long P, const unsigned lon outflank_v = ((O | ~0x1010101010101000) + 1) & P & 0x1010101010101000; flipped = OutflankToFlipmask(outflank_v) & 0x1010101010101000; - outflank_d = ((P & 0x0000000182442810) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0000000002442810) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[((LODWORD(O) & 0x02442810) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000000182442810) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0000000002442810; // A5E1H4 outflank_h = OUTFLANK_4[(O >> 1) & 0x3f] & rotl8(P, 2); @@ -327,8 +330,8 @@ static unsigned long long flip_F1(const unsigned long long P, const unsigned lon outflank_v = ((O | ~0x2020202020202000) + 1) & P & 0x2020202020202000; flipped = OutflankToFlipmask(outflank_v) & 0x2020202020202000; - outflank_d = ((P & 0x0000010204885020) * 0x0101010101010101) >> 55; // hgfe[dcbah]... - outflank_d = OUTFLANK_5[((O & 0x0000000204085020) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_5[(((HIDWORD(O) & 0x00000002) + (LODWORD(O) & 0x04085020)) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000010204885020) * 0x0101010101010101) >> 55; // hgfe[dcbah]... flipped |= FLIPPED_5_H[outflank_d] & 0x0000000204085020; // A6F1H3 outflank_h = OUTFLANK_5[(O >> 1) & 0x3f] & rotl8(P, 1); @@ -448,8 +451,8 @@ static unsigned long long flip_C2(const unsigned long long P, const unsigned lon outflank_v = ((O | ~0x0404040404040000) + 1) & P & 0x0404040404040000; flipped = OutflankToFlipmask(outflank_v) & 0x0404040404040000; - outflank_d = ((P & 0x00804020110a0400) * 0x0101010101010101) >> 52; // hgfedcb[ahgfe]... - outflank_d = OUTFLANK_2[((O & 0x00004020100a0400) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_2[(((HIDWORD(O) & 0x00004020) + (LODWORD(O) & 0x100a0400)) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x00804020110a0400) * 0x0101010101010101) >> 52; // hgfedcb[ahgfe]... flipped |= FLIPPED_2_H[outflank_d] & 0x00004020100a0400; // A4C2H7 outflank_h = OUTFLANK_2[(O >> 9) & 0x3f] & rotl8(P >> 8, 4); @@ -473,8 +476,8 @@ static unsigned long long flip_D2(const unsigned long long P, const unsigned lon outflank_v = ((O | ~0x0808080808080000) + 1) & P & 0x0808080808080000; flipped = OutflankToFlipmask(outflank_v) & 0x0808080808080000; - outflank_d = ((P & 0x0000804122140800) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0000004022140800) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 8) & 0x40221408) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000804122140800) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0000004022140800; // A5D2H6 outflank_h = OUTFLANK_3[(O >> 9) & 0x3f] & rotl8(P >> 8, 3); @@ -498,8 +501,8 @@ static unsigned long long flip_E2(const unsigned long long P, const unsigned lon outflank_v = ((O | ~0x1010101010100000) + 1) & P & 0x1010101010100000; flipped = OutflankToFlipmask(outflank_v) & 0x1010101010100000; - outflank_d = ((P & 0x0000018244281000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0000000244281000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 8) & 0x02442810) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000018244281000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0000000244281000; // A6E2H5 outflank_h = OUTFLANK_4[(O >> 9) & 0x3f] & rotl8(P >> 8, 2); @@ -523,8 +526,8 @@ static unsigned long long flip_F2(const unsigned long long P, const unsigned lon outflank_v = ((O | ~0x2020202020200000) + 1) & P & 0x2020202020200000; flipped = OutflankToFlipmask(outflank_v) & 0x2020202020200000; - outflank_d = ((P & 0x0001020488502000) * 0x0101010101010101) >> 55; // hgfe[dcbah]... - outflank_d = OUTFLANK_5[((O & 0x0000020408502000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_5[(((HIDWORD(O) & 0x00000204) + (LODWORD(O) & 0x08502000)) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0001020488502000) * 0x0101010101010101) >> 55; // hgfe[dcbah]... flipped |= FLIPPED_5_H[outflank_d] & 0x0000020408502000; // A7F2H4 outflank_h = OUTFLANK_5[(O >> 9) & 0x3f] & rotl8(P >> 8, 1); @@ -593,12 +596,12 @@ static unsigned long long flip_A3(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_a1a3f8, outflank_a8a3c1; unsigned long long flipped; - outflank_a1a3f8 = ((P & 0x2010080402010101) * 0x8000000002020202) >> 59; // 18765 - outflank_a1a3f8 = OUTFLANK_2[((O & 0x0010080402010100) * 0x0102040404040404) >> 57] & outflank_a1a3f8; + outflank_a1a3f8 = OUTFLANK_2[((O & 0x0010080402010100) * 0x0102040404040404) >> 57]; + outflank_a1a3f8 &= ((P & 0x2010080402010101) * 0x8000000002020202) >> 59; // 18765 flipped = FLIPPED_2_V[outflank_a1a3f8] & 0x0010080402010100; - outflank_a8a3c1 = ((P & 0x0101010101010204) * 0x0200000080402010) >> 59; // 56781 - outflank_a8a3c1 = OUTFLANK_5[((O & 0x0001010101010200) * 0x2020201008040201) >> 57] & outflank_a8a3c1; + outflank_a8a3c1 = OUTFLANK_5[((O & 0x0001010101010200) * 0x2020201008040201) >> 57]; + outflank_a8a3c1 &= ((P & 0x0101010101010204) * 0x0200000080402010) >> 59; // 56781 flipped |= vertical_mirror(FLIPPED_5_V[outflank_a8a3c1]) & 0x0001010101010200; outflank_h = ((O & 0x007e0000) + 0x00020000) & P; @@ -619,12 +622,12 @@ static unsigned long long flip_B3(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_b1b3g8, outflank_b8b3d1; unsigned long long flipped; - outflank_b1b3g8 = ((P & 0x4020100804020202) * 0x4000000001010101) >> 59; // 18765 - outflank_b1b3g8 = OUTFLANK_2[((O & 0x0020100804020200) * 0x0081020202020202) >> 57] & outflank_b1b3g8; + outflank_b1b3g8 = OUTFLANK_2[((O & 0x0020100804020200) * 0x0081020202020202) >> 57]; + outflank_b1b3g8 &= ((P & 0x4020100804020202) * 0x4000000001010101) >> 59; // 18765 flipped = FLIPPED_2_V[outflank_b1b3g8] & 0x0020100804020200; - outflank_b8b3d1 = ((P & 0x0202020202020408) * 0x0100000040201008) >> 59; // 56781 - outflank_b8b3d1 = OUTFLANK_5[((O & 0x0002020202020400) * 0x0010100804020100) >> 57] & outflank_b8b3d1; + outflank_b8b3d1 = OUTFLANK_5[((O & 0x0002020202020400) * 0x0010100804020100) >> 57]; + outflank_b8b3d1 &= ((P & 0x0202020202020408) * 0x0100000040201008) >> 59; // 56781 flipped |= vertical_mirror(FLIPPED_5_V[outflank_b8b3d1]) & 0x0002020202020400; outflank_h = ((O & 0x007c0000) + 0x00040000) & P; @@ -645,8 +648,8 @@ static unsigned long long flip_C3(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d9; unsigned long long flipped; - outflank_v = ((P & 0x0404040404040404) * 0x2000000002040810) >> 59; // 18765 - outflank_v = OUTFLANK_2[((O & 0x0004040404040400) * 0x0040810204081020) >> 57] & outflank_v; + outflank_v = OUTFLANK_2[((O & 0x0004040404040400) * 0x0040810204081020) >> 57]; + outflank_v &= ((P & 0x0404040404040404) * 0x2000000002040810) >> 59; // 18765 flipped = FLIPPED_2_V[outflank_v] & 0x0004040404040400; outflank_h = OUTFLANK_2[(O >> 17) & 0x3f] & rotl8(P >> 16, 4); @@ -654,8 +657,8 @@ static unsigned long long flip_C3(const unsigned long long P, const unsigned lon flipped |= (((P >> 7) | (P << 7)) & 0x000000002000800) & O; - outflank_d9 = ((P & 0x8040201008040201) * 0x0101010101010101) >> 56; // (h8) - outflank_d9 = OUTFLANK_2[((O & 0x0040201008040200) * 0x0101010101010101) >> 57] & rotl8(outflank_d9, 4); + outflank_d9 = OUTFLANK_2[(((HIDWORD(O) & 0x00402010) + (LODWORD(O) & 0x08040200)) * 0x01010101) >> 25]; + outflank_d9 &= rotl8((((HIDWORD(P) & 0x80402010) + (LODWORD(P) & 0x08040201)) * 0x01010101) >> 24, 4); // (h8) flipped |= FLIPPED_2_H[outflank_d9] & 0x0040201008040200; return flipped; @@ -673,15 +676,15 @@ static unsigned long long flip_D3(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d; unsigned long long flipped; - outflank_v = ((P & 0x0808080808080808) * 0x1020408001020408) >> 59; // 18765 - outflank_v = OUTFLANK_2[((O & 0x0008080808080800) * 0x0020408102040810) >> 57] & outflank_v; + outflank_v = OUTFLANK_2[((O & 0x0008080808080800) * 0x0020408102040810) >> 57]; + outflank_v &= ((P & 0x0808080808080808) * 0x1020408001020408) >> 59; // 18765 flipped = FLIPPED_2_V[outflank_v] & 0x0008080808080800; outflank_h = OUTFLANK_3[(O >> 17) & 0x3f] & rotl8(P >> 16, 3); flipped |= (unsigned char) FLIPPED_3_H[outflank_h] << 16; - outflank_d = ((P & 0x0080412214080000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0000402214080000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 16) & 0x40221408) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0080412214080000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0000402214080000; // A6D3H7 flipped |= (((P << 7) & 0x0000000000001000) | ((P << 9) & 0x000000000000400)) & O; @@ -701,15 +704,15 @@ static unsigned long long flip_E3(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d; unsigned long long flipped; - outflank_v = ((P & 0x1010101010101010) * 0x0810204000810204) >> 59; // 18765 - outflank_v = OUTFLANK_2[((O & 0x0010101010101000) * 0x0010204081020408) >> 57] & outflank_v; + outflank_v = OUTFLANK_2[((O & 0x0010101010101000) * 0x0010204081020408) >> 57]; + outflank_v &= ((P & 0x1010101010101010) * 0x0810204000810204) >> 59; // 18765 flipped = FLIPPED_2_V[outflank_v] & 0x0010101010101000; outflank_h = OUTFLANK_4[(O >> 17) & 0x3f] & rotl8(P >> 16, 2); flipped |= (unsigned char) FLIPPED_4_H[outflank_h] << 16; - outflank_d = ((P & 0x0001824428100000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0000024428100000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 16) & 0x02442810) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0001824428100000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0000024428100000; // A7E3H6 flipped |= (((P << 7) & 0x0000000000002000) | ((P << 9) & 0x000000000000800)) & O; @@ -729,15 +732,15 @@ static unsigned long long flip_F3(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d7; unsigned long long flipped; - outflank_v = ((P & 0x2020202020202020) * 0x0408102000408102) >> 59; // 18765 - outflank_v = OUTFLANK_2[((O & 0x0020202020202000) * 0x0008102040810204) >> 57] & outflank_v; + outflank_v = OUTFLANK_2[((O & 0x0020202020202000) * 0x0008102040810204) >> 57]; + outflank_v &= ((P & 0x2020202020202020) * 0x0408102000408102) >> 59; // 18765 flipped = FLIPPED_2_V[outflank_v] & 0x0020202020202000; outflank_h = OUTFLANK_5[(O >> 17) & 0x3f] & rotl8(P >> 16, 1); flipped |= (unsigned char) FLIPPED_5_H[outflank_h] << 16; - outflank_d7 = ((P & 0x0102040810204080) * 0x0010000010101010) >> 59; // dcbah - outflank_d7 = OUTFLANK_5[((O & 0x0002040810204000) * 0x0101010101010101) >> 57] & outflank_d7; + outflank_d7 = OUTFLANK_5[(((HIDWORD(O) & 0x00020408) + (LODWORD(O) & 0x10204000)) * 0x01010101) >> 25]; + outflank_d7 &= ((P & 0x0102040810204080) * 0x0010000010101010) >> 59; // dcbah flipped |= FLIPPED_5_H[outflank_d7] & 0x0002040810204000; flipped |= (((P >> 9) | (P << 9)) & 0x0000000040001000) & O; @@ -757,15 +760,15 @@ static unsigned long long flip_G3(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_e1g3g8, outflank_b8g3g1; unsigned long long flipped; - outflank_e1g3g8 = ((P & 0x4040404040402010) * 0x0800000000204081) >> 59; // 18765 - outflank_e1g3g8 = OUTFLANK_2[((O & 0x0040404040402000) * 0x0010101020408102) >> 57] & outflank_e1g3g8; + outflank_e1g3g8 = OUTFLANK_2[((O & 0x0040404040402000) * 0x0010101020408102) >> 57]; + outflank_e1g3g8 &= ((P & 0x4040404040402010) * 0x0800000000204081) >> 59; // 18765 flipped = FLIPPED_2_V[outflank_e1g3g8] & 0x0040404040402000; - outflank_b8g3g1 = ((P & 0x0204081020404040) * 0x0020000008080808) >> 59; // 43218 - outflank_b8g3g1 = OUTFLANK_5[((O & 0x0004081020404000) * 0x0402010101010101) >> 58] & outflank_b8g3g1; + outflank_b8g3g1 = OUTFLANK_5[((O & 0x0004081020404000) * 0x0402010101010101) >> 58]; + outflank_b8g3g1 &= ((P & 0x0204081020404040) * 0x0020000008080808) >> 59; // 43218 flipped |= vertical_mirror(FLIPPED_5_V[outflank_b8g3g1]) & 0x0004081020404000; - outflank_h = outflank_right_H(((unsigned int) O >> 17) << 27) & (unsigned int)(P << 10); + outflank_h = outflank_right_H(((unsigned int) O >> 17) << 27) & (unsigned int) (P << 10); flipped |= (outflank_h * (unsigned int) -2) >> 10; return flipped; @@ -783,15 +786,15 @@ static unsigned long long flip_H3(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_f1h3h8, outflank_c8h3h1; unsigned long long flipped; - outflank_f1h3h8 = ((P & 0x8080808080804020) * 0x0008080810204081) >> 56; // (h8) - outflank_f1h3h8 = OUTFLANK_2[((O & 0x0080808080804000) * 0x0008080810204081) >> 57] & rotl8(outflank_f1h3h8, 4); + outflank_f1h3h8 = OUTFLANK_2[((O & 0x0080808080804000) * 0x0008080810204081) >> 57]; + outflank_f1h3h8 &= rotl8(((P & 0x8080808080804020) * 0x0008080810204081) >> 56, 4); // (h8) flipped = FLIPPED_2_V[outflank_f1h3h8] & 0x0080808080804000; - outflank_c8h3h1 = ((P & 0x0408102040808080) * 0x0010000004040404) >> 59; // 43218 - outflank_c8h3h1 = OUTFLANK_5[((O & 0x0008102040808000) * 0x0000804040404040) >> 57] & outflank_c8h3h1; + outflank_c8h3h1 = OUTFLANK_5[((O & 0x0008102040808000) * 0x0000804040404040) >> 57]; + outflank_c8h3h1 &= ((P & 0x0408102040808080) * 0x0010000004040404) >> 59; // 43218 flipped |= vertical_mirror(FLIPPED_5_V[outflank_c8h3h1]) & 0x0008102040808000; - outflank_h = outflank_right_H(((unsigned int) O >> 17) << 26) & (unsigned int)(P << 9); + outflank_h = outflank_right_H(((unsigned int) O >> 17) << 26) & (unsigned int) (P << 9); flipped |= (outflank_h * (unsigned int) -2) >> 9; return flipped; @@ -809,12 +812,12 @@ static unsigned long long flip_A4(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_a1a4e8, outflank_a8a4d1; unsigned long long flipped; - outflank_a1a4e8 = ((P & 0x1008040201010101) * 0x4080000000020202) >> 59; // 21876 - outflank_a1a4e8 = OUTFLANK_3[((O & 0x0008040201010100) * 0x0102040808080808) >> 57] & outflank_a1a4e8; + outflank_a1a4e8 = OUTFLANK_3[((O & 0x0008040201010100) * 0x0102040808080808) >> 57]; + outflank_a1a4e8 &= ((P & 0x1008040201010101) * 0x4080000000020202) >> 59; // 21876 flipped = FLIPPED_3_V[outflank_a1a4e8] & 0x0008040201010100; - outflank_a8a4d1 = ((P & 0x0101010101020408) * 0x0202000000804020) >> 59; // 67812 - outflank_a8a4d1 = OUTFLANK_4[((O & 0x0001010101020400) * 0x1010101008040201) >> 57] & outflank_a8a4d1; + outflank_a8a4d1 = OUTFLANK_4[((O & 0x0001010101020400) * 0x1010101008040201) >> 57]; + outflank_a8a4d1 &= ((P & 0x0101010101020408) * 0x0202000000804020) >> 59; // 67812 flipped |= vertical_mirror(FLIPPED_4_V[outflank_a8a4d1]) & 0x0001010101020400; outflank_h = ((unsigned int) O + 0x02000000) & P; @@ -835,12 +838,12 @@ static unsigned long long flip_B4(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_b1b4f8, outflank_b8b4e1; unsigned long long flipped; - outflank_b1b4f8 = ((P & 0x2010080402020202) * 0x2040000000010101) >> 59; // 21876 - outflank_b1b4f8 = OUTFLANK_3[((O & 0x0010080402020200) * 0x0081020404040404) >> 57] & outflank_b1b4f8; + outflank_b1b4f8 = OUTFLANK_3[((O & 0x0010080402020200) * 0x0081020404040404) >> 57]; + outflank_b1b4f8 &= ((P & 0x2010080402020202) * 0x2040000000010101) >> 59; // 21876 flipped = FLIPPED_3_V[outflank_b1b4f8] & 0x0010080402020200; - outflank_b8b4e1 = ((P & 0x0202020202040810) * 0x0101000000402010) >> 59; // 67812 - outflank_b8b4e1 = OUTFLANK_4[((O & 0x0002020202040800) * 0x1010101008040201) >> 58] & outflank_b8b4e1; + outflank_b8b4e1 = OUTFLANK_4[((O & 0x0002020202040800) * 0x1010101008040201) >> 58]; + outflank_b8b4e1 &= ((P & 0x0202020202040810) * 0x0101000000402010) >> 59; // 67812 flipped |= vertical_mirror(FLIPPED_4_V[outflank_b8b4e1]) & 0x0002020202040800; outflank_h = ((unsigned int) O + 0x04000000) & P; @@ -861,12 +864,12 @@ static unsigned long long flip_C4(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_c1c4g8, outflank_c8c4f1; unsigned long long flipped; - outflank_c1c4g8 = ((P & 0x4020100804040404) * 0x0040810202020202) >> 56; // (g8) - outflank_c1c4g8 = OUTFLANK_3[((O & 0x0020100804040400) * 0x0040810202020202) >> 57] & rotl8(outflank_c1c4g8, 3); + outflank_c1c4g8 = OUTFLANK_3[((O & 0x0020100804040400) * 0x0040810202020202) >> 57]; + outflank_c1c4g8 &= rotl8(((P & 0x4020100804040404) * 0x0040810202020202) >> 56, 3); // (g8) flipped = FLIPPED_3_V[outflank_c1c4g8] & 0x0020100804040400; - outflank_c8c4f1 = ((P & 0x0404040404081020) * 0x0080800000201008) >> 59; // 67812 - outflank_c8c4f1 = OUTFLANK_4[((O & 0x0004040404081000) * 0x0404040402010080) >> 57] & outflank_c8c4f1; + outflank_c8c4f1 = OUTFLANK_4[((O & 0x0004040404081000) * 0x0404040402010080) >> 57]; + outflank_c8c4f1 &= ((P & 0x0404040404081020) * 0x0080800000201008) >> 59; // 67812 flipped |= vertical_mirror(FLIPPED_4_V[outflank_c8c4f1]) & 0x0004040404081000; outflank_h = OUTFLANK_2[(O >> 25) & 0x3f] & rotl8(P >> 24, 4); @@ -889,19 +892,19 @@ static unsigned long long flip_D4(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d7, outflank_d9; unsigned long long flipped; - outflank_v = ((P & 0x0808080808080808) * 0x0810000000010204) >> 59; // 21876 - outflank_v = OUTFLANK_3[((O & 0x0008080808080800) * 0x0020408102040810) >> 57] & outflank_v; + outflank_v = OUTFLANK_3[((O & 0x0008080808080800) * 0x0020408102040810) >> 57]; + outflank_v &= ((P & 0x0808080808080808) * 0x0810000000010204) >> 59; // 21876 flipped = FLIPPED_3_V[outflank_v] & 0x0008080808080800; outflank_h = OUTFLANK_3[(O >> 25) & 0x3f] & rotl8(P >> 24, 3); flipped |= (unsigned char) FLIPPED_3_H[outflank_h] << 24; - outflank_d7 = ((P & 0x0001020408102040) * 0x0040400000404000) >> 59; // ba0gf - outflank_d7 = OUTFLANK_3[((O & 0x0000020408102000) * 0x0101010101010101) >> 57] & outflank_d7; + outflank_d7 = OUTFLANK_3[(((HIDWORD(O) & 0x00000204) + (LODWORD(O) & 0x08102000)) * 0x01010101) >> 25]; + outflank_d7 &= ((P & 0x0001020408102040) * 0x0040400000404000) >> 59; // ba0gf flipped |= FLIPPED_3_H[outflank_d7] & 0x0000020408102000; - outflank_d9 = ((P & 0x8040201008040201) * 0x0101010101010101) >> 56; // (h8) - outflank_d9 = OUTFLANK_3[((O & 0x0040201008040200) * 0x0101010101010101) >> 57] & rotl8(outflank_d9, 3); + outflank_d9 = OUTFLANK_3[(((HIDWORD(O) & 0x00402010) + (LODWORD(O) & 0x08040200)) * 0x01010101) >> 25]; + outflank_d9 &= rotl8((((HIDWORD(P) & 0x80402010) + (LODWORD(P) & 0x08040201)) * 0x01010101) >> 24, 3); // (h8) flipped |= FLIPPED_3_H[outflank_d9] & 0x0040201008040200; return flipped; @@ -919,19 +922,19 @@ static unsigned long long flip_E4(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d7, outflank_d9; unsigned long long flipped; - outflank_v = ((P & 0x1010101010101010) * 0x0408000000008102) >> 59; // 21876 - outflank_v = OUTFLANK_3[((O & 0x0010101010101000) * 0x0010204081020408) >> 57] & outflank_v; + outflank_v = OUTFLANK_3[((O & 0x0010101010101000) * 0x0010204081020408) >> 57]; + outflank_v &= ((P & 0x1010101010101010) * 0x0408000000008102) >> 59; // 21876 flipped = FLIPPED_3_V[outflank_v] & 0x0010101010101000; outflank_h = OUTFLANK_4[(O >> 25) & 0x3f] & rotl8(P >> 24, 2); flipped |= (unsigned char) FLIPPED_4_H[outflank_h] << 24; - outflank_d7 = ((P & 0x0102040810204080) * 0x0020200000202020) >> 59; // cbahg - outflank_d7 = OUTFLANK_4[((O & 0x0002040810204000) * 0x0101010101010101) >> 57] & outflank_d7; + outflank_d7 = OUTFLANK_4[(((HIDWORD(O) & 0x00020408) + (LODWORD(O) & 0x10204000)) * 0x01010101) >> 25]; + outflank_d7 &= ((P & 0x0102040810204080) * 0x0020200000202020) >> 59; // cbahg flipped |= FLIPPED_4_H[outflank_d7] & 0x0002040810204000; - outflank_d9 = ((P & 0x0080402010080402) * 0x0404000000040404) >> 56; // cbahg - outflank_d9 = OUTFLANK_4[((O & 0x0000402010080400) * 0x0101010101010101) >> 57] & outflank_d9; + outflank_d9 = OUTFLANK_4[(((HIDWORD(O) & 0x00004020) + (LODWORD(O) & 0x10080400)) * 0x01010101) >> 25]; + outflank_d9 &= ((P & 0x0080402010080402) * 0x0404000000040404) >> 56; // cbahg flipped |= FLIPPED_4_H[outflank_d9] & 0x0000402010080400; return flipped; @@ -949,12 +952,12 @@ static unsigned long long flip_F4(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_c1f4f8, outflank_b8f4f1; unsigned long long flipped; - outflank_c1f4f8 = ((P & 0x2020202020100804) * 0x1010000000004081) >> 59; // 21876 - outflank_c1f4f8 = OUTFLANK_3[((O & 0x0020202020100800) * 0x0040404040810204) >> 57] & outflank_c1f4f8; + outflank_c1f4f8 = OUTFLANK_3[((O & 0x0020202020100800) * 0x0040404040810204) >> 57]; + outflank_c1f4f8 &= ((P & 0x2020202020100804) * 0x1010000000004081) >> 59; // 21876 flipped = FLIPPED_3_V[outflank_c1f4f8] & 0x0020202020100800; - outflank_b8f4f1 = ((P & 0x0204081020202020) * 0x0080400000101010) >> 59; // 67812 - outflank_b8f4f1 = OUTFLANK_4[((O & 0x0004081020202000) * 0x0804020101010101) >> 58] & outflank_b8f4f1; + outflank_b8f4f1 = OUTFLANK_4[((O & 0x0004081020202000) * 0x0804020101010101) >> 58]; + outflank_b8f4f1 &= ((P & 0x0204081020202020) * 0x0080400000101010) >> 59; // 67812 flipped |= vertical_mirror(FLIPPED_4_V[outflank_b8f4f1]) & 0x0004081020202000; outflank_h = OUTFLANK_5[(O >> 25) & 0x3f] & rotl8(P >> 24, 1); @@ -977,15 +980,15 @@ static unsigned long long flip_G4(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_d1g4g8, outflank_c8g4g1; unsigned long long flipped; - outflank_d1g4g8 = ((P & 0x4040404040201008) * 0x0020202020408102) >> 56; // (g8) - outflank_d1g4g8 = OUTFLANK_3[((O & 0x0040404040201000) * 0x0020202020408102) >> 57] & rotl8(outflank_d1g4g8, 3); + outflank_d1g4g8 = OUTFLANK_3[((O & 0x0040404040201000) * 0x0020202020408102) >> 57]; + outflank_d1g4g8 &= rotl8(((P & 0x4040404040201008) * 0x0020202020408102) >> 56, 3); // (g8) flipped = FLIPPED_3_V[outflank_d1g4g8] & 0x0040404040201000; - outflank_c8g4g1 = ((P & 0x0408102040404040) * 0x0040200000080808) >> 59; // 67812 - outflank_c8g4g1 = OUTFLANK_4[((O & 0x0008102040404000) * 0x0001008040404040) >> 57] & outflank_c8g4g1; + outflank_c8g4g1 = OUTFLANK_4[((O & 0x0008102040404000) * 0x0001008040404040) >> 57]; + outflank_c8g4g1 &= ((P & 0x0408102040404040) * 0x0040200000080808) >> 59; // 67812 flipped |= vertical_mirror(FLIPPED_4_V[outflank_c8g4g1]) & 0x0008102040404000; - outflank_h = outflank_right_H(((unsigned int) O >> 25) << 27) & (unsigned int)(P << 2); + outflank_h = outflank_right_H(((unsigned int) O >> 25) << 27) & (unsigned int) (P << 2); flipped |= (outflank_h * (unsigned int) -2) >> 2; return flipped; @@ -1003,15 +1006,15 @@ static unsigned long long flip_H4(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_e1h4h8, outflank_d8h4h1; unsigned long long flipped; - outflank_e1h4h8 = ((P & 0x8080808080402010) * 0x0010101010204081) >> 56; // (h8) - outflank_e1h4h8 = OUTFLANK_3[((O & 0x0080808080402000) * 0x0010101010204081) >> 57] & rotl8(outflank_e1h4h8, 3); + outflank_e1h4h8 = OUTFLANK_3[((O & 0x0080808080402000) * 0x0010101010204081) >> 57]; + outflank_e1h4h8 &= rotl8(((P & 0x8080808080402010) * 0x0010101010204081) >> 56, 3); // (h8) flipped = FLIPPED_3_V[outflank_e1h4h8] & 0x0080808080402000; - outflank_d8h4h1 = ((P & 0x0810204080808080) * 0x0020100000040404) >> 59; // 67812 - outflank_d8h4h1 = OUTFLANK_4[((O & 0x0010204080808000) * 0x0000804020202020) >> 57] & outflank_d8h4h1; + outflank_d8h4h1 = OUTFLANK_4[((O & 0x0010204080808000) * 0x0000804020202020) >> 57]; + outflank_d8h4h1 &= ((P & 0x0810204080808080) * 0x0020100000040404) >> 59; // 67812 flipped |= vertical_mirror(FLIPPED_4_V[outflank_d8h4h1]) & 0x0010204080808000; - outflank_h = outflank_right_H(((unsigned int) O >> 25) << 26) & (unsigned int)(P << 1); + outflank_h = outflank_right_H(((unsigned int) O >> 25) << 26) & (unsigned int) (P << 1); flipped |= (outflank_h * (unsigned int) -2) >> 1; return flipped; @@ -1029,12 +1032,12 @@ static unsigned long long flip_A5(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_a1a5d8, outflank_a8a5e1; unsigned long long flipped; - outflank_a1a5d8 = ((P & 0x0804020101010101) * 0x2040800000000202) >> 59; // 32187 - outflank_a1a5d8 = OUTFLANK_4[((O & 0x0004020101010100) * 0x0102040810101010) >> 57] & outflank_a1a5d8; + outflank_a1a5d8 = OUTFLANK_4[((O & 0x0004020101010100) * 0x0102040810101010) >> 57]; + outflank_a1a5d8 &= ((P & 0x0804020101010101) * 0x2040800000000202) >> 59; // 32187 flipped = FLIPPED_4_V[outflank_a1a5d8] & 0x0004020101010100; - outflank_a8a5e1 = ((P & 0x0101010102040810) * 0x0202020000008040) >> 59; // 78123 - outflank_a8a5e1 = OUTFLANK_3[((O & 0x0001010102040800) * 0x0808080808040201) >> 57] & outflank_a8a5e1; + outflank_a8a5e1 = OUTFLANK_3[((O & 0x0001010102040800) * 0x0808080808040201) >> 57]; + outflank_a8a5e1 &= ((P & 0x0101010102040810) * 0x0202020000008040) >> 59; // 78123 flipped |= vertical_mirror(FLIPPED_3_V[outflank_a8a5e1]) & 0x0001010102040800; outflank_h = ((unsigned int) (O >> 8) + 0x02000000) & (unsigned int) (P >> 8); @@ -1055,12 +1058,12 @@ static unsigned long long flip_B5(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_b1b5e8, outflank_b8b5f1; unsigned long long flipped; - outflank_b1b5e8 = ((P & 0x1008040202020202) * 0x1020400000000101) >> 59; // 32187 - outflank_b1b5e8 = OUTFLANK_4[((O & 0x0008040202020200) * 0x0081020408080808) >> 57] & outflank_b1b5e8; + outflank_b1b5e8 = OUTFLANK_4[((O & 0x0008040202020200) * 0x0081020408080808) >> 57]; + outflank_b1b5e8 &= ((P & 0x1008040202020202) * 0x1020400000000101) >> 59; // 32187 flipped = FLIPPED_4_V[outflank_b1b5e8] & 0x0008040202020200; - outflank_b8b5f1 = ((P & 0x0202020204081020) * 0x0101010000004020) >> 59; // 78123 - outflank_b8b5f1 = OUTFLANK_3[((O & 0x0002020204081000) * 0x0808080808040201) >> 58] & outflank_b8b5f1; + outflank_b8b5f1 = OUTFLANK_3[((O & 0x0002020204081000) * 0x0808080808040201) >> 58]; + outflank_b8b5f1 &= ((P & 0x0202020204081020) * 0x0101010000004020) >> 59; // 78123 flipped |= vertical_mirror(FLIPPED_3_V[outflank_b8b5f1]) & 0x0002020204081000; outflank_h = ((unsigned int) (O >> 8) + 0x04000000) & (unsigned int) (P >> 8); @@ -1081,12 +1084,12 @@ static unsigned long long flip_C5(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_c1c5f8, outflank_c8c5g1; unsigned long long flipped; - outflank_c1c5f8 = ((P & 0x2010080404040404) * 0x0040810204040404) >> 56; // (f8) - outflank_c1c5f8 = OUTFLANK_4[((O & 0x0010080404040400) * 0x0040810204040404) >> 57] & rotl8(outflank_c1c5f8, 2); + outflank_c1c5f8 = OUTFLANK_4[((O & 0x0010080404040400) * 0x0040810204040404) >> 57]; + outflank_c1c5f8 &= rotl8(((P & 0x2010080404040404) * 0x0040810204040404) >> 56, 2); // (f8) flipped = FLIPPED_4_V[outflank_c1c5f8] & 0x0010080404040400; - outflank_c8c5g1 = ((P & 0x0404040408102040) * 0x0080808000002010) >> 59; // 78123 - outflank_c8c5g1 = OUTFLANK_3[((O & 0x0004040408102000) * 0x0002020202010080) >> 57] & outflank_c8c5g1; + outflank_c8c5g1 = OUTFLANK_3[((O & 0x0004040408102000) * 0x0002020202010080) >> 57]; + outflank_c8c5g1 &= ((P & 0x0404040408102040) * 0x0080808000002010) >> 59; // 78123 flipped |= vertical_mirror(FLIPPED_3_V[outflank_c8c5g1]) & 0x0004040408102000; outflank_h = OUTFLANK_2[(O >> 33) & 0x3f] & rotl8(P >> 32, 4); @@ -1109,19 +1112,19 @@ static unsigned long long flip_D5(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d7, outflank_d9; unsigned long long flipped; - outflank_v = ((P & 0x0808080808080808) * 0x0408100000000102) >> 59; // 32187 - outflank_v = OUTFLANK_4[((O & 0x0008080808080800) * 0x0020408102040810) >> 57] & outflank_v; + outflank_v = OUTFLANK_4[((O & 0x0008080808080800) * 0x0020408102040810) >> 57]; + outflank_v &= ((P & 0x0808080808080808) * 0x0408100000000102) >> 59; // 32187 flipped = FLIPPED_4_V[outflank_v] & 0x0008080808080800; outflank_h = OUTFLANK_3[(O >> 33) & 0x3f] & rotl8(P >> 32, 3); flipped |= (unsigned long long)(unsigned char) FLIPPED_3_H[outflank_h] << 32; - outflank_d7 = ((P & 0x0102040810204080) * 0x0040404000004040) >> 59; // bahgf - outflank_d7 = OUTFLANK_3[((O & 0x0002040810204000) * 0x0101010101010101) >> 57] & outflank_d7; + outflank_d7 = OUTFLANK_3[(((HIDWORD(O) & 0x00020408) + (LODWORD(O) & 0x10204000)) * 0x01010101) >> 25]; + outflank_d7 &= ((P & 0x0102040810204080) * 0x0040404000004040) >> 59; // bahgf flipped |= FLIPPED_3_H[outflank_d7] & 0x0002040810204000; - outflank_d9 = ((P & 0x4020100804020100) * 0x0101010101010101) >> 56; // (g8) - outflank_d9 = OUTFLANK_3[((O & 0x0020100804020000) * 0x0101010101010101) >> 57] & rotl8(outflank_d9, 3); + outflank_d9 = OUTFLANK_3[(((HIDWORD(O) & 0x00201008) + (LODWORD(O) & 0x04020000)) * 0x01010101) >> 25]; + outflank_d9 &= rotl8((((HIDWORD(P) & 0x40201008) + (LODWORD(P) & 0x04020100)) * 0x01010101) >> 24, 3); // (g8) flipped |= FLIPPED_3_H[outflank_d9] & 0x0020100804020000; return flipped; @@ -1139,19 +1142,19 @@ static unsigned long long flip_E5(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d7, outflank_d9; unsigned long long flipped; - outflank_v = ((P & 0x1010101010101010) * 0x0204080000000081) >> 59; // 32187 - outflank_v = OUTFLANK_4[((O & 0x0010101010101000) * 0x0010204081020408) >> 57] & outflank_v; + outflank_v = OUTFLANK_4[((O & 0x0010101010101000) * 0x0010204081020408) >> 57]; + outflank_v &= ((P & 0x1010101010101010) * 0x0204080000000081) >> 59; // 32187 flipped = FLIPPED_4_V[outflank_v] & 0x0010101010101000; outflank_h = OUTFLANK_4[(O >> 33) & 0x3f] & rotl8(P >> 32, 2); flipped |= (unsigned long long)(unsigned char) FLIPPED_4_H[outflank_h] << 32; - outflank_d7 = ((P & 0x0204081020408000) * 0x0000202000002020) >> 59; // cb0hg - outflank_d7 = OUTFLANK_4[((O & 0x0004081020400000) * 0x0101010101010101) >> 57] & outflank_d7; + outflank_d7 = OUTFLANK_4[(((HIDWORD(O) & 0x00040810) + (LODWORD(O) & 0x20400000)) * 0x01010101) >> 25]; + outflank_d7 &= ((P & 0x0204081020408000) * 0x0000202000002020) >> 59; // cb0hg flipped |= FLIPPED_4_H[outflank_d7] & 0x0004081020400000; - outflank_d9 = ((P & 0x8040201008040201) * 0x0101010101010101) >> 56; // (h8) - outflank_d9 = OUTFLANK_4[((O & 0x0040201008040200) * 0x0101010101010101) >> 57] & rotl8(outflank_d9, 2); + outflank_d9 = OUTFLANK_4[(((HIDWORD(O) & 0x00402010) + (LODWORD(O) & 0x08040200)) * 0x01010101) >> 25]; + outflank_d9 &= rotl8((((HIDWORD(P) & 0x80402010) + (LODWORD(P) & 0x08040201)) * 0x01010101) >> 24, 2); // (h8) flipped |= FLIPPED_4_H[outflank_d9] & 0x0040201008040200; return flipped; @@ -1169,12 +1172,12 @@ static unsigned long long flip_F5(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_b1f5f8, outflank_c8f5f1; unsigned long long flipped; - outflank_b1f5f8 = ((P & 0x2020202010080402) * 0x0080808080810204) >> 56; // (f8) - outflank_b1f5f8 = OUTFLANK_4[((O & 0x0020202010080400) * 0x0080808080810204) >> 57] & rotl8(outflank_b1f5f8, 2); + outflank_b1f5f8 = OUTFLANK_4[((O & 0x0020202010080400) * 0x0080808080810204) >> 57]; + outflank_b1f5f8 &= rotl8(((P & 0x2020202010080402) * 0x0080808080810204) >> 56, 2); // (f8) flipped = FLIPPED_4_V[outflank_b1f5f8] & 0x0020202010080400; - outflank_c8f5f1 = ((P & 0x0408102020202020) * 0x0100804000001010) >> 59; // 78123 - outflank_c8f5f1 = OUTFLANK_3[((O & 0x0008102020202000) * 0x0002010080404040) >> 57] & outflank_c8f5f1; + outflank_c8f5f1 = OUTFLANK_3[((O & 0x0008102020202000) * 0x0002010080404040) >> 57]; + outflank_c8f5f1 &= ((P & 0x0408102020202020) * 0x0100804000001010) >> 59; // 78123 flipped |= vertical_mirror(FLIPPED_3_V[outflank_c8f5f1]) & 0x0008102020202000; outflank_h = OUTFLANK_5[(O >> 33) & 0x3f] & rotl8(P >> 32, 1); @@ -1197,16 +1200,16 @@ static unsigned long long flip_G5(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_c1g5g8, outflank_d8g5g1; unsigned long long flipped; - outflank_c1g5g8 = ((P & 0x4040404020100804) * 0x0040404040408102) >> 56; // (g8) - outflank_c1g5g8 = OUTFLANK_4[((O & 0x0040404020100800) * 0x0040404040408102) >> 57] & rotl8(outflank_c1g5g8, 2); + outflank_c1g5g8 = OUTFLANK_4[((O & 0x0040404020100800) * 0x0040404040408102) >> 57]; + outflank_c1g5g8 &= rotl8(((P & 0x4040404020100804) * 0x0040404040408102) >> 56, 2); // (g8) flipped = FLIPPED_4_V[outflank_c1g5g8] & 0x0040404020100800; - outflank_d8g5g1 = ((P & 0x0810204040404040) * 0x0080402000000808) >> 59; // 78123 - outflank_d8g5g1 = OUTFLANK_3[((O & 0x0010204040404000) * 0x0001008040202020) >> 57] & outflank_d8g5g1; + outflank_d8g5g1 = OUTFLANK_3[((O & 0x0010204040404000) * 0x0001008040202020) >> 57]; + outflank_d8g5g1 &= ((P & 0x0810204040404040) * 0x0080402000000808) >> 59; // 78123 flipped |= vertical_mirror(FLIPPED_3_V[outflank_d8g5g1]) & 0x0010204040404000; - outflank_h = outflank_right_H((unsigned int)(O >> 33) << 27) & (unsigned int)(P >> 6); - flipped |= (unsigned long long)(outflank_h * (unsigned int) -2) << 6; + outflank_h = outflank_right_H((unsigned int) (O >> 33) << 27) & (unsigned int) (P >> 6); + flipped |= (unsigned long long) (outflank_h * (unsigned int) -2) << 6; return flipped; } @@ -1223,16 +1226,16 @@ static unsigned long long flip_H5(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_d1h5h8, outflank_e8h5h1; unsigned long long flipped; - outflank_d1h5h8 = ((P & 0x8080808040201008) * 0x0020202020204081) >> 56; // (h8) - outflank_d1h5h8 = OUTFLANK_4[((O & 0x0080808040201000) * 0x0020202020204081) >> 57] & rotl8(outflank_d1h5h8, 2); + outflank_d1h5h8 = OUTFLANK_4[((O & 0x0080808040201000) * 0x0020202020204081) >> 57]; + outflank_d1h5h8 &= rotl8(((P & 0x8080808040201008) * 0x0020202020204081) >> 56, 2); // (h8) flipped = FLIPPED_4_V[outflank_d1h5h8] & 0x0080808040201000; - outflank_e8h5h1 = ((P & 0x1020408080808080) * 0x0040201000000404) >> 59; // 78123 - outflank_e8h5h1 = OUTFLANK_3[((O & 0x0020408080808000) * 0x0000804020101010) >> 57] & outflank_e8h5h1; + outflank_e8h5h1 = OUTFLANK_3[((O & 0x0020408080808000) * 0x0000804020101010) >> 57]; + outflank_e8h5h1 &= ((P & 0x1020408080808080) * 0x0040201000000404) >> 59; // 78123 flipped |= vertical_mirror(FLIPPED_3_V[outflank_e8h5h1]) & 0x0020408080808000; - outflank_h = outflank_right_H((unsigned int)(O >> 33) << 26) & (unsigned int)(P >> 7); - flipped |= (unsigned long long)(outflank_h * (unsigned int) -2) << 7; + outflank_h = outflank_right_H((unsigned int) (O >> 33) << 26) & (unsigned int) (P >> 7); + flipped |= (unsigned long long) (outflank_h * (unsigned int) -2) << 7; return flipped; } @@ -1249,12 +1252,12 @@ static unsigned long long flip_A6(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_a1a6c8, outflank_a8a6f1; unsigned long long flipped; - outflank_a1a6c8 = ((P & 0x0402010101010101) * 0x1020408000000002) >> 59; // 43218 - outflank_a1a6c8 = OUTFLANK_5[((O & 0x0002010101010100) * 0x0102040810202020) >> 57] & outflank_a1a6c8; + outflank_a1a6c8 = OUTFLANK_5[((O & 0x0002010101010100) * 0x0102040810202020) >> 57]; + outflank_a1a6c8 &= ((P & 0x0402010101010101) * 0x1020408000000002) >> 59; // 43218 flipped = FLIPPED_5_V[outflank_a1a6c8] & 0x0002010101010100; - outflank_a8a6f1 = ((P & 0x0101010204081020) * 0x0202020200000080) >> 59; // 81234 - outflank_a8a6f1 = OUTFLANK_2[((O & 0x0001010204081000) * 0x0404040404040201) >> 57] & outflank_a8a6f1; + outflank_a8a6f1 = OUTFLANK_2[((O & 0x0001010204081000) * 0x0404040404040201) >> 57]; + outflank_a8a6f1 &= ((P & 0x0101010204081020) * 0x0202020200000080) >> 59; // 81234 flipped |= vertical_mirror(FLIPPED_2_V[outflank_a8a6f1]) & 0x0001010204081000; outflank_h = ((unsigned int) (O >> 16) + 0x02000000) & (unsigned int) (P >> 16); @@ -1275,12 +1278,12 @@ static unsigned long long flip_B6(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_b1b6d8, outflank_b8b6g1; unsigned long long flipped; - outflank_b1b6d8 = ((P & 0x0804020202020202) * 0x0810204000000001) >> 59; // 43218 - outflank_b1b6d8 = OUTFLANK_5[((O & 0x0004020202020200) * 0x0081020408101010) >> 57] & outflank_b1b6d8; + outflank_b1b6d8 = OUTFLANK_5[((O & 0x0004020202020200) * 0x0081020408101010) >> 57]; + outflank_b1b6d8 &= ((P & 0x0804020202020202) * 0x0810204000000001) >> 59; // 43218 flipped = FLIPPED_5_V[outflank_b1b6d8] & 0x0004020202020200; - outflank_b8b6g1 = ((P & 0x0202020408102040) * 0x0101010100000040) >> 59; // 81234 - outflank_b8b6g1 = OUTFLANK_2[((O & 0x0002020408102000) * 0x0404040404040201) >> 58] & outflank_b8b6g1; + outflank_b8b6g1 = OUTFLANK_2[((O & 0x0002020408102000) * 0x0404040404040201) >> 58]; + outflank_b8b6g1 &= ((P & 0x0202020408102040) * 0x0101010100000040) >> 59; // 81234 flipped |= vertical_mirror(FLIPPED_2_V[outflank_b8b6g1]) & 0x0002020408102000; outflank_h = ((unsigned int) (O >> 16) + 0x04000000) & (unsigned int) (P >> 16); @@ -1301,15 +1304,15 @@ static unsigned long long flip_C6(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d7; unsigned long long flipped; - outflank_v = ((P & 0x0404040404040404) * 0x0408102000000002) >> 59; // 43218 - outflank_v = OUTFLANK_5[((O & 0x0004040404040400) * 0x0040810204081020) >> 57] & outflank_v; + outflank_v = OUTFLANK_5[((O & 0x0004040404040400) * 0x0040810204081020) >> 57]; + outflank_v &= ((P & 0x0404040404040404) * 0x0408102000000002) >> 59; // 43218 flipped = FLIPPED_5_V[outflank_v] & 0x0004040404040400; outflank_h = OUTFLANK_2[(O >> 41) & 0x3f] & rotl8(P >> 40, 4); flipped |= (unsigned long long)(unsigned char) FLIPPED_2_H[outflank_h] << 40; - outflank_d7 = ((P & 0x0102040810204080) * 0x0080808080000080) >> 59; // ahgfe - outflank_d7 = OUTFLANK_2[((O & 0x0002040810204000) * 0x0101010101010101) >> 57] & outflank_d7; + outflank_d7 = OUTFLANK_2[(((HIDWORD(O) & 0x00020408) + (LODWORD(O) & 0x10204000)) * 0x01010101) >> 25]; + outflank_d7 &= ((P & 0x0102040810204080) * 0x0080808080000080) >> 59; // ahgfe flipped |= FLIPPED_2_H[outflank_d7] & 0x0002040810204000; flipped |= ((P >> 9) | (P << 9)) & 0x0008000200000000 & O; @@ -1329,15 +1332,15 @@ static unsigned long long flip_D6(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d; unsigned long long flipped; - outflank_v = ((P & 0x0808080808080808) * 0x0204081020408001) >> 59; // 43218 - outflank_v = OUTFLANK_5[((O & 0x0008080808080800) * 0x0020408102040810) >> 57] & outflank_v; + outflank_v = OUTFLANK_5[((O & 0x0008080808080800) * 0x0020408102040810) >> 57]; + outflank_v &= ((P & 0x0808080808080808) * 0x0204081020408001) >> 59; // 43218 flipped = FLIPPED_5_V[outflank_v] & 0x0008080808080800; outflank_h = OUTFLANK_3[(O >> 41) & 0x3f] & rotl8(P >> 40, 3); flipped |= (unsigned long long)(unsigned char) FLIPPED_3_H[outflank_h] << 40; - outflank_d = ((P & 0x0000081422418000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0000081422400000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 16) & 0x08142240) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000081422418000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0000081422400000; // A3D6H2 flipped |= (((P >> 9) & 0x0010000000000000) | ((P >> 7) & 0x0004000000000000)) & O; @@ -1357,15 +1360,15 @@ static unsigned long long flip_E6(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d; unsigned long long flipped; - outflank_v = ((P & 0x1010101010101010) * 0x0010204081020408) >> 56; // (e8) - outflank_v = OUTFLANK_5[((O & 0x0010101010101000) * 0x0010204081020408) >> 57] & rotl8(outflank_v, 1); + outflank_v = OUTFLANK_5[((O & 0x0010101010101000) * 0x0010204081020408) >> 57]; + outflank_v &= rotl8(((P & 0x1010101010101010) * 0x0010204081020408) >> 56, 1); // (e8) flipped = FLIPPED_5_V[outflank_v] & 0x0010101010101000; outflank_h = OUTFLANK_4[(O >> 41) & 0x3f] & rotl8(P >> 40, 2); flipped |= (unsigned long long)(unsigned char) FLIPPED_4_H[outflank_h] << 40; - outflank_d = ((P & 0x0000102844820100) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0000102844020000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 16) & 0x10284402) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000102844820100) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0000102844020000; // A2E6H3 flipped |= (((P >> 9) & 0x0020000000000000) | ((P >> 7) & 0x0008000000000000)) & O; @@ -1385,8 +1388,8 @@ static unsigned long long flip_F6(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_v, outflank_d9; unsigned long long flipped; - outflank_v = ((P & 0x2020202020202020) * 0x0008102040810204) >> 56; // (f8) - outflank_v = OUTFLANK_5[((O & 0x0020202020202000) * 0x0008102040810204) >> 57] & rotl8(outflank_v, 1); + outflank_v = OUTFLANK_5[((O & 0x0020202020202000) * 0x0008102040810204) >> 57]; + outflank_v &= rotl8(((P & 0x2020202020202020) * 0x0008102040810204) >> 56, 1); // (f8) flipped = FLIPPED_5_V[outflank_v] & 0x0020202020202000; outflank_h = OUTFLANK_5[(O >> 41) & 0x3f] & rotl8(P >> 40, 1); @@ -1394,8 +1397,8 @@ static unsigned long long flip_F6(const unsigned long long P, const unsigned lon flipped |= ((P >> 7) | (P << 7)) & 0x0010004000000000 & O; - outflank_d9 = ((P & 0x8040201008040201) * 0x0101010101010101) >> 56; // (h8) - outflank_d9 = OUTFLANK_5[((O & 0x0040201008040200) * 0x0101010101010101) >> 57] & rotl8(outflank_d9, 1); + outflank_d9 = OUTFLANK_5[(((HIDWORD(O) & 0x00402010) + (LODWORD(O) & 0x08040200)) * 0x01010101) >> 25]; + outflank_d9 &= rotl8((((HIDWORD(P) & 0x80402010) + (LODWORD(P) & 0x08040201)) * 0x01010101) >> 24, 1); // (h8) flipped |= FLIPPED_5_H[outflank_d9] & 0x0040201008040200; return flipped; @@ -1413,16 +1416,16 @@ static unsigned long long flip_G6(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_b1g6g8, outflank_e8g6g1; unsigned long long flipped; - outflank_b1g6g8 = ((P & 0x4040402010080402) * 0x0080808080808102) >> 56; // (g8) - outflank_b1g6g8 = OUTFLANK_5[((O & 0x0040402010080400) * 0x0080808080808102) >> 57] & rotl8(outflank_b1g6g8, 1); + outflank_b1g6g8 = OUTFLANK_5[((O & 0x0040402010080400) * 0x0080808080808102) >> 57]; + outflank_b1g6g8 &= rotl8(((P & 0x4040402010080402) * 0x0080808080808102) >> 56, 1); // (g8) flipped = FLIPPED_5_V[outflank_b1g6g8] & 0x0040402010080400; - outflank_e8g6g1 = ((P & 0x1020404040404040) * 0x0100804020000008) >> 59; // 81234 - outflank_e8g6g1 = OUTFLANK_2[((O & 0x0020404040404000) * 0x0001008040201010) >> 57] & outflank_e8g6g1; + outflank_e8g6g1 = OUTFLANK_2[((O & 0x0020404040404000) * 0x0001008040201010) >> 57]; + outflank_e8g6g1 &= ((P & 0x1020404040404040) * 0x0100804020000008) >> 59; // 81234 flipped |= vertical_mirror(FLIPPED_2_V[outflank_e8g6g1]) & 0x0020404040404000; - outflank_h = outflank_right_H((unsigned int)(O >> 41) << 27) & (unsigned int)(P >> 14); - flipped |= (unsigned long long)(outflank_h * (unsigned int) -2) << 14; + outflank_h = outflank_right_H((unsigned int) (O >> 41) << 27) & (unsigned int) (P >> 14); + flipped |= (unsigned long long) (outflank_h * (unsigned int) -2) << 14; return flipped; } @@ -1439,16 +1442,16 @@ static unsigned long long flip_H6(const unsigned long long P, const unsigned lon unsigned int outflank_h, outflank_c1h6h8, outflank_f8h6h1; unsigned long long flipped; - outflank_c1h6h8 = ((P & 0x8080804020100804) * 0x0040404040404081) >> 56; // (h8) - outflank_c1h6h8 = OUTFLANK_5[((O & 0x0080804020100800) * 0x0040404040404081) >> 57] & rotl8(outflank_c1h6h8, 1); + outflank_c1h6h8 = OUTFLANK_5[((O & 0x0080804020100800) * 0x0040404040404081) >> 57]; + outflank_c1h6h8 &= rotl8(((P & 0x8080804020100804) * 0x0040404040404081) >> 56, 1); // (h8) flipped = FLIPPED_5_V[outflank_c1h6h8] & 0x0080804020100800; - outflank_f8h6h1 = ((P & 0x2040808080808080) * 0x0080402010000004) >> 59; // 81234 - outflank_f8h6h1 = OUTFLANK_2[((O & 0x0040808080808000) * 0x0000804020100808) >> 57] & outflank_f8h6h1; + outflank_f8h6h1 = OUTFLANK_2[((O & 0x0040808080808000) * 0x0000804020100808) >> 57]; + outflank_f8h6h1 &= ((P & 0x2040808080808080) * 0x0080402010000004) >> 59; // 81234 flipped |= vertical_mirror(FLIPPED_2_V[outflank_f8h6h1]) & 0x0040808080808000; - outflank_h = outflank_right_H((unsigned int)(O >> 41) << 26) & (unsigned int)(P >> 15); - flipped |= (unsigned long long)(outflank_h * (unsigned int) -2) << 15; + outflank_h = outflank_right_H((unsigned int) (O >> 41) << 26) & (unsigned int) (P >> 15); + flipped |= (unsigned long long) (outflank_h * (unsigned int) -2) << 15; return flipped; } @@ -1516,8 +1519,8 @@ static unsigned long long flip_C7(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0000040404040404) & P; flipped = (outflank_v * -2) & 0x0000040404040404; - outflank_d = ((P & 0x00040a1120408000) * 0x0101010101010101) >> 52; // hgfedcb[ahgfe]... - outflank_d = OUTFLANK_2[((O & 0x00040a1020400000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_2[(((HIDWORD(O) & 0x00040a10) + (LODWORD(O) & 0x20400000)) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x00040a1120408000) * 0x0101010101010101) >> 52; // hgfedcb[ahgfe]... flipped |= FLIPPED_2_H[outflank_d] & 0x00040a1020400000; // A5C7H2 outflank_h = OUTFLANK_2[(O >> 49) & 0x3f] & rotl8(P >> 48, 4); @@ -1541,8 +1544,8 @@ static unsigned long long flip_D7(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0000080808080808) & P; flipped = (outflank_v * -2) & 0x0000080808080808; - outflank_d = ((P & 0x0008142241800000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0008142240000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 24) & 0x08142240) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0008142241800000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0008142240000000; // A4D7H3 outflank_h = OUTFLANK_3[(O >> 49) & 0x3f] & rotl8(P >> 48, 3); @@ -1566,8 +1569,8 @@ static unsigned long long flip_E7(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0000101010101010) & P; flipped = (outflank_v * -2) & 0x0000101010101010; - outflank_d = ((P & 0x0010284482010000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0010284402000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 24) & 0x10284402) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0010284482010000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0010284402000000; // A3E7H4 outflank_h = OUTFLANK_4[(O >> 49) & 0x3f] & rotl8(P >> 48, 2); @@ -1591,8 +1594,8 @@ static unsigned long long flip_F7(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0000202020202020) & P; flipped = (outflank_v * -2) & 0x0000202020202020; - outflank_d = ((P & 0x0020508804020100) * 0x0101010101010101) >> 55; // hgfe[dcbah]... - outflank_d = OUTFLANK_5[((O & 0x0020500804020000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_5[(((HIDWORD(O) & 0x00205008) + (LODWORD(O) & 0x04020000)) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0020508804020100) * 0x0101010101010101) >> 55; // hgfe[dcbah]... flipped |= FLIPPED_5_H[outflank_d] & 0x0020500804020000; // A2F7H5 outflank_h = OUTFLANK_5[(O >> 49) & 0x3f] & rotl8(P >> 48, 1); @@ -1619,8 +1622,8 @@ static unsigned long long flip_G7(const unsigned long long P, const unsigned lon outflank_d9 = outflank_right(O, 0x0000201008040201) & P; flipped |= (outflank_d9 * -2) & 0x0000201008040201; - outflank_h = outflank_right_H((unsigned int)(O >> 49) << 27) & (unsigned int)(P >> 22); - flipped |= (unsigned long long)(outflank_h * (unsigned int) -2) << 22; + outflank_h = outflank_right_H((unsigned int) (O >> 49) << 27) & (unsigned int) (P >> 22); + flipped |= (unsigned long long) (outflank_h * (unsigned int) -2) << 22; return flipped; } @@ -1643,8 +1646,8 @@ static unsigned long long flip_H7(const unsigned long long P, const unsigned lon outflank_d9 = outflank_right(O, 0x0000402010080402) & P; flipped |= (outflank_d9 * -2) & 0x0000402010080402; - outflank_h = outflank_right_H((unsigned int)(O >> 49) << 26) & (unsigned int)(P >> 23); - flipped |= (unsigned long long)(outflank_h * (unsigned int) -2) << 23; + outflank_h = outflank_right_H((unsigned int) (O >> 49) << 26) & (unsigned int) (P >> 23); + flipped |= (unsigned long long) (outflank_h * (unsigned int) -2) << 23; return flipped; } @@ -1710,8 +1713,8 @@ static unsigned long long flip_C8(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0004040404040404) & P; flipped = (outflank_v * -2) & 0x0004040404040404; - outflank_d = ((P & 0x040a112040800000) * 0x0101010101010101) >> 52; // hgfedcb[ahgfe]d0ba... - outflank_d = OUTFLANK_2[((O & 0x040a102040000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_2[(((HIDWORD(O) & 0x040a1020) + (LODWORD(O) & 0x40000000)) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x040a112040800000) * 0x0101010101010101) >> 52; // hgfedcb[ahgfe]d0ba... flipped |= FLIPPED_2_H[outflank_d] & 0x040a102040000000; // A6C8H3 outflank_h = OUTFLANK_2[(O >> 57) & 0x3f] & rotl8(P >> 56, 4); @@ -1735,8 +1738,8 @@ static unsigned long long flip_D8(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0008080808080808) & P; flipped = (outflank_v * -2) & 0x0008080808080808; - outflank_d = ((P & 0x0814224180000000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]e0cba... - outflank_d = OUTFLANK_3[((O & 0x0814224000000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[((HIDWORD(O) & 0x08142240) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0814224180000000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]e0cba... flipped |= FLIPPED_3_H[outflank_d] & 0x0814224000000000; // A5D8H4 outflank_h = OUTFLANK_3[(O >> 57) & 0x3f] & rotl8(P >> 56, 3); @@ -1760,8 +1763,8 @@ static unsigned long long flip_E8(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0010101010101010) & P; flipped = (outflank_v * -2) & 0x0010101010101010; - outflank_d = ((P & 0x1028448201000000) * 0x0101010101010101) >> 54; // hgfed[cbahg]f0dcba... - outflank_d = OUTFLANK_4[((O & 0x1028440200000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[((HIDWORD(O) & 0x10284402) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x1028448201000000) * 0x0101010101010101) >> 54; // hgfed[cbahg]f0dcba... flipped |= FLIPPED_4_H[outflank_d] & 0x1028440200000000; // A4E8H5 outflank_h = OUTFLANK_4[(O >> 57) & 0x3f] & rotl8(P >> 56, 2); @@ -1785,8 +1788,8 @@ static unsigned long long flip_F8(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0020202020202020) & P; flipped = (outflank_v * -2) & 0x0020202020202020; - outflank_d = ((P & 0x2050880402010000) * 0x0101010101010101) >> 55; // hgfe[dcbah]g0edcba... - outflank_d = OUTFLANK_5[((O & 0x2050080402000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_5[(((HIDWORD(O) & 0x20500804) + (LODWORD(O) & 0x02000000)) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x2050880402010000) * 0x0101010101010101) >> 55; // hgfe[dcbah]g0edcba... flipped |= FLIPPED_5_H[outflank_d] & 0x2050080402000000; // A3F8H6 outflank_h = OUTFLANK_5[(O >> 57) & 0x3f] & rotl8(P >> 56, 1); @@ -1813,8 +1816,8 @@ static unsigned long long flip_G8(const unsigned long long P, const unsigned lon outflank_d9 = outflank_right(O, 0x0020100804020100) & P; flipped |= (outflank_d9 * -2) & 0x0020100804020100; - outflank_h = outflank_right_H((unsigned int)(O >> 57) << 27) & (unsigned int)(P >> 30); - flipped |= (unsigned long long)(outflank_h * (unsigned int) -2) << 30; + outflank_h = outflank_right_H((unsigned int) (O >> 57) << 27) & (unsigned int) (P >> 30); + flipped |= (unsigned long long) (outflank_h * (unsigned int) -2) << 30; return flipped; } @@ -1837,8 +1840,8 @@ static unsigned long long flip_H8(const unsigned long long P, const unsigned lon outflank_d9 = outflank_right(O, 0x0040201008040201) & P; flipped |= (outflank_d9 * -2) & 0x0040201008040201; - outflank_h = outflank_right_H((unsigned int)(O >> 57) << 26) & (unsigned int)(P >> 31); - flipped |= (unsigned long long)(outflank_h * (unsigned int) -2) << 31; + outflank_h = outflank_right_H((unsigned int) (O >> 57) << 26) & (unsigned int) (P >> 31); + flipped |= (unsigned long long) (outflank_h * (unsigned int) -2) << 31; return flipped; } diff --git a/src/flip_neon_bitscan.c b/src/flip_neon_bitscan.c index 27ece87..4311103 100644 --- a/src/flip_neon_bitscan.c +++ b/src/flip_neon_bitscan.c @@ -34,9 +34,8 @@ * returned to generate moves. * * If the OUTFLANK search is in LSB to MSB direction, carry propagation - * can be used to determine contiguous opponent discs. - * If the OUTFLANK search is in MSB to LSB direction, lzcnt64 is used if - * available, or __builtin_bswap is used to use carry propagation backwards. + * (with Neon if appropriate) can be used to determine contiguous opponent discs. + * If the OUTFLANK search is in MSB to LSB direction, lzcnt64 is used. * * @date 1998 - 2020 * @author Richard Delorme @@ -44,9 +43,11 @@ * @version 4.4 */ -#include "arm_neon.h" #include "bit_intrinsics.h" +// included from board.c or linked in Android Arm32 dispatch build +#if defined(flip_neon) || (defined(ANDROID) && defined(__arm__) && !defined(hasNeon)) + /** rotated outflank array (indexed with inner 6 bits) */ static const unsigned char OUTFLANK_3[64] = { // ...bahgf 0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x11, 0x09, 0x00, 0x00, 0x00, 0x00, @@ -198,8 +199,8 @@ static unsigned long long flip_D1(const unsigned long long P, const unsigned lon outflank_v = (outflank_v & -outflank_v) & 0x0808080808080800 & P; flipped = OutflankToFlipmask(outflank_v) & 0x0808080808080800; - outflank_d = ((P & 0x0000008041221408) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0000000040221408) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) O & 0x40221408) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000008041221408) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0000000040221408; // A4D1H5 outflank_h = OUTFLANK_3[(O >> 1) & 0x3f] & rotl8(P, 3); @@ -224,8 +225,8 @@ static unsigned long long flip_E1(const unsigned long long P, const unsigned lon outflank_v = (outflank_v & -outflank_v) & 0x1010101010101000 & P; flipped = OutflankToFlipmask(outflank_v) & 0x1010101010101000; - outflank_d = ((P & 0x0000000182442810) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0000000002442810) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) O & 0x02442810) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000000182442810) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0000000002442810; // A5E1H4 outflank_h = OUTFLANK_4[(O >> 1) & 0x3f] & rotl8(P, 2); @@ -398,8 +399,8 @@ static unsigned long long flip_D2(const unsigned long long P, const unsigned lon outflank_v = (outflank_v & -outflank_v) & 0x0808080808080000 & P; flipped = OutflankToFlipmask(outflank_v) & 0x0808080808080000; - outflank_d = ((P & 0x0000804122140800) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0000004022140800) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 8) & 0x40221408) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000804122140800) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0000004022140800; // A5D2H6 outflank_h = OUTFLANK_3[(O >> 9) & 0x3f] & rotl8(P >> 8, 3); @@ -424,8 +425,8 @@ static unsigned long long flip_E2(const unsigned long long P, const unsigned lon outflank_v = (outflank_v & -outflank_v) & 0x1010101010100000 & P; flipped = OutflankToFlipmask(outflank_v) & 0x1010101010100000; - outflank_d = ((P & 0x0000018244281000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0000000244281000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 8) & 0x02442810) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000018244281000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0000000244281000; // A6E2H5 outflank_h = OUTFLANK_4[(O >> 9) & 0x3f] & rotl8(P >> 8, 2); @@ -614,8 +615,8 @@ static unsigned long long flip_D3(const unsigned long long P, const unsigned lon outflank_h = OUTFLANK_3[(O >> 17) & 0x3f] & rotl8(P >> 16, 3); flipped |= (unsigned char) FLIPPED_3_H[outflank_h] << 16; - outflank_d = ((P & 0x0080412214080000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0000402214080000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 16) & 0x40221408) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0080412214080000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0000402214080000; // A6D3H7 flipped |= (((P << 7) & 0x0000000000001000) | ((P << 8) & 0x000000000000800) | ((P << 9) & 0x000000000000400)) & O; @@ -642,8 +643,8 @@ static unsigned long long flip_E3(const unsigned long long P, const unsigned lon outflank_h = OUTFLANK_4[(O >> 17) & 0x3f] & rotl8(P >> 16, 2); flipped |= (unsigned char) FLIPPED_4_H[outflank_h] << 16; - outflank_d = ((P & 0x0001824428100000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0000024428100000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 16) & 0x02442810) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0001824428100000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0000024428100000; // A7E3H6 flipped |= (((P << 7) & 0x0000000000002000) | ((P << 8) & 0x000000000001000) | ((P << 9) & 0x000000000000800)) & O; @@ -819,7 +820,6 @@ static unsigned long long flip_C4(const unsigned long long P, const unsigned lon uint32x4_t PH = vsetq_lane_u32(vgetq_lane_u32(PP, 1), PP, 2); // HHHL uint32x4_t OH = vsetq_lane_u32(vgetq_lane_u32(OO, 1), OO, 2); uint32x4_t outflankL, outflankH, flippedL4, flippedH; - uint32x2_t flippedL2; uint32x2x2_t flippedLH; uint64x1_t flipped; const uint32x4_t maskL = { 0x03000000, 0x00040404, 0x00020100, 0x00081020 }; @@ -830,13 +830,12 @@ static unsigned long long flip_C4(const unsigned long long P, const unsigned lon outflankL = vshlq_u32(msb, vnegq_s32(vreinterpretq_s32_u32(vclzq_u32(vbicq_u32(maskL, vtrnq_u32(OO, OO).val[0]))))); outflankL = vandq_u32(outflankL, vtrnq_u32(PP, PP).val[0]); flippedL4 = vandq_u32(maskL, vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(vaddq_u32(outflankL, outflankL))))); - flippedL2 = vorr_u32(vget_low_u32(flippedL4), vget_high_u32(flippedL4)); outflankH = vbicq_u32(maskH, OH); outflankH = vandq_u32(vbicq_u32(outflankH, vsubq_u32(outflankH, one)), PH); flippedH = vandq_u32(maskH, vqsubq_u32(outflankH, one)); - flippedLH = vtrn_u32(flippedL2, vget_high_u32(flippedH)); + flippedLH = vtrn_u32(vorr_u32(vget_low_u32(flippedL4), vget_high_u32(flippedL4)), vget_high_u32(flippedH)); flipped = vreinterpret_u64_u32(vorr_u32(vorr_u32(flippedLH.val[0], flippedLH.val[1]), vget_low_u32(flippedH))); return vget_lane_u64(flipped, 0); @@ -856,7 +855,6 @@ static unsigned long long flip_D4(const unsigned long long P, const unsigned lon uint32x4_t PH = vsetq_lane_u32(vgetq_lane_u32(PP, 1), PP, 2); // HHHL uint32x4_t OH = vsetq_lane_u32(vgetq_lane_u32(OO, 1), OO, 2); uint32x4_t outflankL, outflankH, flippedL4, flippedH; - uint32x2_t flippedL2; uint32x2x2_t flippedLH; uint64x1_t flipped; const uint32x4_t maskL = { 0x07000000, 0x00080808, 0x00040201, 0x00102040 }; @@ -867,13 +865,12 @@ static unsigned long long flip_D4(const unsigned long long P, const unsigned lon outflankL = vshlq_u32(msb, vnegq_s32(vreinterpretq_s32_u32(vclzq_u32(vbicq_u32(maskL, vtrnq_u32(OO, OO).val[0]))))); outflankL = vandq_u32(outflankL, vtrnq_u32(PP, PP).val[0]); flippedL4 = vandq_u32(maskL, vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(vaddq_u32(outflankL, outflankL))))); - flippedL2 = vorr_u32(vget_low_u32(flippedL4), vget_high_u32(flippedL4)); outflankH = vbicq_u32(maskH, OH); outflankH = vandq_u32(vbicq_u32(outflankH, vsubq_u32(outflankH, one)), PH); flippedH = vandq_u32(maskH, vqsubq_u32(outflankH, one)); - flippedLH = vtrn_u32(flippedL2, vget_high_u32(flippedH)); + flippedLH = vtrn_u32(vorr_u32(vget_low_u32(flippedL4), vget_high_u32(flippedL4)), vget_high_u32(flippedH)); flipped = vreinterpret_u64_u32(vorr_u32(vorr_u32(flippedLH.val[0], flippedLH.val[1]), vget_low_u32(flippedH))); return vget_lane_u64(flipped, 0); @@ -893,7 +890,6 @@ static unsigned long long flip_E4(const unsigned long long P, const unsigned lon uint32x4_t PH = vsetq_lane_u32(vgetq_lane_u32(PP, 1), PP, 2); // HHHL uint32x4_t OH = vsetq_lane_u32(vgetq_lane_u32(OO, 1), OO, 2); uint32x4_t outflankL, outflankH, flippedL4, flippedH; - uint32x2_t flippedL2; uint32x2x2_t flippedLH; uint64x1_t flipped; const uint32x4_t maskL = { 0x0f000000, 0x00101010, 0x00080402, 0x00204080 }; @@ -904,13 +900,12 @@ static unsigned long long flip_E4(const unsigned long long P, const unsigned lon outflankL = vshlq_u32(msb, vnegq_s32(vreinterpretq_s32_u32(vclzq_u32(vbicq_u32(maskL, vtrnq_u32(OO, OO).val[0]))))); outflankL = vandq_u32(outflankL, vtrnq_u32(PP, PP).val[0]); flippedL4 = vandq_u32(maskL, vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(vaddq_u32(outflankL, outflankL))))); - flippedL2 = vorr_u32(vget_low_u32(flippedL4), vget_high_u32(flippedL4)); outflankH = vbicq_u32(maskH, OH); outflankH = vandq_u32(vbicq_u32(outflankH, vsubq_u32(outflankH, one)), PH); flippedH = vandq_u32(maskH, vqsubq_u32(outflankH, one)); - flippedLH = vtrn_u32(flippedL2, vget_high_u32(flippedH)); + flippedLH = vtrn_u32(vorr_u32(vget_low_u32(flippedL4), vget_high_u32(flippedL4)), vget_high_u32(flippedH)); flipped = vreinterpret_u64_u32(vorr_u32(vorr_u32(flippedLH.val[0], flippedLH.val[1]), vget_low_u32(flippedH))); return vget_lane_u64(flipped, 0); @@ -930,7 +925,6 @@ static unsigned long long flip_F4(const unsigned long long P, const unsigned lon uint32x4_t PH = vsetq_lane_u32(vgetq_lane_u32(PP, 1), PP, 2); // HHHL uint32x4_t OH = vsetq_lane_u32(vgetq_lane_u32(OO, 1), OO, 2); uint32x4_t outflankL, outflankH, flippedL4, flippedH; - uint32x2_t flippedL2; uint32x2x2_t flippedLH; uint64x1_t flipped; const uint32x4_t maskL = { 0x1f000000, 0x00202020, 0x00100804, 0x00408000 }; @@ -941,13 +935,12 @@ static unsigned long long flip_F4(const unsigned long long P, const unsigned lon outflankL = vshlq_u32(msb, vnegq_s32(vreinterpretq_s32_u32(vclzq_u32(vbicq_u32(maskL, vtrnq_u32(OO, OO).val[0]))))); outflankL = vandq_u32(outflankL, vtrnq_u32(PP, PP).val[0]); flippedL4 = vandq_u32(maskL, vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(vaddq_u32(outflankL, outflankL))))); - flippedL2 = vorr_u32(vget_low_u32(flippedL4), vget_high_u32(flippedL4)); outflankH = vbicq_u32(maskH, OH); outflankH = vandq_u32(vbicq_u32(outflankH, vsubq_u32(outflankH, one)), PH); flippedH = vandq_u32(maskH, vqsubq_u32(outflankH, one)); - flippedLH = vtrn_u32(flippedL2, vget_high_u32(flippedH)); + flippedLH = vtrn_u32(vorr_u32(vget_low_u32(flippedL4), vget_high_u32(flippedL4)), vget_high_u32(flippedH)); flipped = vreinterpret_u64_u32(vorr_u32(vorr_u32(flippedLH.val[0], flippedLH.val[1]), vget_low_u32(flippedH))); return vget_lane_u64(flipped, 0); @@ -1097,7 +1090,6 @@ static unsigned long long flip_C5(const unsigned long long P, const unsigned lon uint32x4_t PL = vsetq_lane_u32(vgetq_lane_u32(PP, 0), PP, 3); // LLHL uint32x4_t OL = vsetq_lane_u32(vgetq_lane_u32(OO, 0), OO, 3); uint32x4_t outflankL, outflankH, flippedH4, flippedL; - uint32x2_t flippedH2; uint32x2x2_t flippedLH; uint64x1_t flipped; const uint32x4_t maskL = { 0x04040404, 0x00000003, 0x02010000, 0x08102040 }; @@ -1111,9 +1103,8 @@ static unsigned long long flip_C5(const unsigned long long P, const unsigned lon outflankH = vbicq_u32(maskH, vtrnq_u32(OO, OO).val[1]); outflankH = vandq_u32(vbicq_u32(outflankH, vsubq_u32(outflankH, one)), vtrnq_u32(PP, PP).val[1]); flippedH4 = vandq_u32(maskH, vqsubq_u32(outflankH, one)); - flippedH2 = vorr_u32(vget_low_u32(flippedH4), vget_high_u32(flippedH4)); - flippedLH = vtrn_u32(vget_high_u32(flippedL), flippedH2); + flippedLH = vtrn_u32(vget_high_u32(flippedL), vorr_u32(vget_low_u32(flippedH4), vget_high_u32(flippedH4))); flipped = vreinterpret_u64_u32(vorr_u32(vorr_u32(flippedLH.val[0], flippedLH.val[1]), vget_low_u32(flippedL))); return vget_lane_u64(flipped, 0); @@ -1133,7 +1124,6 @@ static unsigned long long flip_D5(const unsigned long long P, const unsigned lon uint32x4_t PL = vsetq_lane_u32(vgetq_lane_u32(PP, 0), PP, 3); // LLHL uint32x4_t OL = vsetq_lane_u32(vgetq_lane_u32(OO, 0), OO, 3); uint32x4_t outflankL, outflankH, flippedH4, flippedL; - uint32x2_t flippedH2; uint32x2x2_t flippedLH; uint64x1_t flipped; const uint32x4_t maskL = { 0x08080808, 0x00000007, 0x04020100, 0x10204080 }; @@ -1147,9 +1137,8 @@ static unsigned long long flip_D5(const unsigned long long P, const unsigned lon outflankH = vbicq_u32(maskH, vtrnq_u32(OO, OO).val[1]); outflankH = vandq_u32(vbicq_u32(outflankH, vsubq_u32(outflankH, one)), vtrnq_u32(PP, PP).val[1]); flippedH4 = vandq_u32(maskH, vqsubq_u32(outflankH, one)); - flippedH2 = vorr_u32(vget_low_u32(flippedH4), vget_high_u32(flippedH4)); - flippedLH = vtrn_u32(vget_high_u32(flippedL), flippedH2); + flippedLH = vtrn_u32(vget_high_u32(flippedL), vorr_u32(vget_low_u32(flippedH4), vget_high_u32(flippedH4))); flipped = vreinterpret_u64_u32(vorr_u32(vorr_u32(flippedLH.val[0], flippedLH.val[1]), vget_low_u32(flippedL))); return vget_lane_u64(flipped, 0); @@ -1169,7 +1158,6 @@ static unsigned long long flip_E5(const unsigned long long P, const unsigned lon uint32x4_t PL = vsetq_lane_u32(vgetq_lane_u32(PP, 0), PP, 3); // LLHL uint32x4_t OL = vsetq_lane_u32(vgetq_lane_u32(OO, 0), OO, 3); uint32x4_t outflankL, outflankH, flippedH4, flippedL; - uint32x2_t flippedH2; uint32x2x2_t flippedLH; uint64x1_t flipped; const uint32x4_t maskL = { 0x10101010, 0x0000000f, 0x08040201, 0x20408000 }; @@ -1183,9 +1171,8 @@ static unsigned long long flip_E5(const unsigned long long P, const unsigned lon outflankH = vbicq_u32(maskH, vtrnq_u32(OO, OO).val[1]); outflankH = vandq_u32(vbicq_u32(outflankH, vsubq_u32(outflankH, one)), vtrnq_u32(PP, PP).val[1]); flippedH4 = vandq_u32(maskH, vqsubq_u32(outflankH, one)); - flippedH2 = vorr_u32(vget_low_u32(flippedH4), vget_high_u32(flippedH4)); - flippedLH = vtrn_u32(vget_high_u32(flippedL), flippedH2); + flippedLH = vtrn_u32(vget_high_u32(flippedL), vorr_u32(vget_low_u32(flippedH4), vget_high_u32(flippedH4))); flipped = vreinterpret_u64_u32(vorr_u32(vorr_u32(flippedLH.val[0], flippedLH.val[1]), vget_low_u32(flippedL))); return vget_lane_u64(flipped, 0); @@ -1205,7 +1192,6 @@ static unsigned long long flip_F5(const unsigned long long P, const unsigned lon uint32x4_t PL = vsetq_lane_u32(vgetq_lane_u32(PP, 0), PP, 3); // LLHL uint32x4_t OL = vsetq_lane_u32(vgetq_lane_u32(OO, 0), OO, 3); uint32x4_t outflankL, outflankH, flippedH4, flippedL; - uint32x2_t flippedH2; uint32x2x2_t flippedLH; uint64x1_t flipped; const uint32x4_t maskL = { 0x20202020, 0x0000001f, 0x10080402, 0x40800000 }; @@ -1219,9 +1205,8 @@ static unsigned long long flip_F5(const unsigned long long P, const unsigned lon outflankH = vbicq_u32(maskH, vtrnq_u32(OO, OO).val[1]); outflankH = vandq_u32(vbicq_u32(outflankH, vsubq_u32(outflankH, one)), vtrnq_u32(PP, PP).val[1]); flippedH4 = vandq_u32(maskH, vqsubq_u32(outflankH, one)); - flippedH2 = vorr_u32(vget_low_u32(flippedH4), vget_high_u32(flippedH4)); - flippedLH = vtrn_u32(vget_high_u32(flippedL), flippedH2); + flippedLH = vtrn_u32(vget_high_u32(flippedL), vorr_u32(vget_low_u32(flippedH4), vget_high_u32(flippedH4))); flipped = vreinterpret_u64_u32(vorr_u32(vorr_u32(flippedLH.val[0], flippedLH.val[1]), vget_low_u32(flippedL))); return vget_lane_u64(flipped, 0); @@ -1380,8 +1365,8 @@ static unsigned long long flip_D6(const unsigned long long P, const unsigned lon outflank_h = OUTFLANK_3[(O >> 41) & 0x3f] & rotl8(P >> 40, 3); flipped |= (unsigned long long)(unsigned char) FLIPPED_3_H[outflank_h] << 40; - outflank_d = ((P & 0x0000081422418000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0000081422400000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 16) & 0x08142240) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000081422418000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0000081422400000; // A3D6H2 flipped |= (((P >> 9) & 0x0010000000000000) | ((P >> 8) & 0x0008000000000000) | ((P >> 7) & 0x0004000000000000)) & O; @@ -1407,8 +1392,8 @@ static unsigned long long flip_E6(const unsigned long long P, const unsigned lon outflank_h = OUTFLANK_4[(O >> 41) & 0x3f] & rotl8(P >> 40, 2); flipped |= (unsigned long long)(unsigned char) FLIPPED_4_H[outflank_h] << 40; - outflank_d = ((P & 0x0000102844820100) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0000102844020000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 16) & 0x10284402) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0000102844820100) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0000102844020000; // A2E6H3 flipped |= (((P >> 9) & 0x0020000000000000) | ((P >> 8) & 0x0010000000000000) | ((P >> 7) & 0x0008000000000000)) & O; @@ -1584,8 +1569,8 @@ static unsigned long long flip_D7(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0000080808080808) & P; flipped = (outflank_v * -2) & 0x0000080808080808; - outflank_d = ((P & 0x0008142241800000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... - outflank_d = OUTFLANK_3[((O & 0x0008142240000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 24) & 0x08142240) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0008142241800000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]... flipped |= FLIPPED_3_H[outflank_d] & 0x0008142240000000; // A4D7H3 outflank_h = OUTFLANK_3[(O >> 49) & 0x3f] & rotl8(P >> 48, 3); @@ -1609,8 +1594,8 @@ static unsigned long long flip_E7(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0000101010101010) & P; flipped = (outflank_v * -2) & 0x0000101010101010; - outflank_d = ((P & 0x0010284482010000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... - outflank_d = OUTFLANK_4[((O & 0x0010284402000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 24) & 0x10284402) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0010284482010000) * 0x0101010101010101) >> 54; // hgfed[cbahg]... flipped |= FLIPPED_4_H[outflank_d] & 0x0010284402000000; // A3E7H4 outflank_h = OUTFLANK_4[(O >> 49) & 0x3f] & rotl8(P >> 48, 2); @@ -1779,8 +1764,8 @@ static unsigned long long flip_D8(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0008080808080808) & P; flipped = (outflank_v * -2) & 0x0008080808080808; - outflank_d = ((P & 0x0814224180000000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]e0cba... - outflank_d = OUTFLANK_3[((O & 0x0814224000000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_3[(((unsigned int) (O >> 32) & 0x08142240) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x0814224180000000) * 0x0101010101010101) >> 53; // hgfedc[bahgf]e0cba... flipped |= FLIPPED_3_H[outflank_d] & 0x0814224000000000; // A5D8H4 outflank_h = OUTFLANK_3[(O >> 57) & 0x3f] & rotl8(P >> 56, 3); @@ -1804,8 +1789,8 @@ static unsigned long long flip_E8(const unsigned long long P, const unsigned lon outflank_v = outflank_right(O, 0x0010101010101010) & P; flipped = (outflank_v * -2) & 0x0010101010101010; - outflank_d = ((P & 0x1028448201000000) * 0x0101010101010101) >> 54; // hgfed[cbahg]f0dcba... - outflank_d = OUTFLANK_4[((O & 0x1028440200000000) * 0x0101010101010101) >> 57] & outflank_d; + outflank_d = OUTFLANK_4[(((unsigned int) (O >> 32) & 0x10284402) * 0x01010101) >> 25]; + outflank_d &= ((P & 0x1028448201000000) * 0x0101010101010101) >> 54; // hgfed[cbahg]f0dcba... flipped |= FLIPPED_4_H[outflank_d] & 0x1028440200000000; // A4E8H5 outflank_h = OUTFLANK_4[(O >> 57) & 0x3f] & rotl8(P >> 56, 2); @@ -1902,9 +1887,8 @@ static unsigned long long flip_pass(const unsigned long long P, const unsigned l return 0; } - /** Array of functions to compute flipped discs */ -unsigned long long (*flip[])(const unsigned long long, const unsigned long long) = { +unsigned long long (*flip_neon[])(const unsigned long long, const unsigned long long) = { flip_A1, flip_B1, flip_C1, flip_D1, flip_E1, flip_F1, flip_G1, flip_H1, flip_A2, flip_B2, flip_C2, flip_D2, flip_E2, flip_F2, flip_G2, flip_H2, flip_A3, flip_B3, flip_C3, flip_D3, flip_E3, flip_F3, flip_G3, flip_H3, @@ -1916,3 +1900,4 @@ unsigned long long (*flip[])(const unsigned long long, const unsigned long long) flip_pass, flip_pass }; +#endif diff --git a/src/flip_neon_lzcnt.c b/src/flip_neon_lzcnt.c index f303754..96fe54d 100644 --- a/src/flip_neon_lzcnt.c +++ b/src/flip_neon_lzcnt.c @@ -92,19 +92,19 @@ static const uint64x2_t lrmask_v4[66][4] = { * @return flipped disc pattern. */ -#ifndef __aarch64__ +#ifndef HAS_CPU_64 #define vceqzq_u32(x) vmvnq_u32(vtstq_u32((x), (x))) #define vnegq_s64(x) vsubq_s64(vdupq_n_s64(0), (x)) #endif -unsigned long long Flip(int pos, unsigned long long P, unsigned long long O) +uint64x2_t mm_Flip(uint64x2_t OP, int pos) { uint64x2_t flip, oflank0, mask0; uint64x2_t oflank1, mask1; int32x4_t clz0; int32x4_t clz1; uint32x4_t msb0; uint32x4_t msb1; const uint64x2_t one = vdupq_n_u64(1); - uint64x2_t PP = vdupq_n_u64(P); - uint64x2_t OO = vdupq_n_u64(O); + uint64x2_t PP = vdupq_lane_u64(vget_low_u64(OP), 0); + uint64x2_t OO = vdupq_lane_u64(vget_high_u64(OP), 0); mask0 = lrmask_v4[pos][2]; mask1 = lrmask_v4[pos][3]; // isolate non-opponent MS1B @@ -129,5 +129,5 @@ unsigned long long Flip(int pos, unsigned long long P, unsigned long long O) oflank0 = vqsubq_u64(oflank0, one); oflank1 = vqsubq_u64(oflank1, one); flip = vbslq_u64(mask1, oflank1, vbslq_u64(mask0, oflank0, flip)); - return vget_lane_u64(vorr_u64(vget_low_u64(flip), vget_high_u64(flip)), 0); + return vorrq_u64(flip, vextq_u64(flip, flip, 1)); } diff --git a/src/flip_neon_ppfill.c b/src/flip_neon_ppfill.c index 534d9cb..cd74513 100644 --- a/src/flip_neon_ppfill.c +++ b/src/flip_neon_ppfill.c @@ -15,142 +15,73 @@ #include "arm_neon.h" -static const uint64x2_t lmask_v4[66][2] = { - {{ 0x00000000000000fe, 0x0101010101010100 }, { 0x8040201008040200, 0x0000000000000000 }}, - {{ 0x00000000000000fc, 0x0202020202020200 }, { 0x0080402010080400, 0x0000000000000100 }}, - {{ 0x00000000000000f8, 0x0404040404040400 }, { 0x0000804020100800, 0x0000000000010200 }}, - {{ 0x00000000000000f0, 0x0808080808080800 }, { 0x0000008040201000, 0x0000000001020400 }}, - {{ 0x00000000000000e0, 0x1010101010101000 }, { 0x0000000080402000, 0x0000000102040800 }}, - {{ 0x00000000000000c0, 0x2020202020202000 }, { 0x0000000000804000, 0x0000010204081000 }}, - {{ 0x0000000000000080, 0x4040404040404000 }, { 0x0000000000008000, 0x0001020408102000 }}, - {{ 0x0000000000000000, 0x8080808080808000 }, { 0x0000000000000000, 0x0102040810204000 }}, - {{ 0x000000000000fe00, 0x0101010101010000 }, { 0x4020100804020000, 0x0000000000000000 }}, - {{ 0x000000000000fc00, 0x0202020202020000 }, { 0x8040201008040000, 0x0000000000010000 }}, - {{ 0x000000000000f800, 0x0404040404040000 }, { 0x0080402010080000, 0x0000000001020000 }}, - {{ 0x000000000000f000, 0x0808080808080000 }, { 0x0000804020100000, 0x0000000102040000 }}, - {{ 0x000000000000e000, 0x1010101010100000 }, { 0x0000008040200000, 0x0000010204080000 }}, - {{ 0x000000000000c000, 0x2020202020200000 }, { 0x0000000080400000, 0x0001020408100000 }}, - {{ 0x0000000000008000, 0x4040404040400000 }, { 0x0000000000800000, 0x0102040810200000 }}, - {{ 0x0000000000000000, 0x8080808080800000 }, { 0x0000000000000000, 0x0204081020400000 }}, - {{ 0x0000000000fe0000, 0x0101010101000000 }, { 0x2010080402000000, 0x0000000000000000 }}, - {{ 0x0000000000fc0000, 0x0202020202000000 }, { 0x4020100804000000, 0x0000000001000000 }}, - {{ 0x0000000000f80000, 0x0404040404000000 }, { 0x8040201008000000, 0x0000000102000000 }}, - {{ 0x0000000000f00000, 0x0808080808000000 }, { 0x0080402010000000, 0x0000010204000000 }}, - {{ 0x0000000000e00000, 0x1010101010000000 }, { 0x0000804020000000, 0x0001020408000000 }}, - {{ 0x0000000000c00000, 0x2020202020000000 }, { 0x0000008040000000, 0x0102040810000000 }}, - {{ 0x0000000000800000, 0x4040404040000000 }, { 0x0000000080000000, 0x0204081020000000 }}, - {{ 0x0000000000000000, 0x8080808080000000 }, { 0x0000000000000000, 0x0408102040000000 }}, - {{ 0x00000000fe000000, 0x0101010100000000 }, { 0x1008040200000000, 0x0000000000000000 }}, - {{ 0x00000000fc000000, 0x0202020200000000 }, { 0x2010080400000000, 0x0000000100000000 }}, - {{ 0x00000000f8000000, 0x0404040400000000 }, { 0x4020100800000000, 0x0000010200000000 }}, - {{ 0x00000000f0000000, 0x0808080800000000 }, { 0x8040201000000000, 0x0001020400000000 }}, - {{ 0x00000000e0000000, 0x1010101000000000 }, { 0x0080402000000000, 0x0102040800000000 }}, - {{ 0x00000000c0000000, 0x2020202000000000 }, { 0x0000804000000000, 0x0204081000000000 }}, - {{ 0x0000000080000000, 0x4040404000000000 }, { 0x0000008000000000, 0x0408102000000000 }}, - {{ 0x0000000000000000, 0x8080808000000000 }, { 0x0000000000000000, 0x0810204000000000 }}, - {{ 0x000000fe00000000, 0x0101010000000000 }, { 0x0804020000000000, 0x0000000000000000 }}, - {{ 0x000000fc00000000, 0x0202020000000000 }, { 0x1008040000000000, 0x0000010000000000 }}, - {{ 0x000000f800000000, 0x0404040000000000 }, { 0x2010080000000000, 0x0001020000000000 }}, - {{ 0x000000f000000000, 0x0808080000000000 }, { 0x4020100000000000, 0x0102040000000000 }}, - {{ 0x000000e000000000, 0x1010100000000000 }, { 0x8040200000000000, 0x0204080000000000 }}, - {{ 0x000000c000000000, 0x2020200000000000 }, { 0x0080400000000000, 0x0408100000000000 }}, - {{ 0x0000008000000000, 0x4040400000000000 }, { 0x0000800000000000, 0x0810200000000000 }}, - {{ 0x0000000000000000, 0x8080800000000000 }, { 0x0000000000000000, 0x1020400000000000 }}, - {{ 0x0000fe0000000000, 0x0101000000000000 }, { 0x0402000000000000, 0x0000000000000000 }}, - {{ 0x0000fc0000000000, 0x0202000000000000 }, { 0x0804000000000000, 0x0001000000000000 }}, - {{ 0x0000f80000000000, 0x0404000000000000 }, { 0x1008000000000000, 0x0102000000000000 }}, - {{ 0x0000f00000000000, 0x0808000000000000 }, { 0x2010000000000000, 0x0204000000000000 }}, - {{ 0x0000e00000000000, 0x1010000000000000 }, { 0x4020000000000000, 0x0408000000000000 }}, - {{ 0x0000c00000000000, 0x2020000000000000 }, { 0x8040000000000000, 0x0810000000000000 }}, - {{ 0x0000800000000000, 0x4040000000000000 }, { 0x0080000000000000, 0x1020000000000000 }}, - {{ 0x0000000000000000, 0x8080000000000000 }, { 0x0000000000000000, 0x2040000000000000 }}, - {{ 0x00fe000000000000, 0x0100000000000000 }, { 0x0200000000000000, 0x0000000000000000 }}, - {{ 0x00fc000000000000, 0x0200000000000000 }, { 0x0400000000000000, 0x0100000000000000 }}, - {{ 0x00f8000000000000, 0x0400000000000000 }, { 0x0800000000000000, 0x0200000000000000 }}, - {{ 0x00f0000000000000, 0x0800000000000000 }, { 0x1000000000000000, 0x0400000000000000 }}, - {{ 0x00e0000000000000, 0x1000000000000000 }, { 0x2000000000000000, 0x0800000000000000 }}, - {{ 0x00c0000000000000, 0x2000000000000000 }, { 0x4000000000000000, 0x1000000000000000 }}, - {{ 0x0080000000000000, 0x4000000000000000 }, { 0x8000000000000000, 0x2000000000000000 }}, - {{ 0x0000000000000000, 0x8000000000000000 }, { 0x0000000000000000, 0x4000000000000000 }}, - {{ 0xfe00000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0xfc00000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0xf800000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0xf000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0xe000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0xc000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x8000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, // pass - {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }} -}; - -static const uint64x2_t rmask_v4[66][2] = { - {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x0000000000000001, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x0000000000000003, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x0000000000000007, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x000000000000000f, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x000000000000001f, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x000000000000003f, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x000000000000007f, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000000000000001 }, { 0x0000000000000000, 0x0000000000000002 }}, - {{ 0x0000000000000100, 0x0000000000000002 }, { 0x0000000000000001, 0x0000000000000004 }}, - {{ 0x0000000000000300, 0x0000000000000004 }, { 0x0000000000000002, 0x0000000000000008 }}, - {{ 0x0000000000000700, 0x0000000000000008 }, { 0x0000000000000004, 0x0000000000000010 }}, - {{ 0x0000000000000f00, 0x0000000000000010 }, { 0x0000000000000008, 0x0000000000000020 }}, - {{ 0x0000000000001f00, 0x0000000000000020 }, { 0x0000000000000010, 0x0000000000000040 }}, - {{ 0x0000000000003f00, 0x0000000000000040 }, { 0x0000000000000020, 0x0000000000000080 }}, - {{ 0x0000000000007f00, 0x0000000000000080 }, { 0x0000000000000040, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000000000000101 }, { 0x0000000000000000, 0x0000000000000204 }}, - {{ 0x0000000000010000, 0x0000000000000202 }, { 0x0000000000000100, 0x0000000000000408 }}, - {{ 0x0000000000030000, 0x0000000000000404 }, { 0x0000000000000201, 0x0000000000000810 }}, - {{ 0x0000000000070000, 0x0000000000000808 }, { 0x0000000000000402, 0x0000000000001020 }}, - {{ 0x00000000000f0000, 0x0000000000001010 }, { 0x0000000000000804, 0x0000000000002040 }}, - {{ 0x00000000001f0000, 0x0000000000002020 }, { 0x0000000000001008, 0x0000000000004080 }}, - {{ 0x00000000003f0000, 0x0000000000004040 }, { 0x0000000000002010, 0x0000000000008000 }}, - {{ 0x00000000007f0000, 0x0000000000008080 }, { 0x0000000000004020, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000000000010101 }, { 0x0000000000000000, 0x0000000000020408 }}, - {{ 0x0000000001000000, 0x0000000000020202 }, { 0x0000000000010000, 0x0000000000040810 }}, - {{ 0x0000000003000000, 0x0000000000040404 }, { 0x0000000000020100, 0x0000000000081020 }}, - {{ 0x0000000007000000, 0x0000000000080808 }, { 0x0000000000040201, 0x0000000000102040 }}, - {{ 0x000000000f000000, 0x0000000000101010 }, { 0x0000000000080402, 0x0000000000204080 }}, - {{ 0x000000001f000000, 0x0000000000202020 }, { 0x0000000000100804, 0x0000000000408000 }}, - {{ 0x000000003f000000, 0x0000000000404040 }, { 0x0000000000201008, 0x0000000000800000 }}, - {{ 0x000000007f000000, 0x0000000000808080 }, { 0x0000000000402010, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000000001010101 }, { 0x0000000000000000, 0x0000000002040810 }}, - {{ 0x0000000100000000, 0x0000000002020202 }, { 0x0000000001000000, 0x0000000004081020 }}, - {{ 0x0000000300000000, 0x0000000004040404 }, { 0x0000000002010000, 0x0000000008102040 }}, - {{ 0x0000000700000000, 0x0000000008080808 }, { 0x0000000004020100, 0x0000000010204080 }}, - {{ 0x0000000f00000000, 0x0000000010101010 }, { 0x0000000008040201, 0x0000000020408000 }}, - {{ 0x0000001f00000000, 0x0000000020202020 }, { 0x0000000010080402, 0x0000000040800000 }}, - {{ 0x0000003f00000000, 0x0000000040404040 }, { 0x0000000020100804, 0x0000000080000000 }}, - {{ 0x0000007f00000000, 0x0000000080808080 }, { 0x0000000040201008, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000000101010101 }, { 0x0000000000000000, 0x0000000204081020 }}, - {{ 0x0000010000000000, 0x0000000202020202 }, { 0x0000000100000000, 0x0000000408102040 }}, - {{ 0x0000030000000000, 0x0000000404040404 }, { 0x0000000201000000, 0x0000000810204080 }}, - {{ 0x0000070000000000, 0x0000000808080808 }, { 0x0000000402010000, 0x0000001020408000 }}, - {{ 0x00000f0000000000, 0x0000001010101010 }, { 0x0000000804020100, 0x0000002040800000 }}, - {{ 0x00001f0000000000, 0x0000002020202020 }, { 0x0000001008040201, 0x0000004080000000 }}, - {{ 0x00003f0000000000, 0x0000004040404040 }, { 0x0000002010080402, 0x0000008000000000 }}, - {{ 0x00007f0000000000, 0x0000008080808080 }, { 0x0000004020100804, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000010101010101 }, { 0x0000000000000000, 0x0000020408102040 }}, - {{ 0x0001000000000000, 0x0000020202020202 }, { 0x0000010000000000, 0x0000040810204080 }}, - {{ 0x0003000000000000, 0x0000040404040404 }, { 0x0000020100000000, 0x0000081020408000 }}, - {{ 0x0007000000000000, 0x0000080808080808 }, { 0x0000040201000000, 0x0000102040800000 }}, - {{ 0x000f000000000000, 0x0000101010101010 }, { 0x0000080402010000, 0x0000204080000000 }}, - {{ 0x001f000000000000, 0x0000202020202020 }, { 0x0000100804020100, 0x0000408000000000 }}, - {{ 0x003f000000000000, 0x0000404040404040 }, { 0x0000201008040201, 0x0000800000000000 }}, - {{ 0x007f000000000000, 0x0000808080808080 }, { 0x0000402010080402, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0001010101010101 }, { 0x0000000000000000, 0x0002040810204080 }}, - {{ 0x0100000000000000, 0x0002020202020202 }, { 0x0001000000000000, 0x0004081020408000 }}, - {{ 0x0300000000000000, 0x0004040404040404 }, { 0x0002010000000000, 0x0008102040800000 }}, - {{ 0x0700000000000000, 0x0008080808080808 }, { 0x0004020100000000, 0x0010204080000000 }}, - {{ 0x0f00000000000000, 0x0010101010101010 }, { 0x0008040201000000, 0x0020408000000000 }}, - {{ 0x1f00000000000000, 0x0020202020202020 }, { 0x0010080402010000, 0x0040800000000000 }}, - {{ 0x3f00000000000000, 0x0040404040404040 }, { 0x0020100804020100, 0x0080000000000000 }}, - {{ 0x7f00000000000000, 0x0080808080808080 }, { 0x0040201008040201, 0x0000000000000000 }}, - {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, // pass - {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }} +static const uint64x2_t lrmask_v4[66][4] = { + {{ 0x00000000000000fe, 0x0101010101010100 }, { 0x8040201008040200, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000fc, 0x0202020202020200 }, { 0x0080402010080400, 0x0000000000000100 }, { 0x0000000000000001, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000f8, 0x0404040404040400 }, { 0x0000804020100800, 0x0000000000010200 }, { 0x0000000000000003, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000f0, 0x0808080808080800 }, { 0x0000008040201000, 0x0000000001020400 }, { 0x0000000000000007, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000e0, 0x1010101010101000 }, { 0x0000000080402000, 0x0000000102040800 }, { 0x000000000000000f, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000c0, 0x2020202020202000 }, { 0x0000000000804000, 0x0000010204081000 }, { 0x000000000000001f, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x0000000000000080, 0x4040404040404000 }, { 0x0000000000008000, 0x0001020408102000 }, { 0x000000000000003f, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x0000000000000000, 0x8080808080808000 }, { 0x0000000000000000, 0x0102040810204000 }, { 0x000000000000007f, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x000000000000fe00, 0x0101010101010000 }, { 0x4020100804020000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000001 }, { 0x0000000000000000, 0x0000000000000002 }}, + {{ 0x000000000000fc00, 0x0202020202020000 }, { 0x8040201008040000, 0x0000000000010000 }, { 0x0000000000000100, 0x0000000000000002 }, { 0x0000000000000001, 0x0000000000000004 }}, + {{ 0x000000000000f800, 0x0404040404040000 }, { 0x0080402010080000, 0x0000000001020000 }, { 0x0000000000000300, 0x0000000000000004 }, { 0x0000000000000002, 0x0000000000000008 }}, + {{ 0x000000000000f000, 0x0808080808080000 }, { 0x0000804020100000, 0x0000000102040000 }, { 0x0000000000000700, 0x0000000000000008 }, { 0x0000000000000004, 0x0000000000000010 }}, + {{ 0x000000000000e000, 0x1010101010100000 }, { 0x0000008040200000, 0x0000010204080000 }, { 0x0000000000000f00, 0x0000000000000010 }, { 0x0000000000000008, 0x0000000000000020 }}, + {{ 0x000000000000c000, 0x2020202020200000 }, { 0x0000000080400000, 0x0001020408100000 }, { 0x0000000000001f00, 0x0000000000000020 }, { 0x0000000000000010, 0x0000000000000040 }}, + {{ 0x0000000000008000, 0x4040404040400000 }, { 0x0000000000800000, 0x0102040810200000 }, { 0x0000000000003f00, 0x0000000000000040 }, { 0x0000000000000020, 0x0000000000000080 }}, + {{ 0x0000000000000000, 0x8080808080800000 }, { 0x0000000000000000, 0x0204081020400000 }, { 0x0000000000007f00, 0x0000000000000080 }, { 0x0000000000000040, 0x0000000000000000 }}, + {{ 0x0000000000fe0000, 0x0101010101000000 }, { 0x2010080402000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000101 }, { 0x0000000000000000, 0x0000000000000204 }}, + {{ 0x0000000000fc0000, 0x0202020202000000 }, { 0x4020100804000000, 0x0000000001000000 }, { 0x0000000000010000, 0x0000000000000202 }, { 0x0000000000000100, 0x0000000000000408 }}, + {{ 0x0000000000f80000, 0x0404040404000000 }, { 0x8040201008000000, 0x0000000102000000 }, { 0x0000000000030000, 0x0000000000000404 }, { 0x0000000000000201, 0x0000000000000810 }}, + {{ 0x0000000000f00000, 0x0808080808000000 }, { 0x0080402010000000, 0x0000010204000000 }, { 0x0000000000070000, 0x0000000000000808 }, { 0x0000000000000402, 0x0000000000001020 }}, + {{ 0x0000000000e00000, 0x1010101010000000 }, { 0x0000804020000000, 0x0001020408000000 }, { 0x00000000000f0000, 0x0000000000001010 }, { 0x0000000000000804, 0x0000000000002040 }}, + {{ 0x0000000000c00000, 0x2020202020000000 }, { 0x0000008040000000, 0x0102040810000000 }, { 0x00000000001f0000, 0x0000000000002020 }, { 0x0000000000001008, 0x0000000000004080 }}, + {{ 0x0000000000800000, 0x4040404040000000 }, { 0x0000000080000000, 0x0204081020000000 }, { 0x00000000003f0000, 0x0000000000004040 }, { 0x0000000000002010, 0x0000000000008000 }}, + {{ 0x0000000000000000, 0x8080808080000000 }, { 0x0000000000000000, 0x0408102040000000 }, { 0x00000000007f0000, 0x0000000000008080 }, { 0x0000000000004020, 0x0000000000000000 }}, + {{ 0x00000000fe000000, 0x0101010100000000 }, { 0x1008040200000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000010101 }, { 0x0000000000000000, 0x0000000000020408 }}, + {{ 0x00000000fc000000, 0x0202020200000000 }, { 0x2010080400000000, 0x0000000100000000 }, { 0x0000000001000000, 0x0000000000020202 }, { 0x0000000000010000, 0x0000000000040810 }}, + {{ 0x00000000f8000000, 0x0404040400000000 }, { 0x4020100800000000, 0x0000010200000000 }, { 0x0000000003000000, 0x0000000000040404 }, { 0x0000000000020100, 0x0000000000081020 }}, + {{ 0x00000000f0000000, 0x0808080800000000 }, { 0x8040201000000000, 0x0001020400000000 }, { 0x0000000007000000, 0x0000000000080808 }, { 0x0000000000040201, 0x0000000000102040 }}, + {{ 0x00000000e0000000, 0x1010101000000000 }, { 0x0080402000000000, 0x0102040800000000 }, { 0x000000000f000000, 0x0000000000101010 }, { 0x0000000000080402, 0x0000000000204080 }}, + {{ 0x00000000c0000000, 0x2020202000000000 }, { 0x0000804000000000, 0x0204081000000000 }, { 0x000000001f000000, 0x0000000000202020 }, { 0x0000000000100804, 0x0000000000408000 }}, + {{ 0x0000000080000000, 0x4040404000000000 }, { 0x0000008000000000, 0x0408102000000000 }, { 0x000000003f000000, 0x0000000000404040 }, { 0x0000000000201008, 0x0000000000800000 }}, + {{ 0x0000000000000000, 0x8080808000000000 }, { 0x0000000000000000, 0x0810204000000000 }, { 0x000000007f000000, 0x0000000000808080 }, { 0x0000000000402010, 0x0000000000000000 }}, + {{ 0x000000fe00000000, 0x0101010000000000 }, { 0x0804020000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000001010101 }, { 0x0000000000000000, 0x0000000002040810 }}, + {{ 0x000000fc00000000, 0x0202020000000000 }, { 0x1008040000000000, 0x0000010000000000 }, { 0x0000000100000000, 0x0000000002020202 }, { 0x0000000001000000, 0x0000000004081020 }}, + {{ 0x000000f800000000, 0x0404040000000000 }, { 0x2010080000000000, 0x0001020000000000 }, { 0x0000000300000000, 0x0000000004040404 }, { 0x0000000002010000, 0x0000000008102040 }}, + {{ 0x000000f000000000, 0x0808080000000000 }, { 0x4020100000000000, 0x0102040000000000 }, { 0x0000000700000000, 0x0000000008080808 }, { 0x0000000004020100, 0x0000000010204080 }}, + {{ 0x000000e000000000, 0x1010100000000000 }, { 0x8040200000000000, 0x0204080000000000 }, { 0x0000000f00000000, 0x0000000010101010 }, { 0x0000000008040201, 0x0000000020408000 }}, + {{ 0x000000c000000000, 0x2020200000000000 }, { 0x0080400000000000, 0x0408100000000000 }, { 0x0000001f00000000, 0x0000000020202020 }, { 0x0000000010080402, 0x0000000040800000 }}, + {{ 0x0000008000000000, 0x4040400000000000 }, { 0x0000800000000000, 0x0810200000000000 }, { 0x0000003f00000000, 0x0000000040404040 }, { 0x0000000020100804, 0x0000000080000000 }}, + {{ 0x0000000000000000, 0x8080800000000000 }, { 0x0000000000000000, 0x1020400000000000 }, { 0x0000007f00000000, 0x0000000080808080 }, { 0x0000000040201008, 0x0000000000000000 }}, + {{ 0x0000fe0000000000, 0x0101000000000000 }, { 0x0402000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000101010101 }, { 0x0000000000000000, 0x0000000204081020 }}, + {{ 0x0000fc0000000000, 0x0202000000000000 }, { 0x0804000000000000, 0x0001000000000000 }, { 0x0000010000000000, 0x0000000202020202 }, { 0x0000000100000000, 0x0000000408102040 }}, + {{ 0x0000f80000000000, 0x0404000000000000 }, { 0x1008000000000000, 0x0102000000000000 }, { 0x0000030000000000, 0x0000000404040404 }, { 0x0000000201000000, 0x0000000810204080 }}, + {{ 0x0000f00000000000, 0x0808000000000000 }, { 0x2010000000000000, 0x0204000000000000 }, { 0x0000070000000000, 0x0000000808080808 }, { 0x0000000402010000, 0x0000001020408000 }}, + {{ 0x0000e00000000000, 0x1010000000000000 }, { 0x4020000000000000, 0x0408000000000000 }, { 0x00000f0000000000, 0x0000001010101010 }, { 0x0000000804020100, 0x0000002040800000 }}, + {{ 0x0000c00000000000, 0x2020000000000000 }, { 0x8040000000000000, 0x0810000000000000 }, { 0x00001f0000000000, 0x0000002020202020 }, { 0x0000001008040201, 0x0000004080000000 }}, + {{ 0x0000800000000000, 0x4040000000000000 }, { 0x0080000000000000, 0x1020000000000000 }, { 0x00003f0000000000, 0x0000004040404040 }, { 0x0000002010080402, 0x0000008000000000 }}, + {{ 0x0000000000000000, 0x8080000000000000 }, { 0x0000000000000000, 0x2040000000000000 }, { 0x00007f0000000000, 0x0000008080808080 }, { 0x0000004020100804, 0x0000000000000000 }}, + {{ 0x00fe000000000000, 0x0100000000000000 }, { 0x0200000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000010101010101 }, { 0x0000000000000000, 0x0000020408102040 }}, + {{ 0x00fc000000000000, 0x0200000000000000 }, { 0x0400000000000000, 0x0100000000000000 }, { 0x0001000000000000, 0x0000020202020202 }, { 0x0000010000000000, 0x0000040810204080 }}, + {{ 0x00f8000000000000, 0x0400000000000000 }, { 0x0800000000000000, 0x0200000000000000 }, { 0x0003000000000000, 0x0000040404040404 }, { 0x0000020100000000, 0x0000081020408000 }}, + {{ 0x00f0000000000000, 0x0800000000000000 }, { 0x1000000000000000, 0x0400000000000000 }, { 0x0007000000000000, 0x0000080808080808 }, { 0x0000040201000000, 0x0000102040800000 }}, + {{ 0x00e0000000000000, 0x1000000000000000 }, { 0x2000000000000000, 0x0800000000000000 }, { 0x000f000000000000, 0x0000101010101010 }, { 0x0000080402010000, 0x0000204080000000 }}, + {{ 0x00c0000000000000, 0x2000000000000000 }, { 0x4000000000000000, 0x1000000000000000 }, { 0x001f000000000000, 0x0000202020202020 }, { 0x0000100804020100, 0x0000408000000000 }}, + {{ 0x0080000000000000, 0x4000000000000000 }, { 0x8000000000000000, 0x2000000000000000 }, { 0x003f000000000000, 0x0000404040404040 }, { 0x0000201008040201, 0x0000800000000000 }}, + {{ 0x0000000000000000, 0x8000000000000000 }, { 0x0000000000000000, 0x4000000000000000 }, { 0x007f000000000000, 0x0000808080808080 }, { 0x0000402010080402, 0x0000000000000000 }}, + {{ 0xfe00000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0001010101010101 }, { 0x0000000000000000, 0x0002040810204080 }}, + {{ 0xfc00000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0100000000000000, 0x0002020202020202 }, { 0x0001000000000000, 0x0004081020408000 }}, + {{ 0xf800000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0300000000000000, 0x0004040404040404 }, { 0x0002010000000000, 0x0008102040800000 }}, + {{ 0xf000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0700000000000000, 0x0008080808080808 }, { 0x0004020100000000, 0x0010204080000000 }}, + {{ 0xe000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0f00000000000000, 0x0010101010101010 }, { 0x0008040201000000, 0x0020408000000000 }}, + {{ 0xc000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x1f00000000000000, 0x0020202020202020 }, { 0x0010080402010000, 0x0040800000000000 }}, + {{ 0x8000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x3f00000000000000, 0x0040404040404040 }, { 0x0020100804020100, 0x0080000000000000 }}, + {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x7f00000000000000, 0x0080808080808080 }, { 0x0040201008040201, 0x0000000000000000 }}, + {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, // pass + {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }} }; /** @@ -162,40 +93,36 @@ static const uint64x2_t rmask_v4[66][2] = { * @return flipped disc pattern. */ -unsigned long long Flip(int pos, unsigned long long P, unsigned long long O) +uint64x2_t mm_Flip(uint64x2_t OP, int pos) { - uint64x2_t flip0, oflank0, eraser0, mask0; uint64x2_t flip1, oflank1, eraser1, mask1; - const int64x2_t lshift18 = { 1, 8 }; const int64x2_t lshift79 = { 7, 9 }; - int64x2_t rshift18 = { -1, -8 }; int64x2_t rshift79 = { -7, -9 }; - const uint64x2_t minusone = vdupq_n_u64(-1); - uint64x2_t PP = vdupq_n_u64(P); - uint64x2_t OO = vdupq_n_u64(O); + uint64x2_t flip, oflank0, eraser0, mask0; uint64x2_t oflank1, eraser1, mask1; + const int64x2_t lshift18 = { 1, 8 }; const int64x2_t lshift79 = { 9, 7 }; + int64x2_t rshift18 = { -1, -8 }; int64x2_t rshift79 = { -9, -7 }; + const uint64x2_t one = vdupq_n_u64(1); + uint64x2_t PP = vdupq_lane_u64(vget_low_u64(OP), 0); + uint64x2_t OO = vdupq_lane_u64(vget_high_u64(OP), 0); - mask0 = rmask_v4[pos][0]; mask1 = rmask_v4[pos][1]; + mask0 = lrmask_v4[pos][2]; mask1 = lrmask_v4[pos][3]; // isolate non-opponent MS1B by clearing lower bits eraser0 = vbicq_u64(mask0, OO); eraser1 = vbicq_u64(mask1, OO); // clear valid bits only using variable shift - oflank0 = vshlq_s64(vandq_u64(PP, mask0), lshift18); oflank1 = vshlq_s64(vandq_u64(PP, mask1), lshift79); - eraser0 = vorrq_u64(eraser0, vshlq_s64(eraser0, rshift18)); eraser1 = vorrq_u64(eraser1, vshlq_s64(eraser1, rshift79)); + oflank0 = vshlq_u64(vandq_u64(PP, mask0), lshift18); oflank1 = vshlq_u64(vandq_u64(PP, mask1), lshift79); + eraser0 = vorrq_u64(eraser0, vshlq_u64(eraser0, rshift18)); eraser1 = vorrq_u64(eraser1, vshlq_u64(eraser1, rshift79)); rshift18 = vaddq_s64(rshift18, rshift18); rshift79 = vaddq_s64(rshift79, rshift79); - eraser0 = vorrq_u64(eraser0, vshlq_s64(eraser0, rshift18)); eraser1 = vorrq_u64(eraser1, vshlq_s64(eraser1, rshift79)); - eraser0 = vorrq_u64(eraser0, vshlq_s64(eraser0, rshift18)); eraser1 = vorrq_u64(eraser1, vshlq_s64(eraser1, rshift79)); + eraser0 = vorrq_u64(eraser0, vshlq_u64(eraser0, rshift18)); eraser1 = vorrq_u64(eraser1, vshlq_u64(eraser1, rshift79)); + eraser0 = vorrq_u64(eraser0, vshlq_u64(eraser0, rshift18)); eraser1 = vorrq_u64(eraser1, vshlq_u64(eraser1, rshift79)); oflank0 = vbicq_u64(oflank0, eraser0); oflank1 = vbicq_u64(oflank1, eraser1); // set mask bits higher than oflank - flip0 = vbicq_u64(mask0, vandq_u64(oflank0, minusone)); flip1 = vbicq_u64(mask1, vandq_u64(oflank1, minusone)); + flip = vbicq_u64(mask0, vsubq_u64(oflank0, one)); flip = vorrq_u64(flip, vbicq_u64(mask1, vsubq_u64(oflank1, one))); - mask0 = lmask_v4[pos][0]; mask1 = lmask_v4[pos][1]; - // look for non-opponent LS1B - oflank0 = vbicq_u64(mask0, OO); oflank1 = vbicq_u64(mask1, OO); - oflank0 = vbicq_u64(oflank0, vaddq_u64(oflank0, minusone)); oflank1 = vbicq_u64(oflank1, vaddq_u64(oflank1, minusone)); - oflank0 = vandq_u64(oflank0, PP); oflank1 = vandq_u64(oflank1, PP); - // set all bits lower than oflank - oflank0 = vaddq_u64(oflank0, minusone); oflank1 = vaddq_u64(oflank1, minusone); - // sign bit becomes 1 only if oflank was 0, if so add back 1 - oflank0 = vaddq_u64(oflank0, vshrq_n_u64(oflank0, 63)); oflank1 = vaddq_u64(oflank1, vshrq_n_u64(oflank1, 63)); - flip0 = vorrq_u64(flip0, vandq_u64(oflank0, mask0)); flip1 = vorrq_u64(flip1, vandq_u64(oflank1, mask1)); + mask0 = lrmask_v4[pos][0]; mask1 = lrmask_v4[pos][1]; + // get outflank with carry-propagation + oflank0 = vaddq_u64(vornq_u64(OO, mask0), one); oflank1 = vaddq_u64(vornq_u64(OO, mask1), one); + oflank0 = vandq_u64(vandq_u64(PP, mask0), oflank0); oflank1 = vandq_u64(vandq_u64(PP, mask1), oflank1); + // set all bits lower than oflank, using satulation if oflank = 0 + oflank0 = vqsubq_u64(oflank0, one); oflank1 = vqsubq_u64(oflank1, one); + flip = vbslq_u64(mask1, oflank1, vbslq_u64(mask0, oflank0, flip)); - flip0 = vorrq_u64(flip0, flip1); - return vget_lane_u64(vorr_u64(vget_low_u64(flip0), vget_high_u64(flip0)), 0); + return vorrq_u64(flip, vextq_u64(flip, flip, 1)); } diff --git a/src/flip_neon_rbit.c b/src/flip_neon_rbit.c new file mode 100644 index 0000000..a7f075f --- /dev/null +++ b/src/flip_neon_rbit.c @@ -0,0 +1,121 @@ +/** + * @file flip_neon_rbit.c + * + * This module deals with flipping discs. + * + * For LSB to MSB directions, carry propagation can be used to determine + * contiguous opponent discs. + * For MSB to LSB directions, rev64/rbit is used to use carry propagation. + * + * @date 2020 + * @author Toshihiko Okuhara + * @version 4.4 + */ + +#include "arm_neon.h" + +static const uint64x2_t lrmask_v4[66][4] = { + {{ 0x00000000000000fe, 0x0101010101010100 }, { 0x8040201008040200, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000fc, 0x0202020202020200 }, { 0x0080402010080400, 0x0000000000000100 }, { 0x8000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000f8, 0x0404040404040400 }, { 0x0000804020100800, 0x0000000000010200 }, { 0xc000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000f0, 0x0808080808080800 }, { 0x0000008040201000, 0x0000000001020400 }, { 0xe000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000e0, 0x1010101010101000 }, { 0x0000000080402000, 0x0000000102040800 }, { 0xf000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x00000000000000c0, 0x2020202020202000 }, { 0x0000000000804000, 0x0000010204081000 }, { 0xf800000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x0000000000000080, 0x4040404040404000 }, { 0x0000000000008000, 0x0001020408102000 }, { 0xfc00000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x0000000000000000, 0x8080808080808000 }, { 0x0000000000000000, 0x0102040810204000 }, { 0xfe00000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, + {{ 0x000000000000fe00, 0x0101010101010000 }, { 0x4020100804020000, 0x0000000000000000 }, { 0x0000000000000000, 0x8000000000000000 }, { 0x0000000000000000, 0x4000000000000000 }}, + {{ 0x000000000000fc00, 0x0202020202020000 }, { 0x8040201008040000, 0x0000000000010000 }, { 0x0080000000000000, 0x4000000000000000 }, { 0x8000000000000000, 0x2000000000000000 }}, + {{ 0x000000000000f800, 0x0404040404040000 }, { 0x0080402010080000, 0x0000000001020000 }, { 0x00c0000000000000, 0x2000000000000000 }, { 0x4000000000000000, 0x1000000000000000 }}, + {{ 0x000000000000f000, 0x0808080808080000 }, { 0x0000804020100000, 0x0000000102040000 }, { 0x00e0000000000000, 0x1000000000000000 }, { 0x2000000000000000, 0x0800000000000000 }}, + {{ 0x000000000000e000, 0x1010101010100000 }, { 0x0000008040200000, 0x0000010204080000 }, { 0x00f0000000000000, 0x0800000000000000 }, { 0x1000000000000000, 0x0400000000000000 }}, + {{ 0x000000000000c000, 0x2020202020200000 }, { 0x0000000080400000, 0x0001020408100000 }, { 0x00f8000000000000, 0x0400000000000000 }, { 0x0800000000000000, 0x0200000000000000 }}, + {{ 0x0000000000008000, 0x4040404040400000 }, { 0x0000000000800000, 0x0102040810200000 }, { 0x00fc000000000000, 0x0200000000000000 }, { 0x0400000000000000, 0x0100000000000000 }}, + {{ 0x0000000000000000, 0x8080808080800000 }, { 0x0000000000000000, 0x0204081020400000 }, { 0x00fe000000000000, 0x0100000000000000 }, { 0x0200000000000000, 0x0000000000000000 }}, + {{ 0x0000000000fe0000, 0x0101010101000000 }, { 0x2010080402000000, 0x0000000000000000 }, { 0x0000000000000000, 0x8080000000000000 }, { 0x0000000000000000, 0x2040000000000000 }}, + {{ 0x0000000000fc0000, 0x0202020202000000 }, { 0x4020100804000000, 0x0000000001000000 }, { 0x0000800000000000, 0x4040000000000000 }, { 0x0080000000000000, 0x1020000000000000 }}, + {{ 0x0000000000f80000, 0x0404040404000000 }, { 0x8040201008000000, 0x0000000102000000 }, { 0x0000c00000000000, 0x2020000000000000 }, { 0x8040000000000000, 0x0810000000000000 }}, + {{ 0x0000000000f00000, 0x0808080808000000 }, { 0x0080402010000000, 0x0000010204000000 }, { 0x0000e00000000000, 0x1010000000000000 }, { 0x4020000000000000, 0x0408000000000000 }}, + {{ 0x0000000000e00000, 0x1010101010000000 }, { 0x0000804020000000, 0x0001020408000000 }, { 0x0000f00000000000, 0x0808000000000000 }, { 0x2010000000000000, 0x0204000000000000 }}, + {{ 0x0000000000c00000, 0x2020202020000000 }, { 0x0000008040000000, 0x0102040810000000 }, { 0x0000f80000000000, 0x0404000000000000 }, { 0x1008000000000000, 0x0102000000000000 }}, + {{ 0x0000000000800000, 0x4040404040000000 }, { 0x0000000080000000, 0x0204081020000000 }, { 0x0000fc0000000000, 0x0202000000000000 }, { 0x0804000000000000, 0x0001000000000000 }}, + {{ 0x0000000000000000, 0x8080808080000000 }, { 0x0000000000000000, 0x0408102040000000 }, { 0x0000fe0000000000, 0x0101000000000000 }, { 0x0402000000000000, 0x0000000000000000 }}, + {{ 0x00000000fe000000, 0x0101010100000000 }, { 0x1008040200000000, 0x0000000000000000 }, { 0x0000000000000000, 0x8080800000000000 }, { 0x0000000000000000, 0x1020400000000000 }}, + {{ 0x00000000fc000000, 0x0202020200000000 }, { 0x2010080400000000, 0x0000000100000000 }, { 0x0000008000000000, 0x4040400000000000 }, { 0x0000800000000000, 0x0810200000000000 }}, + {{ 0x00000000f8000000, 0x0404040400000000 }, { 0x4020100800000000, 0x0000010200000000 }, { 0x000000c000000000, 0x2020200000000000 }, { 0x0080400000000000, 0x0408100000000000 }}, + {{ 0x00000000f0000000, 0x0808080800000000 }, { 0x8040201000000000, 0x0001020400000000 }, { 0x000000e000000000, 0x1010100000000000 }, { 0x8040200000000000, 0x0204080000000000 }}, + {{ 0x00000000e0000000, 0x1010101000000000 }, { 0x0080402000000000, 0x0102040800000000 }, { 0x000000f000000000, 0x0808080000000000 }, { 0x4020100000000000, 0x0102040000000000 }}, + {{ 0x00000000c0000000, 0x2020202000000000 }, { 0x0000804000000000, 0x0204081000000000 }, { 0x000000f800000000, 0x0404040000000000 }, { 0x2010080000000000, 0x0001020000000000 }}, + {{ 0x0000000080000000, 0x4040404000000000 }, { 0x0000008000000000, 0x0408102000000000 }, { 0x000000fc00000000, 0x0202020000000000 }, { 0x1008040000000000, 0x0000010000000000 }}, + {{ 0x0000000000000000, 0x8080808000000000 }, { 0x0000000000000000, 0x0810204000000000 }, { 0x000000fe00000000, 0x0101010000000000 }, { 0x0804020000000000, 0x0000000000000000 }}, + {{ 0x000000fe00000000, 0x0101010000000000 }, { 0x0804020000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x8080808000000000 }, { 0x0000000000000000, 0x0810204000000000 }}, + {{ 0x000000fc00000000, 0x0202020000000000 }, { 0x1008040000000000, 0x0000010000000000 }, { 0x0000000080000000, 0x4040404000000000 }, { 0x0000008000000000, 0x0408102000000000 }}, + {{ 0x000000f800000000, 0x0404040000000000 }, { 0x2010080000000000, 0x0001020000000000 }, { 0x00000000c0000000, 0x2020202000000000 }, { 0x0000804000000000, 0x0204081000000000 }}, + {{ 0x000000f000000000, 0x0808080000000000 }, { 0x4020100000000000, 0x0102040000000000 }, { 0x00000000e0000000, 0x1010101000000000 }, { 0x0080402000000000, 0x0102040800000000 }}, + {{ 0x000000e000000000, 0x1010100000000000 }, { 0x8040200000000000, 0x0204080000000000 }, { 0x00000000f0000000, 0x0808080800000000 }, { 0x8040201000000000, 0x0001020400000000 }}, + {{ 0x000000c000000000, 0x2020200000000000 }, { 0x0080400000000000, 0x0408100000000000 }, { 0x00000000f8000000, 0x0404040400000000 }, { 0x4020100800000000, 0x0000010200000000 }}, + {{ 0x0000008000000000, 0x4040400000000000 }, { 0x0000800000000000, 0x0810200000000000 }, { 0x00000000fc000000, 0x0202020200000000 }, { 0x2010080400000000, 0x0000000100000000 }}, + {{ 0x0000000000000000, 0x8080800000000000 }, { 0x0000000000000000, 0x1020400000000000 }, { 0x00000000fe000000, 0x0101010100000000 }, { 0x1008040200000000, 0x0000000000000000 }}, + {{ 0x0000fe0000000000, 0x0101000000000000 }, { 0x0402000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x8080808080000000 }, { 0x0000000000000000, 0x0408102040000000 }}, + {{ 0x0000fc0000000000, 0x0202000000000000 }, { 0x0804000000000000, 0x0001000000000000 }, { 0x0000000000800000, 0x4040404040000000 }, { 0x0000000080000000, 0x0204081020000000 }}, + {{ 0x0000f80000000000, 0x0404000000000000 }, { 0x1008000000000000, 0x0102000000000000 }, { 0x0000000000c00000, 0x2020202020000000 }, { 0x0000008040000000, 0x0102040810000000 }}, + {{ 0x0000f00000000000, 0x0808000000000000 }, { 0x2010000000000000, 0x0204000000000000 }, { 0x0000000000e00000, 0x1010101010000000 }, { 0x0000804020000000, 0x0001020408000000 }}, + {{ 0x0000e00000000000, 0x1010000000000000 }, { 0x4020000000000000, 0x0408000000000000 }, { 0x0000000000f00000, 0x0808080808000000 }, { 0x0080402010000000, 0x0000010204000000 }}, + {{ 0x0000c00000000000, 0x2020000000000000 }, { 0x8040000000000000, 0x0810000000000000 }, { 0x0000000000f80000, 0x0404040404000000 }, { 0x8040201008000000, 0x0000000102000000 }}, + {{ 0x0000800000000000, 0x4040000000000000 }, { 0x0080000000000000, 0x1020000000000000 }, { 0x0000000000fc0000, 0x0202020202000000 }, { 0x4020100804000000, 0x0000000001000000 }}, + {{ 0x0000000000000000, 0x8080000000000000 }, { 0x0000000000000000, 0x2040000000000000 }, { 0x0000000000fe0000, 0x0101010101000000 }, { 0x2010080402000000, 0x0000000000000000 }}, + {{ 0x00fe000000000000, 0x0100000000000000 }, { 0x0200000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x8080808080800000 }, { 0x0000000000000000, 0x0204081020400000 }}, + {{ 0x00fc000000000000, 0x0200000000000000 }, { 0x0400000000000000, 0x0100000000000000 }, { 0x0000000000008000, 0x4040404040400000 }, { 0x0000000000800000, 0x0102040810200000 }}, + {{ 0x00f8000000000000, 0x0400000000000000 }, { 0x0800000000000000, 0x0200000000000000 }, { 0x000000000000c000, 0x2020202020200000 }, { 0x0000000080400000, 0x0001020408100000 }}, + {{ 0x00f0000000000000, 0x0800000000000000 }, { 0x1000000000000000, 0x0400000000000000 }, { 0x000000000000e000, 0x1010101010100000 }, { 0x0000008040200000, 0x0000010204080000 }}, + {{ 0x00e0000000000000, 0x1000000000000000 }, { 0x2000000000000000, 0x0800000000000000 }, { 0x000000000000f000, 0x0808080808080000 }, { 0x0000804020100000, 0x0000000102040000 }}, + {{ 0x00c0000000000000, 0x2000000000000000 }, { 0x4000000000000000, 0x1000000000000000 }, { 0x000000000000f800, 0x0404040404040000 }, { 0x0080402010080000, 0x0000000001020000 }}, + {{ 0x0080000000000000, 0x4000000000000000 }, { 0x8000000000000000, 0x2000000000000000 }, { 0x000000000000fc00, 0x0202020202020000 }, { 0x8040201008040000, 0x0000000000010000 }}, + {{ 0x0000000000000000, 0x8000000000000000 }, { 0x0000000000000000, 0x4000000000000000 }, { 0x000000000000fe00, 0x0101010101010000 }, { 0x4020100804020000, 0x0000000000000000 }}, + {{ 0xfe00000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x8080808080808000 }, { 0x0000000000000000, 0x0102040810204000 }}, + {{ 0xfc00000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000080, 0x4040404040404000 }, { 0x0000000000008000, 0x0001020408102000 }}, + {{ 0xf800000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x00000000000000c0, 0x2020202020202000 }, { 0x0000000000804000, 0x0000010204081000 }}, + {{ 0xf000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x00000000000000e0, 0x1010101010101000 }, { 0x0000000080402000, 0x0000000102040800 }}, + {{ 0xe000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x00000000000000f0, 0x0808080808080800 }, { 0x0000008040201000, 0x0000000001020400 }}, + {{ 0xc000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x00000000000000f8, 0x0404040404040400 }, { 0x0000804020100800, 0x0000000000010200 }}, + {{ 0x8000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x00000000000000fc, 0x0202020202020200 }, { 0x0080402010080400, 0x0000000000000100 }}, + {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x00000000000000fe, 0x0101010101010100 }, { 0x8040201008040200, 0x0000000000000000 }}, + {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }}, // pass + {{ 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }, { 0x0000000000000000, 0x0000000000000000 }} +}; + +/** + * Compute flipped discs when playing on square pos. + * + * @param pos player's move. + * @param P player's disc pattern. + * @param O opponent's disc pattern. + * @return flipped disc pattern. + */ + +uint64x2_t mm_Flip(uint64x2_t OP, int pos) +{ + uint64x2_t flip, oflank0, mask0, oflank1, mask1; + const uint64x2_t one = vdupq_n_u64(1); + uint64x2_t rOP = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(OP)))); + uint64x2_t PP = vdupq_lane_u64(vget_low_u64(OP), 0); uint64x2_t rPP = vdupq_lane_u64(vget_low_u64(rOP), 0); + uint64x2_t OO = vdupq_lane_u64(vget_high_u64(OP), 0); uint64x2_t rOO = vdupq_lane_u64(vget_high_u64(rOP), 0); + + mask0 = lrmask_v4[pos][2]; mask1 = lrmask_v4[pos][3]; + // get outflank with carry-propagation + oflank0 = vaddq_u64(vornq_u64(rOO, mask0), one); oflank1 = vaddq_u64(vornq_u64(rOO, mask1), one); + oflank0 = vandq_u64(vandq_u64(rPP, mask0), oflank0); oflank1 = vandq_u64(vandq_u64(rPP, mask1), oflank1); + // set all bits lower than oflank, using satulation if oflank = 0 + oflank0 = vqsubq_u64(oflank0, one); oflank1 = vqsubq_u64(oflank1, one); + flip = vbslq_u64(mask1, oflank1, vandq_u64(mask0, oflank0)); + flip = vreinterpretq_u64_u8(vrev64q_u8(vrbitq_u8(vreinterpretq_u8_u64(flip)))); + + mask0 = lrmask_v4[pos][0]; mask1 = lrmask_v4[pos][1]; + // get outflank with carry-propagation + oflank0 = vaddq_u64(vornq_u64(OO, mask0), one); oflank1 = vaddq_u64(vornq_u64(OO, mask1), one); + oflank0 = vandq_u64(vandq_u64(PP, mask0), oflank0); oflank1 = vandq_u64(vandq_u64(PP, mask1), oflank1); + // set all bits lower than oflank, using satulation if oflank = 0 + oflank0 = vqsubq_u64(oflank0, one); oflank1 = vqsubq_u64(oflank1, one); + flip = vbslq_u64(mask1, oflank1, vbslq_u64(mask0, oflank0, flip)); + + return vorrq_u64(flip, vextq_u64(flip, flip, 1)); +} diff --git a/src/flip_sse.c b/src/flip_sse.c index 08a1260..af2cbf7 100644 --- a/src/flip_sse.c +++ b/src/flip_sse.c @@ -1403,7 +1403,7 @@ static __m128i vectorcall flip_C6(const __m128i OP) */ static __m128i vectorcall flip_D6(const __m128i OP) { - __m128i flipped, flipped_h_c7e7, index_d; + __m128i flipped, flipped_c7e7, index_d; unsigned int outflank_h, outflank_v, outflank_d, index_v; const __m128i mask = _mm_set_epi64x(0x0000081422418000, 0x0808080808080808); // A3D6H2 @@ -1417,11 +1417,11 @@ static __m128i vectorcall flip_D6(const __m128i OP) outflank_h = OUTFLANK_3[(_mm_extract_epi16(OP, 6) >> 9) & 0x3f] & rotl8(_mm_extract_epi16(OP, 2) >> 8, 3); - flipped_h_c7e7 = _mm_unpacklo_epi64(_mm_srli_epi64(OP, 7), _mm_srli_epi64(OP, 9)); - flipped_h_c7e7 = _mm_and_si128(flipped_h_c7e7, _mm_shuffle_epi32(OP, DUPHI)); - flipped_h_c7e7 = _mm_insert_epi16(flipped_h_c7e7, FLIPPED_3_H[outflank_h], 2); - flipped_h_c7e7 = _mm_and_si128(flipped_h_c7e7, _mm_set_epi64x(0x0010000000000000, 0x0004ff0000000000)); - flipped = _mm_or_si128(flipped, flipped_h_c7e7); + flipped_c7e7 = _mm_shuffle_epi32(OP, 0xf5); + flipped_c7e7 = _mm_and_si128(flipped_c7e7, _mm_set_epi32(0x00100000, 0x00040000, 0x20000000, 0x02000000)); + flipped_c7e7 = _mm_min_epi16(flipped_c7e7, _mm_shuffle_epi32(flipped_c7e7, SWAP64)); + flipped = _mm_or_si128(flipped, _mm_unpacklo_epi16( + _mm_slli_epi64(_mm_loadl_epi64((__m128i *) &FLIPPED_3_H[outflank_h]), 56), flipped_c7e7)); return _mm_or_si128(flipped, _mm_shuffle_epi32(flipped, SWAP64)); } @@ -1435,7 +1435,7 @@ static __m128i vectorcall flip_D6(const __m128i OP) */ static __m128i vectorcall flip_E6(const __m128i OP) { - __m128i flipped, flipped_h_d7f7, index_d; + __m128i flipped, flipped_d7f7, index_d; unsigned int outflank_h, outflank_v, outflank_d, index_v; const __m128i mask = _mm_set_epi64x(0x0000102844820100, 0x1010101010101010); // A2E6H3 @@ -1449,11 +1449,11 @@ static __m128i vectorcall flip_E6(const __m128i OP) outflank_h = OUTFLANK_4[(_mm_extract_epi16(OP, 6) >> 9) & 0x3f] & rotl8(_mm_extract_epi16(OP, 2) >> 8, 2); - flipped_h_d7f7 = _mm_unpacklo_epi64(_mm_srli_epi64(OP, 7), _mm_srli_epi64(OP, 9)); - flipped_h_d7f7 = _mm_and_si128(flipped_h_d7f7, _mm_shuffle_epi32(OP, DUPHI)); - flipped_h_d7f7 = _mm_insert_epi16(flipped_h_d7f7, FLIPPED_4_H[outflank_h], 2); - flipped_h_d7f7 = _mm_and_si128(flipped_h_d7f7, _mm_set_epi64x(0x0020000000000000, 0x0008ff0000000000)); - flipped = _mm_or_si128(flipped, flipped_h_d7f7); + flipped_d7f7 = _mm_shuffle_epi32(OP, 0xf5); + flipped_d7f7 = _mm_and_si128(flipped_d7f7, _mm_set_epi32(0x00200000, 0x00080000, 0x40000000, 0x04000000)); + flipped_d7f7 = _mm_min_epi16(flipped_d7f7, _mm_shuffle_epi32(flipped_d7f7, SWAP64)); + flipped = _mm_or_si128(flipped, _mm_unpacklo_epi16( + _mm_slli_epi64(_mm_loadl_epi64((__m128i *) &FLIPPED_4_H[outflank_h]), 56), flipped_d7f7)); return _mm_or_si128(flipped, _mm_shuffle_epi32(flipped, SWAP64)); } diff --git a/src/search.h b/src/search.h index 8fecc10..25d4277 100644 --- a/src/search.h +++ b/src/search.h @@ -219,11 +219,11 @@ int search_get_pv_cost(Search*); void show_current_move(FILE *f, Search*, const Move*, const int, const int, const bool); int search_bound(const Search*, int); -#if defined(hasSSE2) || defined(hasNeon) || defined(USE_GAS_MMX) || defined(USE_MSVC_X86) +#if defined(hasSSE2) || defined(hasNeon) || defined(USE_GAS_MMX) || defined(USE_MSVC_X86) || defined(ANDROID) #ifdef __AVX2__ #define mm_malloc(s) _mm_malloc((s), 32) #define mm_free(p) _mm_free(p) - #elif defined(hasSSE2) && !defined(__ANDROID__) + #elif defined(hasSSE2) && !defined(ANDROID) #define mm_malloc(s) _mm_malloc((s), 16) #define mm_free(p) _mm_free(p) #elif defined(_MSC_VER) diff --git a/src/settings.h b/src/settings.h index c472655..3bc0d16 100644 --- a/src/settings.h +++ b/src/settings.h @@ -17,14 +17,13 @@ #define MOVE_GENERATOR_CARRY 1 // 32.6Mnps #define MOVE_GENERATOR_KINDERGARTEN 2 // 31.1Mnps #define MOVE_GENERATOR_SSE 3 // 34.4Mnps // best for generic X64 -#define MOVE_GENERATOR_BITSCAN 4 // 32.7Mnps // best for AMD K10/FX +#define MOVE_GENERATOR_BITSCAN 4 // 32.7Mnps // best for AMD K10/FX // 7.21Mnps (neon_bitscan) #define MOVE_GENERATOR_ROXANE 5 // 29.0Mnps #define MOVE_GENERATOR_32 6 // 31.3Mnps // best for 32bit X86 #define MOVE_GENERATOR_SSE_BSWAP 7 // 30.6Mnps #define MOVE_GENERATOR_AVX 8 // 34.7Mnps // best for modern X64 #define MOVE_GENERATOR_AVX512 9 -#define MOVE_GENERATOR_NEON 10 // neon_lzcnt (6.51Mnps), neon_ppfill (5.55Mnps) -#define MOVE_GENERATOR_NEON_BITSCAN 11 // neon_bitscan (6.43Mnps) +#define MOVE_GENERATOR_NEON 10 // 6.71Mnps (neon_rbit), 6.51Mnps (neon_lzcnt), 6.17Mnps (neon_ppfill) #define COUNT_LAST_FLIP_CARRY 1 // 33.8Mnps #define COUNT_LAST_FLIP_KINDERGARTEN 2 // 33.5Mnps @@ -40,18 +39,20 @@ #define MOVE_GENERATOR MOVE_GENERATOR_AVX512 #elif defined(__AVX2__) #define MOVE_GENERATOR MOVE_GENERATOR_AVX - #elif defined(hasSSE2) + #elif defined(__SSE2__) || defined(_M_X64) || defined(hasSSE2) #define MOVE_GENERATOR MOVE_GENERATOR_SSE - #elif defined(__aarch64__) + #elif defined(__aarch64__) || defined(_M_ARM64) #define MOVE_GENERATOR MOVE_GENERATOR_NEON + #elif defined(__arm__) || defined(_M_ARM) + #define MOVE_GENERATOR MOVE_GENERATOR_BITSCAN #else #define MOVE_GENERATOR MOVE_GENERATOR_32 #endif #endif #ifndef LAST_FLIP_COUNTER - #ifdef hasSSE2 + #if defined(__SSE2__) || defined(_M_X64) || defined(hasSSE2) || defined(__aarch64__) || defined(_M_ARM64) #define LAST_FLIP_COUNTER COUNT_LAST_FLIP_SSE - #elif defined(__aarch64__) + #elif defined(__arm__) || defined(_M_ARM) #define LAST_FLIP_COUNTER COUNT_LAST_FLIP_BITSCAN #else #define LAST_FLIP_COUNTER COUNT_LAST_FLIP_32