From 1735f3d68abc34d74c41af31d97e3091ad9d2979 Mon Sep 17 00:00:00 2001 From: Manodeep Sinha Date: Mon, 17 Jul 2023 21:18:58 +1000 Subject: [PATCH] Fix compile on Apple M1/M2 (#295) * Cherry-picked: Separated out compile flags that are not universally present * Disabling -flto since that seems to be causing install issues * Fixed whitespace * Added the enum and the runtime dispatch for ARM64 * updated Changelog [ci skip] * Removing the double-check safeguard for compile options --- CHANGES.rst | 4 +++ common.mk | 78 +++++++++++++++++++++++++++++++++++++++++--- utils/cpu_features.c | 16 ++++++++- utils/cpu_features.h | 11 +++++++ 4 files changed, 104 insertions(+), 5 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index b807b6a7..acc81d81 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,6 +14,10 @@ New features 2.5.1 (upcoming) ================ +Enhancements +------------ +- Corrfunc now compiles and runs on Apple M1/M2 cpus (using the `FALLBACK` kernels) [#295] + Changes ------- - Python >= 3.7 and numpy >= 1.16 are required for python extensions [#291] diff --git a/common.mk b/common.mk index b01db9f2..455edda8 100644 --- a/common.mk +++ b/common.mk @@ -226,7 +226,7 @@ ifeq ($(DO_CHECKS), 1) # Normally, one would use -xHost with icc instead of -march=native # But -xHost does not allow you to later turn off just AVX-512, # which we may decide is necessary if the GAS bug is present - CFLAGS += -march=native + ### Now we check and add -march=native separately at the end of common.mk (MS: 4th May 2023) ifeq (USE_OMP,$(findstring USE_OMP,$(OPT))) CFLAGS += -qopenmp CLINK += -qopenmp @@ -330,8 +330,8 @@ ifeq ($(DO_CHECKS), 1) endif # USE_OMP endif # CC is clang - CFLAGS += -funroll-loops - CFLAGS += -march=native -fno-strict-aliasing + CFLAGS += -funroll-loops -fno-strict-aliasing + # CFLAGS += -march=native ### Now we check and add -march=native separately at the end of common.mk (MS: 4th May 2023) CFLAGS += -Wformat=2 -Wpacked -Wnested-externs -Wpointer-arith -Wredundant-decls -Wfloat-equal -Wcast-qual CFLAGS += -Wcast-align -Wmissing-declarations -Wmissing-prototypes -Wnested-externs -Wstrict-prototypes #-D_POSIX_C_SOURCE=2 -Wpadded -Wconversion CFLAGS += -Wno-unused-local-typedefs ## to suppress the unused typedef warning for the compile time assert for sizeof(struct config_options) @@ -374,6 +374,76 @@ ifeq ($(DO_CHECKS), 1) endif endif + # Check if using clang on Apple with M1/M1 Max/M2 etc + # if so, remove -march=native from CFLAGS and + # then add -mcpu=apple-m1 -mtune=apple-m1 +# ARCH := $(shell uname -m) +# $(info ARCH is $(ARCH)) +# ifeq ($(ARCH), arm64) +# ifeq ($(UNAME), Darwin) +# ifeq (clang,$(findstring clang,$(CC))) +# CFLAGS := $(filter-out -march=native, $(CFLAGS)) +# CFLAGS += -mtune=apple-m1 -mcpu=apple-m1 +# endif +# endif +# endif + #### UPDATE: This doesn't work in some cases because the parent + #### process might be running under ROSETTA emulation and in that + #### case arch would be reported as x86_64 (rather than arm64). For example, + #### the following lines work fine when invoked as "make" on Apple M2 + #### Macbook Air, but when I try to install through python ("python -m pip install -e .") + #### then I get an error that "-march=native" is not supported. This happens because + #### anaconda python is installed as "x86_64" (as seen with file `which python`). Therefore, + #### a better strategy is to test for the relevant compiler flags and then add them + #### Specifically, this would be for -march=native, -mcpu=apple-m1 -mtune=apple-m1 + #### and -flto (though -flto needs to be specified to both compile and link lines) + #### MS: 3rd May 2023 + ### For reasons unknown to me, the addition to CFLAGS does not work correctly if I + ### change this variable name "opt" to match the remaining names of "copt". Works fine + ### for 'clang' on OSX but not for 'gcc'. Adds the -march=native but somehow that + ### extra flag is removed when testing the -mcpu/-mtune compiler options. For the sake + ### of my sanity, I have accepted that this is how it shall work! Hopefully, in the future, + ### someone will figure out/explain *why* this behaviour is expected. It + ### seems more like a gcc bug to me where gcc is updating CFLAGS based on + ### the options on the last compile call (since cland does not show + ### this behaviour) - MS: 3rd May, 2023 + + ## TLDR: Leave this variable as "opt" while the remaining are set to "copt". Otherwise, + ## the correct flags may not be picked up when using gcc on a ARM64 OSX machine + opt := -march=native + COMPILE_OPT_SUPPORTED := $(shell $(CC) $(opt) -dM -E - < /dev/null 2>&1 1>/dev/null) + ifndef COMPILE_OPT_SUPPORTED + CFLAGS += $(opt) + else + CFLAGS := $(filter-out $(opt), $(CFLAGS)) + endif + + copt := -mcpu=apple-m1 + COMPILE_OPT_SUPPORTED := $(shell $(CC) $(copt) -dM -E - < /dev/null 2>&1 1>/dev/null) + ifndef COMPILE_OPT_SUPPORTED + CFLAGS += $(copt) + else + CFLAGS := $(filter-out $(copt), $(CFLAGS)) + endif + + copt := -mtune=apple-m1 + COMPILE_OPT_SUPPORTED := $(shell $(CC) $(copt) -dM -E - < /dev/null 2>&1 1>/dev/null) + ifndef COMPILE_OPT_SUPPORTED + CFLAGS += $(copt) + else + CFLAGS := $(filter-out $(copt), $(CFLAGS)) + endif + + # copt := -flto + # COMPILE_OPT_SUPPORTED := $(shell $(CC) $(copt) -dM -E - < /dev/null 2>&1 1>/dev/null) + # ifndef COMPILE_OPT_SUPPORTED + # CFLAGS += $(copt) + # CLINK += $(copt) + # else + # CFLAGS := $(filter-out $(copt), $(CFLAGS)) + # CLINK := $(filter-out $(copt), $(CLINK)) + # endif + # All of the python/numpy checks follow export PYTHON_CHECKED ?= 0 export NUMPY_CHECKED ?= 0 @@ -430,7 +500,7 @@ ifeq ($(DO_CHECKS), 1) endif ifneq ($(COMPILE_PYTHON_EXT), 0) - PYTHON_INCL := $(shell $(PYTHON) -c "from __future__ import print_function; import sysconfig; flags = set(['-I' + sysconfig.get_path('include'),'-I' + sysconfig.get_path('platinclude')]); print(' '.join(flags));") + PYTHON_INCL := $(shell $(PYTHON) -c "from __future__ import print_function; import sysconfig; flags = set(['-I' + sysconfig.get_path('include'),'-I' + sysconfig.get_path('platinclude')]); print(' '.join(flags));") PYTHON_INCL:=$(patsubst -I%,-isystem%, $(PYTHON_INCL)) # NUMPY is available -> next step should not fail diff --git a/utils/cpu_features.c b/utils/cpu_features.c index cf3d2938..63a01242 100644 --- a/utils/cpu_features.c +++ b/utils/cpu_features.c @@ -21,8 +21,13 @@ int runtime_instrset_detect(void) static int iset = -1; // remember value for next call if (iset >= 0) { return iset; // called before - } + } iset = FALLBACK; // default value + +#ifdef __ARM_NEON + return iset; /* return FALLBACK as the ISA on ARM64 */ +#endif + int abcd[4] = {0,0,0,0}; // cpuid results cpuid(abcd, 0); // call cpuid function 0 if (abcd[0] == 0) return iset; // no further cpuid function supported @@ -149,6 +154,15 @@ int get_max_usable_isa(void) fprintf(stderr, "[Warning] The CPU supports SSE but the compiler does not. Can you try another compiler?\n"); #endif // fall through +// This will need to be uncommented when the ARM64 kernels are added in +// case ARM64: +// #ifdef __ARM_NEON +// iset = ARM64; +// break; +// #else +// fprintf(stderr, "[Warning] The CPU supports NEON but the compiler does not. Can you try another compiler?\n"); +// #endif + // fall through case FALLBACK: default: iset = FALLBACK; diff --git a/utils/cpu_features.h b/utils/cpu_features.h index b85d0f04..24fd90e3 100644 --- a/utils/cpu_features.h +++ b/utils/cpu_features.h @@ -29,12 +29,17 @@ typedef enum { AVX=7, /* 256bit vector width */ AVX2=8, /* AVX2 (integer operations)*/ AVX512F=9,/* AVX 512 Foundation */ + ARM64=10, /* ARM64 on Apple M1/M2 */ NUM_ISA /*NUM_ISA will be the next integer after the last declared enum. AVX512F:=9 (so, NUM_ISA==10)*/ } isa; //name for instruction sets -> corresponds to the return values for functions in cpu_features.c static inline void cpuid (int output[4], int functionnumber) { +#if defined (__ARM_NEON) + (void) output; + (void) functionnumber; +#else #if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax int a, b, c, d; @@ -57,11 +62,16 @@ static inline void cpuid (int output[4], int functionnumber) { mov [esi+12], edx } +#endif #endif } // Define interface to xgetbv instruction static inline int64_t xgetbv (int ctr) { +#if defined(__ARM_NEON) + (void) ctr; + return (int64_t) FALLBACK; /* use FALLBACK kernels until the ARM64 kernels are added to the pair-counters */ +#else #if (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) //Intel compiler supporting _xgetbv intrinsic return _xgetbv(ctr); // intrinsic function for XGETBV #elif defined(__GNUC__) // use inline assembly, Gnu/AT&T syntax @@ -80,6 +90,7 @@ static inline int64_t xgetbv (int ctr) { } return a | (((uint64_t) d) << 32); +#endif #endif }