From 9e8a7a8cc2ca26647c7a14efac78b865f7299690 Mon Sep 17 00:00:00 2001
From: Sayed Adel <seiko@imavr.com>
Date: Sun, 20 Aug 2023 03:26:00 +0400
Subject: [PATCH] SIMD: Refactor partial load Workaround for Clang

  Clang exhibits aggressive optimization behavior when the `-ftrapping-math` flag is not fully supported,
  starting from -O1 optimization level. When partially loading a vector register for operations that
  require filling up the remaining lanes with specific values (e.g., divide operations needing non-zero
  integers to prevent FP exception divide-by-zero), Clang's optimizer recognizes that the full register
  is unnecessary for the store operation. Consequently, it optimizes out the fill step involving
  non-zero integers for the remaining elements.

  As a solution, we apply the `volatile` keyword to the returned register, followed by a symmetric
  operand operation like `or`, to inform the compiler about the necessity of the full vector.

  This refactor involves transferring this workaround from the source files to the universal intrinsic headers,
  also to guarantee that it is applied by all kernels. Furthermore, the workaround is disabled when the
  `-ftrapping-math` flag is fully supported by the Clang compiler.

  This patch also enables `-ftrapping-math` flag for clang-cl and
  suppress floating point exceptions warnings.
---
 meson.build                                   |  32 ++++-
 numpy/core/meson.build                        |   8 +-
 numpy/core/src/common/simd/avx2/memory.h      |  97 +++++++++++----
 numpy/core/src/common/simd/avx512/memory.h    |  77 ++++++++++--
 numpy/core/src/common/simd/neon/memory.h      |  52 +++++++--
 numpy/core/src/common/simd/simd.h             |  42 +++++--
 numpy/core/src/common/simd/sse/memory.h       | 110 +++++++-----------
 numpy/core/src/common/simd/vec/memory.h       |  51 ++++++--
 .../src/umath/loops_arithm_fp.dispatch.c.src  | 101 +---------------
 .../src/umath/loops_unary_fp.dispatch.c.src   |  64 ----------
 10 files changed, 332 insertions(+), 302 deletions(-)

diff --git a/meson.build b/meson.build
index 0469f7f4590b..9ba86f393d6d 100644
--- a/meson.build
+++ b/meson.build
@@ -55,11 +55,33 @@ add_project_arguments(
 #
 # Clang defaults to a non-strict floating error point model, but we need strict
 # behavior. `-ftrapping-math` is equivalent to `-ffp-exception-behavior=strict`.
-# Note that this is only supported on macOS arm64 as of XCode 14.3
-if cc.get_id() == 'clang'
-  add_project_arguments(
-    cc.get_supported_arguments('-ftrapping-math'), language: ['c', 'cpp'],
-  )
+# This flag is also required to prevent the activation of SIMD partial load workarounds.
+# For further clarification, refer to gh-24461.
+cc_id = cc.get_id()
+if cc_id.startswith('clang')
+  # Determine the compiler flags for trapping math exceptions.
+  trapping_math = {
+    'clang-cl': '/clang:-ftrapping-math',
+  }.get(cc_id, '-ftrapping-math')
+  # Check if the compiler supports the trapping math flag.
+  if cc.has_argument(trapping_math)
+    # TODO: Consider upgrading the vendored Meson to 1.3.0 to support the parameter `werror`
+    # Detect whether the compiler actually supports strict handling of floating-point exceptions
+    # by treating warnings as errors.
+    if cc.compiles('int main() { return 0; }', args: [trapping_math, '-Werror'])
+      trapping_math = [trapping_math, '-DNPY_HAVE_CLANG_FPSTRICT']
+    else
+      # Suppress warnings about unsupported floating-point optimization.
+      trapping_math = [trapping_math, '-Wno-unsupported-floating-point-opt']
+      # Inform the user about the workaround.
+      message(
+        'NumPy is being built against a version of Clang that does not strictly enforce ' +
+        'floating-point exception handling. Workarounds will be used, which may impact performance.\n' +
+        'Consider upgrading Clang to the latest version.'
+      )
+    endif
+    add_project_arguments(trapping_math, language: ['c', 'cpp'])
+  endif
 endif
 
 subdir('meson_cpu')
diff --git a/numpy/core/meson.build b/numpy/core/meson.build
index ccc060aacb96..d32bb7406d33 100644
--- a/numpy/core/meson.build
+++ b/numpy/core/meson.build
@@ -838,9 +838,7 @@ foreach gen_mtargets : [
   [
     'loops_exponent_log.dispatch.h',
     src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
-    # Enabling SIMD on clang-cl raises spurious FP exceptions
-    # TODO (seiko2plus): debug spurious FP exceptions for single-precision log/exp
-    compiler_id == 'clang-cl' ? [] : [
+    [
       AVX512_SKX, AVX512F, [AVX2, FMA3]
     ]
   ],
@@ -884,9 +882,7 @@ foreach gen_mtargets : [
   [
     'loops_trigonometric.dispatch.h',
     src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
-    # Enabling SIMD on clang-cl raises spurious FP exceptions
-    # TODO (seiko2plus): debug spurious FP exceptions for single-precision sin/cos
-    compiler_id == 'clang-cl' ? [] : [
+    [
       AVX512F, [AVX2, FMA3],
       VSX4, VSX3, VSX2,
       NEON_VFPV4,
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
index 993d3ba0d06c..f18636538174 100644
--- a/numpy/core/src/common/simd/avx2/memory.h
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -196,7 +196,12 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     __m256i vnlane  = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi32(vnlane, steps);
     __m256i payload = _mm256_maskload_epi32((const int*)ptr, mask);
-    return _mm256_blendv_epi8(vfill, payload, mask);
+    __m256i ret     = _mm256_blendv_epi8(vfill, payload, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -205,7 +210,12 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
     const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
     __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
     __m256i mask   = _mm256_cmpgt_epi32(vnlane, steps);
-    return _mm256_maskload_epi32((const int*)ptr, mask);
+    __m256i ret    = _mm256_maskload_epi32((const int*)ptr, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 //// 64
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
@@ -216,7 +226,12 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
-    return _mm256_blendv_epi8(vfill, payload, mask);
+    __m256i ret     = _mm256_blendv_epi8(vfill, payload, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
@@ -225,7 +240,12 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_maskload_epi64((const long long*)ptr, mask);
+    __m256i ret     = _mm256_maskload_epi64((const long long*)ptr, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 
 //// 64-bit nlane
@@ -241,7 +261,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
-    return _mm256_blendv_epi8(vfill, payload, mask);
+    __m256i ret     = _mm256_blendv_epi8(vfill, payload, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -251,19 +276,29 @@ NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    npy_int64 m = -((npy_int64)(nlane > 1));
+    npy_int64 m  = -((npy_int64)(nlane > 1));
     __m256i mask = npyv_set_s64(-1, -1, m, m);
-    return _mm256_maskload_epi64((const long long*)ptr, mask);
+    __m256i ret  = _mm256_maskload_epi64((const long long*)ptr, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
                                            npy_int64 fill_lo, npy_int64 fill_hi)
 {
     const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
-    npy_int64 m = -((npy_int64)(nlane > 1));
-    __m256i mask = npyv_set_s64(-1, -1, m, m);
+    npy_int64 m     = -((npy_int64)(nlane > 1));
+    __m256i mask    = npyv_set_s64(-1, -1, m, m);
     __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
-    return _mm256_blendv_epi8(vfill, payload, mask);
+    __m256i ret     =_mm256_blendv_epi8(vfill, payload, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 /*********************************
  * Non-contiguous partial load
@@ -277,9 +312,14 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m256i vfill = _mm256_set1_epi32(fill);
     const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
     const __m256i idx   = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
-    __m256i vnlane  = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
-    __m256i mask    = _mm256_cmpgt_epi32(vnlane, steps);
-    return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
+    __m256i vnlane      = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask        = _mm256_cmpgt_epi32(vnlane, steps);
+    __m256i ret         = _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32
@@ -293,9 +333,14 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m256i vfill = npyv_setall_s64(fill);
     const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
-    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
-    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
+    __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i ret    = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
@@ -313,9 +358,14 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
     );
     const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
-    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
-    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+    __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i ret    = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
@@ -323,7 +373,7 @@ NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride
 
 //// 128-bit load over 64-bit stride
 NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
-                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+                                          npy_int64 fill_lo, npy_int64 fill_hi)
 {
     assert(nlane > 0);
     __m256i a = npyv_loadl_s64(ptr);
@@ -336,7 +386,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
     __m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
 #endif
     __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
-    return _mm256_inserti128_si256(a, b, 1);
+    __m256i ret = _mm256_inserti128_si256(a, b, 1);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
index fdf96a92c583..e981ef8f6dd1 100644
--- a/numpy/core/src/common/simd/avx512/memory.h
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -248,14 +248,24 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     assert(nlane > 0);
     const __m512i vfill = _mm512_set1_epi32(fill);
     const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
     const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
-    return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 //// 64
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
@@ -263,14 +273,24 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
     assert(nlane > 0);
     const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
     const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
-    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 
 //// 64-bit nlane
@@ -280,7 +300,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
     assert(nlane > 0);
     const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
     const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -293,14 +318,24 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
     assert(nlane > 0);
     const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
     const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
-    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
     const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
-    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 /*********************************
  * Non-contiguous partial load
@@ -317,7 +352,12 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
     const __m512i vfill = _mm512_set1_epi32(fill);
     const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
+    __m512i ret = _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32
@@ -334,7 +374,12 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
     );
     const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+    __m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
@@ -352,7 +397,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
     );
     const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
     const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+    __m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
@@ -369,7 +419,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
     );
     const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
-    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+    __m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
index 6163440c39cd..2dc21e5a4305 100644
--- a/numpy/core/src/common/simd/neon/memory.h
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -187,19 +187,28 @@ NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
 NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
 {
     assert(nlane > 0);
+    npyv_s32 a;
     switch(nlane) {
     case 1:
-        return vld1q_lane_s32((const int32_t*)ptr, vdupq_n_s32(fill), 0);
+        a = vld1q_lane_s32((const int32_t*)ptr, vdupq_n_s32(fill), 0);
+        break;
     case 2:
-        return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(fill));
+        a = vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(fill));
+        break;
     case 3:
-        return vcombine_s32(
+        a = vcombine_s32(
             vld1_s32((const int32_t*)ptr),
             vld1_lane_s32((const int32_t*)ptr + 2, vdup_n_s32(fill), 0)
         );
+        break;
     default:
         return npyv_load_s32(ptr);
     }
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile npyv_s32 workaround = a;
+    a = vorrq_s32(workaround, a);
+#endif
+    return a;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -209,7 +218,12 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     if (nlane == 1) {
-        return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+        npyv_s64 a = vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile npyv_s64 workaround = a;
+        a = vorrq_s64(workaround, a);
+    #endif
+        return a;
     }
     return npyv_load_s64(ptr);
 }
@@ -224,7 +238,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
     assert(nlane > 0);
     if (nlane == 1) {
         const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
-        return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+        npyv_s32 a = vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile npyv_s32 workaround = a;
+        a = vorrq_s32(workaround, a);
+    #endif
+        return a;
     }
     return npyv_load_s32(ptr);
 }
@@ -256,10 +275,15 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
         vfill = vld1q_lane_s32((const int32_t*)ptr + stride, vfill, 1);
     case 1:
         vfill = vld1q_lane_s32((const int32_t*)ptr, vfill, 0);
-        return vfill;
+        break;
     default:
         return npyv_loadn_s32(ptr, stride);
     }
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile npyv_s32 workaround = vfill;
+    vfill = vorrq_s32(workaround, vfill);
+#endif
+    return vfill;
 }
 NPY_FINLINE npyv_s32
 npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
@@ -270,7 +294,7 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
 {
     assert(nlane > 0);
     if (nlane == 1) {
-        return vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill));
+        return npyv_load_till_s64(ptr, 1, fill);
     }
     return npyv_loadn_s64(ptr, stride);
 }
@@ -285,7 +309,12 @@ NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
     assert(nlane > 0);
     if (nlane == 1) {
         const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi};
-        return vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+        npyv_s32 a = vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill));
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile npyv_s32 workaround = a;
+        a = vorrq_s32(workaround, a);
+    #endif
+        return a;
     }
     return npyv_loadn2_s32(ptr, stride);
 }
@@ -293,7 +322,12 @@ NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride
 {
     assert(nlane > 0);
     if (nlane == 1) {
-        return vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0));
+        npyv_s32 a = vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0));
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile npyv_s32 workaround = a;
+        a = vorrq_s32(workaround, a);
+    #endif
+        return a;
     }
     return npyv_loadn2_s32(ptr, stride);
 }
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
index 8c9b14251aa0..a5b1fa0c5521 100644
--- a/numpy/core/src/common/simd/simd.h
+++ b/numpy/core/src/common/simd/simd.h
@@ -18,18 +18,23 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-
-// lane type by intrin suffix
-typedef npy_uint8  npyv_lanetype_u8;
-typedef npy_int8   npyv_lanetype_s8;
-typedef npy_uint16 npyv_lanetype_u16;
-typedef npy_int16  npyv_lanetype_s16;
-typedef npy_uint32 npyv_lanetype_u32;
-typedef npy_int32  npyv_lanetype_s32;
-typedef npy_uint64 npyv_lanetype_u64;
-typedef npy_int64  npyv_lanetype_s64;
-typedef float      npyv_lanetype_f32;
-typedef double     npyv_lanetype_f64;
+/*
+ * clang commit a agrresive optimization behavoueir when flag `-ftrapping-math`
+ * isn't fully supported that's present at -O1 or greater. When partially loading a
+ * vector register for a operations that requires to fill up the remaining lanes
+ * with certain value for example divide operation needs to fill the remaining value
+ * with non-zero integer to avoid fp exception divide-by-zero.
+ * clang optimizer notices that the entire register is not needed for the store
+ * and optimizes out the fill of non-zero integer to the remaining
+ * elements. As workaround we mark the returned register with `volatile`
+ * followed by symmetric operand operation e.g. `or`
+ * to convince the compiler that the entire vector is needed.
+ */
+#if defined(__clang__) && !defined(NPY_HAVE_CLANG_FPSTRICT)
+    #define NPY_SIMD_GUARD_PARTIAL_LOAD 1
+#else
+    #define NPY_SIMD_GUARD_PARTIAL_LOAD 0
+#endif
 
 #if defined(_MSC_VER) && defined(_M_IX86)
 /*
@@ -50,6 +55,19 @@ typedef double     npyv_lanetype_f64;
     #undef _mm256_set_epi64x
     #undef _mm_set_epi64x
 #endif
+
+// lane type by intrin suffix
+typedef npy_uint8  npyv_lanetype_u8;
+typedef npy_int8   npyv_lanetype_s8;
+typedef npy_uint16 npyv_lanetype_u16;
+typedef npy_int16  npyv_lanetype_s16;
+typedef npy_uint32 npyv_lanetype_u32;
+typedef npy_int32  npyv_lanetype_s32;
+typedef npy_uint64 npyv_lanetype_u64;
+typedef npy_int64  npyv_lanetype_s64;
+typedef float      npyv_lanetype_f32;
+typedef double     npyv_lanetype_f64;
+
 #if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
     #include "avx512/avx512.h"
 #elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
index 4c8e86a6f9da..90c01ffefedb 100644
--- a/numpy/core/src/common/simd/sse/memory.h
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -178,62 +178,53 @@ NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a)
 /*********************************
  * Partial Load
  *********************************/
-#if defined(__clang__) && __clang_major__ > 7
-    /**
-     * Clang >=8 perform aggressive optimization that tends to
-     * zero the bits of upper half part of vectors even
-     * when we try to fill it up with certain scalars,
-     * which my lead to zero division errors.
-    */
-    #define NPYV__CLANG_ZEROUPPER
-#endif
 //// 32
 NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill)
 {
     assert(nlane > 0);
-#ifdef NPYV__CLANG_ZEROUPPER
-    if (nlane > 3) {
-        return npyv_load_s32(ptr);
-    }
-    npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
-    for (npy_uint64 i = 0; i < nlane; ++i) {
-        data[i] = ptr[i];
-    }
-    return npyv_loada_s32(data);
-#else
     #ifndef NPY_HAVE_SSE41
         const short *wptr = (const short*)ptr;
     #endif
     const __m128i vfill = npyv_setall_s32(fill);
     __m128i a;
     switch(nlane) {
-    case 2:
-        return _mm_castpd_si128(
-            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
-        );
+        case 2:
+            a = _mm_castpd_si128(
+                _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
+            );
+            break;
     #ifdef NPY_HAVE_SSE41
         case 1:
-            return _mm_insert_epi32(vfill, ptr[0], 0);
+            a = _mm_insert_epi32(vfill, ptr[0], 0);
+            break;
         case 3:
             a = _mm_loadl_epi64((const __m128i*)ptr);
             a = _mm_insert_epi32(a, ptr[2], 2);
             a = _mm_insert_epi32(a, fill, 3);
-            return a;
+            break;
     #else
         case 1:
             a = _mm_insert_epi16(vfill, wptr[0], 0);
-            return _mm_insert_epi16(a, wptr[1], 1);
+    a = _mm_insert_epi16(a, wptr[1], 1);
+            break;
         case 3:
             a = _mm_loadl_epi64((const __m128i*)ptr);
             a = _mm_unpacklo_epi64(a, vfill);
             a = _mm_insert_epi16(a, wptr[4], 4);
             a = _mm_insert_epi16(a, wptr[5], 5);
-            return a;
+            break;
     #endif // NPY_HAVE_SSE41
         default:
             return npyv_load_s32(ptr);
-        }
-#endif
+    }
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        // We use a variable marked 'volatile' to convince the compiler that
+        // the entire vector is needed.
+        volatile __m128i workaround = a;
+        // avoid optimizing it out
+        a = _mm_or_si128(workaround, a);
+    #endif
+    return a;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -260,22 +251,17 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-#ifdef NPYV__CLANG_ZEROUPPER
-    if (nlane <= 2) {
-        npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
-        for (npy_uint64 i = 0; i < nlane; ++i) {
-            data[i] = ptr[i];
-        }
-        return npyv_loada_s64(data);
-    }
-#else
     if (nlane == 1) {
         const __m128i vfill = npyv_setall_s64(fill);
-        return _mm_castpd_si128(
+        npyv_s64 a = _mm_castpd_si128(
             _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
         );
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile __m128i workaround = a;
+        a = _mm_or_si128(workaround, a);
+    #endif
+        return a;
     }
-#endif
     return npyv_load_s64(ptr);
 }
 // fill zero to rest lanes
@@ -295,9 +281,14 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
     assert(nlane > 0);
     if (nlane == 1) {
         const __m128i vfill = npyv_set_s32(fill_lo, fill_hi, fill_lo, fill_hi);
-        return _mm_castpd_si128(
+        __m128i a =  _mm_castpd_si128(
             _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
         );
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile __m128i workaround = a;
+        a = _mm_or_si128(workaround, a);
+    #endif
+        return a;
     }
     return npyv_load_s32(ptr);
 }
@@ -321,16 +312,6 @@ NPY_FINLINE npyv_s32
 npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill)
 {
     assert(nlane > 0);
-#ifdef NPYV__CLANG_ZEROUPPER
-    if (nlane > 3) {
-        return npyv_loadn_s32(ptr, stride);
-    }
-    npy_int32 NPY_DECL_ALIGNED(16) data[4] = {fill, fill, fill, fill};
-    for (npy_uint64 i = 0; i < nlane; ++i) {
-        data[i] = ptr[stride*i];
-    }
-    return npyv_loada_s32(data);
-#else
     __m128i vfill = npyv_setall_s32(fill);
     #ifndef NPY_HAVE_SSE41
         const short *wptr = (const short*)ptr;
@@ -360,8 +341,11 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     default:
         return npyv_loadn_s32(ptr, stride);
     } // switch
-    return vfill;
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m128i workaround = vfill;
+    vfill = _mm_or_si128(workaround, vfill);
 #endif
+    return vfill;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32
@@ -402,22 +386,9 @@ NPY_FINLINE npyv_s64
 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
-#ifdef NPYV__CLANG_ZEROUPPER
-    if (nlane <= 2) {
-        npy_int64 NPY_DECL_ALIGNED(16) data[2] = {fill, fill};
-        for (npy_uint64 i = 0; i < nlane; ++i) {
-            data[i] = ptr[i*stride];
-        }
-        return npyv_loada_s64(data);
-    }
-#else
     if (nlane == 1) {
-        const __m128i vfill = npyv_setall_s64(fill);
-        return _mm_castpd_si128(
-            _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
-        );
+        return npyv_load_till_s64(ptr, 1, fill);
     }
-#endif
     return npyv_loadn_s64(ptr, stride);
 }
 // fill zero to rest lanes
@@ -437,9 +408,14 @@ NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
     assert(nlane > 0);
     if (nlane == 1) {
         const __m128i vfill = npyv_set_s32(0, 0, fill_lo, fill_hi);
-        return _mm_castpd_si128(
+        __m128i a = _mm_castpd_si128(
             _mm_loadl_pd(_mm_castsi128_pd(vfill), (double*)ptr)
         );
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile __m128i workaround = a;
+        a = _mm_or_si128(workaround, a);
+    #endif
+        return a;
     }
     return npyv_loadn2_s32(ptr, stride);
 }
diff --git a/numpy/core/src/common/simd/vec/memory.h b/numpy/core/src/common/simd/vec/memory.h
index 4545e53e9094..dbcdc16da395 100644
--- a/numpy/core/src/common/simd/vec/memory.h
+++ b/numpy/core/src/common/simd/vec/memory.h
@@ -210,24 +210,33 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     const npyv_u32 vlane = npyv_setall_u32(blane);
     const npyv_b32 mask  = vec_cmpgt(vlane, steps);
     npyv_s32 a = vec_load_len(ptr, blane*4-1);
-    return vec_sel(vfill, a, mask);
+    a = vec_sel(vfill, a, mask);
 #else
+    npyv_s32 a;
     switch(nlane) {
     case 1:
-        return vec_insert(ptr[0], vfill, 0);
+        a = vec_insert(ptr[0], vfill, 0);
+        break;
     case 2:
-        return (npyv_s32)vec_insert(
+        a = (npyv_s32)vec_insert(
             *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0
         );
+        break;
     case 3:
         vfill = vec_insert(ptr[2], vfill, 2);
-        return (npyv_s32)vec_insert(
+        a = (npyv_s32)vec_insert(
             *npyv__ptr2u64(ptr), (npyv_u64)vfill, 0
         );
+        break;
     default:
         return npyv_load_s32(ptr);
     }
 #endif
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile npyv_s32 workaround = a;
+    a = vec_or(workaround, a);
+#endif
+    return a;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -244,7 +253,12 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
 {
     assert(nlane > 0);
     if (nlane == 1) {
-        return npyv_set_s64(ptr[0], fill);
+        npyv_s64 r = npyv_set_s64(ptr[0], fill);
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile npyv_s64 workaround = r;
+        r = vec_or(workaround, r);
+    #endif
+        return r;
     }
     return npyv_load_s64(ptr);
 }
@@ -264,7 +278,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
 {
     assert(nlane > 0);
     if (nlane == 1) {
-        return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+        npyv_s32 r = npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile npyv_s32 workaround = r;
+        r = vec_or(workaround, r);
+    #endif
+        return r;
     }
     return npyv_load_s32(ptr);
 }
@@ -299,6 +318,10 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     default:
         return npyv_loadn_s32(ptr, stride);
     } // switch
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile npyv_s32 workaround = vfill;
+    vfill = vec_or(workaround, vfill);
+#endif
     return vfill;
 }
 // fill zero to rest lanes
@@ -311,7 +334,7 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
 {
     assert(nlane > 0);
     if (nlane == 1) {
-        return npyv_set_s64(*ptr, fill);
+        return npyv_load_till_s64(ptr, nlane, fill);
     }
     return npyv_loadn_s64(ptr, stride);
 }
@@ -325,7 +348,12 @@ NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
 {
     assert(nlane > 0);
     if (nlane == 1) {
-        return npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+        npyv_s32 r = npyv_set_s32(ptr[0], ptr[1], fill_lo, fill_hi);
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile npyv_s32 workaround = r;
+        r = vec_or(workaround, r);
+    #endif
+        return r;
     }
     return npyv_loadn2_s32(ptr, stride);
 }
@@ -333,7 +361,12 @@ NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride
 {
     assert(nlane > 0);
     if (nlane == 1) {
-        return (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0);
+        npyv_s32 r = (npyv_s32)npyv_set_s64(*(npy_int64*)ptr, 0);
+    #if NPY_SIMD_GUARD_PARTIAL_LOAD
+        volatile npyv_s32 workaround = r;
+        r = vec_or(workaround, r);
+    #endif
+        return r;
     }
     return npyv_loadn2_s32(ptr, stride);
 }
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
index 7ba3981e8119..c8bcedb6bbdc 100644
--- a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -31,59 +31,6 @@
 /********************************************************************************
  ** Defining ufunc inner functions
  ********************************************************************************/
-
-/*
- * clang has a bug that's present at -O1 or greater.  When partially loading a
- * vector register for a divide operation, the remaining elements are set
- * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
- * store after the divide operation.  clang notices that the entire register
- * is not needed for the store and optimizes out the fill of 1 to the remaining
- * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
- * that we were trying to avoid by filling.
- *
- * Using a dummy variable marked 'volatile' convinces clang not to ignore
- * the explicit fill of remaining elements.  If `-ftrapping-math` is
- * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
- * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
- * `-ftrapping-math` is set by default of Numpy builds in
- * numpy/distutils/ccompiler.py.
- *
- * Note: Apple clang and clang upstream have different versions that overlap
- */
-#if defined(__clang__)
-    #if defined(__apple_build_version__)
-    // Apple Clang
-        #if __apple_build_version__ < 12000000
-        // Apple Clang before v12
-        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
-        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
-        // Apple Clang after v12, targeting i386 or x86_64
-        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
-        #else
-        // Apple Clang after v12, not targeting i386 or x86_64
-        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
-        #endif
-    #else
-    // Clang, not Apple Clang
-        #if __clang_major__ < 10
-        // Clang before v10
-        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
-        #elif defined(_MSC_VER)
-        // clang-cl has the same bug
-        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
-        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
-        // Clang v10+, targeting i386 or x86_64
-        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
-        #else
-        // Clang v10+, not targeting i386 or x86_64
-        #define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 1
-        #endif
-    #endif
-#else
-// Not a Clang compiler
-#define WORKAROUND_CLANG_PARTIAL_LOAD_BUG 0
-#endif
-
 /**begin repeat
  * Float types
  *  #type = npy_float, npy_double#
@@ -148,12 +95,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_store_@sfx@((@type@*)dst, r0);
                 npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            const int vstop = hstep - 1;
-        #else
-            const int vstop = 0;
-        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            for (; len > vstop; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
             #if @is_div@
                 npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
                 npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
@@ -164,15 +106,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
                 npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            // last partial iteration for divide and working around clang partial load bug
-            if(len > 0){
-                npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
-                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
-                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
-                npyv_store_till_@sfx@((@type@*)dst, len, r);
-            }
-        #endif // #if @is_div@ && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
         }
         else if (ssrc0 == 0 && ssrc1 == sizeof(@type@) && sdst == ssrc1) {
             npyv_@sfx@ a = npyv_setall_@sfx@(*((@type@*)src0));
@@ -184,12 +117,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_store_@sfx@((@type@*)dst, r0);
                 npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            const int vstop = hstep - 1;
-        #else
-            const int vstop = 0;
-        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            for (; len > vstop; len -= hstep, src1 += vstep, dst += vstep) {
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
             #if @is_div@ || @is_mul@
                 npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
             #else
@@ -198,14 +126,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
                 npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            // last partial iteration for multiply / divide and working around clang partial load bug
-            if(len > 0){
-                volatile npyv_@sfx@ b = npyv_load_till_@sfx@((const @type@*)src1, len, 1.0@c@);
-                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
-                npyv_store_till_@sfx@((@type@*)dst, len, r);
-            }
-        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
         }
         else if (ssrc1 == 0 && ssrc0 == sizeof(@type@) && sdst == ssrc0) {
             npyv_@sfx@ b = npyv_setall_@sfx@(*((@type@*)src1));
@@ -217,12 +137,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_store_@sfx@((@type@*)dst, r0);
                 npyv_store_@sfx@((@type@*)(dst + vstep), r1);
             }
-        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            const int vstop = hstep - 1;
-        #else
-            const int vstop = 0;
-        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            for (; len > vstop; len -= hstep, src0 += vstep, dst += vstep) {
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
             #if @is_div@ || @is_mul@
                 npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
             #else
@@ -231,14 +146,6 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
                 npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
                 npyv_store_till_@sfx@((@type@*)dst, len, r);
             }
-        #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-            // last partial iteration for multiply / divide and working around clang partial load bug
-            if(len > 0){
-                volatile npyv_@sfx@ a = npyv_load_till_@sfx@((const @type@*)src0, len, 1.0@c@);
-                npyv_@sfx@ r = npyv_@intrin@_@sfx@(a, b);
-                npyv_store_till_@sfx@((@type@*)dst, len, r);
-            }
-        #endif // #if (@is_div@ || @is_mul@) && WORKAROUND_CLANG_PARTIAL_LOAD_BUG
         } else {
             goto loop_scalar;
         }
@@ -279,8 +186,6 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@_indexed)
 /**end repeat1**/
 /**end repeat**/
 
-#undef WORKAROUND_CLANG_PARTIAL_LOAD_BUG
-
 //###############################################################################
 //## Complex Single/Double precision
 //###############################################################################
diff --git a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
index c4e7b8929f1d..f6404f6f7d68 100644
--- a/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
+++ b/numpy/core/src/umath/loops_unary_fp.dispatch.c.src
@@ -93,58 +93,6 @@ NPY_FINLINE double c_square_f64(double a)
 #define CONTIG  0
 #define NCONTIG 1
 
-/*
- * clang has a bug that's present at -O1 or greater.  When partially loading a
- * vector register for a reciprocal operation, the remaining elements are set
- * to 1 to avoid divide-by-zero.  The partial load is paired with a partial
- * store after the reciprocal operation.  clang notices that the entire register
- * is not needed for the store and optimizes out the fill of 1 to the remaining
- * elements.  This causes either a divide-by-zero or 0/0 with invalid exception
- * that we were trying to avoid by filling.
- *
- * Using a dummy variable marked 'volatile' convinces clang not to ignore
- * the explicit fill of remaining elements.  If `-ftrapping-math` is
- * supported, then it'll also avoid the bug.  `-ftrapping-math` is supported
- * on Apple clang v12+ for x86_64.  It is not currently supported for arm64.
- * `-ftrapping-math` is set by default of Numpy builds in
- * numpy/distutils/ccompiler.py.
- *
- * Note: Apple clang and clang upstream have different versions that overlap
- */
-#if defined(__clang__)
-    #if defined(__apple_build_version__)
-    // Apple Clang
-        #if __apple_build_version__ < 12000000
-        // Apple Clang before v12
-        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
-        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
-        // Apple Clang after v12, targeting i386 or x86_64
-        #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
-        #else
-        // Apple Clang after v12, not targeting i386 or x86_64
-        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
-        #endif
-    #else
-    // Clang, not Apple Clang
-        #if __clang_major__ < 10
-        // Clang before v10
-        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
-        #elif defined(_MSC_VER)
-        // clang-cl has the same bug
-        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
-        #elif defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64)
-        // Clang v10+, targeting i386 or x86_64
-        #define WORKAROUND_CLANG_RECIPROCAL_BUG 0
-        #else
-        // Clang v10+, not targeting i386 or x86_64
-        #define WORKAROUND_CLANG_RECIPROCAL_BUG 1
-        #endif
-    #endif
-#else
-// Not a Clang compiler
-#define WORKAROUND_CLANG_RECIPROCAL_BUG 0
-#endif
-
 /**begin repeat
  * #TYPE = FLOAT, DOUBLE#
  * #sfx  = f32, f64#
@@ -155,7 +103,6 @@ NPY_FINLINE double c_square_f64(double a)
  * #kind     = rint,  floor, ceil, trunc, sqrt, absolute, square, reciprocal#
  * #intr     = rint,  floor, ceil, trunc, sqrt, abs,      square, recip#
  * #repl_0w1 = 0*7, 1#
- * #RECIP_WORKAROUND = 0*7, WORKAROUND_CLANG_RECIPROCAL_BUG#
  */
 /**begin repeat2
  * #STYPE  = CONTIG, NCONTIG, CONTIG,  NCONTIG#
@@ -228,15 +175,6 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
             npyv_@sfx@ v_src0 = npyv_loadn_tillz_@sfx@(src, ssrc, len);
         #endif
     #endif
-        #if @RECIP_WORKAROUND@
-            /*
-             * Workaround clang bug.  We use a dummy variable marked 'volatile'
-             * to convince clang that the entire vector is needed.  We only
-             * want to do this for the last iteration / partial load-store of
-             * the loop since 'volatile' forces a refresh of the contents.
-             */
-             volatile npyv_@sfx@ unused_but_workaround_bug = v_src0;
-        #endif // @RECIP_WORKAROUND@
         npyv_@sfx@ v_unary0 = npyv_@intr@_@sfx@(v_src0);
     #if @DTYPE@ == CONTIG
         npyv_store_till_@sfx@(dst, len, v_unary0);
@@ -252,8 +190,6 @@ static void simd_@TYPE@_@kind@_@STYPE@_@DTYPE@
 #endif // @VCHK@
 /**end repeat**/
 
-#undef WORKAROUND_CLANG_RECIPROCAL_BUG
-
 /********************************************************************************
  ** Defining ufunc inner functions
  ********************************************************************************/