# 0 "./neon.h" # 0 "" # 0 "" # 1 "/usr/include/stdc-predef.h" 1 3 4 # 0 "" 2 # 1 "./neon.h" # 21 "./neon.h" typedef uint8x16_t npyv_u8; typedef int8x16_t npyv_s8; typedef uint16x8_t npyv_u16; typedef int16x8_t npyv_s16; typedef uint32x4_t npyv_u32; typedef int32x4_t npyv_s32; typedef uint64x2_t npyv_u64; typedef int64x2_t npyv_s64; typedef float32x4_t npyv_f32; typedef float64x2_t npyv_f64; typedef uint8x16_t npyv_b8; typedef uint16x8_t npyv_b16; typedef uint32x4_t npyv_b32; typedef uint64x2_t npyv_b64; typedef uint8x16x2_t npyv_u8x2; typedef int8x16x2_t npyv_s8x2; typedef uint16x8x2_t npyv_u16x2; typedef int16x8x2_t npyv_s16x2; typedef uint32x4x2_t npyv_u32x2; typedef int32x4x2_t npyv_s32x2; typedef uint64x2x2_t npyv_u64x2; typedef int64x2x2_t npyv_s64x2; typedef float32x4x2_t npyv_f32x2; typedef float64x2x2_t npyv_f64x2; typedef uint8x16x3_t npyv_u8x3; typedef int8x16x3_t npyv_s8x3; typedef uint16x8x3_t npyv_u16x3; typedef int16x8x3_t npyv_s16x3; typedef uint32x4x3_t npyv_u32x3; typedef int32x4x3_t npyv_s32x3; typedef uint64x2x3_t npyv_u64x3; typedef int64x2x3_t npyv_s64x3; typedef float32x4x3_t npyv_f32x3; typedef float64x2x3_t npyv_f64x3; # 76 "./neon.h" # 1 "./memory.h" 1 # 1 "./misc.h" 1 # 9 "./memory.h" 2 # 38 "./memory.h" NPY_FINLINE npyv_u8 npyv_load_u8(const npyv_lanetype_u8 *ptr) { return vld1q_u8((const uint8_t*)ptr); } NPY_FINLINE npyv_u8 npyv_loada_u8(const npyv_lanetype_u8 *ptr) { return vld1q_u8((const uint8_t*)ptr); } NPY_FINLINE npyv_u8 npyv_loads_u8(const npyv_lanetype_u8 *ptr) { return vld1q_u8((const uint8_t*)ptr); } NPY_FINLINE npyv_u8 npyv_loadl_u8(const npyv_lanetype_u8 *ptr) { return vcombine_u8( vld1_u8((const uint8_t*)ptr), vdup_n_u8(0) ); } NPY_FINLINE void npyv_store_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) { vst1q_u8((uint8_t*)ptr, vec); } NPY_FINLINE void npyv_storea_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) { vst1q_u8((uint8_t*)ptr, vec); } NPY_FINLINE void npyv_stores_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) { vst1q_u8((uint8_t*)ptr, vec); } NPY_FINLINE void npyv_storel_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) { vst1_u8((uint8_t*)ptr, vget_low_u8(vec)); } NPY_FINLINE void npyv_storeh_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) { vst1_u8((uint8_t*)ptr, vget_high_u8(vec)); } NPY_FINLINE npyv_s8 npyv_load_s8(const npyv_lanetype_s8 *ptr) { return vld1q_s8((const int8_t*)ptr); } NPY_FINLINE npyv_s8 npyv_loada_s8(const npyv_lanetype_s8 *ptr) { return vld1q_s8((const int8_t*)ptr); } NPY_FINLINE npyv_s8 npyv_loads_s8(const npyv_lanetype_s8 *ptr) { return vld1q_s8((const int8_t*)ptr); } NPY_FINLINE npyv_s8 npyv_loadl_s8(const npyv_lanetype_s8 *ptr) { return vcombine_s8( vld1_s8((const int8_t*)ptr), vdup_n_s8(0) ); } NPY_FINLINE void npyv_store_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) { vst1q_s8((int8_t*)ptr, vec); } NPY_FINLINE void npyv_storea_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) { vst1q_s8((int8_t*)ptr, vec); } NPY_FINLINE void npyv_stores_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) { vst1q_s8((int8_t*)ptr, vec); } NPY_FINLINE void npyv_storel_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) { vst1_s8((int8_t*)ptr, vget_low_s8(vec)); } NPY_FINLINE void npyv_storeh_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) { vst1_s8((int8_t*)ptr, vget_high_s8(vec)); } NPY_FINLINE npyv_u16 npyv_load_u16(const npyv_lanetype_u16 *ptr) { return vld1q_u16((const uint16_t*)ptr); } NPY_FINLINE npyv_u16 npyv_loada_u16(const npyv_lanetype_u16 *ptr) { return vld1q_u16((const uint16_t*)ptr); } NPY_FINLINE npyv_u16 npyv_loads_u16(const npyv_lanetype_u16 *ptr) { return vld1q_u16((const uint16_t*)ptr); } NPY_FINLINE npyv_u16 npyv_loadl_u16(const npyv_lanetype_u16 *ptr) { return vcombine_u16( vld1_u16((const uint16_t*)ptr), vdup_n_u16(0) ); } NPY_FINLINE void npyv_store_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) { vst1q_u16((uint16_t*)ptr, vec); } NPY_FINLINE void npyv_storea_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) { vst1q_u16((uint16_t*)ptr, vec); } NPY_FINLINE void npyv_stores_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) { vst1q_u16((uint16_t*)ptr, vec); } NPY_FINLINE void npyv_storel_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) { vst1_u16((uint16_t*)ptr, vget_low_u16(vec)); } NPY_FINLINE void npyv_storeh_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) { vst1_u16((uint16_t*)ptr, vget_high_u16(vec)); } NPY_FINLINE npyv_s16 npyv_load_s16(const npyv_lanetype_s16 *ptr) { return vld1q_s16((const int16_t*)ptr); } NPY_FINLINE npyv_s16 npyv_loada_s16(const npyv_lanetype_s16 *ptr) { return vld1q_s16((const int16_t*)ptr); } NPY_FINLINE npyv_s16 npyv_loads_s16(const npyv_lanetype_s16 *ptr) { return vld1q_s16((const int16_t*)ptr); } NPY_FINLINE npyv_s16 npyv_loadl_s16(const npyv_lanetype_s16 *ptr) { return vcombine_s16( vld1_s16((const int16_t*)ptr), vdup_n_s16(0) ); } NPY_FINLINE void npyv_store_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) { vst1q_s16((int16_t*)ptr, vec); } NPY_FINLINE void npyv_storea_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) { vst1q_s16((int16_t*)ptr, vec); } NPY_FINLINE void npyv_stores_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) { vst1q_s16((int16_t*)ptr, vec); } NPY_FINLINE void npyv_storel_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) { vst1_s16((int16_t*)ptr, vget_low_s16(vec)); } NPY_FINLINE void npyv_storeh_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) { vst1_s16((int16_t*)ptr, vget_high_s16(vec)); } NPY_FINLINE npyv_u32 npyv_load_u32(const npyv_lanetype_u32 *ptr) { return vld1q_u32((const uint32_t*)ptr); } NPY_FINLINE npyv_u32 npyv_loada_u32(const npyv_lanetype_u32 *ptr) { return vld1q_u32((const uint32_t*)ptr); } NPY_FINLINE npyv_u32 npyv_loads_u32(const npyv_lanetype_u32 *ptr) { return vld1q_u32((const uint32_t*)ptr); } NPY_FINLINE npyv_u32 npyv_loadl_u32(const npyv_lanetype_u32 *ptr) { return vcombine_u32( vld1_u32((const uint32_t*)ptr), vdup_n_u32(0) ); } NPY_FINLINE void npyv_store_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) { vst1q_u32((uint32_t*)ptr, vec); } NPY_FINLINE void npyv_storea_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) { vst1q_u32((uint32_t*)ptr, vec); } NPY_FINLINE void npyv_stores_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) { vst1q_u32((uint32_t*)ptr, vec); } NPY_FINLINE void npyv_storel_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) { vst1_u32((uint32_t*)ptr, vget_low_u32(vec)); } NPY_FINLINE void npyv_storeh_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) { vst1_u32((uint32_t*)ptr, vget_high_u32(vec)); } NPY_FINLINE npyv_s32 npyv_load_s32(const npyv_lanetype_s32 *ptr) { return vld1q_s32((const int32_t*)ptr); } NPY_FINLINE npyv_s32 npyv_loada_s32(const npyv_lanetype_s32 *ptr) { return vld1q_s32((const int32_t*)ptr); } NPY_FINLINE npyv_s32 npyv_loads_s32(const npyv_lanetype_s32 *ptr) { return vld1q_s32((const int32_t*)ptr); } NPY_FINLINE npyv_s32 npyv_loadl_s32(const npyv_lanetype_s32 *ptr) { return vcombine_s32( vld1_s32((const int32_t*)ptr), vdup_n_s32(0) ); } NPY_FINLINE void npyv_store_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) { vst1q_s32((int32_t*)ptr, vec); } NPY_FINLINE void npyv_storea_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) { vst1q_s32((int32_t*)ptr, vec); } NPY_FINLINE void npyv_stores_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) { vst1q_s32((int32_t*)ptr, vec); } NPY_FINLINE void npyv_storel_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) { vst1_s32((int32_t*)ptr, vget_low_s32(vec)); } NPY_FINLINE void npyv_storeh_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) { vst1_s32((int32_t*)ptr, vget_high_s32(vec)); } NPY_FINLINE npyv_u64 npyv_load_u64(const npyv_lanetype_u64 *ptr) { return vld1q_u64((const uint64_t*)ptr); } NPY_FINLINE npyv_u64 npyv_loada_u64(const npyv_lanetype_u64 *ptr) { return vld1q_u64((const uint64_t*)ptr); } NPY_FINLINE npyv_u64 npyv_loads_u64(const npyv_lanetype_u64 *ptr) { return vld1q_u64((const uint64_t*)ptr); } NPY_FINLINE npyv_u64 npyv_loadl_u64(const npyv_lanetype_u64 *ptr) { return vcombine_u64( vld1_u64((const uint64_t*)ptr), vdup_n_u64(0) ); } NPY_FINLINE void npyv_store_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) { vst1q_u64((uint64_t*)ptr, vec); } NPY_FINLINE void npyv_storea_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) { vst1q_u64((uint64_t*)ptr, vec); } NPY_FINLINE void npyv_stores_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) { vst1q_u64((uint64_t*)ptr, vec); } NPY_FINLINE void npyv_storel_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) { vst1_u64((uint64_t*)ptr, vget_low_u64(vec)); } NPY_FINLINE void npyv_storeh_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) { vst1_u64((uint64_t*)ptr, vget_high_u64(vec)); } NPY_FINLINE npyv_s64 npyv_load_s64(const npyv_lanetype_s64 *ptr) { return vld1q_s64((const int64_t*)ptr); } NPY_FINLINE npyv_s64 npyv_loada_s64(const npyv_lanetype_s64 *ptr) { return vld1q_s64((const int64_t*)ptr); } NPY_FINLINE npyv_s64 npyv_loads_s64(const npyv_lanetype_s64 *ptr) { return vld1q_s64((const int64_t*)ptr); } NPY_FINLINE npyv_s64 npyv_loadl_s64(const npyv_lanetype_s64 *ptr) { return vcombine_s64( vld1_s64((const int64_t*)ptr), vdup_n_s64(0) ); } NPY_FINLINE void npyv_store_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) { vst1q_s64((int64_t*)ptr, vec); } NPY_FINLINE void npyv_storea_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) { vst1q_s64((int64_t*)ptr, vec); } NPY_FINLINE void npyv_stores_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) { vst1q_s64((int64_t*)ptr, vec); } NPY_FINLINE void npyv_storel_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) { vst1_s64((int64_t*)ptr, vget_low_s64(vec)); } NPY_FINLINE void npyv_storeh_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) { vst1_s64((int64_t*)ptr, vget_high_s64(vec)); } NPY_FINLINE npyv_f32 npyv_load_f32(const npyv_lanetype_f32 *ptr) { return vld1q_f32((const float*)ptr); } NPY_FINLINE npyv_f32 npyv_loada_f32(const npyv_lanetype_f32 *ptr) { return vld1q_f32((const float*)ptr); } NPY_FINLINE npyv_f32 npyv_loads_f32(const npyv_lanetype_f32 *ptr) { return vld1q_f32((const float*)ptr); } NPY_FINLINE npyv_f32 npyv_loadl_f32(const npyv_lanetype_f32 *ptr) { return vcombine_f32( vld1_f32((const float*)ptr), vdup_n_f32(0) ); } NPY_FINLINE void npyv_store_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) { vst1q_f32((float*)ptr, vec); } NPY_FINLINE void npyv_storea_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) { vst1q_f32((float*)ptr, vec); } NPY_FINLINE void npyv_stores_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) { vst1q_f32((float*)ptr, vec); } NPY_FINLINE void npyv_storel_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) { vst1_f32((float*)ptr, vget_low_f32(vec)); } NPY_FINLINE void npyv_storeh_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) { vst1_f32((float*)ptr, vget_high_f32(vec)); } NPY_FINLINE npyv_f64 npyv_load_f64(const npyv_lanetype_f64 *ptr) { return vld1q_f64((const double*)ptr); } NPY_FINLINE npyv_f64 npyv_loada_f64(const npyv_lanetype_f64 *ptr) { return vld1q_f64((const double*)ptr); } NPY_FINLINE npyv_f64 npyv_loads_f64(const npyv_lanetype_f64 *ptr) { return vld1q_f64((const double*)ptr); } NPY_FINLINE npyv_f64 npyv_loadl_f64(const npyv_lanetype_f64 *ptr) { return vcombine_f64( vld1_f64((const double*)ptr), vdup_n_f64(0) ); } NPY_FINLINE void npyv_store_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) { vst1q_f64((double*)ptr, vec); } NPY_FINLINE void npyv_storea_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) { vst1q_f64((double*)ptr, vec); } NPY_FINLINE void npyv_stores_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) { vst1q_f64((double*)ptr, vec); } NPY_FINLINE void npyv_storel_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) { vst1_f64((double*)ptr, vget_low_f64(vec)); } NPY_FINLINE void npyv_storeh_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) { vst1_f64((double*)ptr, vget_high_f64(vec)); } NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) { int32x4_t a = vdupq_n_s32(0); a = vld1q_lane_s32((const int32_t*)ptr, a, 0); a = vld1q_lane_s32((const int32_t*)ptr + stride, a, 1); a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2); a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3); return a; } NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) { return vreinterpretq_u32_s32( npyv_loadn_s32((const npy_int32*)ptr, stride) ); } NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) { return vreinterpretq_f32_s32( npyv_loadn_s32((const npy_int32*)ptr, stride) ); } NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) { return vcombine_s64( vld1_s64((const int64_t*)ptr), vld1_s64((const int64_t*)ptr + stride) ); } NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) { return vreinterpretq_u64_s64( npyv_loadn_s64((const npy_int64*)ptr, stride) ); } NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) { return vreinterpretq_f64_s64( npyv_loadn_s64((const npy_int64*)ptr, stride) ); } NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) { return vcombine_u32( vld1_u32((const uint32_t*)ptr), vld1_u32((const uint32_t*)ptr + stride) ); } NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) { return vreinterpretq_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) { return vreinterpretq_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) { (void)stride; return npyv_load_u64(ptr); } NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) { (void)stride; return npyv_load_s64(ptr); } NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) { (void)stride; return npyv_load_f64(ptr); } NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) { vst1q_lane_s32((int32_t*)ptr, a, 0); vst1q_lane_s32((int32_t*)ptr + stride, a, 1); vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); } NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) { npyv_storen_s32((npy_int32*)ptr, stride, vreinterpretq_s32_u32(a)); } NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) { npyv_storen_s32((npy_int32*)ptr, stride, vreinterpretq_s32_f32(a)); } NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) { vst1q_lane_s64((int64_t*)ptr, a, 0); vst1q_lane_s64((int64_t*)ptr + stride, a, 1); } NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) { npyv_storen_s64((npy_int64*)ptr, stride, vreinterpretq_s64_u64(a)); } NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) { npyv_storen_s64((npy_int64*)ptr, stride, vreinterpretq_s64_f64(a)); } NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) { vst1q_lane_u64((uint64_t*)ptr, vreinterpretq_u64_u32(a), 0); vst1q_lane_u64((uint64_t*)(ptr + stride), vreinterpretq_u64_u32(a), 1); } NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) { npyv_storen2_u32((npy_uint32*)ptr, stride, vreinterpretq_u32_s32(a)); } NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) { npyv_storen2_u32((npy_uint32*)ptr, stride, vreinterpretq_u32_f32(a)); } NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) { (void)stride; npyv_store_u64(ptr, a); } NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) { (void)stride; npyv_store_s64(ptr, a); } NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) { (void)stride; npyv_store_f64(ptr, a); } NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) { assert(nlane > 0); npyv_s32 a; switch(nlane) { case 1: a = vld1q_lane_s32((const int32_t*)ptr, vdupq_n_s32(fill), 0); break; case 2: a = vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(fill)); break; case 3: a = vcombine_s32( vld1_s32((const int32_t*)ptr), vld1_lane_s32((const int32_t*)ptr + 2, vdup_n_s32(fill), 0) ); break; default: return npyv_load_s32(ptr); } return a; } NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) { return npyv_load_till_s32(ptr, nlane, 0); } NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) { assert(nlane > 0); if (nlane == 1) { npyv_s64 a = vcombine_s64(vld1_s64((const int64_t*)ptr), vdup_n_s64(fill)); return a; } return npyv_load_s64(ptr); } NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { return npyv_load_till_s64(ptr, nlane, 0); } NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill_lo, npy_int32 fill_hi) { assert(nlane > 0); if (nlane == 1) { const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; npyv_s32 a = vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill)); return a; } return npyv_load_s32(ptr); } NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) { return vreinterpretq_s32_s64(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); } NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill_lo, npy_int64 fill_hi) { (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { (void)nlane; return npyv_load_s64(ptr); } NPY_FINLINE npyv_s32 npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) { assert(nlane > 0); int32x4_t vfill = vdupq_n_s32(fill); switch(nlane) { case 3: vfill = vld1q_lane_s32((const int32_t*)ptr + stride*2, vfill, 2); case 2: vfill = vld1q_lane_s32((const int32_t*)ptr + stride, vfill, 1); case 1: vfill = vld1q_lane_s32((const int32_t*)ptr, vfill, 0); break; default: return npyv_loadn_s32(ptr, stride); } return vfill; } NPY_FINLINE npyv_s32 npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s32(ptr, stride, nlane, 0); } NPY_FINLINE npyv_s64 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) { assert(nlane > 0); if (nlane == 1) { return npyv_load_till_s64(ptr, 1, fill); } return npyv_loadn_s64(ptr, stride); } NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill_lo, npy_int32 fill_hi) { assert(nlane > 0); if (nlane == 1) { const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; npyv_s32 a = vcombine_s32(vld1_s32((const int32_t*)ptr), vld1_s32(fill)); return a; } return npyv_loadn2_s32(ptr, stride); } NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) { assert(nlane > 0); if (nlane == 1) { npyv_s32 a = vcombine_s32(vld1_s32((const int32_t*)ptr), vdup_n_s32(0)); return a; } return npyv_loadn2_s32(ptr, stride); } NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill_lo, npy_int64 fill_hi) { assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); switch(nlane) { case 1: vst1q_lane_s32((int32_t*)ptr, a, 0); break; case 2: vst1_s32((int32_t*)ptr, vget_low_s32(a)); break; case 3: vst1_s32((int32_t*)ptr, vget_low_s32(a)); vst1q_lane_s32((int32_t*)ptr + 2, a, 2); break; default: npyv_store_s32(ptr, a); } } NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); if (nlane == 1) { vst1q_lane_s64((int64_t*)ptr, a, 0); return; } npyv_store_s64(ptr, a); } NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); if (nlane == 1) { vst1q_lane_s64((int64_t*)ptr, vreinterpretq_s64_s32(a), 0); return; } npyv_store_s32(ptr, a); } NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); (void)nlane; npyv_store_s64(ptr, a); } NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); vst1q_lane_s32((int32_t*)ptr, a, 0); switch(nlane) { case 1: return; case 2: vst1q_lane_s32((int32_t*)ptr + stride, a, 1); return; case 3: vst1q_lane_s32((int32_t*)ptr + stride, a, 1); vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); return; default: vst1q_lane_s32((int32_t*)ptr + stride, a, 1); vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); } } NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); if (nlane == 1) { vst1q_lane_s64((int64_t*)ptr, a, 0); return; } npyv_storen_s64(ptr, stride, a); } NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) { assert(nlane > 0); vst1q_lane_s64((int64_t*)ptr, vreinterpretq_s64_s32(a), 0); if (nlane > 1) { vst1q_lane_s64((int64_t*)(ptr + stride), vreinterpretq_s64_s32(a), 1); } } NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) { assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } # 506 "./memory.h" NPY_FINLINE npyv_u32 npyv_load_till_u32 (const npyv_lanetype_u32 *ptr, npy_uintp nlane, npyv_lanetype_u32 fill) { union { npyv_lanetype_u32 from_u32; npyv_lanetype_s32 to_s32; } pun; pun.from_u32 = fill; return vreinterpretq_u32_s32(npyv_load_till_s32( (const npyv_lanetype_s32 *)ptr, nlane, pun.to_s32 )); } NPY_FINLINE npyv_u32 npyv_loadn_till_u32 (const npyv_lanetype_u32 *ptr, npy_intp stride, npy_uintp nlane, npyv_lanetype_u32 fill) { union { npyv_lanetype_u32 from_u32; npyv_lanetype_s32 to_s32; } pun; pun.from_u32 = fill; return vreinterpretq_u32_s32(npyv_loadn_till_s32( (const npyv_lanetype_s32 *)ptr, stride, nlane, pun.to_s32 )); } NPY_FINLINE npyv_u32 npyv_load_tillz_u32 (const npyv_lanetype_u32 *ptr, npy_uintp nlane) { return vreinterpretq_u32_s32(npyv_load_tillz_s32( (const npyv_lanetype_s32 *)ptr, nlane )); } NPY_FINLINE npyv_u32 npyv_loadn_tillz_u32 (const npyv_lanetype_u32 *ptr, npy_intp stride, npy_uintp nlane) { return vreinterpretq_u32_s32(npyv_loadn_tillz_s32( (const npyv_lanetype_s32 *)ptr, stride, nlane )); } NPY_FINLINE void npyv_store_till_u32 (npyv_lanetype_u32 *ptr, npy_uintp nlane, npyv_u32 a) { npyv_store_till_s32( (npyv_lanetype_s32 *)ptr, nlane, vreinterpretq_s32_u32(a) ); } NPY_FINLINE void npyv_storen_till_u32 (npyv_lanetype_u32 *ptr, npy_intp stride, npy_uintp nlane, npyv_u32 a) { npyv_storen_till_s32( (npyv_lanetype_s32 *)ptr, stride, nlane, vreinterpretq_s32_u32(a) ); } NPY_FINLINE npyv_f32 npyv_load_till_f32 (const npyv_lanetype_f32 *ptr, npy_uintp nlane, npyv_lanetype_f32 fill) { union { npyv_lanetype_f32 from_f32; npyv_lanetype_s32 to_s32; } pun; pun.from_f32 = fill; return vreinterpretq_f32_s32(npyv_load_till_s32( (const npyv_lanetype_s32 *)ptr, nlane, pun.to_s32 )); } NPY_FINLINE npyv_f32 npyv_loadn_till_f32 (const npyv_lanetype_f32 *ptr, npy_intp stride, npy_uintp nlane, npyv_lanetype_f32 fill) { union { npyv_lanetype_f32 from_f32; npyv_lanetype_s32 to_s32; } pun; pun.from_f32 = fill; return vreinterpretq_f32_s32(npyv_loadn_till_s32( (const npyv_lanetype_s32 *)ptr, stride, nlane, pun.to_s32 )); } NPY_FINLINE npyv_f32 npyv_load_tillz_f32 (const npyv_lanetype_f32 *ptr, npy_uintp nlane) { return vreinterpretq_f32_s32(npyv_load_tillz_s32( (const npyv_lanetype_s32 *)ptr, nlane )); } NPY_FINLINE npyv_f32 npyv_loadn_tillz_f32 (const npyv_lanetype_f32 *ptr, npy_intp stride, npy_uintp nlane) { return vreinterpretq_f32_s32(npyv_loadn_tillz_s32( (const npyv_lanetype_s32 *)ptr, stride, nlane )); } NPY_FINLINE void npyv_store_till_f32 (npyv_lanetype_f32 *ptr, npy_uintp nlane, npyv_f32 a) { npyv_store_till_s32( (npyv_lanetype_s32 *)ptr, nlane, vreinterpretq_s32_f32(a) ); } NPY_FINLINE void npyv_storen_till_f32 (npyv_lanetype_f32 *ptr, npy_intp stride, npy_uintp nlane, npyv_f32 a) { npyv_storen_till_s32( (npyv_lanetype_s32 *)ptr, stride, nlane, vreinterpretq_s32_f32(a) ); } NPY_FINLINE npyv_u64 npyv_load_till_u64 (const npyv_lanetype_u64 *ptr, npy_uintp nlane, npyv_lanetype_u64 fill) { union { npyv_lanetype_u64 from_u64; npyv_lanetype_s64 to_s64; } pun; pun.from_u64 = fill; return vreinterpretq_u64_s64(npyv_load_till_s64( (const npyv_lanetype_s64 *)ptr, nlane, pun.to_s64 )); } NPY_FINLINE npyv_u64 npyv_loadn_till_u64 (const npyv_lanetype_u64 *ptr, npy_intp stride, npy_uintp nlane, npyv_lanetype_u64 fill) { union { npyv_lanetype_u64 from_u64; npyv_lanetype_s64 to_s64; } pun; pun.from_u64 = fill; return vreinterpretq_u64_s64(npyv_loadn_till_s64( (const npyv_lanetype_s64 *)ptr, stride, nlane, pun.to_s64 )); } NPY_FINLINE npyv_u64 npyv_load_tillz_u64 (const npyv_lanetype_u64 *ptr, npy_uintp nlane) { return vreinterpretq_u64_s64(npyv_load_tillz_s64( (const npyv_lanetype_s64 *)ptr, nlane )); } NPY_FINLINE npyv_u64 npyv_loadn_tillz_u64 (const npyv_lanetype_u64 *ptr, npy_intp stride, npy_uintp nlane) { return vreinterpretq_u64_s64(npyv_loadn_tillz_s64( (const npyv_lanetype_s64 *)ptr, stride, nlane )); } NPY_FINLINE void npyv_store_till_u64 (npyv_lanetype_u64 *ptr, npy_uintp nlane, npyv_u64 a) { npyv_store_till_s64( (npyv_lanetype_s64 *)ptr, nlane, vreinterpretq_s64_u64(a) ); } NPY_FINLINE void npyv_storen_till_u64 (npyv_lanetype_u64 *ptr, npy_intp stride, npy_uintp nlane, npyv_u64 a) { npyv_storen_till_s64( (npyv_lanetype_s64 *)ptr, stride, nlane, vreinterpretq_s64_u64(a) ); } NPY_FINLINE npyv_f64 npyv_load_till_f64 (const npyv_lanetype_f64 *ptr, npy_uintp nlane, npyv_lanetype_f64 fill) { union { npyv_lanetype_f64 from_f64; npyv_lanetype_s64 to_s64; } pun; pun.from_f64 = fill; return vreinterpretq_f64_s64(npyv_load_till_s64( (const npyv_lanetype_s64 *)ptr, nlane, pun.to_s64 )); } NPY_FINLINE npyv_f64 npyv_loadn_till_f64 (const npyv_lanetype_f64 *ptr, npy_intp stride, npy_uintp nlane, npyv_lanetype_f64 fill) { union { npyv_lanetype_f64 from_f64; npyv_lanetype_s64 to_s64; } pun; pun.from_f64 = fill; return vreinterpretq_f64_s64(npyv_loadn_till_s64( (const npyv_lanetype_s64 *)ptr, stride, nlane, pun.to_s64 )); } NPY_FINLINE npyv_f64 npyv_load_tillz_f64 (const npyv_lanetype_f64 *ptr, npy_uintp nlane) { return vreinterpretq_f64_s64(npyv_load_tillz_s64( (const npyv_lanetype_s64 *)ptr, nlane )); } NPY_FINLINE npyv_f64 npyv_loadn_tillz_f64 (const npyv_lanetype_f64 *ptr, npy_intp stride, npy_uintp nlane) { return vreinterpretq_f64_s64(npyv_loadn_tillz_s64( (const npyv_lanetype_s64 *)ptr, stride, nlane )); } NPY_FINLINE void npyv_store_till_f64 (npyv_lanetype_f64 *ptr, npy_uintp nlane, npyv_f64 a) { npyv_store_till_s64( (npyv_lanetype_s64 *)ptr, nlane, vreinterpretq_s64_f64(a) ); } NPY_FINLINE void npyv_storen_till_f64 (npyv_lanetype_f64 *ptr, npy_intp stride, npy_uintp nlane, npyv_f64 a) { npyv_storen_till_s64( (npyv_lanetype_s64 *)ptr, stride, nlane, vreinterpretq_s64_f64(a) ); } # 579 "./memory.h" NPY_FINLINE npyv_u32 npyv_load2_till_u32 (const npyv_lanetype_u32 *ptr, npy_uintp nlane, npyv_lanetype_u32 fill_lo, npyv_lanetype_u32 fill_hi) { union pun { npyv_lanetype_u32 from_u32; npyv_lanetype_s32 to_s32; }; union pun pun_lo; union pun pun_hi; pun_lo.from_u32 = fill_lo; pun_hi.from_u32 = fill_hi; return vreinterpretq_u32_s32(npyv_load2_till_s32( (const npyv_lanetype_s32 *)ptr, nlane, pun_lo.to_s32, pun_hi.to_s32 )); } NPY_FINLINE npyv_u32 npyv_loadn2_till_u32 (const npyv_lanetype_u32 *ptr, npy_intp stride, npy_uintp nlane, npyv_lanetype_u32 fill_lo, npyv_lanetype_u32 fill_hi) { union pun { npyv_lanetype_u32 from_u32; npyv_lanetype_s32 to_s32; }; union pun pun_lo; union pun pun_hi; pun_lo.from_u32 = fill_lo; pun_hi.from_u32 = fill_hi; return vreinterpretq_u32_s32(npyv_loadn2_till_s32( (const npyv_lanetype_s32 *)ptr, stride, nlane, pun_lo.to_s32, pun_hi.to_s32 )); } NPY_FINLINE npyv_u32 npyv_load2_tillz_u32 (const npyv_lanetype_u32 *ptr, npy_uintp nlane) { return vreinterpretq_u32_s32(npyv_load2_tillz_s32( (const npyv_lanetype_s32 *)ptr, nlane )); } NPY_FINLINE npyv_u32 npyv_loadn2_tillz_u32 (const npyv_lanetype_u32 *ptr, npy_intp stride, npy_uintp nlane) { return vreinterpretq_u32_s32(npyv_loadn2_tillz_s32( (const npyv_lanetype_s32 *)ptr, stride, nlane )); } NPY_FINLINE void npyv_store2_till_u32 (npyv_lanetype_u32 *ptr, npy_uintp nlane, npyv_u32 a) { npyv_store2_till_s32( (npyv_lanetype_s32 *)ptr, nlane, vreinterpretq_s32_u32(a) ); } NPY_FINLINE void npyv_storen2_till_u32 (npyv_lanetype_u32 *ptr, npy_intp stride, npy_uintp nlane, npyv_u32 a) { npyv_storen2_till_s32( (npyv_lanetype_s32 *)ptr, stride, nlane, vreinterpretq_s32_u32(a) ); } NPY_FINLINE npyv_f32 npyv_load2_till_f32 (const npyv_lanetype_f32 *ptr, npy_uintp nlane, npyv_lanetype_f32 fill_lo, npyv_lanetype_f32 fill_hi) { union pun { npyv_lanetype_f32 from_f32; npyv_lanetype_s32 to_s32; }; union pun pun_lo; union pun pun_hi; pun_lo.from_f32 = fill_lo; pun_hi.from_f32 = fill_hi; return vreinterpretq_f32_s32(npyv_load2_till_s32( (const npyv_lanetype_s32 *)ptr, nlane, pun_lo.to_s32, pun_hi.to_s32 )); } NPY_FINLINE npyv_f32 npyv_loadn2_till_f32 (const npyv_lanetype_f32 *ptr, npy_intp stride, npy_uintp nlane, npyv_lanetype_f32 fill_lo, npyv_lanetype_f32 fill_hi) { union pun { npyv_lanetype_f32 from_f32; npyv_lanetype_s32 to_s32; }; union pun pun_lo; union pun pun_hi; pun_lo.from_f32 = fill_lo; pun_hi.from_f32 = fill_hi; return vreinterpretq_f32_s32(npyv_loadn2_till_s32( (const npyv_lanetype_s32 *)ptr, stride, nlane, pun_lo.to_s32, pun_hi.to_s32 )); } NPY_FINLINE npyv_f32 npyv_load2_tillz_f32 (const npyv_lanetype_f32 *ptr, npy_uintp nlane) { return vreinterpretq_f32_s32(npyv_load2_tillz_s32( (const npyv_lanetype_s32 *)ptr, nlane )); } NPY_FINLINE npyv_f32 npyv_loadn2_tillz_f32 (const npyv_lanetype_f32 *ptr, npy_intp stride, npy_uintp nlane) { return vreinterpretq_f32_s32(npyv_loadn2_tillz_s32( (const npyv_lanetype_s32 *)ptr, stride, nlane )); } NPY_FINLINE void npyv_store2_till_f32 (npyv_lanetype_f32 *ptr, npy_uintp nlane, npyv_f32 a) { npyv_store2_till_s32( (npyv_lanetype_s32 *)ptr, nlane, vreinterpretq_s32_f32(a) ); } NPY_FINLINE void npyv_storen2_till_f32 (npyv_lanetype_f32 *ptr, npy_intp stride, npy_uintp nlane, npyv_f32 a) { npyv_storen2_till_s32( (npyv_lanetype_s32 *)ptr, stride, nlane, vreinterpretq_s32_f32(a) ); } NPY_FINLINE npyv_u64 npyv_load2_till_u64 (const npyv_lanetype_u64 *ptr, npy_uintp nlane, npyv_lanetype_u64 fill_lo, npyv_lanetype_u64 fill_hi) { union pun { npyv_lanetype_u64 from_u64; npyv_lanetype_s64 to_s64; }; union pun pun_lo; union pun pun_hi; pun_lo.from_u64 = fill_lo; pun_hi.from_u64 = fill_hi; return vreinterpretq_u64_s64(npyv_load2_till_s64( (const npyv_lanetype_s64 *)ptr, nlane, pun_lo.to_s64, pun_hi.to_s64 )); } NPY_FINLINE npyv_u64 npyv_loadn2_till_u64 (const npyv_lanetype_u64 *ptr, npy_intp stride, npy_uintp nlane, npyv_lanetype_u64 fill_lo, npyv_lanetype_u64 fill_hi) { union pun { npyv_lanetype_u64 from_u64; npyv_lanetype_s64 to_s64; }; union pun pun_lo; union pun pun_hi; pun_lo.from_u64 = fill_lo; pun_hi.from_u64 = fill_hi; return vreinterpretq_u64_s64(npyv_loadn2_till_s64( (const npyv_lanetype_s64 *)ptr, stride, nlane, pun_lo.to_s64, pun_hi.to_s64 )); } NPY_FINLINE npyv_u64 npyv_load2_tillz_u64 (const npyv_lanetype_u64 *ptr, npy_uintp nlane) { return vreinterpretq_u64_s64(npyv_load2_tillz_s64( (const npyv_lanetype_s64 *)ptr, nlane )); } NPY_FINLINE npyv_u64 npyv_loadn2_tillz_u64 (const npyv_lanetype_u64 *ptr, npy_intp stride, npy_uintp nlane) { return vreinterpretq_u64_s64(npyv_loadn2_tillz_s64( (const npyv_lanetype_s64 *)ptr, stride, nlane )); } NPY_FINLINE void npyv_store2_till_u64 (npyv_lanetype_u64 *ptr, npy_uintp nlane, npyv_u64 a) { npyv_store2_till_s64( (npyv_lanetype_s64 *)ptr, nlane, vreinterpretq_s64_u64(a) ); } NPY_FINLINE void npyv_storen2_till_u64 (npyv_lanetype_u64 *ptr, npy_intp stride, npy_uintp nlane, npyv_u64 a) { npyv_storen2_till_s64( (npyv_lanetype_s64 *)ptr, stride, nlane, vreinterpretq_s64_u64(a) ); } NPY_FINLINE npyv_f64 npyv_load2_till_f64 (const npyv_lanetype_f64 *ptr, npy_uintp nlane, npyv_lanetype_f64 fill_lo, npyv_lanetype_f64 fill_hi) { union pun { npyv_lanetype_f64 from_f64; npyv_lanetype_s64 to_s64; }; union pun pun_lo; union pun pun_hi; pun_lo.from_f64 = fill_lo; pun_hi.from_f64 = fill_hi; return vreinterpretq_f64_s64(npyv_load2_till_s64( (const npyv_lanetype_s64 *)ptr, nlane, pun_lo.to_s64, pun_hi.to_s64 )); } NPY_FINLINE npyv_f64 npyv_loadn2_till_f64 (const npyv_lanetype_f64 *ptr, npy_intp stride, npy_uintp nlane, npyv_lanetype_f64 fill_lo, npyv_lanetype_f64 fill_hi) { union pun { npyv_lanetype_f64 from_f64; npyv_lanetype_s64 to_s64; }; union pun pun_lo; union pun pun_hi; pun_lo.from_f64 = fill_lo; pun_hi.from_f64 = fill_hi; return vreinterpretq_f64_s64(npyv_loadn2_till_s64( (const npyv_lanetype_s64 *)ptr, stride, nlane, pun_lo.to_s64, pun_hi.to_s64 )); } NPY_FINLINE npyv_f64 npyv_load2_tillz_f64 (const npyv_lanetype_f64 *ptr, npy_uintp nlane) { return vreinterpretq_f64_s64(npyv_load2_tillz_s64( (const npyv_lanetype_s64 *)ptr, nlane )); } NPY_FINLINE npyv_f64 npyv_loadn2_tillz_f64 (const npyv_lanetype_f64 *ptr, npy_intp stride, npy_uintp nlane) { return vreinterpretq_f64_s64(npyv_loadn2_tillz_s64( (const npyv_lanetype_s64 *)ptr, stride, nlane )); } NPY_FINLINE void npyv_store2_till_f64 (npyv_lanetype_f64 *ptr, npy_uintp nlane, npyv_f64 a) { npyv_store2_till_s64( (npyv_lanetype_s64 *)ptr, nlane, vreinterpretq_s64_f64(a) ); } NPY_FINLINE void npyv_storen2_till_f64 (npyv_lanetype_f64 *ptr, npy_intp stride, npy_uintp nlane, npyv_f64 a) { npyv_storen2_till_s64( (npyv_lanetype_s64 *)ptr, stride, nlane, vreinterpretq_s64_f64(a) ); } # 602 "./memory.h" NPY_FINLINE npyv_u8x2 npyv_load_u8x2( const npyv_lanetype_u8 *ptr ) { return vld2q_u8((const uint8_t*)ptr); } NPY_FINLINE void npyv_store_u8x2( npyv_lanetype_u8 *ptr, npyv_u8x2 v ) { vst2q_u8((uint8_t*)ptr, v); } NPY_FINLINE npyv_s8x2 npyv_load_s8x2( const npyv_lanetype_s8 *ptr ) { return vld2q_s8((const int8_t*)ptr); } NPY_FINLINE void npyv_store_s8x2( npyv_lanetype_s8 *ptr, npyv_s8x2 v ) { vst2q_s8((int8_t*)ptr, v); } NPY_FINLINE npyv_u16x2 npyv_load_u16x2( const npyv_lanetype_u16 *ptr ) { return vld2q_u16((const uint16_t*)ptr); } NPY_FINLINE void npyv_store_u16x2( npyv_lanetype_u16 *ptr, npyv_u16x2 v ) { vst2q_u16((uint16_t*)ptr, v); } NPY_FINLINE npyv_s16x2 npyv_load_s16x2( const npyv_lanetype_s16 *ptr ) { return vld2q_s16((const int16_t*)ptr); } NPY_FINLINE void npyv_store_s16x2( npyv_lanetype_s16 *ptr, npyv_s16x2 v ) { vst2q_s16((int16_t*)ptr, v); } NPY_FINLINE npyv_u32x2 npyv_load_u32x2( const npyv_lanetype_u32 *ptr ) { return vld2q_u32((const uint32_t*)ptr); } NPY_FINLINE void npyv_store_u32x2( npyv_lanetype_u32 *ptr, npyv_u32x2 v ) { vst2q_u32((uint32_t*)ptr, v); } NPY_FINLINE npyv_s32x2 npyv_load_s32x2( const npyv_lanetype_s32 *ptr ) { return vld2q_s32((const int32_t*)ptr); } NPY_FINLINE void npyv_store_s32x2( npyv_lanetype_s32 *ptr, npyv_s32x2 v ) { vst2q_s32((int32_t*)ptr, v); } NPY_FINLINE npyv_f32x2 npyv_load_f32x2( const npyv_lanetype_f32 *ptr ) { return vld2q_f32((const float*)ptr); } NPY_FINLINE void npyv_store_f32x2( npyv_lanetype_f32 *ptr, npyv_f32x2 v ) { vst2q_f32((float*)ptr, v); } NPY_FINLINE npyv_f64x2 npyv_load_f64x2( const npyv_lanetype_f64 *ptr ) { return vld2q_f64((const double*)ptr); } NPY_FINLINE void npyv_store_f64x2( npyv_lanetype_f64 *ptr, npyv_f64x2 v ) { vst2q_f64((double*)ptr, v); } NPY_FINLINE npyv_u64x2 npyv_load_u64x2( const npyv_lanetype_u64 *ptr ) { return vld2q_u64((const uint64_t*)ptr); } NPY_FINLINE void npyv_store_u64x2( npyv_lanetype_u64 *ptr, npyv_u64x2 v ) { vst2q_u64((uint64_t*)ptr, v); } NPY_FINLINE npyv_s64x2 npyv_load_s64x2( const npyv_lanetype_s64 *ptr ) { return vld2q_s64((const int64_t*)ptr); } NPY_FINLINE void npyv_store_s64x2( npyv_lanetype_s64 *ptr, npyv_s64x2 v ) { vst2q_s64((int64_t*)ptr, v); } # 642 "./memory.h" NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) { const unsigned i0 = vgetq_lane_u32(idx, 0); const unsigned i1 = vgetq_lane_u32(idx, 1); const unsigned i2 = vgetq_lane_u32(idx, 2); const unsigned i3 = vgetq_lane_u32(idx, 3); uint32x2_t low = vcreate_u32(table[i0]); low = vld1_lane_u32((const uint32_t*)table + i1, low, 1); uint32x2_t high = vcreate_u32(table[i2]); high = vld1_lane_u32((const uint32_t*)table + i3, high, 1); return vcombine_u32(low, high); } NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) { return vreinterpretq_s32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) { return vreinterpretq_f32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) { const unsigned i0 = vgetq_lane_u32(vreinterpretq_u32_u64(idx), 0); const unsigned i1 = vgetq_lane_u32(vreinterpretq_u32_u64(idx), 2); return vcombine_u64( vld1_u64((const uint64_t*)table + i0), vld1_u64((const uint64_t*)table + i1) ); } NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) { return vreinterpretq_s64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) { return vreinterpretq_f64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } # 77 "./neon.h" 2 # 1 "./misc.h" 1 # 78 "./neon.h" 2 # 1 "./reorder.h" 1 # 64 "./reorder.h" NPY_FINLINE npyv_u8x2 npyv_combine_u8(npyv_u8 a, npyv_u8 b) { npyv_u8x2 r; r.val[0] = NPY_CAT(npyv_combinel_, u8)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, u8)(a, b); return r; } NPY_FINLINE npyv_s8x2 npyv_combine_s8(npyv_s8 a, npyv_s8 b) { npyv_s8x2 r; r.val[0] = NPY_CAT(npyv_combinel_, s8)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, s8)(a, b); return r; } NPY_FINLINE npyv_u16x2 npyv_combine_u16(npyv_u16 a, npyv_u16 b) { npyv_u16x2 r; r.val[0] = NPY_CAT(npyv_combinel_, u16)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, u16)(a, b); return r; } NPY_FINLINE npyv_s16x2 npyv_combine_s16(npyv_s16 a, npyv_s16 b) { npyv_s16x2 r; r.val[0] = NPY_CAT(npyv_combinel_, s16)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, s16)(a, b); return r; } NPY_FINLINE npyv_u32x2 npyv_combine_u32(npyv_u32 a, npyv_u32 b) { npyv_u32x2 r; r.val[0] = NPY_CAT(npyv_combinel_, u32)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, u32)(a, b); return r; } NPY_FINLINE npyv_s32x2 npyv_combine_s32(npyv_s32 a, npyv_s32 b) { npyv_s32x2 r; r.val[0] = NPY_CAT(npyv_combinel_, s32)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, s32)(a, b); return r; } NPY_FINLINE npyv_u64x2 npyv_combine_u64(npyv_u64 a, npyv_u64 b) { npyv_u64x2 r; r.val[0] = NPY_CAT(npyv_combinel_, u64)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, u64)(a, b); return r; } NPY_FINLINE npyv_s64x2 npyv_combine_s64(npyv_s64 a, npyv_s64 b) { npyv_s64x2 r; r.val[0] = NPY_CAT(npyv_combinel_, s64)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, s64)(a, b); return r; } NPY_FINLINE npyv_f32x2 npyv_combine_f32(npyv_f32 a, npyv_f32 b) { npyv_f32x2 r; r.val[0] = NPY_CAT(npyv_combinel_, f32)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, f32)(a, b); return r; } NPY_FINLINE npyv_f64x2 npyv_combine_f64(npyv_f64 a, npyv_f64 b) { npyv_f64x2 r; r.val[0] = NPY_CAT(npyv_combinel_, f64)(a, b); r.val[1] = NPY_CAT(npyv_combineh_, f64)(a, b); return r; } # 102 "./reorder.h" NPY_FINLINE npyv_u8x2 npyv_zip_u8(npyv_u8 a, npyv_u8 b) { npyv_u8x2 r; r.val[0] = vzip1q_u8(a, b); r.val[1] = vzip2q_u8(a, b); return r; } NPY_FINLINE npyv_u8x2 npyv_unzip_u8(npyv_u8 a, npyv_u8 b) { npyv_u8x2 r; r.val[0] = vuzp1q_u8(a, b); r.val[1] = vuzp2q_u8(a, b); return r; } NPY_FINLINE npyv_s8x2 npyv_zip_s8(npyv_s8 a, npyv_s8 b) { npyv_s8x2 r; r.val[0] = vzip1q_s8(a, b); r.val[1] = vzip2q_s8(a, b); return r; } NPY_FINLINE npyv_s8x2 npyv_unzip_s8(npyv_s8 a, npyv_s8 b) { npyv_s8x2 r; r.val[0] = vuzp1q_s8(a, b); r.val[1] = vuzp2q_s8(a, b); return r; } NPY_FINLINE npyv_u16x2 npyv_zip_u16(npyv_u16 a, npyv_u16 b) { npyv_u16x2 r; r.val[0] = vzip1q_u16(a, b); r.val[1] = vzip2q_u16(a, b); return r; } NPY_FINLINE npyv_u16x2 npyv_unzip_u16(npyv_u16 a, npyv_u16 b) { npyv_u16x2 r; r.val[0] = vuzp1q_u16(a, b); r.val[1] = vuzp2q_u16(a, b); return r; } NPY_FINLINE npyv_s16x2 npyv_zip_s16(npyv_s16 a, npyv_s16 b) { npyv_s16x2 r; r.val[0] = vzip1q_s16(a, b); r.val[1] = vzip2q_s16(a, b); return r; } NPY_FINLINE npyv_s16x2 npyv_unzip_s16(npyv_s16 a, npyv_s16 b) { npyv_s16x2 r; r.val[0] = vuzp1q_s16(a, b); r.val[1] = vuzp2q_s16(a, b); return r; } NPY_FINLINE npyv_u32x2 npyv_zip_u32(npyv_u32 a, npyv_u32 b) { npyv_u32x2 r; r.val[0] = vzip1q_u32(a, b); r.val[1] = vzip2q_u32(a, b); return r; } NPY_FINLINE npyv_u32x2 npyv_unzip_u32(npyv_u32 a, npyv_u32 b) { npyv_u32x2 r; r.val[0] = vuzp1q_u32(a, b); r.val[1] = vuzp2q_u32(a, b); return r; } NPY_FINLINE npyv_s32x2 npyv_zip_s32(npyv_s32 a, npyv_s32 b) { npyv_s32x2 r; r.val[0] = vzip1q_s32(a, b); r.val[1] = vzip2q_s32(a, b); return r; } NPY_FINLINE npyv_s32x2 npyv_unzip_s32(npyv_s32 a, npyv_s32 b) { npyv_s32x2 r; r.val[0] = vuzp1q_s32(a, b); r.val[1] = vuzp2q_s32(a, b); return r; } NPY_FINLINE npyv_f32x2 npyv_zip_f32(npyv_f32 a, npyv_f32 b) { npyv_f32x2 r; r.val[0] = vzip1q_f32(a, b); r.val[1] = vzip2q_f32(a, b); return r; } NPY_FINLINE npyv_f32x2 npyv_unzip_f32(npyv_f32 a, npyv_f32 b) { npyv_f32x2 r; r.val[0] = vuzp1q_f32(a, b); r.val[1] = vuzp2q_f32(a, b); return r; } # 79 "./neon.h" 2 # 1 "./operators.h" 1 # 242 "./operators.h" NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) { # 258 "./operators.h" return vceqq_f32(a, a); } NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) { return vceqq_f64(a, a); } # 283 "./operators.h" NPY_FINLINE bool npyv_any_b8(npyv_b8 a) { return vmaxvq_u8(a) != 0; } NPY_FINLINE bool npyv_all_b8(npyv_b8 a) { return vminvq_u8(a) != 0; } NPY_FINLINE bool npyv_any_b16(npyv_b16 a) { return vmaxvq_u16(a) != 0; } NPY_FINLINE bool npyv_all_b16(npyv_b16 a) { return vminvq_u16(a) != 0; } NPY_FINLINE bool npyv_any_b32(npyv_b32 a) { return vmaxvq_u32(a) != 0; } NPY_FINLINE bool npyv_all_b32(npyv_b32 a) { return vminvq_u32(a) != 0; } NPY_FINLINE bool npyv_any_u8(npyv_u8 a) { return npyv_any_b8(a); } NPY_FINLINE bool npyv_all_u8(npyv_u8 a) { return npyv_all_b8(a); } NPY_FINLINE bool npyv_any_s8(npyv_s8 a) { return npyv_any_b8(vreinterpretq_u8_s8(a)); } NPY_FINLINE bool npyv_all_s8(npyv_s8 a) { return npyv_all_b8(vreinterpretq_u8_s8(a)); } NPY_FINLINE bool npyv_any_u16(npyv_u16 a) { return npyv_any_b16(a); } NPY_FINLINE bool npyv_all_u16(npyv_u16 a) { return npyv_all_b16(a); } NPY_FINLINE bool npyv_any_s16(npyv_s16 a) { return npyv_any_b16(vreinterpretq_u16_s16(a)); } NPY_FINLINE bool npyv_all_s16(npyv_s16 a) { return npyv_all_b16(vreinterpretq_u16_s16(a)); } NPY_FINLINE bool npyv_any_u32(npyv_u32 a) { return npyv_any_b32(a); } NPY_FINLINE bool npyv_all_u32(npyv_u32 a) { return npyv_all_b32(a); } NPY_FINLINE bool npyv_any_s32(npyv_s32 a) { return npyv_any_b32(vreinterpretq_u32_s32(a)); } NPY_FINLINE bool npyv_all_s32(npyv_s32 a) { return npyv_all_b32(vreinterpretq_u32_s32(a)); } NPY_FINLINE bool npyv_any_b64(npyv_b64 a) { return vmaxvq_u32(vreinterpretq_u32_u64(a)) != 0; } NPY_FINLINE bool npyv_all_b64(npyv_b64 a) { return vminvq_u32(vreinterpretq_u32_u64(a)) != 0; } NPY_FINLINE bool npyv_all_u64(npyv_u64 a) { uint32x4_t a32 = vreinterpretq_u32_u64(a); a32 = vorrq_u32(a32, vrev64q_u32(a32)); return vminvq_u32(a32) != 0; } NPY_FINLINE bool npyv_any_s64(npyv_s64 a) { return npyv_any_b64(vreinterpretq_u64_s64(a)); } NPY_FINLINE bool npyv_all_s64(npyv_s64 a) { return npyv_all_u64(vreinterpretq_u64_s64(a)); } NPY_FINLINE bool npyv_any_f32(npyv_f32 a) { return !npyv_all_b32(vceqq_f32(a, vdupq_n_f32(0.0f))); } NPY_FINLINE bool npyv_all_f32(npyv_f32 a) { return !npyv_any_b32(vceqq_f32(a, vdupq_n_f32(0.0f))); } NPY_FINLINE bool npyv_any_f64(npyv_f64 a) { return !npyv_all_b64(vceqq_f64(a, vdupq_n_f64(0.0))); } NPY_FINLINE bool npyv_all_f64(npyv_f64 a) { return !npyv_any_b64(vceqq_f64(a, vdupq_n_f64(0.0))); } # 80 "./neon.h" 2 # 1 "./conversion.h" 1 # 33 "./conversion.h" NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) { const npyv_u8 scale = ((uint8x16_t){NPYV__SET_FILL_16(uint8_t, 0, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128)}); npyv_u8 seq_scale = vandq_u8(a, scale); const npyv_u8 byteOrder = {0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15}; npyv_u8 v0 = vqtbl1q_u8(seq_scale, byteOrder); return vaddlvq_u16(vreinterpretq_u16_u8(v0)); } NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) { const npyv_u16 scale = ((uint16x8_t){NPYV__SET_FILL_8(uint16_t, 0, 1, 2, 4, 8, 16, 32, 64, 128)}); npyv_u16 seq_scale = vandq_u16(a, scale); return vaddvq_u16(seq_scale); } NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) { const npyv_u32 scale = ((uint32x4_t){NPYV__SET_FILL_4(uint32_t, 0, 1, 2, 4, 8)}); npyv_u32 seq_scale = vandq_u32(a, scale); return vaddvq_u32(seq_scale); } NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) { uint64_t lo = vgetq_lane_u64(a, 0); uint64_t hi = vgetq_lane_u64(a, 1); return ((hi & 0x2) | (lo & 0x1)); } NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) { npyv_u16x2 r; r.val[0] = vmovl_u8(vget_low_u8(data)); r.val[1] = vmovl_u8(vget_high_u8(data)); return r; } NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { npyv_u32x2 r; r.val[0] = vmovl_u16(vget_low_u16(data)); r.val[1] = vmovl_u16(vget_high_u16(data)); return r; } NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { return vuzp1q_u8((uint8x16_t)a, (uint8x16_t)b); } NPY_FINLINE npyv_b8 npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { npyv_b16 ab = vuzp1q_u16((uint16x8_t)a, (uint16x8_t)b); npyv_b16 cd = vuzp1q_u16((uint16x8_t)c, (uint16x8_t)d); return npyv_pack_b8_b16(ab, cd); } NPY_FINLINE npyv_b8 npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { npyv_b32 ab = vuzp1q_u32((uint32x4_t)a, (uint32x4_t)b); npyv_b32 cd = vuzp1q_u32((uint32x4_t)c, (uint32x4_t)d); npyv_b32 ef = vuzp1q_u32((uint32x4_t)e, (uint32x4_t)f); npyv_u32 gh = vuzp1q_u32((uint32x4_t)g, (uint32x4_t)h); return npyv_pack_b8_b32(ab, cd, ef, gh); } NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) { npyv_s64 lo = vcvtnq_s64_f64(a), hi = vcvtnq_s64_f64(b); return vcombine_s32(vmovn_s64(lo), vmovn_s64(hi)); } # 81 "./neon.h" 2 # 1 "./arithmetic.h" 1 # 68 "./arithmetic.h" NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) { const uint8x8_t mulc_lo = vget_low_u8(divisor.val[0]); uint16x8_t mull_lo = vmull_u8(vget_low_u8(a), mulc_lo); uint16x8_t mull_hi = vmull_high_u8(a, divisor.val[0]); uint8x16_t mulhi = vuzp2q_u8(vreinterpretq_u8_u16(mull_lo), vreinterpretq_u8_u16(mull_hi)); uint8x16_t q = vsubq_u8(a, mulhi); q = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[1])); q = vaddq_u8(mulhi, q); q = vshlq_u8(q, vreinterpretq_s8_u8(divisor.val[2])); return q; } NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) { const int8x8_t mulc_lo = vget_low_s8(divisor.val[0]); int16x8_t mull_lo = vmull_s8(vget_low_s8(a), mulc_lo); int16x8_t mull_hi = vmull_high_s8(a, divisor.val[0]); int8x16_t mulhi = vuzp2q_s8(vreinterpretq_s8_s16(mull_lo), vreinterpretq_s8_s16(mull_hi)); int8x16_t q = vshlq_s8(vaddq_s8(a, mulhi), divisor.val[1]); q = vsubq_s8(q, vshrq_n_s8(a, 7)); q = vsubq_s8(veorq_s8(q, divisor.val[2]), divisor.val[2]); return q; } NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) { const uint16x4_t mulc_lo = vget_low_u16(divisor.val[0]); uint32x4_t mull_lo = vmull_u16(vget_low_u16(a), mulc_lo); uint32x4_t mull_hi = vmull_high_u16(a, divisor.val[0]); uint16x8_t mulhi = vuzp2q_u16(vreinterpretq_u16_u32(mull_lo), vreinterpretq_u16_u32(mull_hi)); uint16x8_t q = vsubq_u16(a, mulhi); q = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[1])); q = vaddq_u16(mulhi, q); q = vshlq_u16(q, vreinterpretq_s16_u16(divisor.val[2])); return q; } NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) { const int16x4_t mulc_lo = vget_low_s16(divisor.val[0]); int32x4_t mull_lo = vmull_s16(vget_low_s16(a), mulc_lo); int32x4_t mull_hi = vmull_high_s16(a, divisor.val[0]); int16x8_t mulhi = vuzp2q_s16(vreinterpretq_s16_s32(mull_lo), vreinterpretq_s16_s32(mull_hi)); int16x8_t q = vshlq_s16(vaddq_s16(a, mulhi), divisor.val[1]); q = vsubq_s16(q, vshrq_n_s16(a, 15)); q = vsubq_s16(veorq_s16(q, divisor.val[2]), divisor.val[2]); return q; } NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) { const uint32x2_t mulc_lo = vget_low_u32(divisor.val[0]); uint64x2_t mull_lo = vmull_u32(vget_low_u32(a), mulc_lo); uint64x2_t mull_hi = vmull_high_u32(a, divisor.val[0]); uint32x4_t mulhi = vuzp2q_u32(vreinterpretq_u32_u64(mull_lo), vreinterpretq_u32_u64(mull_hi)); uint32x4_t q = vsubq_u32(a, mulhi); q = vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[1])); q = vaddq_u32(mulhi, q); q = vshlq_u32(q, vreinterpretq_s32_u32(divisor.val[2])); return q; } NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) { const int32x2_t mulc_lo = vget_low_s32(divisor.val[0]); int64x2_t mull_lo = vmull_s32(vget_low_s32(a), mulc_lo); int64x2_t mull_hi = vmull_high_s32(a, divisor.val[0]); int32x4_t mulhi = vuzp2q_s32(vreinterpretq_s32_s64(mull_lo), vreinterpretq_s32_s64(mull_hi)); int32x4_t q = vshlq_s32(vaddq_s32(a, mulhi), divisor.val[1]); q = vsubq_s32(q, vshrq_n_s32(a, 31)); q = vsubq_s32(veorq_s32(q, divisor.val[2]), divisor.val[2]); return q; } NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) { const uint64_t d = vgetq_lane_u64(divisor.val[0], 0); return ((uint64x2_t){NPYV__SET_FILL_2(uint64_t, 0, vgetq_lane_u64(a, 0) / d, vgetq_lane_u64(a, 1) / d)}); } NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) { const int64_t d = vgetq_lane_s64(divisor.val[0], 0); return ((int64x2_t){NPYV__SET_FILL_2(int64_t, 0, vgetq_lane_s64(a, 0) / d, vgetq_lane_s64(a, 1) / d)}); } # 256 "./arithmetic.h" NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) { return vmlaq_f32(c, a, b); } NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) { return vmlaq_f32(vnegq_f32(c), a, b); } NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) { return vmlsq_f32(c, a, b); } NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) { return vmlsq_f32(vnegq_f32(c), a, b); } NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) { const npyv_f32 msign = ((float32x4_t){NPYV__SET_FILL_4(float, 0, -0.0f, 0.0f, -0.0f, 0.0f)}); return npyv_muladd_f32(a, b, vreinterpretq_f32_u8(veorq_u8(vreinterpretq_u8_f32(msign), vreinterpretq_u8_f32(c)))); } NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vfmaq_f64(c, a, b); } NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vfmaq_f64(vnegq_f64(c), a, b); } NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vfmsq_f64(c, a, b); } NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { return vfmsq_f64(vnegq_f64(c), a, b); } NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) { const npyv_f64 msign = ((float64x2_t){NPYV__SET_FILL_2(double, 0, -0.0, 0.0)}); return npyv_muladd_f64(a, b, vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(msign), vreinterpretq_u8_f64(c)))); } # 82 "./neon.h" 2 # 1 "./math.h" 1 # 16 "./math.h" NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) { return vmulq_f32(a, a); } NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) { return vmulq_f64(a, a); } # 58 "./math.h" NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) { const npyv_f32 one = vdupq_n_f32(1.0f); return vdivq_f32(one, a); # 77 "./math.h" } NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) { const npyv_f64 one = vdupq_n_f64(1.0); return vdivq_f64(one, a); } # 95 "./math.h" NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) { npyv_u32 nn_a = vceqq_f32(a, a); npyv_u32 nn_b = vceqq_f32(b, b); return vmaxq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a)); } # 116 "./math.h" NPY_FINLINE npyv_u64 npyv_max_u64(npyv_u64 a, npyv_u64 b) { return vbslq_u64(vcgtq_u64(a, b), a, b); } NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) { return vbslq_s64(vcgtq_s64(a, b), a, b); } # 134 "./math.h" NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) { npyv_u32 nn_a = vceqq_f32(a, a); npyv_u32 nn_b = vceqq_f32(b, b); return vminq_f32(vbslq_f32(nn_a, a, b), vbslq_f32(nn_b, b, a)); } # 156 "./math.h" NPY_FINLINE npyv_u64 npyv_min_u64(npyv_u64 a, npyv_u64 b) { return vbslq_u64(vcgtq_u64(b, a), a, b); } NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) { return vbslq_s64(vcgtq_s64(b, a), a, b); } # 268 "./math.h" NPY_FINLINE npy_uint64 npyv_reduce_max_u64(npyv_u64 a) { npy_uint64 al = (npy_uint64)vget_low_u64(a); npy_uint64 ah = (npy_uint64)vget_high_u64(a); return al > ah ? al : ah; } NPY_FINLINE npy_int64 npyv_reduce_max_s64(npyv_s64 a) { npy_int64 al = (npy_int64)vget_low_s64(a); npy_int64 ah = (npy_int64)vget_high_s64(a); return al > ah ? al : ah; } NPY_FINLINE npy_uint64 npyv_reduce_min_u64(npyv_u64 a) { npy_uint64 al = (npy_uint64)vget_low_u64(a); npy_uint64 ah = (npy_uint64)vget_high_u64(a); return al < ah ? al : ah; } NPY_FINLINE npy_int64 npyv_reduce_min_s64(npyv_s64 a) { npy_int64 al = (npy_int64)vget_low_s64(a); npy_int64 ah = (npy_int64)vget_high_s64(a); return al < ah ? al : ah; } NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) { # 285 "./math.h" const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); npyv_u32 nnan_mask = vceqq_f32(a, a); npyv_f32 abs_x = vabsq_f32(vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a)))); npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask )); npyv_u32 mask = vcleq_f32(abs_x, two_power_23); mask = vandq_u32(mask, nnan_mask); return vbslq_f32(mask, round, a); } # 310 "./math.h" NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); npyv_u32 nnan_mask = vceqq_f32(a, a); npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); npyv_f32 abs_x = vabsq_f32(x); npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 ceil = vaddq_f32(round, vreinterpretq_f32_u32( vandq_u32(vcltq_f32(round, x), one)) ); ceil = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(ceil), sign_mask)); npyv_u32 mask = vcleq_f32(abs_x, two_power_23); mask = vandq_u32(mask, nnan_mask); return vbslq_f32(mask, ceil, a); } # 344 "./math.h" NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) { const npyv_s32 max_int = vdupq_n_s32(0x7fffffff); const npyv_u32 exp_mask = vdupq_n_u32(0xff000000); const npyv_s32 szero = vreinterpretq_s32_f32(vdupq_n_f32(-0.0f)); const npyv_u32 sign_mask = vandq_u32( vreinterpretq_u32_f32(a), vreinterpretq_u32_s32(szero)); npyv_u32 nfinite_mask = vshlq_n_u32(vreinterpretq_u32_f32(a), 1); nfinite_mask = vandq_u32(nfinite_mask, exp_mask); nfinite_mask = vceqq_u32(nfinite_mask, exp_mask); npyv_f32 x = vreinterpretq_f32_u32( veorq_u32(nfinite_mask, vreinterpretq_u32_f32(a))); npyv_s32 trunci = vcvtq_s32_f32(x); npyv_f32 trunc = vcvtq_f32_s32(trunci); trunc = vreinterpretq_f32_u32( vorrq_u32(vreinterpretq_u32_f32(trunc), sign_mask)); npyv_u32 overflow_mask = vorrq_u32( vceqq_s32(trunci, szero), vceqq_s32(trunci, max_int) ); return vbslq_f32(vorrq_u32(nfinite_mask, overflow_mask), a, trunc); } # 385 "./math.h" NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) { const npyv_u32 one = vreinterpretq_u32_f32(vdupq_n_f32(1.0f)); const npyv_u32 szero = vreinterpretq_u32_f32(vdupq_n_f32(-0.0f)); const npyv_u32 sign_mask = vandq_u32(vreinterpretq_u32_f32(a), szero); const npyv_f32 two_power_23 = vdupq_n_f32(8388608.0); const npyv_f32 two_power_23h = vdupq_n_f32(12582912.0f); npyv_u32 nnan_mask = vceqq_f32(a, a); npyv_f32 x = vreinterpretq_f32_u32(vandq_u32(nnan_mask, vreinterpretq_u32_f32(a))); npyv_f32 abs_x = vabsq_f32(x); npyv_f32 round = vsubq_f32(vaddq_f32(two_power_23h, abs_x), two_power_23h); round = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(round), sign_mask)); npyv_f32 floor = vsubq_f32(round, vreinterpretq_f32_u32( vandq_u32(vcgtq_f32(round, x), one) )); floor = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(floor), sign_mask)); npyv_u32 mask = vcleq_f32(abs_x, two_power_23); mask = vandq_u32(mask, nnan_mask); return vbslq_f32(mask, floor, a); } # 83 "./neon.h" 2