diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index d05f5cd9ba7f95..01c80bdf799a9b 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -195,10 +195,5 @@ TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4, "V4UiV2d", "nc", "si TARGET_BUILTIN(__builtin_wasm_load32_zero, "V4iiC*", "n", "simd128") TARGET_BUILTIN(__builtin_wasm_load64_zero, "V2LLiLLiC*", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_store8_lane, "vSc*V16ScIi", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_store16_lane, "vs*V8sIi", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_store32_lane, "vi*V4iIi", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_store64_lane, "vLLi*V2LLiIi", "n", "simd128") - #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index d4b2414cde4254..2819931664ba42 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17776,36 +17776,6 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load64_zero); return Builder.CreateCall(Callee, {Ptr}); } - case WebAssembly::BI__builtin_wasm_store8_lane: - case WebAssembly::BI__builtin_wasm_store16_lane: - case WebAssembly::BI__builtin_wasm_store32_lane: - case WebAssembly::BI__builtin_wasm_store64_lane: { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - Value *Vec = EmitScalarExpr(E->getArg(1)); - Optional LaneIdxConst = - E->getArg(2)->getIntegerConstantExpr(getContext()); - assert(LaneIdxConst && "Constant arg isn't actually constant?"); - Value *LaneIdx = llvm::ConstantInt::get(getLLVMContext(), *LaneIdxConst); - unsigned IntNo; - switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_store8_lane: - IntNo = Intrinsic::wasm_store8_lane; - break; - case WebAssembly::BI__builtin_wasm_store16_lane: - IntNo = Intrinsic::wasm_store16_lane; - break; - case WebAssembly::BI__builtin_wasm_store32_lane: - IntNo = Intrinsic::wasm_store32_lane; - break; - case WebAssembly::BI__builtin_wasm_store64_lane: - IntNo = Intrinsic::wasm_store64_lane; - break; - default: - llvm_unreachable("unexpected builtin ID"); - } - Function *Callee = CGM.getIntrinsic(IntNo); - return Builder.CreateCall(Callee, {Ptr, Vec, LaneIdx}); - } case WebAssembly::BI__builtin_wasm_shuffle_i8x16: { Value *Ops[18]; size_t OpIdx = 0; diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index d52cbc0d6a12fa..fdb32bd1d059f0 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -223,18 +223,48 @@ static __inline__ void __DEFAULT_FN_ATTRS wasm_v128_store(void *__mem, ((struct __wasm_v128_store_struct *)__mem)->__v = __a; } -#define wasm_v128_store8_lane(__ptr, __vec, __i) \ - (__builtin_wasm_store8_lane((signed char *)(__ptr), (__i8x16)(__vec), (__i))) +static __inline__ void __DEFAULT_FN_ATTRS wasm_v128_store8_lane(void *__mem, + v128_t __vec, + int __i) + __REQUIRE_CONSTANT(__i) { + struct __wasm_v128_store8_lane_struct { + int8_t __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __wasm_v128_store8_lane_struct *)__mem)->__v = ((__i8x16)__vec)[__i]; +} -#define wasm_v128_store16_lane(__ptr, __vec, __i) \ - (__builtin_wasm_store16_lane((short *)(__ptr), (__i16x8)(__vec), (__i))) +static __inline__ void __DEFAULT_FN_ATTRS wasm_v128_store16_lane(void *__mem, + v128_t __vec, + int __i) + __REQUIRE_CONSTANT(__i) { + struct __wasm_v128_store16_lane_struct { + int16_t __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __wasm_v128_store16_lane_struct *)__mem)->__v = + ((__i16x8)__vec)[__i]; +} -#define wasm_v128_store32_lane(__ptr, __vec, __i) \ - (__builtin_wasm_store32_lane((int *)(__ptr), (__i32x4)(__vec), (__i))) +static __inline__ void __DEFAULT_FN_ATTRS wasm_v128_store32_lane(void *__mem, + v128_t __vec, + int __i) + __REQUIRE_CONSTANT(__i) { + struct __wasm_v128_store32_lane_struct { + int32_t __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __wasm_v128_store32_lane_struct *)__mem)->__v = + ((__i32x4)__vec)[__i]; +} -#define wasm_v128_store64_lane(__ptr, __vec, __i) \ - (__builtin_wasm_store64_lane((long long int *)(__ptr), (__i64x2)(__vec), \ - (__i))) +static __inline__ void __DEFAULT_FN_ATTRS wasm_v128_store64_lane(void *__mem, + v128_t __vec, + int __i) + __REQUIRE_CONSTANT(__i) { + struct __wasm_v128_store64_lane_struct { + int64_t __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __wasm_v128_store64_lane_struct *)__mem)->__v = + ((__i64x2)__vec)[__i]; +} static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_make(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3, int8_t __c4, diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index 16b19ffb09baea..b7341b943ada55 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -284,34 +284,6 @@ f64x2 replace_lane_f64x2(f64x2 v, double x) { // WEBASSEMBLY-NEXT: ret } -void store8_lane(signed char *p, i8x16 v) { - __builtin_wasm_store8_lane(p, v, 0); - // WEBASSEMBLY: call void @llvm.wasm.store8.lane( - // WEBASSEMBLY-SAME: i8* %p, <16 x i8> %v, i32 0) - // WEBASSEMBLY-NEXT: ret -} - -void store16_lane(short *p, i16x8 v) { - __builtin_wasm_store16_lane(p, v, 0); - // WEBASSEMBLY: call void @llvm.wasm.store16.lane( - // WEBASSEMBLY-SAME: i16* %p, <8 x i16> %v, i32 0) - // WEBASSEMBLY-NEXT: ret -} - -void store32_lane(int *p, i32x4 v) { - __builtin_wasm_store32_lane(p, v, 0); - // WEBASSEMBLY: call void @llvm.wasm.store32.lane( - // WEBASSEMBLY-SAME: i32* %p, <4 x i32> %v, i32 0) - // WEBASSEMBLY-NEXT: ret -} - -void store64_lane(long long *p, i64x2 v) { - __builtin_wasm_store64_lane(p, v, 0); - // WEBASSEMBLY: call void @llvm.wasm.store64.lane( - // WEBASSEMBLY-SAME: i64* %p, <2 x i64> %v, i32 0) - // WEBASSEMBLY-NEXT: ret -} - i8x16 add_sat_s_i8x16(i8x16 x, i8x16 y) { return __builtin_wasm_add_sat_s_i8x16(x, y); // WEBASSEMBLY: call <16 x i8> @llvm.sadd.sat.v16i8( diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c index 6ac78e1b66499e..7f279725192a40 100644 --- a/clang/test/Headers/wasm.c +++ b/clang/test/Headers/wasm.c @@ -219,7 +219,8 @@ void test_v128_store(void *mem, v128_t a) { // CHECK-LABEL: @test_v128_store8_lane( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <16 x i8> -// CHECK-NEXT: tail call void @llvm.wasm.store8.lane(i8* [[PTR:%.*]], <16 x i8> [[TMP0]], i32 15) +// CHECK-NEXT: [[VECEXT_I:%.*]] = extractelement <16 x i8> [[TMP0]], i32 15 +// CHECK-NEXT: store i8 [[VECEXT_I]], i8* [[PTR:%.*]], align 1, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // void test_v128_store8_lane(uint8_t *ptr, v128_t vec) { @@ -229,7 +230,8 @@ void test_v128_store8_lane(uint8_t *ptr, v128_t vec) { // CHECK-LABEL: @test_v128_store16_lane( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <8 x i16> -// CHECK-NEXT: tail call void @llvm.wasm.store16.lane(i16* [[PTR:%.*]], <8 x i16> [[TMP0]], i32 7) +// CHECK-NEXT: [[VECEXT_I:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 +// CHECK-NEXT: store i16 [[VECEXT_I]], i16* [[PTR:%.*]], align 1, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // void test_v128_store16_lane(uint16_t *ptr, v128_t vec) { @@ -238,7 +240,8 @@ void test_v128_store16_lane(uint16_t *ptr, v128_t vec) { // CHECK-LABEL: @test_v128_store32_lane( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.wasm.store32.lane(i32* [[PTR:%.*]], <4 x i32> [[VEC:%.*]], i32 3) +// CHECK-NEXT: [[VECEXT_I:%.*]] = extractelement <4 x i32> [[VEC:%.*]], i32 3 +// CHECK-NEXT: store i32 [[VECEXT_I]], i32* [[PTR:%.*]], align 1, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // void test_v128_store32_lane(uint32_t *ptr, v128_t vec) { @@ -248,7 +251,8 @@ void test_v128_store32_lane(uint32_t *ptr, v128_t vec) { // CHECK-LABEL: @test_v128_store64_lane( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VEC:%.*]] to <2 x i64> -// CHECK-NEXT: tail call void @llvm.wasm.store64.lane(i64* [[PTR:%.*]], <2 x i64> [[TMP0]], i32 1) +// CHECK-NEXT: [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +// CHECK-NEXT: store i64 [[VECEXT_I]], i64* [[PTR:%.*]], align 1, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // void test_v128_store64_lane(uint64_t *ptr, v128_t vec) { @@ -1285,7 +1289,7 @@ v128_t test_v128_andnot(v128_t a, v128_t b) { // CHECK-LABEL: @test_v128_any_true( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR8:[0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR6:[0-9]+]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1295,7 +1299,7 @@ bool test_v128_any_true(v128_t a) { // CHECK-LABEL: @test_v128_bitselect( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP0]] // v128_t test_v128_bitselect(v128_t a, v128_t b, v128_t mask) { @@ -1305,7 +1309,7 @@ v128_t test_v128_bitselect(v128_t a, v128_t b, v128_t mask) { // CHECK-LABEL: @test_i8x16_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -1327,7 +1331,7 @@ v128_t test_i8x16_neg(v128_t a) { // CHECK-LABEL: @test_i8x16_all_true( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1338,7 +1342,7 @@ bool test_i8x16_all_true(v128_t a) { // CHECK-LABEL: @test_i8x16_bitmask( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: ret i32 [[TMP1]] // int32_t test_i8x16_bitmask(v128_t a) { @@ -1348,7 +1352,7 @@ int32_t test_i8x16_bitmask(v128_t a) { // CHECK-LABEL: @test_i8x16_popcnt( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.wasm.popcnt(<16 x i8> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.wasm.popcnt(<16 x i8> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -1414,7 +1418,7 @@ v128_t test_i8x16_add(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1426,7 +1430,7 @@ v128_t test_i8x16_add_sat(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1450,7 +1454,7 @@ v128_t test_i8x16_sub(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1462,7 +1466,7 @@ v128_t test_i8x16_sub_sat(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1526,7 +1530,7 @@ v128_t test_u8x16_max(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1537,7 +1541,7 @@ v128_t test_u8x16_avgr(v128_t a, v128_t b) { // CHECK-LABEL: @test_i16x8_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -1559,7 +1563,7 @@ v128_t test_i16x8_neg(v128_t a) { // CHECK-LABEL: @test_i16x8_all_true( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1570,7 +1574,7 @@ bool test_i16x8_all_true(v128_t a) { // CHECK-LABEL: @test_i16x8_bitmask( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: ret i32 [[TMP1]] // int32_t test_i16x8_bitmask(v128_t a) { @@ -1635,7 +1639,7 @@ v128_t test_i16x8_add(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1647,7 +1651,7 @@ v128_t test_i16x8_add_sat(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1671,7 +1675,7 @@ v128_t test_i16x8_sub(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1683,7 +1687,7 @@ v128_t test_i16x8_sub_sat(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1759,7 +1763,7 @@ v128_t test_u16x8_max(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -1769,7 +1773,7 @@ v128_t test_u16x8_avgr(v128_t a, v128_t b) { // CHECK-LABEL: @test_i32x4_abs( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP0]] // v128_t test_i32x4_abs(v128_t a) { @@ -1787,7 +1791,7 @@ v128_t test_i32x4_neg(v128_t a) { // CHECK-LABEL: @test_i32x4_all_true( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR6]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP0]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1797,7 +1801,7 @@ bool test_i32x4_all_true(v128_t a) { // CHECK-LABEL: @test_i32x4_bitmask( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR6]] // CHECK-NEXT: ret i32 [[TMP0]] // int32_t test_i32x4_bitmask(v128_t a) { @@ -1908,7 +1912,7 @@ v128_t test_u32x4_max(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_i32x4_dot_i16x8(v128_t a, v128_t b) { @@ -1918,7 +1922,7 @@ v128_t test_i32x4_dot_i16x8(v128_t a, v128_t b) { // CHECK-LABEL: @test_i64x2_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -1940,7 +1944,7 @@ v128_t test_i64x2_neg(v128_t a) { // CHECK-LABEL: @test_i64x2_all_true( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0 // CHECK-NEXT: ret i1 [[TOBOOL_I]] // @@ -1951,7 +1955,7 @@ bool test_i64x2_all_true(v128_t a) { // CHECK-LABEL: @test_i64x2_bitmask( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64> -// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: ret i32 [[TMP1]] // int32_t test_i64x2_bitmask(v128_t a) { @@ -2039,7 +2043,7 @@ v128_t test_i64x2_mul(v128_t a, v128_t b) { // CHECK-LABEL: @test_f32x4_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2061,7 +2065,7 @@ v128_t test_f32x4_neg(v128_t a) { // CHECK-LABEL: @test_f32x4_sqrt( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2072,7 +2076,7 @@ v128_t test_f32x4_sqrt(v128_t a) { // CHECK-LABEL: @test_f32x4_ceil( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2083,7 +2087,7 @@ v128_t test_f32x4_ceil(v128_t a) { // CHECK-LABEL: @test_f32x4_floor( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2094,7 +2098,7 @@ v128_t test_f32x4_floor(v128_t a) { // CHECK-LABEL: @test_f32x4_trunc( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2105,7 +2109,7 @@ v128_t test_f32x4_trunc(v128_t a) { // CHECK-LABEL: @test_f32x4_nearest( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2165,7 +2169,7 @@ v128_t test_f32x4_div(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2177,7 +2181,7 @@ v128_t test_f32x4_min(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2189,7 +2193,7 @@ v128_t test_f32x4_max(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2201,7 +2205,7 @@ v128_t test_f32x4_pmin(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2212,7 +2216,7 @@ v128_t test_f32x4_pmax(v128_t a, v128_t b) { // CHECK-LABEL: @test_f64x2_abs( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2234,7 +2238,7 @@ v128_t test_f64x2_neg(v128_t a) { // CHECK-LABEL: @test_f64x2_sqrt( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2245,7 +2249,7 @@ v128_t test_f64x2_sqrt(v128_t a) { // CHECK-LABEL: @test_f64x2_ceil( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2256,7 +2260,7 @@ v128_t test_f64x2_ceil(v128_t a) { // CHECK-LABEL: @test_f64x2_floor( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2267,7 +2271,7 @@ v128_t test_f64x2_floor(v128_t a) { // CHECK-LABEL: @test_f64x2_trunc( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2278,7 +2282,7 @@ v128_t test_f64x2_trunc(v128_t a) { // CHECK-LABEL: @test_f64x2_nearest( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2338,7 +2342,7 @@ v128_t test_f64x2_div(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2350,7 +2354,7 @@ v128_t test_f64x2_min(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2362,7 +2366,7 @@ v128_t test_f64x2_max(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2374,7 +2378,7 @@ v128_t test_f64x2_pmin(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2385,7 +2389,7 @@ v128_t test_f64x2_pmax(v128_t a, v128_t b) { // CHECK-LABEL: @test_i32x4_trunc_sat_f32x4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_i32x4_trunc_sat_f32x4(v128_t a) { @@ -2395,7 +2399,7 @@ v128_t test_i32x4_trunc_sat_f32x4(v128_t a) { // CHECK-LABEL: @test_u32x4_trunc_sat_f32x4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_u32x4_trunc_sat_f32x4(v128_t a) { @@ -2447,7 +2451,7 @@ v128_t test_f64x2_convert_low_u32x4(v128_t a) { // CHECK-LABEL: @test_i32x4_trunc_sat_f64x2_zero( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2458,7 +2462,7 @@ v128_t test_i32x4_trunc_sat_f64x2_zero(v128_t a) { // CHECK-LABEL: @test_u32x4_trunc_sat_f64x2_zero( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2542,7 +2546,7 @@ v128_t test_i64x2_shuffle(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2554,7 +2558,7 @@ v128_t test_i8x16_swizzle(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2566,7 +2570,7 @@ v128_t test_i8x16_narrow_i16x8(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2576,7 +2580,7 @@ v128_t test_u8x16_narrow_i16x8(v128_t a, v128_t b) { // CHECK-LABEL: @test_i16x8_narrow_i32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR6]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2586,7 +2590,7 @@ v128_t test_i16x8_narrow_i32x4(v128_t a, v128_t b) { // CHECK-LABEL: @test_u16x8_narrow_i32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR6]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2733,7 +2737,7 @@ v128_t test_u64x2_extend_high_u32x4(v128_t a) { // CHECK-LABEL: @test_i16x8_extadd_pairwise_i8x16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2744,7 +2748,7 @@ v128_t test_i16x8_extadd_pairwise_i8x16(v128_t a) { // CHECK-LABEL: @test_u16x8_extadd_pairwise_u8x16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP2]] // @@ -2755,7 +2759,7 @@ v128_t test_u16x8_extadd_pairwise_u8x16(v128_t a) { // CHECK-LABEL: @test_i32x4_extadd_pairwise_i16x8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_i32x4_extadd_pairwise_i16x8(v128_t a) { @@ -2765,7 +2769,7 @@ v128_t test_i32x4_extadd_pairwise_i16x8(v128_t a) { // CHECK-LABEL: @test_u32x4_extadd_pairwise_u16x8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP1]] // v128_t test_u32x4_extadd_pairwise_u16x8(v128_t a) { @@ -2776,7 +2780,7 @@ v128_t test_u32x4_extadd_pairwise_u16x8(v128_t a) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2788,7 +2792,7 @@ v128_t test_i16x8_extmul_low_i8x16(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2800,7 +2804,7 @@ v128_t test_i16x8_extmul_high_i8x16(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2812,7 +2816,7 @@ v128_t test_u16x8_extmul_low_u8x16(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // @@ -2824,7 +2828,7 @@ v128_t test_u16x8_extmul_high_u8x16(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_i32x4_extmul_low_i16x8(v128_t a, v128_t b) { @@ -2835,7 +2839,7 @@ v128_t test_i32x4_extmul_low_i16x8(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_i32x4_extmul_high_i16x8(v128_t a, v128_t b) { @@ -2846,7 +2850,7 @@ v128_t test_i32x4_extmul_high_i16x8(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_u32x4_extmul_low_u16x8(v128_t a, v128_t b) { @@ -2857,7 +2861,7 @@ v128_t test_u32x4_extmul_low_u16x8(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: ret <4 x i32> [[TMP2]] // v128_t test_u32x4_extmul_high_u16x8(v128_t a, v128_t b) { @@ -2866,7 +2870,7 @@ v128_t test_u32x4_extmul_high_u16x8(v128_t a, v128_t b) { // CHECK-LABEL: @test_i64x2_extmul_low_i32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR6]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2876,7 +2880,7 @@ v128_t test_i64x2_extmul_low_i32x4(v128_t a, v128_t b) { // CHECK-LABEL: @test_i64x2_extmul_high_i32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR6]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2886,7 +2890,7 @@ v128_t test_i64x2_extmul_high_i32x4(v128_t a, v128_t b) { // CHECK-LABEL: @test_u64x2_extmul_low_u32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR6]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2896,7 +2900,7 @@ v128_t test_u64x2_extmul_low_u32x4(v128_t a, v128_t b) { // CHECK-LABEL: @test_u64x2_extmul_high_u32x4( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR6]] // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP1]] // @@ -2908,7 +2912,7 @@ v128_t test_u64x2_extmul_high_u32x4(v128_t a, v128_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16> -// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR8]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR6]] // CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP3]] // diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td index d5398a9ce63311..99e7ecea593743 100644 --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -186,32 +186,6 @@ def int_wasm_load64_zero : [IntrReadMem, IntrArgMemOnly], "", [SDNPMemOperand]>; -// These intrinsics do not mark their lane index arguments as immediate because -// that changes the corresponding SDNode from ISD::Constant to -// ISD::TargetConstant, which would require extra complications in the ISel -// tablegen patterns. TODO: Replace these intrinsic with normal ISel patterns -// once the load_lane instructions are merged to the proposal. -def int_wasm_store8_lane : - Intrinsic<[], - [LLVMPointerType, llvm_v16i8_ty, llvm_i32_ty], - [IntrWriteMem, IntrArgMemOnly], - "", [SDNPMemOperand]>; -def int_wasm_store16_lane : - Intrinsic<[], - [LLVMPointerType, llvm_v8i16_ty, llvm_i32_ty], - [IntrWriteMem, IntrArgMemOnly], - "", [SDNPMemOperand]>; -def int_wasm_store32_lane : - Intrinsic<[], - [LLVMPointerType, llvm_v4i32_ty, llvm_i32_ty], - [IntrWriteMem, IntrArgMemOnly], - "", [SDNPMemOperand]>; -def int_wasm_store64_lane : - Intrinsic<[], - [LLVMPointerType, llvm_v2i64_ty, llvm_i32_ty], - [IntrWriteMem, IntrArgMemOnly], - "", [SDNPMemOperand]>; - // TODO: Replace this intrinsic with normal ISel patterns once popcnt is merged // to the proposal. def int_wasm_popcnt : diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 19dffa85c62226..b8831bb7070b57 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -767,35 +767,6 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = Align(1); Info.flags = MachineMemOperand::MOLoad; return true; - case Intrinsic::wasm_store8_lane: - case Intrinsic::wasm_store16_lane: - case Intrinsic::wasm_store32_lane: - case Intrinsic::wasm_store64_lane: { - MVT MemVT; - switch (Intrinsic) { - case Intrinsic::wasm_store8_lane: - MemVT = MVT::i8; - break; - case Intrinsic::wasm_store16_lane: - MemVT = MVT::i16; - break; - case Intrinsic::wasm_store32_lane: - MemVT = MVT::i32; - break; - case Intrinsic::wasm_store64_lane: - MemVT = MVT::i64; - break; - default: - llvm_unreachable("unexpected intrinsic"); - } - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = MemVT; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.align = Align(1); - Info.flags = MachineMemOperand::MOStore; - return true; - } default: return false; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 97c3ece1c46f47..ce8146c4b4eb90 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -385,15 +385,13 @@ multiclass SIMDStoreLane simdop> { } // mayStore = 1, UseNamedOperandTable = 1 } -// TODO: Also support v4f32 and v2f64 once the instructions are merged -// to the proposal defm "" : SIMDStoreLane; defm "" : SIMDStoreLane; defm "" : SIMDStoreLane; defm "" : SIMDStoreLane; // Select stores with no constant offset. -multiclass StoreLanePatNoOffset { +multiclass StoreLanePatNoOffset { def : Pat<(kind (i32 I32:$addr), (vec.vt V128:$vec), (i32 vec.lane_idx:$idx)), (!cast("STORE_LANE_"#vec#"_A32") 0, 0, imm:$idx, $addr, $vec)>, Requires<[HasAddr32]>; @@ -402,13 +400,26 @@ multiclass StoreLanePatNoOffset { Requires<[HasAddr64]>; } -defm : StoreLanePatNoOffset; -defm : StoreLanePatNoOffset; -defm : StoreLanePatNoOffset; -defm : StoreLanePatNoOffset; +def store8_lane : + PatFrag<(ops node:$ptr, node:$vec, node:$idx), + (truncstorei8 (i32 (vector_extract $vec, $idx)), $ptr)>; +def store16_lane : + PatFrag<(ops node:$ptr, node:$vec, node:$idx), + (truncstorei16 (i32 (vector_extract $vec, $idx)), $ptr)>; +def store32_lane : + PatFrag<(ops node:$ptr, node:$vec, node:$idx), + (store (i32 (vector_extract $vec, $idx)), $ptr)>; +def store64_lane : + PatFrag<(ops node:$ptr, node:$vec, node:$idx), + (store (i64 (vector_extract $vec, $idx)), $ptr)>; +// TODO: floating point lanes as well -// TODO: Also support the other store patterns for store_lane once the -// instructions are merged to the proposal. +let AddedComplexity = 1 in { +defm : StoreLanePatNoOffset; +defm : StoreLanePatNoOffset; +defm : StoreLanePatNoOffset; +defm : StoreLanePatNoOffset; +} //===----------------------------------------------------------------------===// // Constructing SIMD values diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-pair.ll b/llvm/test/CodeGen/WebAssembly/simd-build-pair.ll index f1b5158c199969..5dc022f2b12acc 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-build-pair.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-pair.ll @@ -13,8 +13,7 @@ target triple = "wasm32-unknown-unknown" ; t8: ch = store<(store 8 into `i64* undef`, align 1)> t3:1, t24, undef:i32, undef:i32 ; t9: ch = WebAssemblyISD::RETURN t8 -; CHECK: i64x2.extract_lane -; CHECK-NEXT: i64.store +; CHECK: v128.store64_lane define void @build_pair_i32s() { entry: %0 = load <4 x i32>, <4 x i32>* undef, align 16 diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-lane-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-load-lane-offset.ll index fdb2328e9d5f5f..6d92f9fde991a5 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-load-lane-offset.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-load-lane-offset.ll @@ -8,11 +8,6 @@ target triple = "wasm32-unknown-unknown" -declare void @llvm.wasm.store8.lane(i8*, <16 x i8>, i32) -declare void @llvm.wasm.store16.lane(i16*, <8 x i16>, i32) -declare void @llvm.wasm.store32.lane(i32*, <4 x i32>, i32) -declare void @llvm.wasm.store64.lane(i64*, <2 x i64>, i32) - ;===---------------------------------------------------------------------------- ; v128.load8_lane / v128.store8_lane ;===---------------------------------------------------------------------------- @@ -150,7 +145,8 @@ define void @store_lane_i8_no_offset(<16 x i8> %v, i8* %p) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.store8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.store8.lane(i8* %p, <16 x i8> %v, i32 0) + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %p ret void } @@ -167,7 +163,8 @@ define void @store_lane_i8_with_folded_offset(<16 x i8> %v, i8* %p) { %q = ptrtoint i8* %p to i32 %r = add nuw i32 %q, 24 %s = inttoptr i32 %r to i8* - tail call void @llvm.wasm.store8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %s ret void } @@ -182,7 +179,8 @@ define void @store_lane_i8_with_folded_gep_offset(<16 x i8> %v, i8* %p) { ; CHECK-NEXT: v128.store8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i8, i8* %p, i32 6 - tail call void @llvm.wasm.store8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %s ret void } @@ -197,7 +195,8 @@ define void @store_lane_i8_with_unfolded_gep_negative_offset(<16 x i8> %v, i8* % ; CHECK-NEXT: v128.store8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i8, i8* %p, i32 -6 - tail call void @llvm.wasm.store8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %s ret void } @@ -214,7 +213,8 @@ define void @store_lane_i8_with_unfolded_offset(<16 x i8> %v, i8* %p) { %q = ptrtoint i8* %p to i32 %r = add nsw i32 %q, 24 %s = inttoptr i32 %r to i8* - tail call void @llvm.wasm.store8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %s ret void } @@ -229,7 +229,8 @@ define void @store_lane_i8_with_unfolded_gep_offset(<16 x i8> %v, i8* %p) { ; CHECK-NEXT: v128.store8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr i8, i8* %p, i32 6 - tail call void @llvm.wasm.store8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %s ret void } @@ -242,7 +243,8 @@ define void @store_lane_i8_to_numeric_address(<16 x i8> %v) { ; CHECK-NEXT: v128.store8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 42 to i8* - tail call void @llvm.wasm.store8.lane(i8* %s, <16 x i8> %v, i32 0) + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %s ret void } @@ -254,7 +256,8 @@ define void @store_lane_i8_from_global_address(<16 x i8> %v) { ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: v128.store8_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.store8.lane(i8* @gv_i8, <16 x i8> %v, i32 0) + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* @gv_i8 ret void } @@ -393,9 +396,10 @@ define void @store_lane_i16_no_offset(<8 x i16> %v, i16* %p) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.store16.lane(i16* %p, <8 x i16> %v, i32 0) + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %p ret void } @@ -407,12 +411,13 @@ define void @store_lane_i16_with_folded_offset(<8 x i16> %v, i16* %p) { ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i16* %p to i32 %r = add nuw i32 %q, 24 %s = inttoptr i32 %r to i16* - tail call void @llvm.wasm.store16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %s ret void } @@ -424,10 +429,11 @@ define void @store_lane_i16_with_folded_gep_offset(<8 x i16> %v, i16* %p) { ; CHECK-NEXT: i32.const 12 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i16, i16* %p, i32 6 - tail call void @llvm.wasm.store16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %s ret void } @@ -439,10 +445,11 @@ define void @store_lane_i16_with_unfolded_gep_negative_offset(<8 x i16> %v, i16* ; CHECK-NEXT: i32.const -12 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i16, i16* %p, i32 -6 - tail call void @llvm.wasm.store16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %s ret void } @@ -454,12 +461,13 @@ define void @store_lane_i16_with_unfolded_offset(<8 x i16> %v, i16* %p) { ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i16* %p to i32 %r = add nsw i32 %q, 24 %s = inttoptr i32 %r to i16* - tail call void @llvm.wasm.store16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %s ret void } @@ -471,10 +479,11 @@ define void @store_lane_i16_with_unfolded_gep_offset(<8 x i16> %v, i16* %p) { ; CHECK-NEXT: i32.const 12 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr i16, i16* %p, i32 6 - tail call void @llvm.wasm.store16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %s ret void } @@ -484,10 +493,11 @@ define void @store_lane_i16_to_numeric_address(<8 x i16> %v) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 42 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 42 to i16* - tail call void @llvm.wasm.store16.lane(i16* %s, <8 x i16> %v, i32 0) + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %s ret void } @@ -497,9 +507,10 @@ define void @store_lane_i16_from_global_address(<8 x i16> %v) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const gv_i16 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store16_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.store16.lane(i16* @gv_i16, <8 x i16> %v, i32 0) + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* @gv_i16 ret void } @@ -638,9 +649,10 @@ define void @store_lane_i32_no_offset(<4 x i32> %v, i32* %p) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.store32.lane(i32* %p, <4 x i32> %v, i32 0) + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %p ret void } @@ -652,12 +664,13 @@ define void @store_lane_i32_with_folded_offset(<4 x i32> %v, i32* %p) { ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i32* %p to i32 %r = add nuw i32 %q, 24 %s = inttoptr i32 %r to i32* - tail call void @llvm.wasm.store32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %s ret void } @@ -669,10 +682,11 @@ define void @store_lane_i32_with_folded_gep_offset(<4 x i32> %v, i32* %p) { ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i32, i32* %p, i32 6 - tail call void @llvm.wasm.store32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %s ret void } @@ -684,10 +698,11 @@ define void @store_lane_i32_with_unfolded_gep_negative_offset(<4 x i32> %v, i32* ; CHECK-NEXT: i32.const -24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i32, i32* %p, i32 -6 - tail call void @llvm.wasm.store32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %s ret void } @@ -699,12 +714,13 @@ define void @store_lane_i32_with_unfolded_offset(<4 x i32> %v, i32* %p) { ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i32* %p to i32 %r = add nsw i32 %q, 24 %s = inttoptr i32 %r to i32* - tail call void @llvm.wasm.store32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %s ret void } @@ -716,10 +732,11 @@ define void @store_lane_i32_with_unfolded_gep_offset(<4 x i32> %v, i32* %p) { ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr i32, i32* %p, i32 6 - tail call void @llvm.wasm.store32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %s ret void } @@ -729,10 +746,11 @@ define void @store_lane_i32_to_numeric_address(<4 x i32> %v) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 42 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 42 to i32* - tail call void @llvm.wasm.store32.lane(i32* %s, <4 x i32> %v, i32 0) + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %s ret void } @@ -742,9 +760,10 @@ define void @store_lane_i32_from_global_address(<4 x i32> %v) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const gv_i32 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store32_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.store32.lane(i32* @gv_i32, <4 x i32> %v, i32 0) + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* @gv_i32 ret void } @@ -883,9 +902,10 @@ define void @store_lane_i64_no_offset(<2 x i64> %v, i64* %p) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 1 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.store64.lane(i64* %p, <2 x i64> %v, i32 0) + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %p ret void } @@ -897,12 +917,13 @@ define void @store_lane_i64_with_folded_offset(<2 x i64> %v, i64* %p) { ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i64* %p to i32 %r = add nuw i32 %q, 24 %s = inttoptr i32 %r to i64* - tail call void @llvm.wasm.store64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %s ret void } @@ -914,10 +935,11 @@ define void @store_lane_i64_with_folded_gep_offset(<2 x i64> %v, i64* %p) { ; CHECK-NEXT: i32.const 48 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i64, i64* %p, i32 6 - tail call void @llvm.wasm.store64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %s ret void } @@ -929,10 +951,11 @@ define void @store_lane_i64_with_unfolded_gep_negative_offset(<2 x i64> %v, i64* ; CHECK-NEXT: i32.const -48 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr inbounds i64, i64* %p, i32 -6 - tail call void @llvm.wasm.store64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %s ret void } @@ -944,12 +967,13 @@ define void @store_lane_i64_with_unfolded_offset(<2 x i64> %v, i64* %p) { ; CHECK-NEXT: i32.const 24 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %q = ptrtoint i64* %p to i32 %r = add nsw i32 %q, 24 %s = inttoptr i32 %r to i64* - tail call void @llvm.wasm.store64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %s ret void } @@ -961,10 +985,11 @@ define void @store_lane_i64_with_unfolded_gep_offset(<2 x i64> %v, i64* %p) { ; CHECK-NEXT: i32.const 48 ; CHECK-NEXT: i32.add ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = getelementptr i64, i64* %p, i32 6 - tail call void @llvm.wasm.store64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %s ret void } @@ -974,10 +999,11 @@ define void @store_lane_i64_to_numeric_address(<2 x i64> %v) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const 42 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return %s = inttoptr i32 42 to i64* - tail call void @llvm.wasm.store64.lane(i64* %s, <2 x i64> %v, i32 0) + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %s ret void } @@ -987,8 +1013,9 @@ define void @store_lane_i64_from_global_address(<2 x i64> %v) { ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: i32.const gv_i64 ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: v128.store64_lane 0, 0 ; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.store64.lane(i64* @gv_i64, <2 x i64> %v, i32 0) + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* @gv_i64 ret void } diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll index 49f8e7b8cec8c2..06b5fd9cbb713b 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll @@ -161,6 +161,34 @@ define <16 x i8> @load_lane_i8_a2(i8* %p, <16 x i8> %v) { ret <16 x i8> %v1 } +; 1 is the default alignment for v128.store8_lane so no attribute is needed. +define void @store_lane_i8_a1(<16 x i8> %v, i8* %p) { +; CHECK-LABEL: store_lane_i8_a1: +; CHECK: .functype store_lane_i8_a1 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store8_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %p, align 1 + ret void +} + +; 2 is greater than the default alignment so it is ignored. +define void @store_lane_i8_a2(<16 x i8> %v, i8* %p) { +; CHECK-LABEL: store_lane_i8_a2: +; CHECK: .functype store_lane_i8_a2 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store8_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <16 x i8> %v, i32 0 + store i8 %x, i8* %p, align 2 + ret void +} + ; ============================================================================== ; 8 x i16 ; ============================================================================== @@ -462,6 +490,47 @@ define <8 x i16> @load_lane_i16_a4(i16* %p, <8 x i16> %v) { ret <8 x i16> %v1 } +define void @store_lane_i16_a1(<8 x i16> %v, i16* %p) { +; CHECK-LABEL: store_lane_i16_a1: +; CHECK: .functype store_lane_i16_a1 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store16_lane 0:p2align=0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %p, align 1 + ret void +} + +; 2 is the default alignment for v128.store16_lane so no attribute is needed. +define void @store_lane_i16_a2(<8 x i16> %v, i16* %p) { +; CHECK-LABEL: store_lane_i16_a2: +; CHECK: .functype store_lane_i16_a2 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store16_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %p, align 2 + ret void +} + +; 4 is greater than the default alignment so it is ignored. +define void @store_lane_i16_a4(<8 x i16> %v, i16* %p) { +; CHECK-LABEL: store_lane_i16_a4: +; CHECK: .functype store_lane_i16_a4 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store16_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <8 x i16> %v, i32 0 + store i16 %x, i16* %p, align 4 + ret void +} + ; ============================================================================== ; 4 x i32 ; ============================================================================== @@ -789,6 +858,60 @@ define <4 x i32> @load_lane_i32_a8(i32* %p, <4 x i32> %v) { ret <4 x i32> %v1 } +define void @store_lane_i32_a1(<4 x i32> %v, i32* %p) { +; CHECK-LABEL: store_lane_i32_a1: +; CHECK: .functype store_lane_i32_a1 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store32_lane 0:p2align=0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %p, align 1 + ret void +} + +define void @store_lane_i32_a2(<4 x i32> %v, i32* %p) { +; CHECK-LABEL: store_lane_i32_a2: +; CHECK: .functype store_lane_i32_a2 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store32_lane 0:p2align=1, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %p, align 2 + ret void +} + +; 4 is the default alignment for v128.store32_lane so no attribute is needed. +define void @store_lane_i32_a4(<4 x i32> %v, i32* %p) { +; CHECK-LABEL: store_lane_i32_a4: +; CHECK: .functype store_lane_i32_a4 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store32_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %p, align 4 + ret void +} + +; 8 is greater than the default alignment so it is ignored. +define void @store_lane_i32_a8(<4 x i32> %v, i32* %p) { +; CHECK-LABEL: store_lane_i32_a8: +; CHECK: .functype store_lane_i32_a8 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store32_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <4 x i32> %v, i32 0 + store i32 %x, i32* %p, align 8 + ret void +} + ; ============================================================================== ; 2 x i64 ; ============================================================================== @@ -1023,6 +1146,73 @@ define <2 x i64> @load_lane_i64_a16(i64* %p, <2 x i64> %v) { ret <2 x i64> %v1 } +define void @store_lane_i64_a1(<2 x i64> %v, i64* %p) { +; CHECK-LABEL: store_lane_i64_a1: +; CHECK: .functype store_lane_i64_a1 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store64_lane 0:p2align=0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %p, align 1 + ret void +} + +define void @store_lane_i64_a2(<2 x i64> %v, i64* %p) { +; CHECK-LABEL: store_lane_i64_a2: +; CHECK: .functype store_lane_i64_a2 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store64_lane 0:p2align=1, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %p, align 2 + ret void +} + +define void @store_lane_i64_a4(<2 x i64> %v, i64* %p) { +; CHECK-LABEL: store_lane_i64_a4: +; CHECK: .functype store_lane_i64_a4 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store64_lane 0:p2align=2, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %p, align 4 + ret void +} + +; 8 is the default alignment for v128.store64_lane so no attribute is needed. +define void @store_lane_i64_a8(<2 x i64> %v, i64* %p) { +; CHECK-LABEL: store_lane_i64_a8: +; CHECK: .functype store_lane_i64_a8 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store64_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %p, align 8 + ret void +} + +; 16 is greater than the default alignment so it is ignored. +define void @store_lane_i64_a16(<2 x i64> %v, i64* %p) { +; CHECK-LABEL: store_lane_i64_a16: +; CHECK: .functype store_lane_i64_a16 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.store64_lane 0, 0 +; CHECK-NEXT: # fallthrough-return + %x = extractelement <2 x i64> %v, i32 0 + store i64 %x, i64* %p, align 16 + ret void +} + ; ============================================================================== ; 4 x float ; ==============================================================================