diff --git a/clang/include/clang/Basic/BuiltinsSystemZ.def b/clang/include/clang/Basic/BuiltinsSystemZ.def index 4cfc52ae42168..f0c0ebfa622a4 100644 --- a/clang/include/clang/Basic/BuiltinsSystemZ.def +++ b/clang/include/clang/Basic/BuiltinsSystemZ.def @@ -64,14 +64,14 @@ TARGET_BUILTIN(__builtin_s390_vupllh, "V4UiV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vupllf, "V2ULLiV4Ui", "nc", "vector") // Vector integer instructions (chapter 22 of the PoP) -TARGET_BUILTIN(__builtin_s390_vaq, "V16UcV16UcV16Uc", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vacq, "V16UcV16UcV16UcV16Uc", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vaq, "SLLLiSLLLiSLLLi", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vacq, "ULLLiULLLiULLLiULLLi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vaccb, "V16UcV16UcV16Uc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vacch, "V8UsV8UsV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vaccf, "V4UiV4UiV4Ui", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vaccg, "V2ULLiV2ULLiV2ULLi", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vaccq, "V16UcV16UcV16Uc", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vacccq, "V16UcV16UcV16UcV16Uc", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vaccq, "ULLLiULLLiULLLi", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vacccq, "ULLLiULLLiULLLiULLLi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vavgb, "V16ScV16ScV16Sc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vavgh, "V8SsV8SsV8Ss", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vavgf, "V4SiV4SiV4Si", "nc", "vector") @@ -116,11 +116,11 @@ TARGET_BUILTIN(__builtin_s390_verllvg, "V2ULLiV2ULLiV2ULLi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vgfmb, "V8UsV16UcV16Uc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vgfmh, "V4UiV8UsV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vgfmf, "V2ULLiV4UiV4Ui", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vgfmg, "V16UcV2ULLiV2ULLi", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vgfmg, "ULLLiV2ULLiV2ULLi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vgfmab, "V8UsV16UcV16UcV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vgfmah, "V4UiV8UsV8UsV4Ui", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vgfmaf, "V2ULLiV4UiV4UiV2ULLi", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vgfmag, "V16UcV2ULLiV2ULLiV16Uc", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vgfmag, "ULLLiV2ULLiV2ULLiULLLi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vmahb, "V16ScV16ScV16ScV16Sc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vmahh, "V8SsV8SsV8SsV8Ss", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vmahf, "V4SiV4SiV4SiV4Si", "nc", "vector") @@ -161,14 +161,14 @@ TARGET_BUILTIN(__builtin_s390_vpopctb, "V16UcV16Uc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vpopcth, "V8UsV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vpopctf, "V4UiV4Ui", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vpopctg, "V2ULLiV2ULLi", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vsq, "V16UcV16UcV16Uc", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vsbcbiq, "V16UcV16UcV16UcV16Uc", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vsbiq, "V16UcV16UcV16UcV16Uc", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vsq, "SLLLiSLLLiSLLLi", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vsbcbiq, "ULLLiULLLiULLLiULLLi", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vsbiq, "ULLLiULLLiULLLiULLLi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vscbib, "V16UcV16UcV16Uc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vscbih, "V8UsV8UsV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vscbif, "V4UiV4UiV4Ui", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vscbig, "V2ULLiV2ULLiV2ULLi", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vscbiq, "V16UcV16UcV16Uc", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vscbiq, "ULLLiULLLiULLLi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vsl, "V16UcV16UcV16Uc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vslb, "V16UcV16UcV16Uc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vsldb, "V16UcV16UcV16UcIi", "nc", "vector") @@ -180,8 +180,8 @@ TARGET_BUILTIN(__builtin_s390_vsumb, "V4UiV16UcV16Uc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vsumh, "V4UiV8UsV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vsumgh, "V2ULLiV8UsV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vsumgf, "V2ULLiV4UiV4Ui", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vsumqf, "V16UcV4UiV4Ui", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_vsumqg, "V16UcV2ULLiV2ULLi", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vsumqf, "ULLLiV4UiV4Ui", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_vsumqg, "ULLLiV2ULLiV2ULLi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vtm, "iV16UcV16Uc", "nc", "vector") // Vector string instructions (chapter 23 of the PoP) @@ -256,7 +256,7 @@ TARGET_BUILTIN(__builtin_s390_vftcidb, "V2SLLiV2dIii*", "nc", "vector") TARGET_BUILTIN(__builtin_s390_vlrlr, "V16ScUivC*", "", "vector-enhancements-1") TARGET_BUILTIN(__builtin_s390_vstrlr, "vV16ScUiv*", "", "vector-enhancements-1") TARGET_BUILTIN(__builtin_s390_vbperm, "V2ULLiV16UcV16Uc", "nc", "vector-enhancements-1") -TARGET_BUILTIN(__builtin_s390_vmslg, "V16UcV2ULLiV2ULLiV16UcIi", "nc", "vector-enhancements-1") +TARGET_BUILTIN(__builtin_s390_vmslg, "ULLLiV2ULLiV2ULLiULLLiIi", "nc", "vector-enhancements-1") TARGET_BUILTIN(__builtin_s390_vfmaxdb, "V2dV2dV2dIi", "nc", "vector-enhancements-1") TARGET_BUILTIN(__builtin_s390_vfmindb, "V2dV2dV2dIi", "nc", "vector-enhancements-1") TARGET_BUILTIN(__builtin_s390_vfnmadb, "V2dV2dV2dV2d", "nc", "vector-enhancements-1") diff --git a/clang/lib/Headers/vecintrin.h b/clang/lib/Headers/vecintrin.h index ecfd6cd1a2f87..1f51e32c0d136 100644 --- a/clang/lib/Headers/vecintrin.h +++ b/clang/lib/Headers/vecintrin.h @@ -8359,7 +8359,7 @@ vec_min(__vector double __a, __vector double __b) { static inline __ATTRS_ai __vector unsigned char vec_add_u128(__vector unsigned char __a, __vector unsigned char __b) { - return __builtin_s390_vaq(__a, __b); + return (__vector unsigned char)((__int128)__a + (__int128)__b); } /*-- vec_addc ---------------------------------------------------------------*/ @@ -8388,7 +8388,8 @@ vec_addc(__vector unsigned long long __a, __vector unsigned long long __b) { static inline __ATTRS_ai __vector unsigned char vec_addc_u128(__vector unsigned char __a, __vector unsigned char __b) { - return __builtin_s390_vaccq(__a, __b); + return (__vector unsigned char) + __builtin_s390_vaccq((unsigned __int128)__a, (unsigned __int128)__b); } /*-- vec_adde_u128 ----------------------------------------------------------*/ @@ -8396,7 +8397,9 @@ vec_addc_u128(__vector unsigned char __a, __vector unsigned char __b) { static inline __ATTRS_ai __vector unsigned char vec_adde_u128(__vector unsigned char __a, __vector unsigned char __b, __vector unsigned char __c) { - return __builtin_s390_vacq(__a, __b, __c); + return (__vector unsigned char) + __builtin_s390_vacq((unsigned __int128)__a, (unsigned __int128)__b, + (unsigned __int128)__c); } /*-- vec_addec_u128 ---------------------------------------------------------*/ @@ -8404,7 +8407,9 @@ vec_adde_u128(__vector unsigned char __a, __vector unsigned char __b, static inline __ATTRS_ai __vector unsigned char vec_addec_u128(__vector unsigned char __a, __vector unsigned char __b, __vector unsigned char __c) { - return __builtin_s390_vacccq(__a, __b, __c); + return (__vector unsigned char) + __builtin_s390_vacccq((unsigned __int128)__a, (unsigned __int128)__b, + (unsigned __int128)__c); } /*-- vec_avg ----------------------------------------------------------------*/ @@ -8478,7 +8483,7 @@ vec_gfmsum(__vector unsigned int __a, __vector unsigned int __b) { static inline __ATTRS_o_ai __vector unsigned char vec_gfmsum_128(__vector unsigned long long __a, __vector unsigned long long __b) { - return __builtin_s390_vgfmg(__a, __b); + return (__vector unsigned char)__builtin_s390_vgfmg(__a, __b); } /*-- vec_gfmsum_accum -------------------------------------------------------*/ @@ -8507,7 +8512,8 @@ static inline __ATTRS_o_ai __vector unsigned char vec_gfmsum_accum_128(__vector unsigned long long __a, __vector unsigned long long __b, __vector unsigned char __c) { - return __builtin_s390_vgfmag(__a, __b, __c); + return (__vector unsigned char) + __builtin_s390_vgfmag(__a, __b, (unsigned __int128)__c); } /*-- vec_mladd --------------------------------------------------------------*/ @@ -8797,15 +8803,21 @@ vec_mulo(__vector unsigned int __a, __vector unsigned int __b) { /*-- vec_msum_u128 ----------------------------------------------------------*/ #if __ARCH__ >= 12 +extern __ATTRS_o __vector unsigned char +vec_msum_u128(__vector unsigned long long __a, __vector unsigned long long __b, + __vector unsigned char __c, int __d) + __constant_range(__d, 0, 15); + #define vec_msum_u128(X, Y, Z, W) \ - ((__vector unsigned char)__builtin_s390_vmslg((X), (Y), (Z), (W))); + ((__typeof__((vec_msum_u128)((X), (Y), (Z), (W)))) \ + __builtin_s390_vmslg((X), (Y), (unsigned __int128)(Z), (W))) #endif /*-- vec_sub_u128 -----------------------------------------------------------*/ static inline __ATTRS_ai __vector unsigned char vec_sub_u128(__vector unsigned char __a, __vector unsigned char __b) { - return __builtin_s390_vsq(__a, __b); + return (__vector unsigned char)((__int128)__a - (__int128)__b); } /*-- vec_subc ---------------------------------------------------------------*/ @@ -8834,7 +8846,8 @@ vec_subc(__vector unsigned long long __a, __vector unsigned long long __b) { static inline __ATTRS_ai __vector unsigned char vec_subc_u128(__vector unsigned char __a, __vector unsigned char __b) { - return __builtin_s390_vscbiq(__a, __b); + return (__vector unsigned char) + __builtin_s390_vscbiq((unsigned __int128)__a, (unsigned __int128)__b); } /*-- vec_sube_u128 ----------------------------------------------------------*/ @@ -8842,7 +8855,9 @@ vec_subc_u128(__vector unsigned char __a, __vector unsigned char __b) { static inline __ATTRS_ai __vector unsigned char vec_sube_u128(__vector unsigned char __a, __vector unsigned char __b, __vector unsigned char __c) { - return __builtin_s390_vsbiq(__a, __b, __c); + return (__vector unsigned char) + __builtin_s390_vsbiq((unsigned __int128)__a, (unsigned __int128)__b, + (unsigned __int128)__c); } /*-- vec_subec_u128 ---------------------------------------------------------*/ @@ -8850,7 +8865,9 @@ vec_sube_u128(__vector unsigned char __a, __vector unsigned char __b, static inline __ATTRS_ai __vector unsigned char vec_subec_u128(__vector unsigned char __a, __vector unsigned char __b, __vector unsigned char __c) { - return __builtin_s390_vsbcbiq(__a, __b, __c); + return (__vector unsigned char) + __builtin_s390_vsbcbiq((unsigned __int128)__a, (unsigned __int128)__b, + (unsigned __int128)__c); } /*-- vec_sum2 ---------------------------------------------------------------*/ @@ -8869,12 +8886,12 @@ vec_sum2(__vector unsigned int __a, __vector unsigned int __b) { static inline __ATTRS_o_ai __vector unsigned char vec_sum_u128(__vector unsigned int __a, __vector unsigned int __b) { - return __builtin_s390_vsumqf(__a, __b); + return (__vector unsigned char)__builtin_s390_vsumqf(__a, __b); } static inline __ATTRS_o_ai __vector unsigned char vec_sum_u128(__vector unsigned long long __a, __vector unsigned long long __b) { - return __builtin_s390_vsumqg(__a, __b); + return (__vector unsigned char)__builtin_s390_vsumqg(__a, __b); } /*-- vec_sum4 ---------------------------------------------------------------*/ diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-error2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-error2.c index cf8ee6f7d002b..312a9a156d21e 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-error2.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-error2.c @@ -1,11 +1,8 @@ // REQUIRES: systemz-registered-target // RUN: %clang_cc1 -triple s390x-ibm-linux -S -emit-llvm %s -verify -o - -typedef __attribute__((vector_size(16))) char v16i8; - -v16i8 f0(v16i8 a, v16i8 b) { - __builtin_tbegin ((void *)0); // expected-error {{'__builtin_tbegin' needs target feature transactional-execution}} - v16i8 tmp = __builtin_s390_vaq(a, b); // expected-error {{'__builtin_s390_vaq' needs target feature vector}} - return tmp; +__int128 f0(__int128 a, __int128 b) { + __builtin_tbegin ((void *)0); // expected-error {{'__builtin_tbegin' needs target feature transactional-execution}} + return __builtin_s390_vaq(a, b); // expected-error {{'__builtin_s390_vaq' needs target feature vector}} } diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c index 877032a52a0ae..31b8cd11ea79f 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c @@ -21,6 +21,8 @@ volatile vec_ushort vus; volatile vec_uint vui; volatile vec_ulong vul; volatile vec_double vd; +volatile signed __int128 si128; +volatile unsigned __int128 ui128; volatile unsigned int len; volatile unsigned char amt; @@ -111,14 +113,14 @@ void test_core(void) { } void test_integer(void) { - vuc = __builtin_s390_vaq(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vaq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) - vuc = __builtin_s390_vacq(vuc, vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vacq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) - vuc = __builtin_s390_vaccq(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vaccq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) - vuc = __builtin_s390_vacccq(vuc, vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vacccq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + si128 = __builtin_s390_vaq(si128, si128); + // CHECK: call i128 @llvm.s390.vaq(i128 %{{.*}}, i128 %{{.*}}) + ui128 = __builtin_s390_vacq(ui128, ui128, ui128); + // CHECK: call i128 @llvm.s390.vacq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}}) + ui128 = __builtin_s390_vaccq(ui128, ui128); + // CHECK: call i128 @llvm.s390.vaccq(i128 %{{.*}}, i128 %{{.*}}) + ui128 = __builtin_s390_vacccq(ui128, ui128, ui128); + // CHECK: call i128 @llvm.s390.vacccq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}}) vuc = __builtin_s390_vaccb(vuc, vuc); // CHECK: call <16 x i8> @llvm.s390.vaccb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) @@ -209,8 +211,8 @@ void test_integer(void) { // CHECK: call <4 x i32> @llvm.s390.vgfmh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) vul = __builtin_s390_vgfmf(vui, vui); // CHECK: call <2 x i64> @llvm.s390.vgfmf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) - vuc = __builtin_s390_vgfmg(vul, vul); - // CHECK: call <16 x i8> @llvm.s390.vgfmg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + ui128 = __builtin_s390_vgfmg(vul, vul); + // CHECK: call i128 @llvm.s390.vgfmg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) vus = __builtin_s390_vgfmab(vuc, vuc, vus); // CHECK: call <8 x i16> @llvm.s390.vgfmab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <8 x i16> %{{.*}}) @@ -218,8 +220,8 @@ void test_integer(void) { // CHECK: call <4 x i32> @llvm.s390.vgfmah(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}}) vul = __builtin_s390_vgfmaf(vui, vui, vul); // CHECK: call <2 x i64> @llvm.s390.vgfmaf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}}) - vuc = __builtin_s390_vgfmag(vul, vul, vuc); - // CHECK: call <16 x i8> @llvm.s390.vgfmag(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}) + ui128 = __builtin_s390_vgfmag(vul, vul, ui128); + // CHECK: call i128 @llvm.s390.vgfmag(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}) vsc = __builtin_s390_vmahb(vsc, vsc, vsc); // CHECK: call <16 x i8> @llvm.s390.vmahb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) @@ -308,14 +310,14 @@ void test_integer(void) { vul = __builtin_s390_vpopctg(vul); // CHECK: call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %{{.*}}) - vuc = __builtin_s390_vsq(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vsq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) - vuc = __builtin_s390_vsbiq(vuc, vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vsbiq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) - vuc = __builtin_s390_vscbiq(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vscbiq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) - vuc = __builtin_s390_vsbcbiq(vuc, vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vsbcbiq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + si128 = __builtin_s390_vsq(si128, si128); + // CHECK: call i128 @llvm.s390.vsq(i128 %{{.*}}, i128 %{{.*}}) + ui128 = __builtin_s390_vsbiq(ui128, ui128, ui128); + // CHECK: call i128 @llvm.s390.vsbiq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}}) + ui128 = __builtin_s390_vscbiq(ui128, ui128); + // CHECK: call i128 @llvm.s390.vscbiq(i128 %{{.*}}, i128 %{{.*}}) + ui128 = __builtin_s390_vsbcbiq(ui128, ui128, ui128); + // CHECK: call i128 @llvm.s390.vsbcbiq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}}) vuc = __builtin_s390_vscbib(vuc, vuc); // CHECK: call <16 x i8> @llvm.s390.vscbib(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) @@ -354,10 +356,10 @@ void test_integer(void) { // CHECK: call <2 x i64> @llvm.s390.vsumgh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) vul = __builtin_s390_vsumgf(vui, vui); // CHECK: call <2 x i64> @llvm.s390.vsumgf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) - vuc = __builtin_s390_vsumqf(vui, vui); - // CHECK: call <16 x i8> @llvm.s390.vsumqf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) - vuc = __builtin_s390_vsumqg(vul, vul); - // CHECK: call <16 x i8> @llvm.s390.vsumqg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + ui128 = __builtin_s390_vsumqf(vui, vui); + // CHECK: call i128 @llvm.s390.vsumqf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + ui128 = __builtin_s390_vsumqg(vul, vul); + // CHECK: call i128 @llvm.s390.vsumqg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) len = __builtin_s390_vtm(vuc, vuc); // CHECK: call i32 @llvm.s390.vtm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-vector2-error.c b/clang/test/CodeGen/SystemZ/builtins-systemz-vector2-error.c index cd27ac79e15d4..af3b4f191879e 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-vector2-error.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-vector2-error.c @@ -23,14 +23,15 @@ volatile vec_uint vui; volatile vec_ulong vul; volatile vec_double vd; volatile vec_float vf; +volatile unsigned __int128 ui128; volatile unsigned int len; int cc; void test_integer(void) { - __builtin_s390_vmslg(vul, vul, vuc, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} - __builtin_s390_vmslg(vul, vul, vuc, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}} - __builtin_s390_vmslg(vul, vul, vuc, len); // expected-error {{must be a constant integer}} + __builtin_s390_vmslg(vul, vul, ui128, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + __builtin_s390_vmslg(vul, vul, ui128, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}} + __builtin_s390_vmslg(vul, vul, ui128, len);// expected-error {{must be a constant integer}} } void test_float(void) { diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-vector2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-vector2.c index 5e287e28ed201..3761f252d724b 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-vector2.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-vector2.c @@ -23,6 +23,7 @@ volatile vec_uint vui; volatile vec_ulong vul; volatile vec_double vd; volatile vec_float vf; +volatile unsigned __int128 ui128; volatile unsigned int len; const void * volatile cptr; @@ -41,10 +42,10 @@ void test_core(void) { } void test_integer(void) { - vuc = __builtin_s390_vmslg(vul, vul, vuc, 0); - // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 0) - vuc = __builtin_s390_vmslg(vul, vul, vuc, 15); - // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 15) + ui128 = __builtin_s390_vmslg(vul, vul, ui128, 0); + // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 0) + ui128 = __builtin_s390_vmslg(vul, vul, ui128, 15); + // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 15) } void test_float(void) { diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c index 0dc2fa7c66dd2..5b6973787d1d7 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c @@ -3413,16 +3413,15 @@ void test_integer(void) { // CHECK-ASM: vaccg vuc = vec_add_u128(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vaq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK-ASM: vaq vuc = vec_addc_u128(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vaccq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call i128 @llvm.s390.vaccq(i128 %{{.*}}, i128 %{{.*}}) // CHECK-ASM: vaccq vuc = vec_adde_u128(vuc, vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vacq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call i128 @llvm.s390.vacq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}}) // CHECK-ASM: vacq vuc = vec_addec_u128(vuc, vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vacccq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call i128 @llvm.s390.vacccq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}}) // CHECK-ASM: vacccq vsc = vec_avg(vsc, vsc); @@ -3464,7 +3463,7 @@ void test_integer(void) { // CHECK: call <2 x i64> @llvm.s390.vgfmf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) // CHECK-ASM: vgfmf vuc = vec_gfmsum_128(vul, vul); - // CHECK: call <16 x i8> @llvm.s390.vgfmg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + // CHECK: call i128 @llvm.s390.vgfmg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) // CHECK-ASM: vgfmg vus = vec_gfmsum_accum(vuc, vuc, vus); @@ -3477,7 +3476,7 @@ void test_integer(void) { // CHECK: call <2 x i64> @llvm.s390.vgfmaf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}}) // CHECK-ASM: vgfmaf vuc = vec_gfmsum_accum_128(vul, vul, vuc); - // CHECK: call <16 x i8> @llvm.s390.vgfmag(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call i128 @llvm.s390.vgfmag(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}) // CHECK-ASM: vgfmag vsc = vec_mladd(vsc, vsc, vsc); @@ -3633,16 +3632,15 @@ void test_integer(void) { // CHECK-ASM: vscbig vuc = vec_sub_u128(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vsq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK-ASM: vsq vuc = vec_subc_u128(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vscbiq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call i128 @llvm.s390.vscbiq(i128 %{{.*}}, i128 %{{.*}}) // CHECK-ASM: vscbiq vuc = vec_sube_u128(vuc, vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vsbiq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call i128 @llvm.s390.vsbiq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}}) // CHECK-ASM: vsbiq vuc = vec_subec_u128(vuc, vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.vsbcbiq(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call i128 @llvm.s390.vsbcbiq(i128 %{{.*}}, i128 %{{.*}}, i128 %{{.*}}) // CHECK-ASM: vsbcbiq vui = vec_sum4(vuc, vuc); @@ -3658,10 +3656,10 @@ void test_integer(void) { // CHECK: call <2 x i64> @llvm.s390.vsumgf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) // CHECK-ASM: vsumgf vuc = vec_sum_u128(vui, vui); - // CHECK: call <16 x i8> @llvm.s390.vsumqf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call i128 @llvm.s390.vsumqf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) // CHECK-ASM: vsumqf vuc = vec_sum_u128(vul, vul); - // CHECK: call <16 x i8> @llvm.s390.vsumqg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + // CHECK: call i128 @llvm.s390.vsumqg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) // CHECK-ASM: vsumqg idx = vec_test_mask(vsc, vuc); diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-error.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-error.c index b7e0d77efca6d..127b0f67e85c9 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-error.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2-error.c @@ -126,9 +126,12 @@ void test_integer(void) { // expected-note@vecintrin.h:* 13 {{candidate function not viable}} // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}} - vuc = vec_msum_u128(vul, vul, vuc, idx); // expected-error {{must be a constant integer}} - vuc = vec_msum_u128(vul, vul, vuc, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} - vuc = vec_msum_u128(vul, vul, vuc, 16); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vuc = vec_msum_u128(vul, vul, vuc, idx); // expected-error {{no matching function}} expected-error {{argument to '__builtin_s390_vmslg' must be a constant integer}} + // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}} + vuc = vec_msum_u128(vul, vul, vuc, -1); // expected-error {{no matching function}} expected-error {{argument value -1 is outside the valid range [0, 15]}} + // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}} + vuc = vec_msum_u128(vul, vul, vuc, 16); // expected-error {{no matching function}} expected-error {{argument value 16 is outside the valid range [0, 15]}} + // expected-note@vecintrin.h:* 1 {{must be a constant integer from 0 to 15}} } void test_float(void) { diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c index 416ca0ddd1b4f..874840b3e6bce 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector2.c @@ -661,16 +661,16 @@ void test_integer(void) { // CHECK-ASM: vtm vuc = vec_msum_u128(vul, vul, vuc, 0); - // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 0) + // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 0) // CHECK-ASM: vmslg vuc = vec_msum_u128(vul, vul, vuc, 4); - // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 4) + // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 4) // CHECK-ASM: vmslg vuc = vec_msum_u128(vul, vul, vuc, 8); - // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 8) + // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 8) // CHECK-ASM: vmslg vuc = vec_msum_u128(vul, vul, vuc, 12); - // CHECK: call <16 x i8> @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <16 x i8> %{{.*}}, i32 12) + // CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 12) // CHECK-ASM: vmslg } diff --git a/llvm/include/llvm/IR/IntrinsicsSystemZ.td b/llvm/include/llvm/IR/IntrinsicsSystemZ.td index a9d80ee5a5c7a..9f4b905fedc7c 100644 --- a/llvm/include/llvm/IR/IntrinsicsSystemZ.td +++ b/llvm/include/llvm/IR/IntrinsicsSystemZ.td @@ -114,7 +114,7 @@ multiclass SystemZBinaryExtBHF { } multiclass SystemZBinaryExtBHFG : SystemZBinaryExtBHF { - def g : SystemZBinaryConv; + def g : SystemZBinaryConv; } multiclass SystemZBinaryBHF { @@ -147,7 +147,7 @@ multiclass SystemZTernaryExtBHF { } multiclass SystemZTernaryExtBHFG : SystemZTernaryExtBHF { - def g : SystemZTernaryConv; + def g : SystemZTernaryConv; } multiclass SystemZTernaryBHF { @@ -265,10 +265,10 @@ let TargetPrefix = "s390" in { defm int_s390_vacc : SystemZBinaryBHFG<"vacc">; - def int_s390_vaq : SystemZBinary<"vaq", llvm_v16i8_ty>; - def int_s390_vacq : SystemZTernary<"vacq", llvm_v16i8_ty>; - def int_s390_vaccq : SystemZBinary<"vaccq", llvm_v16i8_ty>; - def int_s390_vacccq : SystemZTernary<"vacccq", llvm_v16i8_ty>; + def int_s390_vaq : SystemZBinary<"vaq", llvm_i128_ty>; + def int_s390_vacq : SystemZTernary<"vacq", llvm_i128_ty>; + def int_s390_vaccq : SystemZBinary<"vaccq", llvm_i128_ty>; + def int_s390_vacccq : SystemZTernary<"vacccq", llvm_i128_ty>; defm int_s390_vavg : SystemZBinaryBHFG<"vavg">; defm int_s390_vavgl : SystemZBinaryBHFG<"vavgl">; @@ -308,10 +308,10 @@ let TargetPrefix = "s390" in { defm int_s390_vscbi : SystemZBinaryBHFG<"vscbi">; - def int_s390_vsq : SystemZBinary<"vsq", llvm_v16i8_ty>; - def int_s390_vsbiq : SystemZTernary<"vsbiq", llvm_v16i8_ty>; - def int_s390_vscbiq : SystemZBinary<"vscbiq", llvm_v16i8_ty>; - def int_s390_vsbcbiq : SystemZTernary<"vsbcbiq", llvm_v16i8_ty>; + def int_s390_vsq : SystemZBinary<"vsq", llvm_i128_ty>; + def int_s390_vsbiq : SystemZTernary<"vsbiq", llvm_i128_ty>; + def int_s390_vscbiq : SystemZBinary<"vscbiq", llvm_i128_ty>; + def int_s390_vsbcbiq : SystemZTernary<"vsbcbiq", llvm_i128_ty>; def int_s390_vsumb : SystemZBinaryConv<"vsumb", llvm_v4i32_ty, llvm_v16i8_ty>; def int_s390_vsumh : SystemZBinaryConv<"vsumh", llvm_v4i32_ty, llvm_v8i16_ty>; @@ -321,9 +321,9 @@ let TargetPrefix = "s390" in { def int_s390_vsumgf : SystemZBinaryConv<"vsumgf", llvm_v2i64_ty, llvm_v4i32_ty>; - def int_s390_vsumqf : SystemZBinaryConv<"vsumqf", llvm_v16i8_ty, + def int_s390_vsumqf : SystemZBinaryConv<"vsumqf", llvm_i128_ty, llvm_v4i32_ty>; - def int_s390_vsumqg : SystemZBinaryConv<"vsumqg", llvm_v16i8_ty, + def int_s390_vsumqg : SystemZBinaryConv<"vsumqg", llvm_i128_ty, llvm_v2i64_ty>; def int_s390_vtm : SystemZBinaryConv<"vtm", llvm_i32_ty, llvm_v16i8_ty>; @@ -370,8 +370,8 @@ let TargetPrefix = "s390" in { llvm_v16i8_ty>; def int_s390_vmslg : ClangBuiltin<"__builtin_s390_vmslg">, - Intrinsic<[llvm_v16i8_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v16i8_ty, + Intrinsic<[llvm_i128_ty], + [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i128_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_s390_vfmaxdb : Intrinsic<[llvm_v2f64_ty], diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 798e6a1d9525e..d4fc9d8a96db2 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -1063,6 +1063,10 @@ def extloadi32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { let IsLoad = true; let MemoryVT = i32; } +def extloadi64 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { + let IsLoad = true; + let MemoryVT = i64; +} def extloadf16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { let IsLoad = true; let MemoryVT = f16; @@ -1092,6 +1096,10 @@ def sextloadi32 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> { let IsLoad = true; let MemoryVT = i32; } +def sextloadi64 : PatFrag<(ops node:$ptr), (sextload node:$ptr)> { + let IsLoad = true; + let MemoryVT = i64; +} def zextloadi1 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { let IsLoad = true; @@ -1109,6 +1117,10 @@ def zextloadi32 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { let IsLoad = true; let MemoryVT = i32; } +def zextloadi64 : PatFrag<(ops node:$ptr), (zextload node:$ptr)> { + let IsLoad = true; + let MemoryVT = i64; +} def extloadvi1 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { let IsLoad = true; @@ -1209,6 +1221,12 @@ def truncstorei32 : PatFrag<(ops node:$val, node:$ptr), let MemoryVT = i32; let IsTruncStore = true; } +def truncstorei64 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr)> { + let IsStore = true; + let MemoryVT = i64; + let IsTruncStore = true; +} def truncstoref16 : PatFrag<(ops node:$val, node:$ptr), (truncstore node:$val, node:$ptr)> { let IsStore = true; diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 29b4a26736b25..136d3d2547219 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -102,9 +102,10 @@ def CC_SystemZ_ELF : CallingConv<[ // A SwiftError is passed in callee-saved R9. CCIfSwiftError>>, - // Force long double values to the stack and pass i64 pointers to them. - CCIfType<[f128], CCPassIndirect>, - // Same for i128 values. These are already split into two i64 here, + // Force i128 (if the type is legal) and long double values to the stack + // and pass i64 pointers to them. + CCIfType<[i128, f128], CCPassIndirect>, + // If i128 is not legal, such values are already split into two i64 here, // so we have to use a custom handler. CCIfType<[i64], CCCustom<"CC_SystemZ_I128Indirect">>, @@ -240,9 +241,10 @@ def CC_SystemZ_XPLINK64 : CallingConv<[ // A SwiftError is passed in R0. CCIfSwiftError>>, - // First i128 values. These are already split into two i64 here, - // so we have to use a custom handler and assign into registers, if possible - // We need to deal with this first + // Force i128 values to the stack and pass i64 pointers to them. + CCIfType<[i128], CCPassIndirect>, + // If i128 is not legal, such values are already split into two i64 here, + // so we have to use a custom handler. CCIfType<[i64], CCCustom<"CC_SystemZ_I128Indirect">>, // The first 3 integer arguments are passed in registers R1D-R3D. // The rest will be passed in the user area. The address offset of the user diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 0d8d5451b8136..e5e1e91916f32 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -307,6 +307,8 @@ class SystemZDAGToDAGISel : public SelectionDAGISel { void loadVectorConstant(const SystemZVectorConstantInfo &VCI, SDNode *Node); + SDNode *loadPoolVectorConstant(APInt Val, EVT VT, SDLoc DL); + // Try to use gather instruction Opcode to implement vector insertion N. bool tryGather(SDNode *N, unsigned Opcode); @@ -784,6 +786,8 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const { case ISD::TRUNCATE: { if (RxSBG.Opcode == SystemZ::RNSBG) return false; + if (N.getOperand(0).getValueSizeInBits() > 64) + return false; uint64_t BitSize = N.getValueSizeInBits(); uint64_t Mask = allOnes(BitSize); if (!refineRxSBGMask(RxSBG, Mask)) @@ -1183,6 +1187,35 @@ void SystemZDAGToDAGISel::loadVectorConstant( SelectCode(Op.getNode()); } +SDNode *SystemZDAGToDAGISel::loadPoolVectorConstant(APInt Val, EVT VT, SDLoc DL) { + SDNode *ResNode; + assert (VT.getSizeInBits() == 128); + + SDValue CP = CurDAG->getTargetConstantPool( + ConstantInt::get(Type::getInt128Ty(*CurDAG->getContext()), Val), + TLI->getPointerTy(CurDAG->getDataLayout())); + + EVT PtrVT = CP.getValueType(); + SDValue Ops[] = { + SDValue(CurDAG->getMachineNode(SystemZ::LARL, DL, PtrVT, CP), 0), + CurDAG->getTargetConstant(0, DL, PtrVT), + CurDAG->getRegister(0, PtrVT), + CurDAG->getEntryNode() + }; + ResNode = CurDAG->getMachineNode(SystemZ::VL, DL, VT, MVT::Other, Ops); + + // Annotate ResNode with memory operand information so that MachineInstr + // queries work properly. This e.g. gives the register allocation the + // required information for rematerialization. + MachineFunction& MF = CurDAG->getMachineFunction(); + MachineMemOperand *MemOp = + MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), + MachineMemOperand::MOLoad, 16, Align(8)); + + CurDAG->setNodeMemRefs(cast(ResNode), {MemOp}); + return ResNode; +} + bool SystemZDAGToDAGISel::tryGather(SDNode *N, unsigned Opcode) { SDValue ElemV = N->getOperand(2); auto *ElemN = dyn_cast(ElemV); @@ -1582,6 +1615,27 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) { return; break; + case ISD::BSWAP: + if (Node->getValueType(0) == MVT::i128) { + SDLoc DL(Node); + SDValue Src = Node->getOperand(0); + Src = CurDAG->getNode(ISD::BITCAST, DL, MVT::v16i8, Src); + + uint64_t Bytes[2] = { 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }; + SDNode *Mask = loadPoolVectorConstant(APInt(128, Bytes), MVT::v16i8, DL); + SDValue Ops[] = { Src, Src, SDValue(Mask, 0) }; + SDValue Res = SDValue(CurDAG->getMachineNode(SystemZ::VPERM, DL, + MVT::v16i8, Ops), 0); + + Res = CurDAG->getNode(ISD::BITCAST, DL, MVT::i128, Res); + SDNode *ResNode = Res.getNode(); + ReplaceNode(Node, ResNode); + SelectCode(Src.getNode()); + SelectCode(ResNode); + return; + } + break; + case ISD::Constant: // If this is a 64-bit constant that is out of the range of LLILF, // LLIHF and LGFI, split it into two 32-bit pieces. @@ -1593,6 +1647,18 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) { return; } } + if (Node->getValueType(0) == MVT::i128) { + const APInt &Val = cast(Node)->getAPIntValue(); + SystemZVectorConstantInfo VCI(Val); + if (VCI.isVectorConstantLegal(*Subtarget)) { + loadVectorConstant(VCI, Node); + return; + } + // If we can't materialize the constant we need to use a literal pool. + SDNode *ResNode = loadPoolVectorConstant(Val, MVT::i128, SDLoc(Node)); + ReplaceNode(Node, ResNode); + return; + } break; case SystemZISD::SELECT_CCMASK: { @@ -1603,6 +1669,7 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) { if ((Op1.getOpcode() == ISD::LOAD && Op0.getOpcode() != ISD::LOAD) || (Subtarget->hasLoadStoreOnCond2() && Node->getValueType(0).isInteger() && + Node->getValueType(0).getSizeInBits() <= 64 && Op1.getOpcode() == ISD::Constant && isInt<16>(cast(Op1)->getSExtValue()) && !(Op0.getOpcode() == ISD::Constant && diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index f79787d4baa4d..d82910a0b2177 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -112,6 +112,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass); } + + if (Subtarget.hasVector()) + addRegisterClass(MVT::i128, &SystemZ::VR128BitRegClass); } // Compute derived properties from the register classes @@ -163,12 +166,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // Expand BRCOND into a BR_CC (see above). setOperationAction(ISD::BRCOND, MVT::Other, Expand); - // Handle integer types. + // Handle integer types except i128. for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE; I <= MVT::LAST_INTEGER_VALUETYPE; ++I) { MVT VT = MVT::SimpleValueType(I); - if (isTypeLegal(VT)) { + if (isTypeLegal(VT) && VT != MVT::i128) { setOperationAction(ISD::ABS, VT, Legal); // Expand individual DIV and REMs into DIVREMs. @@ -236,6 +239,45 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, } } + // Handle i128 if legal. + if (isTypeLegal(MVT::i128)) { + // No special instructions for these. + setOperationAction(ISD::SDIVREM, MVT::i128, Expand); + setOperationAction(ISD::UDIVREM, MVT::i128, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i128, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i128, Expand); + setOperationAction(ISD::ROTR, MVT::i128, Expand); + setOperationAction(ISD::ROTL, MVT::i128, Expand); + setOperationAction(ISD::MUL, MVT::i128, Expand); + setOperationAction(ISD::MULHS, MVT::i128, Expand); + setOperationAction(ISD::MULHU, MVT::i128, Expand); + setOperationAction(ISD::SDIV, MVT::i128, Expand); + setOperationAction(ISD::UDIV, MVT::i128, Expand); + setOperationAction(ISD::SREM, MVT::i128, Expand); + setOperationAction(ISD::UREM, MVT::i128, Expand); + setOperationAction(ISD::CTLZ, MVT::i128, Expand); + setOperationAction(ISD::CTTZ, MVT::i128, Expand); + + // Support addition/subtraction with carry. + setOperationAction(ISD::UADDO, MVT::i128, Custom); + setOperationAction(ISD::USUBO, MVT::i128, Custom); + setOperationAction(ISD::UADDO_CARRY, MVT::i128, Custom); + setOperationAction(ISD::USUBO_CARRY, MVT::i128, Custom); + + // Use VPOPCT and add up partial results. + setOperationAction(ISD::CTPOP, MVT::i128, Custom); + + // We have to use libcalls for these. + setOperationAction(ISD::FP_TO_UINT, MVT::i128, LibCall); + setOperationAction(ISD::FP_TO_SINT, MVT::i128, LibCall); + setOperationAction(ISD::UINT_TO_FP, MVT::i128, LibCall); + setOperationAction(ISD::SINT_TO_FP, MVT::i128, LibCall); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, LibCall); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, LibCall); + setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, LibCall); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, LibCall); + } + // Type legalization will convert 8- and 16-bit atomic operations into // forms that operate on i32s (but still keeping the original memory VT). // Lower them into full i32 operations. @@ -251,7 +293,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom); - // Even though i128 is not a legal type, we still need to custom lower + // Whether or not i128 is not a legal type, we need to custom lower // the atomic operations in order to exploit SystemZ instructions. setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); @@ -299,7 +341,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::SRA_I128, nullptr); // Handle bitcast from fp128 to i128. - setOperationAction(ISD::BITCAST, MVT::i128, Custom); + if (!isTypeLegal(MVT::i128)) + setOperationAction(ISD::BITCAST, MVT::i128, Custom); // We have native instructions for i8, i16 and i32 extensions, but not i1. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -1474,7 +1517,15 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL, static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) { SDLoc DL(In); SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitScalar(In, DL, MVT::i64, MVT::i64); + if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128)) { + Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, In); + Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, + DAG.getNode(ISD::SRL, DL, MVT::i128, In, + DAG.getConstant(64, DL, MVT::i32))); + } else { + std::tie(Lo, Hi) = DAG.SplitScalar(In, DL, MVT::i64, MVT::i64); + } + SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL, MVT::Untyped, Hi, Lo); return SDValue(Pair, 0); @@ -1486,7 +1537,16 @@ static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) { DL, MVT::i64, In); SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64, DL, MVT::i64, In); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi); + + if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128)) { + Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, Lo); + Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, Hi); + Hi = DAG.getNode(ISD::SHL, DL, MVT::i128, Hi, + DAG.getConstant(64, DL, MVT::i32)); + return DAG.getNode(ISD::OR, DL, MVT::i128, Lo, Hi); + } else { + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi); + } } bool SystemZTargetLowering::splitValueIntoRegisterParts( @@ -2084,7 +2144,7 @@ CanLowerReturn(CallingConv::ID CallConv, VerifyVectorTypes(Outs); // Special case that we cannot easily detect in RetCC_SystemZ since - // i128 is not a legal type. + // i128 may not be a legal type. for (auto &Out : Outs) if (Out.ArgVT == MVT::i128) return false; @@ -2403,7 +2463,7 @@ static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) { return; auto *ConstOp1 = dyn_cast(C.Op1.getNode()); - if (!ConstOp1) + if (!ConstOp1 || ConstOp1->getValueSizeInBits(0) > 64) return; int64_t Value = ConstOp1->getSExtValue(); @@ -2437,6 +2497,8 @@ static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL, // The load must be an extending one and the constant must be within the // range of the unextended value. auto *ConstOp1 = cast(C.Op1); + if (!ConstOp1 || ConstOp1->getValueSizeInBits(0) > 64) + return; uint64_t Value = ConstOp1->getZExtValue(); uint64_t Mask = (1 << NumBits) - 1; if (Load->getExtensionType() == ISD::SEXTLOAD) { @@ -2515,7 +2577,9 @@ static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) { // Return true if it is better to swap the operands of C. static bool shouldSwapCmpOperands(const Comparison &C) { - // Leave f128 comparisons alone, since they have no memory forms. + // Leave i128 and f128 comparisons alone, since they have no memory forms. + if (C.Op0.getValueType() == MVT::i128) + return false; if (C.Op0.getValueType() == MVT::f128) return false; @@ -2652,6 +2716,7 @@ static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL, if (C.Op0.getOpcode() == ISD::TRUNCATE && C.Op0.getOperand(0).getOpcode() == ISD::LOAD && C.Op1.getOpcode() == ISD::Constant && + cast(C.Op1)->getValueSizeInBits(0) <= 64 && cast(C.Op1)->getZExtValue() == 0) { auto *L = cast(C.Op0.getOperand(0)); if (L->getMemoryVT().getStoreSizeInBits().getFixedValue() <= @@ -2780,6 +2845,27 @@ static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask, // Update the arguments with the TM version if so. static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) { + // Use VECTOR TEST UNDER MASK for i128 operations. + if (C.Op0.getValueType() == MVT::i128) { + // We can use VTM for EQ/NE comparisons of x & y against 0. + if (C.Op0.getOpcode() == ISD::AND && + (C.CCMask == SystemZ::CCMASK_CMP_EQ || + C.CCMask == SystemZ::CCMASK_CMP_NE)) { + auto *Mask = dyn_cast(C.Op1); + if (Mask && Mask->getAPIntValue() == 0) { + C.Opcode = SystemZISD::VTM; + C.Op1 = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, C.Op0.getOperand(1)); + C.Op0 = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, C.Op0.getOperand(0)); + C.CCValid = SystemZ::CCMASK_VCMP; + if (C.CCMask == SystemZ::CCMASK_CMP_EQ) + C.CCMask = SystemZ::CCMASK_VCMP_ALL; + else + C.CCMask = SystemZ::CCMASK_VCMP_ALL ^ C.CCValid; + } + } + return; + } + // Check that we have a comparison with a constant. auto *ConstOp1 = dyn_cast(C.Op1); if (!ConstOp1) @@ -2866,6 +2952,51 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL, C.CCMask = NewCCMask; } +// Implement i128 comparison in vector registers. +static void adjustICmp128(SelectionDAG &DAG, const SDLoc &DL, + Comparison &C) { + if (C.Opcode != SystemZISD::ICMP) + return; + if (C.Op0.getValueType() != MVT::i128) + return; + + // (In-)Equality comparisons can be implemented via VCEQGS. + if (C.CCMask == SystemZ::CCMASK_CMP_EQ || + C.CCMask == SystemZ::CCMASK_CMP_NE) { + C.Opcode = SystemZISD::VICMPES; + C.Op0 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, C.Op0); + C.Op1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, C.Op1); + C.CCValid = SystemZ::CCMASK_VCMP; + if (C.CCMask == SystemZ::CCMASK_CMP_EQ) + C.CCMask = SystemZ::CCMASK_VCMP_ALL; + else + C.CCMask = SystemZ::CCMASK_VCMP_ALL ^ C.CCValid; + return; + } + + // Normalize other comparisons to GT. + bool Swap = false, Invert = false; + switch (C.CCMask) { + case SystemZ::CCMASK_CMP_GT: break; + case SystemZ::CCMASK_CMP_LT: Swap = true; break; + case SystemZ::CCMASK_CMP_LE: Invert = true; break; + case SystemZ::CCMASK_CMP_GE: Swap = Invert = true; break; + default: llvm_unreachable("Invalid integer condition!"); + } + if (Swap) + std::swap(C.Op0, C.Op1); + + if (C.ICmpType == SystemZICMP::UnsignedOnly) + C.Opcode = SystemZISD::UCMP128HI; + else + C.Opcode = SystemZISD::SCMP128HI; + C.CCValid = SystemZ::CCMASK_ANY; + C.CCMask = SystemZ::CCMASK_1; + + if (Invert) + C.CCMask ^= C.CCValid; +} + // See whether the comparison argument contains a redundant AND // and remove it if so. This sometimes happens due to the generic // BRCOND expansion. @@ -2874,7 +3005,7 @@ static void adjustForRedundantAnd(SelectionDAG &DAG, const SDLoc &DL, if (C.Op0.getOpcode() != ISD::AND) return; auto *Mask = dyn_cast(C.Op0.getOperand(1)); - if (!Mask) + if (!Mask || Mask->getValueSizeInBits(0) > 64) return; KnownBits Known = DAG.computeKnownBits(C.Op0.getOperand(0)); if ((~Known.Zero).getZExtValue() & ~Mask->getZExtValue()) @@ -2926,16 +3057,17 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1, bool IsSignaling = false) { if (CmpOp1.getOpcode() == ISD::Constant) { assert(!Chain); - uint64_t Constant = cast(CmpOp1)->getZExtValue(); unsigned Opcode, CCValid; if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN && CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) && isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid)) - return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond); + return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, + cast(CmpOp1)->getZExtValue(), Cond); if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN && CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 && isIntrinsicWithCC(CmpOp0, Opcode, CCValid)) - return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond); + return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, + cast(CmpOp1)->getZExtValue(), Cond); } Comparison C(CmpOp0, CmpOp1, Chain); C.CCMask = CCMaskForCondCode(Cond); @@ -2980,6 +3112,7 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1, } adjustForTestUnderMask(DAG, DL, C); + adjustICmp128(DAG, DL, C); return C; } @@ -3007,6 +3140,11 @@ static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) { return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1, DAG.getTargetConstant(RegisterOnly, DL, MVT::i32)); } + if (C.Opcode == SystemZISD::VICMPES) { + SDVTList VTs = DAG.getVTList(C.Op0.getValueType(), MVT::i32); + SDValue Val = DAG.getNode(C.Opcode, DL, VTs, C.Op0, C.Op1); + return SDValue(Val.getNode(), 1); + } if (C.Chain) { SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); return DAG.getNode(C.Opcode, DL, VTs, C.Chain, C.Op0, C.Op1); @@ -3352,6 +3490,7 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op, C.CCMask != SystemZ::CCMASK_CMP_EQ && C.CCMask != SystemZ::CCMASK_CMP_NE && C.Op1.getOpcode() == ISD::Constant && + cast(C.Op1)->getValueSizeInBits(0) <= 64 && cast(C.Op1)->getZExtValue() == 0) { if (isAbsolute(C.Op0, TrueOp, FalseOp)) return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT); @@ -4135,6 +4274,29 @@ SDValue SystemZTargetLowering::lowerXALUO(SDValue Op, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDLoc DL(N); + + if (N->getValueType(0) == MVT::i128) { + unsigned BaseOp = 0; + unsigned FlagOp = 0; + switch (Op.getOpcode()) { + default: llvm_unreachable("Unknown instruction!"); + case ISD::UADDO: + BaseOp = ISD::ADD; + FlagOp = SystemZISD::VACC; + break; + case ISD::USUBO: + BaseOp = ISD::SUB; + FlagOp = SystemZISD::VSCBI; + break; + } + SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS); + SDValue Flag = DAG.getNode(FlagOp, DL, MVT::i128, LHS, RHS); + Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag, + DAG.getValueType(MVT::i1)); + Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1)); + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag); + } + unsigned BaseOp = 0; unsigned CCValid = 0; unsigned CCMask = 0; @@ -4200,6 +4362,30 @@ SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op, SDValue RHS = N->getOperand(1); SDValue Carry = Op.getOperand(2); SDLoc DL(N); + + if (VT == MVT::i128) { + unsigned BaseOp = 0; + unsigned FlagOp = 0; + switch (Op.getOpcode()) { + default: llvm_unreachable("Unknown instruction!"); + case ISD::UADDO_CARRY: + BaseOp = SystemZISD::VAC; + FlagOp = SystemZISD::VACCC; + break; + case ISD::USUBO_CARRY: + BaseOp = SystemZISD::VSBI; + FlagOp = SystemZISD::VSBCBI; + break; + } + Carry = DAG.getZExtOrTrunc(Carry, DL, MVT::i128); + SDValue Result = DAG.getNode(BaseOp, DL, MVT::i128, LHS, RHS, Carry); + SDValue Flag = DAG.getNode(FlagOp, DL, MVT::i128, LHS, RHS, Carry); + Flag = DAG.getNode(ISD::AssertZext, DL, MVT::i128, Flag, + DAG.getValueType(MVT::i1)); + Flag = DAG.getZExtOrTrunc(Flag, DL, N->getValueType(1)); + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Flag); + } + unsigned BaseOp = 0; unsigned CCValid = 0; unsigned CCMask = 0; @@ -4245,6 +4431,15 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op, SDLoc DL(Op); Op = Op.getOperand(0); + if (VT.getScalarSizeInBits() == 128) { + Op = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v2i64, Op); + SDValue Tmp = DAG.getSplatBuildVector(MVT::v2i64, DL, + DAG.getConstant(0, DL, MVT::i64)); + Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp); + return Op; + } + // Handle vector types via VPOPCT. if (VT.isVector()) { Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op); @@ -4338,6 +4533,12 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op, SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const { auto *Node = cast(Op.getNode()); + if (Node->getMemoryVT() == MVT::i128) { + // Use same code to handle both legal and non-legal i128 types. + SmallVector Results; + LowerOperationWrapper(Node, Results, DAG); + return DAG.getMergeValues(Results, SDLoc(Op)); + } return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(), Node->getChain(), Node->getBasePtr(), Node->getMemoryVT(), Node->getMemOperand()); @@ -4347,6 +4548,12 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op, SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const { auto *Node = cast(Op.getNode()); + if (Node->getMemoryVT() == MVT::i128) { + // Use same code to handle both legal and non-legal i128 types. + SmallVector Results; + LowerOperationWrapper(Node, Results, DAG); + return DAG.getMergeValues(Results, SDLoc(Op)); + } SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(), Node->getBasePtr(), Node->getMemoryVT(), Node->getMemOperand()); @@ -4477,6 +4684,13 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op, MachineMemOperand *MMO = Node->getMemOperand(); SDLoc DL(Node); + if (Node->getMemoryVT() == MVT::i128) { + // Use same code to handle both legal and non-legal i128 types. + SmallVector Results; + LowerOperationWrapper(Node, Results, DAG); + return DAG.getMergeValues(Results, DL); + } + // We have native support for 32-bit and 64-bit compare and swap, but we // still need to expand extracting the "success" result from the CC. EVT NarrowVT = Node->getMemoryVT(); @@ -4682,6 +4896,40 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::s390_vsumqg: return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::s390_vaq: + return DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::s390_vaccb: + case Intrinsic::s390_vacch: + case Intrinsic::s390_vaccf: + case Intrinsic::s390_vaccg: + case Intrinsic::s390_vaccq: + return DAG.getNode(SystemZISD::VACC, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::s390_vacq: + return DAG.getNode(SystemZISD::VAC, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::s390_vacccq: + return DAG.getNode(SystemZISD::VACCC, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::s390_vsq: + return DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::s390_vscbib: + case Intrinsic::s390_vscbih: + case Intrinsic::s390_vscbif: + case Intrinsic::s390_vscbig: + case Intrinsic::s390_vscbiq: + return DAG.getNode(SystemZISD::VSCBI, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::s390_vsbiq: + return DAG.getNode(SystemZISD::VSBI, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::s390_vsbcbiq: + return DAG.getNode(SystemZISD::VSBCBI, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } return SDValue(); @@ -6140,6 +6388,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(VSRA_BY_SCALAR); OPCODE(VROTL_BY_SCALAR); OPCODE(VSUM); + OPCODE(VACC); + OPCODE(VSCBI); + OPCODE(VAC); + OPCODE(VSBI); + OPCODE(VACCC); + OPCODE(VSBCBI); OPCODE(VICMPE); OPCODE(VICMPH); OPCODE(VICMPHL); @@ -6164,6 +6418,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(VROUND); OPCODE(STRICT_VROUND); OPCODE(VTM); + OPCODE(SCMP128HI); + OPCODE(UCMP128HI); OPCODE(VFAE_CC); OPCODE(VFAEZ_CC); OPCODE(VFEE_CC); @@ -6474,6 +6730,71 @@ SDValue SystemZTargetLowering::combineLOAD( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; EVT LdVT = N->getValueType(0); + SDLoc DL(N); + + // Replace an i128 load that is used solely to move its value into GPRs + // by separate loads of both halves. + if (LdVT == MVT::i128) { + LoadSDNode *LD = cast(N); + if (!LD->isSimple() || !ISD::isNormalLoad(LD)) + return SDValue(); + + // Scan through all users. + SmallVector, 2> Users; + int UsedElements = 0; + for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end(); + UI != UIEnd; ++UI) { + // Skip the uses of the chain. + if (UI.getUse().getResNo() != 0) + continue; + + // Verify every user is a TRUNCATE to i64 of the low or high half ... + SDNode *User = *UI; + int Index = 1; + if (User->getOpcode() == ISD::SRL && + User->getOperand(1).getOpcode() == ISD::Constant && + cast(User->getOperand(1))->getZExtValue() == 64 && + User->hasOneUse()) { + User = *User->use_begin(); + Index = 0; + } + if (User->getOpcode() != ISD::TRUNCATE || + User->getValueType(0) != MVT::i64) + return SDValue(); + + // ... and no half is extracted twice. + if (UsedElements & (1 << Index)) + return SDValue(); + + UsedElements |= 1 << Index; + Users.push_back(std::make_pair(User, Index)); + } + + // Rewrite each extraction as an independent load. + SmallVector ArgChains; + for (auto UserAndIndex : Users) { + SDNode *User = UserAndIndex.first; + unsigned Offset = User->getValueType(0).getStoreSize() * UserAndIndex.second; + SDValue Ptr = + DAG.getMemBasePlusOffset(LD->getBasePtr(), TypeSize::getFixed(Offset), DL); + SDValue EltLoad = + DAG.getLoad(User->getValueType(0), DL, LD->getChain(), Ptr, + LD->getPointerInfo().getWithOffset(Offset), + LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), + LD->getAAInfo()); + + DCI.CombineTo(User, EltLoad, true); + ArgChains.push_back(EltLoad.getValue(1)); + } + + // Collect all chains via TokenFactor. + SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + ArgChains); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); + DCI.AddToWorklist(Chain.getNode()); + return SDValue(N, 0); + } + if (LdVT.isVector() || LdVT.isInteger()) return SDValue(); // Transform a scalar load that is REPLICATEd as well as having other @@ -6497,7 +6818,6 @@ SDValue SystemZTargetLowering::combineLOAD( if (!Replicate || OtherUses.empty()) return SDValue(); - SDLoc DL(N); SDValue Extract0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, LdVT, Replicate, DAG.getConstant(0, DL, MVT::i32)); // Update uses of the loaded Value while preserving old chains. @@ -6514,7 +6834,7 @@ bool SystemZTargetLowering::canLoadStoreByteSwapped(EVT VT) const { if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) return true; if (Subtarget.hasVectorEnhancements2()) - if (VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64) + if (VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::i128) return true; return false; } @@ -6552,6 +6872,33 @@ static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) { return true; } +static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) { + if (Val.getOpcode() != ISD::OR || !Val.getNode()->hasOneUse()) + return false; + + SDValue Op0 = Val.getOperand(0); + SDValue Op1 = Val.getOperand(1); + + if (Op0.getOpcode() == ISD::SHL) + std::swap(Op0, Op1); + if (Op1.getOpcode() != ISD::SHL || !Op1.getNode()->hasOneUse() || + Op1.getOperand(1).getOpcode() != ISD::Constant || + cast(Op1.getOperand(1))->getZExtValue() != 64) + return false; + Op1 = Op1.getOperand(0); + + if (Op0.getOpcode() != ISD::ZERO_EXTEND || !Op0.getNode()->hasOneUse() || + Op0.getOperand(0).getValueType() != MVT::i64) + return false; + if (Op1.getOpcode() != ISD::ANY_EXTEND || !Op1.getNode()->hasOneUse() || + Op1.getOperand(0).getValueType() != MVT::i64) + return false; + + LoPart = Op0.getOperand(0); + HiPart = Op1.getOperand(0); + return true; +} + SDValue SystemZTargetLowering::combineSTORE( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -6610,6 +6957,27 @@ SDValue SystemZTargetLowering::combineSTORE( } } + // Transform a store of an i128 moved from GPRs into two separate stores. + if (MemVT == MVT::i128 && SN->isSimple() && ISD::isNormalStore(SN)) { + SDValue LoPart, HiPart; + if (isMovedFromParts(Op1, LoPart, HiPart)) { + SDLoc DL(SN); + SDValue Chain0 = + DAG.getStore(SN->getChain(), DL, HiPart, SN->getBasePtr(), + SN->getPointerInfo(), SN->getOriginalAlign(), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); + SDValue Chain1 = + DAG.getStore(SN->getChain(), DL, LoPart, + DAG.getObjectPtrOffset(DL, SN->getBasePtr(), + TypeSize::getFixed(8)), + SN->getPointerInfo().getWithOffset(8), + SN->getOriginalAlign(), + SN->getMemOperand()->getFlags(), SN->getAAInfo()); + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain0, Chain1); + } + } + // Replicate a reg or immediate with VREP instead of scalar multiply or // immediate load. It seems best to do this during the first DAGCombine as // it is straight-forward to handle the zero-extend node in the initial @@ -7700,6 +8068,7 @@ static bool isSelectPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { case SystemZ::Select32: case SystemZ::Select64: + case SystemZ::Select128: case SystemZ::SelectF32: case SystemZ::SelectF64: case SystemZ::SelectF128: @@ -7944,6 +8313,69 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI, return JoinMBB; } +// Implement EmitInstrWithCustomInserter for pseudo [SU]Cmp128Hi instruction MI. +MachineBasicBlock * +SystemZTargetLowering::emitICmp128Hi(MachineInstr &MI, + MachineBasicBlock *MBB, + bool Unsigned) const { + MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = Subtarget.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Synthetic instruction to compare 128-bit values. + // Sets CC 1 if Op0 > Op1, sets a different CC otherwise. + Register Op0 = MI.getOperand(0).getReg(); + Register Op1 = MI.getOperand(1).getReg(); + + MachineBasicBlock *StartMBB = MBB; + MachineBasicBlock *JoinMBB = SystemZ::splitBlockAfter(MI, MBB); + MachineBasicBlock *HiEqMBB = SystemZ::emitBlockAfter(StartMBB); + + // StartMBB: + // + // Use VECTOR ELEMENT COMPARE [LOGICAL] to compare the high parts. + // Swap the inputs to get: + // CC 1 if high(Op0) > high(Op1) + // CC 2 if high(Op0) < high(Op1) + // CC 0 if high(Op0) == high(Op1) + // + // If CC != 0, we'd done, so jump over the next instruction. + // + // VEC[L]G Op1, Op0 + // JNE JoinMBB + // # fallthrough to HiEqMBB + MBB = StartMBB; + int HiOpcode = Unsigned? SystemZ::VECLG : SystemZ::VECG; + BuildMI(MBB, MI.getDebugLoc(), TII->get(HiOpcode)) + .addReg(Op1).addReg(Op0); + BuildMI(MBB, MI.getDebugLoc(), TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE).addMBB(JoinMBB); + MBB->addSuccessor(JoinMBB); + MBB->addSuccessor(HiEqMBB); + + // HiEqMBB: + // + // Otherwise, use VECTOR COMPARE HIGH LOGICAL. + // Since we already know the high parts are equal, the CC + // result will only depend on the low parts: + // CC 1 if low(Op0) > low(Op1) + // CC 3 if low(Op0) <= low(Op1) + // + // VCHLGS Tmp, Op0, Op1 + // # fallthrough to JoinMBB + MBB = HiEqMBB; + Register Temp = MRI.createVirtualRegister(&SystemZ::VR128BitRegClass); + BuildMI(MBB, MI.getDebugLoc(), TII->get(SystemZ::VCHLGS), Temp) + .addReg(Op0).addReg(Op1); + MBB->addSuccessor(JoinMBB); + + // Mark CC as live-in to JoinMBB. + JoinMBB->addLiveIn(SystemZ::CC); + + MI.eraseFromParent(); + return JoinMBB; +} + // Implement EmitInstrWithCustomInserter for subword pseudo ATOMIC_LOADW_* or // ATOMIC_SWAPW instruction MI. BinOpcode is the instruction that performs // the binary operation elided by "*", or 0 for ATOMIC_SWAPW. Invert says @@ -8908,6 +9340,7 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( switch (MI.getOpcode()) { case SystemZ::Select32: case SystemZ::Select64: + case SystemZ::Select128: case SystemZ::SelectF32: case SystemZ::SelectF64: case SystemZ::SelectF128: @@ -8953,6 +9386,11 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( case SystemZ::CondStoreF64Inv: return emitCondStore(MI, MBB, SystemZ::STD, 0, true); + case SystemZ::SCmp128Hi: + return emitICmp128Hi(MI, MBB, false); + case SystemZ::UCmp128Hi: + return emitICmp128Hi(MI, MBB, true); + case SystemZ::PAIR128: return emitPair128(MI, MBB); case SystemZ::AEXT128: diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 1e2887cff8164..3e614a1186b26 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -227,6 +227,13 @@ enum NodeType : unsigned { // rightmost sub-element of the corresponding element of operand 1. VSUM, + // Compute carry/borrow indication for add/subtract. + VACC, VSCBI, + // Add/subtract with carry/borrow. + VAC, VSBI, + // Compute carry/borrow indication for add/subtract with carry/borrow. + VACCC, VSBCBI, + // Compare integer vector operands 0 and 1 to produce the usual 0/-1 // vector result. VICMPE is for equality, VICMPH for "signed greater than" // and VICMPHL for "unsigned greater than". @@ -265,6 +272,10 @@ enum NodeType : unsigned { // AND the two vector operands together and set CC based on the result. VTM, + // i128 high integer comparisons. + SCMP128HI, + UCMP128HI, + // String operations that set CC as a side-effect. VFAE_CC, VFAEZ_CC, @@ -433,6 +444,7 @@ class SystemZTargetLowering : public TargetLowering { return TargetLowering::getNumRegisters(Context, VT); } bool isCheapToSpeculateCtlz(Type *) const override { return true; } + bool isCheapToSpeculateCttz(Type *) const override { return true; } bool preferZeroCompareBranch() const override { return true; } bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override { ConstantInt* Mask = dyn_cast(AndI.getOperand(1)); @@ -742,6 +754,8 @@ class SystemZTargetLowering : public TargetLowering { MachineBasicBlock *emitCondStore(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOpcode, unsigned STOCOpcode, bool Invert) const; + MachineBasicBlock *emitICmp128Hi(MachineInstr &MI, MachineBasicBlock *BB, + bool Unsigned) const; MachineBasicBlock *emitPair128(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *emitExt128(MachineInstr &MI, MachineBasicBlock *MBB, diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index 37d6945dc7a05..799b27d74414d 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -266,7 +266,7 @@ let Predicates = [FeatureVectorEnhancements2] in { def VLBRH : UnaryVRX<"vlbrh", 0xE606, z_loadbswap, v128h, 16, 1>; def VLBRF : UnaryVRX<"vlbrf", 0xE606, z_loadbswap, v128f, 16, 2>; def VLBRG : UnaryVRX<"vlbrg", 0xE606, z_loadbswap, v128g, 16, 3>; - def VLBRQ : UnaryVRX<"vlbrq", 0xE606, null_frag, v128q, 16, 4>; + def VLBRQ : UnaryVRX<"vlbrq", 0xE606, z_loadbswap, v128q, 16, 4>; // Load elements reversed. def VLER : UnaryVRXGeneric<"vler", 0xE607>; @@ -307,7 +307,7 @@ let Predicates = [FeatureVectorEnhancements2] in { def VSTBRH : StoreVRX<"vstbrh", 0xE60E, z_storebswap, v128h, 16, 1>; def VSTBRF : StoreVRX<"vstbrf", 0xE60E, z_storebswap, v128f, 16, 2>; def VSTBRG : StoreVRX<"vstbrg", 0xE60E, z_storebswap, v128g, 16, 3>; - def VSTBRQ : StoreVRX<"vstbrq", 0xE60E, null_frag, v128q, 16, 4>; + def VSTBRQ : StoreVRX<"vstbrq", 0xE60E, z_storebswap, v128q, 16, 4>; // Store elements reversed. def VSTER : StoreVRXGeneric<"vster", 0xE60F>; @@ -478,26 +478,26 @@ let Predicates = [FeatureVector] in { def VAH : BinaryVRRc<"vah", 0xE7F3, add, v128h, v128h, 1>; def VAF : BinaryVRRc<"vaf", 0xE7F3, add, v128f, v128f, 2>; def VAG : BinaryVRRc<"vag", 0xE7F3, add, v128g, v128g, 3>; - def VAQ : BinaryVRRc<"vaq", 0xE7F3, int_s390_vaq, v128q, v128q, 4>; + def VAQ : BinaryVRRc<"vaq", 0xE7F3, add, v128q, v128q, 4>; } let isCommutable = 1 in { // Add compute carry. def VACC : BinaryVRRcGeneric<"vacc", 0xE7F1>; - def VACCB : BinaryVRRc<"vaccb", 0xE7F1, int_s390_vaccb, v128b, v128b, 0>; - def VACCH : BinaryVRRc<"vacch", 0xE7F1, int_s390_vacch, v128h, v128h, 1>; - def VACCF : BinaryVRRc<"vaccf", 0xE7F1, int_s390_vaccf, v128f, v128f, 2>; - def VACCG : BinaryVRRc<"vaccg", 0xE7F1, int_s390_vaccg, v128g, v128g, 3>; - def VACCQ : BinaryVRRc<"vaccq", 0xE7F1, int_s390_vaccq, v128q, v128q, 4>; + def VACCB : BinaryVRRc<"vaccb", 0xE7F1, z_vacc, v128b, v128b, 0>; + def VACCH : BinaryVRRc<"vacch", 0xE7F1, z_vacc, v128h, v128h, 1>; + def VACCF : BinaryVRRc<"vaccf", 0xE7F1, z_vacc, v128f, v128f, 2>; + def VACCG : BinaryVRRc<"vaccg", 0xE7F1, z_vacc, v128g, v128g, 3>; + def VACCQ : BinaryVRRc<"vaccq", 0xE7F1, z_vacc, v128q, v128q, 4>; // Add with carry. def VAC : TernaryVRRdGeneric<"vac", 0xE7BB>; - def VACQ : TernaryVRRd<"vacq", 0xE7BB, int_s390_vacq, v128q, v128q, 4>; + def VACQ : TernaryVRRd<"vacq", 0xE7BB, z_vac, v128q, v128q, 4>; // Add with carry compute carry. def VACCC : TernaryVRRdGeneric<"vaccc", 0xE7B9>; - def VACCCQ : TernaryVRRd<"vacccq", 0xE7B9, int_s390_vacccq, v128q, v128q, 4>; - } + def VACCCQ : TernaryVRRd<"vacccq", 0xE7B9, z_vaccc, v128q, v128q, 4>; + } // And. let isCommutable = 1 in @@ -830,24 +830,23 @@ let Predicates = [FeatureVector] in { def VSH : BinaryVRRc<"vsh", 0xE7F7, sub, v128h, v128h, 1>; def VSF : BinaryVRRc<"vsf", 0xE7F7, sub, v128f, v128f, 2>; def VSG : BinaryVRRc<"vsg", 0xE7F7, sub, v128g, v128g, 3>; - def VSQ : BinaryVRRc<"vsq", 0xE7F7, int_s390_vsq, v128q, v128q, 4>; + def VSQ : BinaryVRRc<"vsq", 0xE7F7, sub, v128q, v128q, 4>; // Subtract compute borrow indication. def VSCBI : BinaryVRRcGeneric<"vscbi", 0xE7F5>; - def VSCBIB : BinaryVRRc<"vscbib", 0xE7F5, int_s390_vscbib, v128b, v128b, 0>; - def VSCBIH : BinaryVRRc<"vscbih", 0xE7F5, int_s390_vscbih, v128h, v128h, 1>; - def VSCBIF : BinaryVRRc<"vscbif", 0xE7F5, int_s390_vscbif, v128f, v128f, 2>; - def VSCBIG : BinaryVRRc<"vscbig", 0xE7F5, int_s390_vscbig, v128g, v128g, 3>; - def VSCBIQ : BinaryVRRc<"vscbiq", 0xE7F5, int_s390_vscbiq, v128q, v128q, 4>; + def VSCBIB : BinaryVRRc<"vscbib", 0xE7F5, z_vscbi, v128b, v128b, 0>; + def VSCBIH : BinaryVRRc<"vscbih", 0xE7F5, z_vscbi, v128h, v128h, 1>; + def VSCBIF : BinaryVRRc<"vscbif", 0xE7F5, z_vscbi, v128f, v128f, 2>; + def VSCBIG : BinaryVRRc<"vscbig", 0xE7F5, z_vscbi, v128g, v128g, 3>; + def VSCBIQ : BinaryVRRc<"vscbiq", 0xE7F5, z_vscbi, v128q, v128q, 4>; // Subtract with borrow indication. def VSBI : TernaryVRRdGeneric<"vsbi", 0xE7BF>; - def VSBIQ : TernaryVRRd<"vsbiq", 0xE7BF, int_s390_vsbiq, v128q, v128q, 4>; + def VSBIQ : TernaryVRRd<"vsbiq", 0xE7BF, z_vsbi, v128q, v128q, 4>; // Subtract with borrow compute borrow indication. def VSBCBI : TernaryVRRdGeneric<"vsbcbi", 0xE7BD>; - def VSBCBIQ : TernaryVRRd<"vsbcbiq", 0xE7BD, int_s390_vsbcbiq, - v128q, v128q, 4>; + def VSBCBIQ : TernaryVRRd<"vsbcbiq", 0xE7BD, z_vsbcbi, v128q, v128q, 4>; // Sum across doubleword. def VSUMG : BinaryVRRcGeneric<"vsumg", 0xE765>; @@ -866,34 +865,35 @@ let Predicates = [FeatureVector] in { } // Instantiate the bitwise ops for type TYPE. -multiclass BitwiseVectorOps { +multiclass BitwiseVectorOps { let Predicates = [FeatureVector] in { def : Pat<(type (and VR128:$x, VR128:$y)), (VN VR128:$x, VR128:$y)>; - def : Pat<(type (and VR128:$x, (z_vnot VR128:$y))), + def : Pat<(type (and VR128:$x, (not_op VR128:$y))), (VNC VR128:$x, VR128:$y)>; def : Pat<(type (or VR128:$x, VR128:$y)), (VO VR128:$x, VR128:$y)>; def : Pat<(type (xor VR128:$x, VR128:$y)), (VX VR128:$x, VR128:$y)>; def : Pat<(type (or (and VR128:$x, VR128:$z), - (and VR128:$y, (z_vnot VR128:$z)))), + (and VR128:$y, (not_op VR128:$z)))), (VSEL VR128:$x, VR128:$y, VR128:$z)>; - def : Pat<(type (z_vnot (or VR128:$x, VR128:$y))), + def : Pat<(type (not_op (or VR128:$x, VR128:$y))), (VNO VR128:$x, VR128:$y)>; - def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>; + def : Pat<(type (not_op VR128:$x)), (VNO VR128:$x, VR128:$x)>; } let Predicates = [FeatureVectorEnhancements1] in { - def : Pat<(type (z_vnot (xor VR128:$x, VR128:$y))), + def : Pat<(type (not_op (xor VR128:$x, VR128:$y))), (VNX VR128:$x, VR128:$y)>; - def : Pat<(type (z_vnot (and VR128:$x, VR128:$y))), + def : Pat<(type (not_op (and VR128:$x, VR128:$y))), (VNN VR128:$x, VR128:$y)>; - def : Pat<(type (or VR128:$x, (z_vnot VR128:$y))), + def : Pat<(type (or VR128:$x, (not_op VR128:$y))), (VOC VR128:$x, VR128:$y)>; } } -defm : BitwiseVectorOps; -defm : BitwiseVectorOps; -defm : BitwiseVectorOps; -defm : BitwiseVectorOps; +defm : BitwiseVectorOps; +defm : BitwiseVectorOps; +defm : BitwiseVectorOps; +defm : BitwiseVectorOps; +defm : BitwiseVectorOps; // Instantiate additional patterns for absolute-related expressions on // type TYPE. LC is the negate instruction for TYPE and LP is the absolute @@ -962,6 +962,26 @@ defm : IntegerMinMaxVectorOps; defm : IntegerMinMaxVectorOps; defm : IntegerMinMaxVectorOps; +// Instantiate full-vector shifts. +multiclass FullVectorShiftOps { + let Predicates = [FeatureVector] in { + def : Pat<(shift (i128 VR128:$x), imm32nobytes:$amt), + (sbit VR128:$x, (VREPIB (UIMM8 imm:$amt)))>; + def : Pat<(shift (i128 VR128:$x), imm32nobits:$amt), + (sbyte VR128:$x, (VREPIB (UIMM8 imm:$amt)))>; + def : Pat<(shift (i128 VR128:$x), imm32:$amt), + (sbit (sbyte VR128:$x, (VREPIB (UIMM8 imm:$amt))), + (VREPIB (UIMM8 imm:$amt)))>; + def : Pat<(shift (i128 VR128:$x), GR32:$amt), + (sbit (sbyte VR128:$x, (VREPB (VLVGP32 GR32:$amt, GR32:$amt), 15)), + (VREPB (VLVGP32 GR32:$amt, GR32:$amt), 15))>; + } +} +defm : FullVectorShiftOps, VSL, VSLB>; +defm : FullVectorShiftOps, VSRL, VSRLB>; +defm : FullVectorShiftOps, VSRA, VSRAB>; + //===----------------------------------------------------------------------===// // Integer comparison //===----------------------------------------------------------------------===// @@ -1516,6 +1536,158 @@ let Predicates = [FeatureVector] in { } } +//===----------------------------------------------------------------------===// +// Support for 128-bit integer values in vector registers +//===----------------------------------------------------------------------===// + +// Loads and stores. +let Predicates = [FeatureVector] in { + def : Pat<(i128 (load bdxaddr12only:$addr)), + (VL bdxaddr12only:$addr)>; + def : Pat<(store (i128 VR128:$src), bdxaddr12only:$addr), + (VST VR128:$src, bdxaddr12only:$addr)>; +} + +// Full i128 move from GPR pair. +let Predicates = [FeatureVector] in + def : Pat<(i128 (or (zext GR64:$x), (shl (anyext GR64:$y), (i32 64)))), + (VLVGP GR64:$y, GR64:$x)>; + +// Any-extensions from GPR to i128. +let Predicates = [FeatureVector] in { + def : Pat<(i128 (anyext GR32:$x)), (VLVGP32 GR32:$x, GR32:$x)>; + def : Pat<(i128 (anyext GR64:$x)), (VLVGP GR64:$x, GR64:$x)>; +} + +// Any-extending loads into i128. +let Predicates = [FeatureVector] in { + def : Pat<(i128 (extloadi8 bdxaddr12only:$addr)), + (VLREPB bdxaddr12only:$addr)>; + def : Pat<(i128 (extloadi16 bdxaddr12only:$addr)), + (VLREPH bdxaddr12only:$addr)>; + def : Pat<(i128 (extloadi32 bdxaddr12only:$addr)), + (VLREPF bdxaddr12only:$addr)>; + def : Pat<(i128 (extloadi64 bdxaddr12only:$addr)), + (VLREPG bdxaddr12only:$addr)>; +} + +// Truncations from i128 to GPR. +let Predicates = [FeatureVector] in { + def : Pat<(i32 (trunc (i128 VR128:$vec))), + (EXTRACT_SUBREG (VLGVF VR128:$vec, zero_reg, 3), subreg_l32)>; + def : Pat<(i32 (trunc (srl (i128 VR128:$vec), (i32 32)))), + (EXTRACT_SUBREG (VLGVF VR128:$vec, zero_reg, 2), subreg_l32)>; + def : Pat<(i32 (trunc (srl (i128 VR128:$vec), (i32 64)))), + (EXTRACT_SUBREG (VLGVF VR128:$vec, zero_reg, 1), subreg_l32)>; + def : Pat<(i32 (trunc (srl (i128 VR128:$vec), (i32 96)))), + (EXTRACT_SUBREG (VLGVF VR128:$vec, zero_reg, 0), subreg_l32)>; + def : Pat<(i64 (trunc (i128 VR128:$vec))), + (VLGVG VR128:$vec, zero_reg, 1)>; + def : Pat<(i64 (trunc (srl (i128 VR128:$vec), (i32 64)))), + (VLGVG VR128:$vec, zero_reg, 0)>; +} + +// Truncating stores from i128. +let Predicates = [FeatureVector] in { + def : Pat<(truncstorei8 (i128 VR128:$x), bdxaddr12only:$addr), + (VSTEB VR128:$x, bdxaddr12only:$addr, 15)>; + def : Pat<(truncstorei16 (i128 VR128:$x), bdxaddr12only:$addr), + (VSTEH VR128:$x, bdxaddr12only:$addr, 7)>; + def : Pat<(truncstorei32 (i128 VR128:$x), bdxaddr12only:$addr), + (VSTEF VR128:$x, bdxaddr12only:$addr, 3)>; + def : Pat<(truncstorei32 (srl (i128 VR128:$x), (i32 32)), bdxaddr12only:$addr), + (VSTEF VR128:$x, bdxaddr12only:$addr, 2)>; + def : Pat<(truncstorei32 (srl (i128 VR128:$x), (i32 64)), bdxaddr12only:$addr), + (VSTEF VR128:$x, bdxaddr12only:$addr, 1)>; + def : Pat<(truncstorei32 (srl (i128 VR128:$x), (i32 96)), bdxaddr12only:$addr), + (VSTEF VR128:$x, bdxaddr12only:$addr, 0)>; + def : Pat<(truncstorei64 (i128 VR128:$x), bdxaddr12only:$addr), + (VSTEG VR128:$x, bdxaddr12only:$addr, 1)>; + def : Pat<(truncstorei64 (srl (i128 VR128:$x), (i32 64)), bdxaddr12only:$addr), + (VSTEG VR128:$x, bdxaddr12only:$addr, 0)>; +} + +// Zero-extensions from GPR to i128. +let Predicates = [FeatureVector] in { + def : Pat<(i128 (zext8 (anyext GR32:$x))), + (VLVGB (VGBM 0), GR32:$x, zero_reg, 15)>; + def : Pat<(i128 (zext16 (anyext GR32:$x))), + (VLVGH (VGBM 0), GR32:$x, zero_reg, 7)>; + def : Pat<(i128 (zext GR32:$x)), + (VLVGF (VGBM 0), GR32:$x, zero_reg, 3)>; + def : Pat<(i128 (zext GR64:$x)), + (VLVGG (VGBM 0), GR64:$x, zero_reg, 1)>; +} + +// Zero-extending loads into i128. +let Predicates = [FeatureVector] in { + def : Pat<(i128 (zextloadi8 bdxaddr12only:$addr)), + (VLEB (VGBM 0), bdxaddr12only:$addr, 15)>; + def : Pat<(i128 (zextloadi16 bdxaddr12only:$addr)), + (VLEH (VGBM 0), bdxaddr12only:$addr, 7)>; + def : Pat<(i128 (zextloadi32 bdxaddr12only:$addr)), + (VLEF (VGBM 0), bdxaddr12only:$addr, 3)>; + def : Pat<(i128 (zextloadi64 bdxaddr12only:$addr)), + (VLEG (VGBM 0), bdxaddr12only:$addr, 1)>; +} + +// In-register i128 sign-extensions. +let Predicates = [FeatureVector] in { + def : Pat<(i128 (sext_inreg VR128:$x, i8)), + (VSRAB (VREPB VR128:$x, 15), (VREPIB 120))>; + def : Pat<(i128 (sext_inreg VR128:$x, i16)), + (VSRAB (VREPH VR128:$x, 7), (VREPIB 112))>; + def : Pat<(i128 (sext_inreg VR128:$x, i32)), + (VSRAB (VREPF VR128:$x, 3), (VREPIB 96))>; + def : Pat<(i128 (sext_inreg VR128:$x, i64)), + (VSRAB (VREPG VR128:$x, 1), (VREPIB 64))>; +} + +// Sign-extensions from GPR to i128. +let Predicates = [FeatureVector] in { + def : Pat<(i128 (sext_inreg (anyext GR32:$x), i8)), + (VLVGP (SRAG (LGBR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GR32:$x, subreg_l32)), zero_reg, 63), + (LGBR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GR32:$x, subreg_l32)))>; + def : Pat<(i128 (sext_inreg (anyext GR32:$x), i16)), + (VLVGP (SRAG (LGHR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GR32:$x, subreg_l32)), zero_reg, 63), + (LGHR (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GR32:$x, subreg_l32)))>; + def : Pat<(i128 (sext GR32:$x)), + (VLVGP (SRAG (LGFR GR32:$x), zero_reg, 63), (LGFR GR32:$x))>; + def : Pat<(i128 (sext GR64:$x)), + (VLVGP (SRAG GR64:$x, zero_reg, 63), GR64:$x)>; +} + +// Sign-extending loads into i128. +let Predicates = [FeatureVector] in { + def : Pat<(i128 (sextloadi8 bdxaddr12only:$addr)), + (VSRAB (VLREPB bdxaddr12only:$addr), (VREPIB 120))>; + def : Pat<(i128 (sextloadi16 bdxaddr12only:$addr)), + (VSRAB (VLREPH bdxaddr12only:$addr), (VREPIB 112))>; + def : Pat<(i128 (sextloadi32 bdxaddr12only:$addr)), + (VSRAB (VLREPF bdxaddr12only:$addr), (VREPIB 96))>; + def : Pat<(i128 (sextloadi64 bdxaddr12only:$addr)), + (VSRAB (VLREPG bdxaddr12only:$addr), (VREPIB 64))>; +} + +// i128 comparison pseudo-instructions. +let Predicates = [FeatureVector], Defs = [CC], + usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def SCmp128Hi : Pseudo<(outs), (ins VR128:$src1, VR128:$src2), + [(set CC, (z_scmp128hi (i128 VR128:$src1), + (i128 VR128:$src2)))]>; + def UCmp128Hi : Pseudo<(outs), (ins VR128:$src1, VR128:$src2), + [(set CC, (z_ucmp128hi (i128 VR128:$src1), + (i128 VR128:$src2)))]>; +} + +// i128 select pseudo-instructions. +let Predicates = [FeatureVector] in + def Select128 : SelectWrapper; + //===----------------------------------------------------------------------===// // Conversions //===----------------------------------------------------------------------===// @@ -1523,6 +1695,7 @@ let Predicates = [FeatureVector] in { def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (i128 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>; @@ -1530,6 +1703,7 @@ def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (i128 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>; @@ -1537,6 +1711,7 @@ def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (i128 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>; @@ -1544,6 +1719,7 @@ def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (i128 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>; @@ -1551,6 +1727,7 @@ def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (i128 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>; @@ -1558,6 +1735,7 @@ def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (i128 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>; @@ -1566,9 +1744,18 @@ def : Pat<(f128 (bitconvert (v16i8 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (i128 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(i128 (bitconvert (v16i8 VR128:$src))), (i128 VR128:$src)>; +def : Pat<(i128 (bitconvert (v8i16 VR128:$src))), (i128 VR128:$src)>; +def : Pat<(i128 (bitconvert (v4i32 VR128:$src))), (i128 VR128:$src)>; +def : Pat<(i128 (bitconvert (v2i64 VR128:$src))), (i128 VR128:$src)>; +def : Pat<(i128 (bitconvert (v4f32 VR128:$src))), (i128 VR128:$src)>; +def : Pat<(i128 (bitconvert (v2f64 VR128:$src))), (i128 VR128:$src)>; +def : Pat<(i128 (bitconvert (f128 VR128:$src))), (i128 VR128:$src)>; + //===----------------------------------------------------------------------===// // Replicating scalars //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZOperands.td b/llvm/lib/Target/SystemZ/SystemZOperands.td index c92e0abe38ac9..0221e2c53f2f4 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperands.td +++ b/llvm/lib/Target/SystemZ/SystemZOperands.td @@ -314,70 +314,72 @@ def U48Imm : ImmediateAsmOperand<"U48Imm">; // Immediates for the lower and upper 16 bits of an i32, with the other // bits of the i32 being zero. defm imm32ll16 : ImmediategetZExtValue()); + return N->getAPIntValue().isIntN(32) && SystemZ::isImmLL(N->getZExtValue()); }], LL16, "U16Imm">; defm imm32lh16 : ImmediategetZExtValue()); + return N->getAPIntValue().isIntN(32) && SystemZ::isImmLH(N->getZExtValue()); }], LH16, "U16Imm">; // Immediates for the lower and upper 16 bits of an i32, with the other // bits of the i32 being one. defm imm32ll16c : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(32) && + SystemZ::isImmLL(uint32_t(~N->getZExtValue())); }], LL16, "U16Imm">; defm imm32lh16c : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(32) && + SystemZ::isImmLH(uint32_t(~N->getZExtValue())); }], LH16, "U16Imm">; // Short immediates defm imm32zx1 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(1); }], NOOP_SDNodeXForm, "U1Imm">; defm imm32zx2 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(2); }], NOOP_SDNodeXForm, "U2Imm">; defm imm32zx3 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(3); }], NOOP_SDNodeXForm, "U3Imm">; defm imm32zx4 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(4); }], NOOP_SDNodeXForm, "U4Imm">; // Note: this enforces an even value during code generation only. // When used from the assembler, any 4-bit value is allowed. defm imm32zx4even : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(4); }], UIMM8EVEN, "U4Imm">; defm imm32sx8 : Immediate(N->getSExtValue()); + return N->getAPIntValue().isSignedIntN(8); }], SIMM8, "S8Imm">; defm imm32zx8 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(8); }], UIMM8, "U8Imm">; defm imm32zx8trunc : Immediate; defm imm32zx12 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(12); }], UIMM12, "U12Imm">; defm imm32sx16 : Immediate(N->getSExtValue()); + return N->getAPIntValue().isSignedIntN(16); }], SIMM16, "S16Imm">; defm imm32sx16n : Immediate(-N->getSExtValue()); + return (-N->getAPIntValue()).isSignedIntN(16); }], NEGSIMM16, "S16Imm">; defm imm32zx16 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(16); }], UIMM16, "U16Imm">; defm imm32sx16trunc : Immediate; @@ -390,7 +392,8 @@ defm simm32 : Immediate; defm uimm32 : Immediate; defm simm32n : Immediate(-N->getSExtValue()); + auto SImm = N->getAPIntValue().trySExtValue(); + return SImm.has_value() && isInt<32>(-*SImm); }], NEGSIMM32, "S32Imm">; def imm32 : ImmLeaf; @@ -402,107 +405,115 @@ def imm32 : ImmLeaf; // Immediates for 16-bit chunks of an i64, with the other bits of the // i32 being zero. defm imm64ll16 : ImmediategetZExtValue()); + return N->getAPIntValue().isIntN(64) && SystemZ::isImmLL(N->getZExtValue()); }], LL16, "U16Imm">; defm imm64lh16 : ImmediategetZExtValue()); + return N->getAPIntValue().isIntN(64) && SystemZ::isImmLH(N->getZExtValue()); }], LH16, "U16Imm">; defm imm64hl16 : ImmediategetZExtValue()); + return N->getAPIntValue().isIntN(64) && SystemZ::isImmHL(N->getZExtValue()); }], HL16, "U16Imm">; defm imm64hh16 : ImmediategetZExtValue()); + return N->getAPIntValue().isIntN(64) && SystemZ::isImmHH(N->getZExtValue()); }], HH16, "U16Imm">; // Immediates for 16-bit chunks of an i64, with the other bits of the // i32 being one. defm imm64ll16c : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(64) && + SystemZ::isImmLL(uint64_t(~N->getZExtValue())); }], LL16, "U16Imm">; defm imm64lh16c : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(64) && + SystemZ::isImmLH(uint64_t(~N->getZExtValue())); }], LH16, "U16Imm">; defm imm64hl16c : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(64) && + SystemZ::isImmHL(uint64_t(~N->getZExtValue())); }], HL16, "U16Imm">; defm imm64hh16c : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(64) && + SystemZ::isImmHH(uint64_t(~N->getZExtValue())); }], HH16, "U16Imm">; // Immediates for the lower and upper 32 bits of an i64, with the other // bits of the i32 being zero. defm imm64lf32 : ImmediategetZExtValue()); + return N->getAPIntValue().isIntN(64) && SystemZ::isImmLF(N->getZExtValue()); }], LF32, "U32Imm">; defm imm64hf32 : ImmediategetZExtValue()); + return N->getAPIntValue().isIntN(64) && SystemZ::isImmHF(N->getZExtValue()); }], HF32, "U32Imm">; // Immediates for the lower and upper 32 bits of an i64, with the other // bits of the i32 being one. defm imm64lf32c : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(64) && + SystemZ::isImmLF(uint64_t(~N->getZExtValue())); }], LF32, "U32Imm">; defm imm64hf32c : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(64) && + SystemZ::isImmHF(uint64_t(~N->getZExtValue())); }], HF32, "U32Imm">; // Negated immediates that fit LF32 or LH16. defm imm64lh16n : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(64) && + SystemZ::isImmLH(uint64_t(-N->getZExtValue())); }], NEGLH16, "U16Imm">; defm imm64lf32n : ImmediategetZExtValue())); + return N->getAPIntValue().isIntN(64) && + SystemZ::isImmLF(uint64_t(-N->getZExtValue())); }], NEGLF32, "U32Imm">; // Short immediates. defm imm64sx8 : Immediate(N->getSExtValue()); + return N->getAPIntValue().isSignedIntN(8); }], SIMM8, "S8Imm">; defm imm64zx8 : Immediate(N->getSExtValue()); + return N->getAPIntValue().isIntN(8);; }], UIMM8, "U8Imm">; defm imm64sx16 : Immediate(N->getSExtValue()); + return N->getAPIntValue().isSignedIntN(16); }], SIMM16, "S16Imm">; defm imm64sx16n : Immediate(-N->getSExtValue()); + return (-N->getAPIntValue()).isSignedIntN(16); }], NEGSIMM16, "S16Imm">; defm imm64zx16 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(16); }], UIMM16, "U16Imm">; defm imm64sx32 : Immediate(N->getSExtValue()); + return N->getAPIntValue().isSignedIntN(32); }], SIMM32, "S32Imm">; defm imm64sx32n : Immediate(-N->getSExtValue()); + return (-N->getAPIntValue()).isSignedIntN(32); }], NEGSIMM32, "S32Imm">; defm imm64zx32 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(32); }], UIMM32, "U32Imm">; defm imm64zx32n : Immediate(-N->getSExtValue()); + return (-N->getAPIntValue()).isIntN(32); }], NEGUIMM32, "U32Imm">; defm imm64zx48 : Immediate(N->getZExtValue()); + return N->getAPIntValue().isIntN(64); }], UIMM48, "U48Imm">; class Imm64 : ImmLeaf, Operand { diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td index 4f0f23fe3ef8e..af6cf340f8a32 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -59,6 +59,15 @@ def SDT_ZBinaryWithCarry : SDTypeProfile<2, 3, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<1, i32>]>; +def SDT_ZBinaryConv : SDTypeProfile<1, 2, + [SDTCisInt<0>, + SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def SDT_ZTernary : SDTypeProfile<1, 3, + [SDTCisInt<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>]>; def SDT_ZAtomicLoadBinaryW : SDTypeProfile<1, 5, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, @@ -283,6 +292,12 @@ def z_uaddo : SDNode<"SystemZISD::UADDO", SDT_ZBinaryWithFlags>; def z_usubo : SDNode<"SystemZISD::USUBO", SDT_ZBinaryWithFlags>; def z_addcarry_1 : SDNode<"SystemZISD::ADDCARRY", SDT_ZBinaryWithCarry>; def z_subcarry_1 : SDNode<"SystemZISD::SUBCARRY", SDT_ZBinaryWithCarry>; +def z_vacc : SDNode<"SystemZISD::VACC", SDTIntBinOp>; +def z_vac : SDNode<"SystemZISD::VAC", SDT_ZTernary>; +def z_vaccc : SDNode<"SystemZISD::VACCC", SDT_ZTernary>; +def z_vscbi : SDNode<"SystemZISD::VSCBI", SDTIntBinOp>; +def z_vsbi : SDNode<"SystemZISD::VSBI", SDT_ZTernary>; +def z_vsbcbi : SDNode<"SystemZISD::VSBCBI", SDT_ZTernary>; def z_loadbswap : SDNode<"SystemZISD::LRV", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -326,7 +341,7 @@ def z_vsra_by_scalar : SDNode<"SystemZISD::VSRA_BY_SCALAR", SDT_ZVecBinaryInt>; def z_vrotl_by_scalar : SDNode<"SystemZISD::VROTL_BY_SCALAR", SDT_ZVecBinaryInt>; -def z_vsum : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>; +def z_vsum : SDNode<"SystemZISD::VSUM", SDT_ZBinaryConv>; def z_vicmpe : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>; def z_vicmph : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>; def z_vicmphl : SDNode<"SystemZISD::VICMPHL", SDT_ZVecBinary>; @@ -358,6 +373,8 @@ def z_vround : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>; def z_strict_vround : SDNode<"SystemZISD::STRICT_VROUND", SDT_ZVecUnaryConv, [SDNPHasChain]>; def z_vtm : SDNode<"SystemZISD::VTM", SDT_ZCmp>; +def z_scmp128hi : SDNode<"SystemZISD::SCMP128HI", SDT_ZCmp>; +def z_ucmp128hi : SDNode<"SystemZISD::UCMP128HI", SDT_ZCmp>; def z_vfae_cc : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryIntCC>; def z_vfaez_cc : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryIntCC>; def z_vfee_cc : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinaryCC>; @@ -757,10 +774,27 @@ class shiftop [(operator node:$val, node:$count), (operator node:$val, (and node:$count, imm32bottom6set))]>; +// Create a shift operator that optionally ignores an AND of the +// shift count with an immediate if the bottom 7 bits are all set. +def imm32bottom7set : PatLeaf<(i32 imm), [{ + return (N->getZExtValue() & 0x7f) == 0x7f; +}]>; +class vshiftop + : PatFrags<(ops node:$val, node:$count), + [(operator node:$val, node:$count), + (operator node:$val, (and node:$count, imm32bottom7set))]>; + def imm32mod64 : PatLeaf<(i32 imm), [{ return (N->getZExtValue() % 64 == 0); }]>; +def imm32nobits : PatLeaf<(i32 imm), [{ + return (N->getZExtValue() & 0x07) == 0; +}]>; +def imm32nobytes : PatLeaf<(i32 imm), [{ + return (N->getZExtValue() & 0x78) == 0; +}]>; + // Load a scalar and replicate it in all elements of a vector. class z_replicate_load : PatFrag<(ops node:$addr), diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td index 5d66501172b28..8f9bb56f2eb3b 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -124,7 +124,7 @@ defm GRX32 : SystemZRegClass<"GRX32", [i32], 32, R12L,R12H,R13L,R13H,R14L,R14H,R15L,R15H) ]>; -// The architecture doesn't really have any i128 support, so model the +// On machines without SIMD support, i128 is not a legal type, so model the // register pairs as untyped instead. // XPLINK64: Allocate all registers in natural order defm GR128 : SystemZRegClass<"GR128", [untyped], 128, @@ -285,7 +285,8 @@ defm VF128 : SystemZRegClass<"VF128", // All vector registers. defm VR128 : SystemZRegClass<"VR128", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128], + [v16i8, v8i16, v4i32, v2i64, i128, + v4f32, v2f64, f128], 128, (add (sequence "V%u", 0, 7), (sequence "V%u", 16, 31), (sequence "V%u", 8, 15))>; @@ -305,7 +306,7 @@ def v128b : TypedReg; def v128h : TypedReg; def v128f : TypedReg; def v128g : TypedReg; -def v128q : TypedReg; +def v128q : TypedReg; def v128sb : TypedReg; def v128db : TypedReg; def v128xb : TypedReg; diff --git a/llvm/test/CodeGen/SystemZ/and-10.ll b/llvm/test/CodeGen/SystemZ/and-10.ll new file mode 100644 index 0000000000000..912d823feb6eb --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/and-10.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit AND / AND-NOT in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; And. +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vn %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = and i128 %a, %b + ret i128 %res +} + +; And with complement. +define i128 @f2(i128 %a, i128 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vnc %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %notb = xor i128 %b, -1 + %res = and i128 %a, %notb + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/and-11.ll b/llvm/test/CodeGen/SystemZ/and-11.ll new file mode 100644 index 0000000000000..12b429b92d28c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/and-11.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit NAND in vector registers on z14 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vnn %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %op = and i128 %a, %b + %res = xor i128 %op, -1 + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/args-12.ll b/llvm/test/CodeGen/SystemZ/args-12.ll new file mode 100644 index 0000000000000..d6d533f22d3a3 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/args-12.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test the handling of i128 argument values +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare void @bar(i64, i64, i64, i64, i128, + i64, i64, i64, i64, i128) + +; There are two indirect i128 slots, one at offset 200 (the first available +; byte after the outgoing arguments) and one immediately after it at 216. + +define void @foo() { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r6, %r15, 48(%r15) +; CHECK-NEXT: .cfi_offset %r6, -112 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -232 +; CHECK-NEXT: .cfi_def_cfa_offset 392 +; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: la %r0, 200(%r15) +; CHECK-NEXT: larl %r1, .LCPI0_1 +; CHECK-NEXT: stg %r0, 192(%r15) +; CHECK-NEXT: vst %v0, 176(%r15), 3 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: la %r6, 216(%r15) +; CHECK-NEXT: lghi %r2, 1 +; CHECK-NEXT: lghi %r3, 2 +; CHECK-NEXT: lghi %r4, 3 +; CHECK-NEXT: lghi %r5, 4 +; CHECK-NEXT: vst %v0, 200(%r15), 3 +; CHECK-NEXT: vst %v0, 216(%r15), 3 +; CHECK-NEXT: brasl %r14, bar@PLT +; CHECK-NEXT: lmg %r6, %r15, 280(%r15) +; CHECK-NEXT: br %r14 + call void @bar (i64 1, i64 2, i64 3, i64 4, i128 0, + i64 5, i64 6, i64 7, i64 8, i128 0) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/args-13.ll b/llvm/test/CodeGen/SystemZ/args-13.ll new file mode 100644 index 0000000000000..50636f23e859d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/args-13.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test incoming i128 arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Do some arithmetic so that we can see the register being used. +define void @f1(ptr %r2, i16 %r3, i32 %r4, i64 %r5, i128 %r6) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r6), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %y = add i128 %r6, %r6 + store i128 %y, ptr %r2 + ret void +} + +; Test a case where the i128 address is passed on the stack. +define void @f2(ptr %r2, i16 %r3, i32 %r4, i64 %r5, i128 %r6, i64 %s1, i64 %s2, i128 %s4) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: lg %r1, 176(%r15) +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %y = add i128 %s4, %s4 + store i128 %y, ptr %r2 + ret void +} + +; Explicit i128 return values are likewise passed indirectly. +define i128 @f14(i128 %r3) { +; CHECK-LABEL: f14: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %y = add i128 %r3, %r3 + ret i128 %y +} + diff --git a/llvm/test/CodeGen/SystemZ/asm-21.ll b/llvm/test/CodeGen/SystemZ/asm-21.ll new file mode 100644 index 0000000000000..9ef76b3b691d9 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/asm-21.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test i128 in GPRs versus VRs. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -no-integrated-as | FileCheck %s + +define i128 @f1() { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: blah %v1 %v0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vst %v1, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %val = call i128 asm "blah $0 $1", "=&v,v" (i128 0) + ret i128 %val +} + +define i128 @f2() { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v4, 0 +; CHECK-NEXT: #APP +; CHECK-NEXT: blah %v4 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vst %v4, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %val = call i128 asm "blah $0", "={v4},0" (i128 0) + ret i128 %val +} + +define i128 @f3() { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: lghi %r0, 0 +; CHECK-NEXT: lgr %r1, %r0 +; CHECK-NEXT: #APP +; CHECK-NEXT: blah %r4 %r0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: stg %r5, 8(%r2) +; CHECK-NEXT: stg %r4, 0(%r2) +; CHECK-NEXT: br %r14 + %val = call i128 asm "blah $0 $1", "=&r,r" (i128 0) + ret i128 %val +} + +define i128 @f4() { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r8, %r15, 64(%r15) +; CHECK-NEXT: .cfi_offset %r8, -96 +; CHECK-NEXT: .cfi_offset %r9, -88 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: lghi %r8, 0 +; CHECK-NEXT: lgr %r9, %r8 +; CHECK-NEXT: #APP +; CHECK-NEXT: blah %r8 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: stg %r9, 8(%r2) +; CHECK-NEXT: stg %r8, 0(%r2) +; CHECK-NEXT: lmg %r8, %r15, 64(%r15) +; CHECK-NEXT: br %r14 + %val = call i128 asm "blah $0", "={r8},0" (i128 0) + ret i128 %val +} diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-05.ll b/llvm/test/CodeGen/SystemZ/atomic-load-05.ll index ff9f8867a77c2..979f1e684e89a 100644 --- a/llvm/test/CodeGen/SystemZ/atomic-load-05.ll +++ b/llvm/test/CodeGen/SystemZ/atomic-load-05.ll @@ -1,6 +1,7 @@ ; Test 128-bit atomic loads. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s define i128 @f1(ptr %src) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-05.ll b/llvm/test/CodeGen/SystemZ/atomic-store-05.ll index be1fd8d5dac0d..dad7d9527b848 100644 --- a/llvm/test/CodeGen/SystemZ/atomic-store-05.ll +++ b/llvm/test/CodeGen/SystemZ/atomic-store-05.ll @@ -1,6 +1,7 @@ ; Test 128-bit atomic stores. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s define void @f1(i128 %val, ptr %src) { ; CHECK-LABEL: f1: diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-ops-i128.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-ops-i128.ll index 41b0964be05e5..604fd50baadeb 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-ops-i128.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-ops-i128.ll @@ -11,26 +11,19 @@ define i128 @atomicrmw_xchg(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_xchg: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r12, %r15, 96(%r15) -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r1, 8(%r4) -; CHECK-NEXT: lg %r0, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 4 +; CHECK-NEXT: vlgvg %r1, %v1, 1 +; CHECK-NEXT: vlgvg %r0, %v1, 0 ; CHECK-NEXT: .LBB0_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r12, %r5 -; CHECK-NEXT: lgr %r13, %r4 -; CHECK-NEXT: cdsg %r12, %r0, 0(%r3) -; CHECK-NEXT: lgr %r4, %r13 -; CHECK-NEXT: lgr %r5, %r12 +; CHECK-NEXT: vlgvg %r5, %v0, 1 +; CHECK-NEXT: vlgvg %r4, %v0, 0 +; CHECK-NEXT: cdsg %r4, %r0, 0(%r3) +; CHECK-NEXT: vlvgp %v0, %r4, %r5 ; CHECK-NEXT: jl .LBB0_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r12, %r15, 96(%r15) +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw xchg ptr %src, i128 %b seq_cst ret i128 %res @@ -40,31 +33,20 @@ define i128 @atomicrmw_xchg(ptr %src, i128 %b) { define i128 @atomicrmw_add(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_add: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 ; CHECK-NEXT: .LBB1_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: algrk %r13, %r4, %r0 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: alcgr %r5, %r1 -; CHECK-NEXT: lgr %r12, %r5 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 +; CHECK-NEXT: vaq %v2, %v1, %v0 +; CHECK-NEXT: vlgvg %r1, %v2, 1 +; CHECK-NEXT: vlgvg %r0, %v2, 0 +; CHECK-NEXT: vlgvg %r5, %v1, 1 +; CHECK-NEXT: vlgvg %r4, %v1, 0 +; CHECK-NEXT: cdsg %r4, %r0, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r4, %r5 ; CHECK-NEXT: jl .LBB1_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw add ptr %src, i128 %b seq_cst ret i128 %res @@ -74,31 +56,20 @@ define i128 @atomicrmw_add(ptr %src, i128 %b) { define i128 @atomicrmw_sub(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_sub: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 ; CHECK-NEXT: .LBB2_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: slgrk %r13, %r4, %r0 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: slbgr %r5, %r1 -; CHECK-NEXT: lgr %r12, %r5 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 +; CHECK-NEXT: vsq %v2, %v1, %v0 +; CHECK-NEXT: vlgvg %r1, %v2, 1 +; CHECK-NEXT: vlgvg %r0, %v2, 0 +; CHECK-NEXT: vlgvg %r5, %v1, 1 +; CHECK-NEXT: vlgvg %r4, %v1, 0 +; CHECK-NEXT: cdsg %r4, %r0, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r4, %r5 ; CHECK-NEXT: jl .LBB2_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw sub ptr %src, i128 %b seq_cst ret i128 %res @@ -108,30 +79,20 @@ define i128 @atomicrmw_sub(ptr %src, i128 %b) { define i128 @atomicrmw_and(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_and: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 ; CHECK-NEXT: .LBB3_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ngrk %r12, %r5, %r1 -; CHECK-NEXT: ngrk %r13, %r4, %r0 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 +; CHECK-NEXT: vn %v2, %v1, %v0 +; CHECK-NEXT: vlgvg %r1, %v2, 1 +; CHECK-NEXT: vlgvg %r0, %v2, 0 +; CHECK-NEXT: vlgvg %r5, %v1, 1 +; CHECK-NEXT: vlgvg %r4, %v1, 0 +; CHECK-NEXT: cdsg %r4, %r0, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r4, %r5 ; CHECK-NEXT: jl .LBB3_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw and ptr %src, i128 %b seq_cst ret i128 %res @@ -141,30 +102,20 @@ define i128 @atomicrmw_and(ptr %src, i128 %b) { define i128 @atomicrmw_nand(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_nand: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 ; CHECK-NEXT: .LBB4_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: nngrk %r12, %r5, %r1 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: nngrk %r13, %r4, %r0 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 +; CHECK-NEXT: vlgvg %r1, %v1, 1 +; CHECK-NEXT: vlgvg %r0, %v1, 0 +; CHECK-NEXT: vnn %v1, %v1, %v0 +; CHECK-NEXT: vlgvg %r5, %v1, 1 +; CHECK-NEXT: vlgvg %r4, %v1, 0 +; CHECK-NEXT: cdsg %r0, %r4, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r0, %r1 ; CHECK-NEXT: jl .LBB4_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw nand ptr %src, i128 %b seq_cst ret i128 %res @@ -174,30 +125,20 @@ define i128 @atomicrmw_nand(ptr %src, i128 %b) { define i128 @atomicrmw_or(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_or: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 ; CHECK-NEXT: .LBB5_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ogrk %r12, %r5, %r1 -; CHECK-NEXT: ogrk %r13, %r4, %r0 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 +; CHECK-NEXT: vo %v2, %v1, %v0 +; CHECK-NEXT: vlgvg %r1, %v2, 1 +; CHECK-NEXT: vlgvg %r0, %v2, 0 +; CHECK-NEXT: vlgvg %r5, %v1, 1 +; CHECK-NEXT: vlgvg %r4, %v1, 0 +; CHECK-NEXT: cdsg %r4, %r0, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r4, %r5 ; CHECK-NEXT: jl .LBB5_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw or ptr %src, i128 %b seq_cst ret i128 %res @@ -207,30 +148,20 @@ define i128 @atomicrmw_or(ptr %src, i128 %b) { define i128 @atomicrmw_xor(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_xor: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 ; CHECK-NEXT: .LBB6_1: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: xgrk %r12, %r5, %r1 -; CHECK-NEXT: xgrk %r13, %r4, %r0 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 +; CHECK-NEXT: vx %v2, %v1, %v0 +; CHECK-NEXT: vlgvg %r1, %v2, 1 +; CHECK-NEXT: vlgvg %r0, %v2, 0 +; CHECK-NEXT: vlgvg %r5, %v1, 1 +; CHECK-NEXT: vlgvg %r4, %v1, 0 +; CHECK-NEXT: cdsg %r4, %r0, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r4, %r5 ; CHECK-NEXT: jl .LBB6_1 ; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw xor ptr %src, i128 %b seq_cst ret i128 %res @@ -240,39 +171,35 @@ define i128 @atomicrmw_xor(ptr %src, i128 %b) { define i128 @atomicrmw_min(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_min: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 +; CHECK-NEXT: j .LBB7_2 ; CHECK-NEXT: .LBB7_1: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: vlgvg %r1, %v1, 1 +; CHECK-NEXT: vlgvg %r0, %v1, 0 +; CHECK-NEXT: vlgvg %r5, %v2, 1 +; CHECK-NEXT: vlgvg %r4, %v2, 0 +; CHECK-NEXT: cdsg %r0, %r4, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r0, %r1 +; CHECK-NEXT: je .LBB7_6 +; CHECK-NEXT: .LBB7_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: clgr %r4, %r0 -; CHECK-NEXT: lhi %r14, 0 -; CHECK-NEXT: lochile %r14, 1 -; CHECK-NEXT: cgr %r5, %r1 -; CHECK-NEXT: lhi %r13, 0 -; CHECK-NEXT: lochile %r13, 1 -; CHECK-NEXT: locrlh %r14, %r13 -; CHECK-NEXT: chi %r14, 0 -; CHECK-NEXT: selgrlh %r13, %r4, %r0 -; CHECK-NEXT: selgrlh %r12, %r5, %r1 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 -; CHECK-NEXT: jl .LBB7_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: jlh .LBB7_4 +; CHECK-NEXT: # %bb.3: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: .LBB7_4: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: vlr %v2, %v1 +; CHECK-NEXT: jnl .LBB7_1 +; CHECK-NEXT: # %bb.5: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT: vlr %v2, %v0 +; CHECK-NEXT: j .LBB7_1 +; CHECK-NEXT: .LBB7_6: # %atomicrmw.end +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw min ptr %src, i128 %b seq_cst ret i128 %res @@ -282,39 +209,35 @@ define i128 @atomicrmw_min(ptr %src, i128 %b) { define i128 @atomicrmw_max(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_max: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 +; CHECK-NEXT: j .LBB8_2 ; CHECK-NEXT: .LBB8_1: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB8_2 Depth=1 +; CHECK-NEXT: vlgvg %r1, %v1, 1 +; CHECK-NEXT: vlgvg %r0, %v1, 0 +; CHECK-NEXT: vlgvg %r5, %v2, 1 +; CHECK-NEXT: vlgvg %r4, %v2, 0 +; CHECK-NEXT: cdsg %r0, %r4, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r0, %r1 +; CHECK-NEXT: je .LBB8_6 +; CHECK-NEXT: .LBB8_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: clgr %r4, %r0 -; CHECK-NEXT: lhi %r14, 0 -; CHECK-NEXT: lochih %r14, 1 -; CHECK-NEXT: cgr %r5, %r1 -; CHECK-NEXT: lhi %r13, 0 -; CHECK-NEXT: lochih %r13, 1 -; CHECK-NEXT: locrlh %r14, %r13 -; CHECK-NEXT: chi %r14, 0 -; CHECK-NEXT: selgrlh %r13, %r4, %r0 -; CHECK-NEXT: selgrlh %r12, %r5, %r1 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: jlh .LBB8_4 +; CHECK-NEXT: # %bb.3: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB8_2 Depth=1 +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: .LBB8_4: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB8_2 Depth=1 +; CHECK-NEXT: vlr %v2, %v1 ; CHECK-NEXT: jl .LBB8_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: # %bb.5: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB8_2 Depth=1 +; CHECK-NEXT: vlr %v2, %v0 +; CHECK-NEXT: j .LBB8_1 +; CHECK-NEXT: .LBB8_6: # %atomicrmw.end +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw max ptr %src, i128 %b seq_cst ret i128 %res @@ -324,40 +247,35 @@ define i128 @atomicrmw_max(ptr %src, i128 %b) { define i128 @atomicrmw_umin(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_umin: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 +; CHECK-NEXT: j .LBB9_2 ; CHECK-NEXT: .LBB9_1: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB9_2 Depth=1 +; CHECK-NEXT: vlgvg %r1, %v1, 1 +; CHECK-NEXT: vlgvg %r0, %v1, 0 +; CHECK-NEXT: vlgvg %r5, %v2, 1 +; CHECK-NEXT: vlgvg %r4, %v2, 0 +; CHECK-NEXT: cdsg %r0, %r4, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r0, %r1 +; CHECK-NEXT: je .LBB9_6 +; CHECK-NEXT: .LBB9_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: clgr %r5, %r1 -; CHECK-NEXT: lhi %r14, 0 -; CHECK-NEXT: lochile %r14, 1 -; CHECK-NEXT: clgr %r4, %r0 -; CHECK-NEXT: lhi %r13, 0 -; CHECK-NEXT: lochile %r13, 1 -; CHECK-NEXT: cgr %r5, %r1 -; CHECK-NEXT: locre %r14, %r13 -; CHECK-NEXT: chi %r14, 0 -; CHECK-NEXT: selgrlh %r13, %r4, %r0 -; CHECK-NEXT: selgrlh %r12, %r5, %r1 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 -; CHECK-NEXT: jl .LBB9_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: jlh .LBB9_4 +; CHECK-NEXT: # %bb.3: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB9_2 Depth=1 +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: .LBB9_4: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB9_2 Depth=1 +; CHECK-NEXT: vlr %v2, %v1 +; CHECK-NEXT: jnl .LBB9_1 +; CHECK-NEXT: # %bb.5: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB9_2 Depth=1 +; CHECK-NEXT: vlr %v2, %v0 +; CHECK-NEXT: j .LBB9_1 +; CHECK-NEXT: .LBB9_6: # %atomicrmw.end +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw umin ptr %src, i128 %b seq_cst ret i128 %res @@ -367,40 +285,35 @@ define i128 @atomicrmw_umin(ptr %src, i128 %b) { define i128 @atomicrmw_umax(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_umax: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r10, %r15, 80(%r15) -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r4, 8(%r3) -; CHECK-NEXT: lg %r5, 0(%r3) +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 4 +; CHECK-NEXT: j .LBB10_2 ; CHECK-NEXT: .LBB10_1: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB10_2 Depth=1 +; CHECK-NEXT: vlgvg %r1, %v1, 1 +; CHECK-NEXT: vlgvg %r0, %v1, 0 +; CHECK-NEXT: vlgvg %r5, %v2, 1 +; CHECK-NEXT: vlgvg %r4, %v2, 0 +; CHECK-NEXT: cdsg %r0, %r4, 0(%r3) +; CHECK-NEXT: vlvgp %v1, %r0, %r1 +; CHECK-NEXT: je .LBB10_6 +; CHECK-NEXT: .LBB10_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: clgr %r5, %r1 -; CHECK-NEXT: lhi %r14, 0 -; CHECK-NEXT: lochih %r14, 1 -; CHECK-NEXT: clgr %r4, %r0 -; CHECK-NEXT: lhi %r13, 0 -; CHECK-NEXT: lochih %r13, 1 -; CHECK-NEXT: cgr %r5, %r1 -; CHECK-NEXT: locre %r14, %r13 -; CHECK-NEXT: chi %r14, 0 -; CHECK-NEXT: selgrlh %r13, %r4, %r0 -; CHECK-NEXT: selgrlh %r12, %r5, %r1 -; CHECK-NEXT: lgr %r10, %r5 -; CHECK-NEXT: lgr %r11, %r4 -; CHECK-NEXT: cdsg %r10, %r12, 0(%r3) -; CHECK-NEXT: lgr %r4, %r11 -; CHECK-NEXT: lgr %r5, %r10 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: jlh .LBB10_4 +; CHECK-NEXT: # %bb.3: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB10_2 Depth=1 +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: .LBB10_4: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB10_2 Depth=1 +; CHECK-NEXT: vlr %v2, %v1 ; CHECK-NEXT: jl .LBB10_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: stg %r4, 8(%r2) -; CHECK-NEXT: lmg %r10, %r15, 80(%r15) +; CHECK-NEXT: # %bb.5: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB10_2 Depth=1 +; CHECK-NEXT: vlr %v2, %v0 +; CHECK-NEXT: j .LBB10_1 +; CHECK-NEXT: .LBB10_6: # %atomicrmw.end +; CHECK-NEXT: vst %v1, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw umax ptr %src, i128 %b seq_cst ret i128 %res @@ -410,46 +323,37 @@ define i128 @atomicrmw_umax(ptr %src, i128 %b) { define i128 @atomicrmw_uinc_wrap(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_uinc_wrap: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r8, %r15, 64(%r15) -; CHECK-NEXT: .cfi_offset %r8, -96 -; CHECK-NEXT: .cfi_offset %r9, -88 -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r5, 8(%r3) -; CHECK-NEXT: lg %r14, 0(%r3) -; CHECK-NEXT: lghi %r4, 0 +; CHECK-NEXT: larl %r1, .LCPI11_0 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r3), 4 +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: j .LBB11_2 ; CHECK-NEXT: .LBB11_1: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB11_2 Depth=1 +; CHECK-NEXT: vlgvg %r1, %v2, 1 +; CHECK-NEXT: vlgvg %r0, %v2, 0 +; CHECK-NEXT: vlgvg %r5, %v3, 1 +; CHECK-NEXT: vlgvg %r4, %v3, 0 +; CHECK-NEXT: cdsg %r0, %r4, 0(%r3) +; CHECK-NEXT: vlvgp %v2, %r0, %r1 +; CHECK-NEXT: je .LBB11_6 +; CHECK-NEXT: .LBB11_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: alghsik %r13, %r5, 1 -; CHECK-NEXT: lgr %r12, %r14 -; CHECK-NEXT: lhi %r11, 0 -; CHECK-NEXT: alcgr %r12, %r4 -; CHECK-NEXT: clgr %r14, %r1 -; CHECK-NEXT: lochihe %r11, 1 -; CHECK-NEXT: clgr %r5, %r0 -; CHECK-NEXT: lhi %r10, 0 -; CHECK-NEXT: lochihe %r10, 1 -; CHECK-NEXT: cgr %r14, %r1 -; CHECK-NEXT: locre %r11, %r10 -; CHECK-NEXT: chi %r11, 0 -; CHECK-NEXT: locghilh %r13, 0 -; CHECK-NEXT: locghilh %r12, 0 -; CHECK-NEXT: lgr %r8, %r14 -; CHECK-NEXT: lgr %r9, %r5 -; CHECK-NEXT: cdsg %r8, %r12, 0(%r3) -; CHECK-NEXT: lgr %r5, %r9 -; CHECK-NEXT: lgr %r14, %r8 -; CHECK-NEXT: jl .LBB11_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r14, 0(%r2) -; CHECK-NEXT: stg %r5, 8(%r2) -; CHECK-NEXT: lmg %r8, %r15, 64(%r15) +; CHECK-NEXT: veclg %v2, %v0 +; CHECK-NEXT: jlh .LBB11_4 +; CHECK-NEXT: # %bb.3: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB11_2 Depth=1 +; CHECK-NEXT: vchlgs %v3, %v0, %v2 +; CHECK-NEXT: .LBB11_4: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB11_2 Depth=1 +; CHECK-NEXT: vgbm %v3, 0 +; CHECK-NEXT: jnl .LBB11_1 +; CHECK-NEXT: # %bb.5: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB11_2 Depth=1 +; CHECK-NEXT: vaq %v3, %v2, %v1 +; CHECK-NEXT: j .LBB11_1 +; CHECK-NEXT: .LBB11_6: # %atomicrmw.end +; CHECK-NEXT: vst %v2, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw uinc_wrap ptr %src, i128 %b seq_cst ret i128 %res @@ -459,50 +363,45 @@ define i128 @atomicrmw_uinc_wrap(ptr %src, i128 %b) { define i128 @atomicrmw_udec_wrap(ptr %src, i128 %b) { ; CHECK-LABEL: atomicrmw_udec_wrap: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r6, %r15, 48(%r15) -; CHECK-NEXT: .cfi_offset %r6, -112 -; CHECK-NEXT: .cfi_offset %r7, -104 -; CHECK-NEXT: .cfi_offset %r9, -88 -; CHECK-NEXT: .cfi_offset %r10, -80 -; CHECK-NEXT: .cfi_offset %r11, -72 -; CHECK-NEXT: .cfi_offset %r12, -64 -; CHECK-NEXT: .cfi_offset %r13, -56 -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r4) -; CHECK-NEXT: lg %r1, 0(%r4) -; CHECK-NEXT: lg %r5, 8(%r3) -; CHECK-NEXT: lg %r14, 0(%r3) -; CHECK-NEXT: lghi %r4, -1 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v3, 0(%r3), 4 +; CHECK-NEXT: vgbm %v1, 65535 +; CHECK-NEXT: vgbm %v2, 0 +; CHECK-NEXT: j .LBB12_2 ; CHECK-NEXT: .LBB12_1: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: vlgvg %r1, %v3, 1 +; CHECK-NEXT: vlgvg %r0, %v3, 0 +; CHECK-NEXT: vlgvg %r5, %v4, 1 +; CHECK-NEXT: vlgvg %r4, %v4, 0 +; CHECK-NEXT: cdsg %r0, %r4, 0(%r3) +; CHECK-NEXT: vlvgp %v3, %r0, %r1 +; CHECK-NEXT: je .LBB12_8 +; CHECK-NEXT: .LBB12_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: alghsik %r13, %r5, -1 -; CHECK-NEXT: lgr %r12, %r14 -; CHECK-NEXT: lhi %r10, 0 -; CHECK-NEXT: alcgr %r12, %r4 -; CHECK-NEXT: ogrk %r11, %r5, %r14 -; CHECK-NEXT: lhi %r11, 0 -; CHECK-NEXT: lochie %r11, 1 -; CHECK-NEXT: clgr %r14, %r1 -; CHECK-NEXT: lochih %r10, 1 -; CHECK-NEXT: clgr %r5, %r0 -; CHECK-NEXT: lhi %r9, 0 -; CHECK-NEXT: lochih %r9, 1 -; CHECK-NEXT: cgr %r14, %r1 -; CHECK-NEXT: locre %r10, %r9 -; CHECK-NEXT: or %r11, %r10 -; CHECK-NEXT: selgrl %r11, %r0, %r13 -; CHECK-NEXT: selgrl %r10, %r1, %r12 -; CHECK-NEXT: lgr %r6, %r14 -; CHECK-NEXT: lgr %r7, %r5 -; CHECK-NEXT: cdsg %r6, %r10, 0(%r3) -; CHECK-NEXT: lgr %r5, %r7 -; CHECK-NEXT: lgr %r14, %r6 -; CHECK-NEXT: jl .LBB12_1 -; CHECK-NEXT: # %bb.2: # %atomicrmw.end -; CHECK-NEXT: stg %r14, 0(%r2) -; CHECK-NEXT: stg %r5, 8(%r2) -; CHECK-NEXT: lmg %r6, %r15, 48(%r15) +; CHECK-NEXT: veclg %v0, %v3 +; CHECK-NEXT: jlh .LBB12_4 +; CHECK-NEXT: # %bb.3: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: vchlgs %v4, %v3, %v0 +; CHECK-NEXT: .LBB12_4: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: vlr %v5, %v0 +; CHECK-NEXT: jl .LBB12_6 +; CHECK-NEXT: # %bb.5: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: vaq %v5, %v3, %v1 +; CHECK-NEXT: .LBB12_6: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: vceqgs %v4, %v3, %v2 +; CHECK-NEXT: vlr %v4, %v0 +; CHECK-NEXT: je .LBB12_1 +; CHECK-NEXT: # %bb.7: # %atomicrmw.start +; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: vlr %v4, %v5 +; CHECK-NEXT: j .LBB12_1 +; CHECK-NEXT: .LBB12_8: # %atomicrmw.end +; CHECK-NEXT: vst %v3, 0(%r2), 3 ; CHECK-NEXT: br %r14 %res = atomicrmw udec_wrap ptr %src, i128 %b seq_cst ret i128 %res diff --git a/llvm/test/CodeGen/SystemZ/bswap-09.ll b/llvm/test/CodeGen/SystemZ/bswap-09.ll new file mode 100644 index 0000000000000..a2d8273c89695 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/bswap-09.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test i128 byteswaps on z13 and higher. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare i128 @llvm.bswap.i128(i128 %a) + +; Check 128-bit register-to-register byteswaps. +define i128 @f1(i128 %a, i128 %b, i128 %c) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r3), 3 +; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: vaq %v1, %v2, %v1 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vl %v0, 0(%r5), 3 +; CHECK-NEXT: vperm %v1, %v1, %v1, %v2 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %in = add i128 %a, %b + %swapped = call i128 @llvm.bswap.i128(i128 %in) + %out = add i128 %swapped, %c + ret i128 %out +} + +; Check 128-bit register-to-memory byteswaps. +define i128 @f2(i128 %a, i128 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: larl %r1, .LCPI1_0 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vperm %v0, %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %in = add i128 %a, %b + %swapped = call i128 @llvm.bswap.i128(i128 %in) + ret i128 %swapped +} + +; Check 128-bit memory-to-register byteswaps. +define i128 @f3(i128 %a, i128 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI2_0 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vperm %v1, %v1, %v1, %v2 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %swapped = call i128 @llvm.bswap.i128(i128 %a) + %out = add i128 %swapped, %b + ret i128 %out +} + diff --git a/llvm/test/CodeGen/SystemZ/bswap-10.ll b/llvm/test/CodeGen/SystemZ/bswap-10.ll new file mode 100644 index 0000000000000..6de2970b80e2e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/bswap-10.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test i128 byteswaps on z15 and higher. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s + +declare i128 @llvm.bswap.i128(i128 %a) + +; Check 128-bit register-to-register byteswaps. +define i128 @f1(i128 %a, i128 %b, i128 %c) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r3), 3 +; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: vaq %v1, %v2, %v1 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vl %v0, 0(%r5), 3 +; CHECK-NEXT: vperm %v1, %v1, %v1, %v2 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %in = add i128 %a, %b + %swapped = call i128 @llvm.bswap.i128(i128 %in) + %out = add i128 %swapped, %c + ret i128 %out +} + +; Check 128-bit register-to-memory byteswaps. +define i128 @f2(i128 %a, i128 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vstbrq %v0, 0(%r2) +; CHECK-NEXT: br %r14 + %in = add i128 %a, %b + %swapped = call i128 @llvm.bswap.i128(i128 %in) + ret i128 %swapped +} + +; Check 128-bit memory-to-register byteswaps. +define i128 @f3(i128 %a, i128 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vlbrq %v1, 0(%r3) +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %swapped = call i128 @llvm.bswap.i128(i128 %a) + %out = add i128 %swapped, %b + ret i128 %out +} + diff --git a/llvm/test/CodeGen/SystemZ/call-zos-i128.ll b/llvm/test/CodeGen/SystemZ/call-zos-i128.ll new file mode 100644 index 0000000000000..ccdac161d08a9 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/call-zos-i128.ll @@ -0,0 +1,31 @@ +; Test the passing of scalar i128 values on z/OS. +; +; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z13 | FileCheck %s + +; CHECK-LABEL: call_i128: +; CHECK-DAG: larl 1, @CPI0_0 +; CHECK-DAG: vl 0, 0(1), 3 +; CHECK-DAG: vst 0, 2256(4), 3 +; CHECK-DAG: larl 1, @CPI0_1 +; CHECK-DAG: vl 0, 0(1), 3 +; CHECK-DAG: vst 0, 2272(4), 3 +; CHECK-DAG: la 1, 2288(4) +; CHECK-DAG: la 2, 2272(4) +; CHECK-DAG: la 3, 2256(4) + +define i128 @call_i128() { +entry: + %retval = call i128 (i128, i128) @pass_i128(i128 64, i128 65) + ret i128 %retval +} + +; CHECK-LABEL: pass_i128: +; CHECK: vl 0, 0(3), 3 +; CHECK: vl 1, 0(2), 3 +; CHECK: vaq 0, 1, 0 +; CHECK: vst 0, 0(1), 3 +define i128 @pass_i128(i128 %arg0, i128 %arg1) { +entry: + %N = add i128 %arg0, %arg1 + ret i128 %N +} diff --git a/llvm/test/CodeGen/SystemZ/cmpxchg-06.ll b/llvm/test/CodeGen/SystemZ/cmpxchg-06.ll index b9a26abc35069..0aff13aa59b59 100644 --- a/llvm/test/CodeGen/SystemZ/cmpxchg-06.ll +++ b/llvm/test/CodeGen/SystemZ/cmpxchg-06.ll @@ -1,6 +1,7 @@ ; Test 128-bit compare and swap. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-Z10 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-Z13 ; Check CDSG without a displacement. define i128 @f1(i128 %cmp, i128 %swap, ptr %src) { @@ -118,10 +119,12 @@ define i128 @f9(i128 %cmp, ptr %ptr) { ; CHECK-DAG: lg %r0, 0(%r3) ; CHECK-DAG: lg %r13, 8(%r2) ; CHECK-DAG: lg %r12, 0(%r2) +; CHECK-Z13: lhi %r2, 0 ; CHECK: cdsg %r12, %r0, 0(%r4) -; CHECK-NEXT: ipm %r2 -; CHECK-NEXT: afi %r2, -268435456 -; CHECK-NEXT: srl %r2, 31 +; CHECK-Z10-NEXT: ipm %r2 +; CHECK-Z10-NEXT: afi %r2, -268435456 +; CHECK-Z10-NEXT: srl %r2, 31 +; CHECK-Z13-NEXT: lochie %r2, 1 ; CHECK: br %r14 define i32 @f10(i128 %cmp, i128 %swap, ptr %src) { %pairval = cmpxchg ptr %src, i128 %cmp, i128 %swap seq_cst seq_cst diff --git a/llvm/test/CodeGen/SystemZ/ctpop-03.ll b/llvm/test/CodeGen/SystemZ/ctpop-03.ll new file mode 100644 index 0000000000000..cf1af03050656 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/ctpop-03.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit arithmetic in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare i128 @llvm.ctpop.i128(i128) + +define i128 @f1(i128 %a) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vpopct %v0, %v0, 0 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vsumb %v0, %v0, %v1 +; CHECK-NEXT: vsumgf %v0, %v0, %v1 +; CHECK-NEXT: vsumqg %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = tail call i128 @llvm.ctpop.i128(i128 %a) + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/ctpop-04.ll b/llvm/test/CodeGen/SystemZ/ctpop-04.ll new file mode 100644 index 0000000000000..f0627a36fb5e7 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/ctpop-04.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit arithmetic in vector registers on z14 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +declare i128 @llvm.ctpop.i128(i128) + +define i128 @f1(i128 %a) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vpopctg %v0, %v0 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vsumqg %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = tail call i128 @llvm.ctpop.i128(i128 %a) + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-19.ll b/llvm/test/CodeGen/SystemZ/fp-conv-19.ll index 034def904af2e..77d6eb1bffa87 100644 --- a/llvm/test/CodeGen/SystemZ/fp-conv-19.ll +++ b/llvm/test/CodeGen/SystemZ/fp-conv-19.ll @@ -84,12 +84,10 @@ define i128 @bitcast_128(ptr %0, ptr %1) { ; Z14-NEXT: vl %v0, 0(%r3), 3 ; Z14-NEXT: vl %v1, 0(%r4), 3 ; Z14-NEXT: wfaxb %v0, %v0, %v1 -; Z14-NEXT: vlgvg %r0, %v0, 1 -; Z14-NEXT: vlgvg %r1, %v0, 0 -; Z14-NEXT: oill %r1, 1 -; Z14-NEXT: oill %r0, 3 -; Z14-NEXT: stg %r0, 8(%r2) -; Z14-NEXT: stg %r1, 0(%r2) +; Z14-NEXT: larl %r1, .LCPI2_0 +; Z14-NEXT: vl %v1, 0(%r1), 3 +; Z14-NEXT: vo %v0, %v0, %v1 +; Z14-NEXT: vst %v0, 0(%r2), 3 ; Z14-NEXT: br %r14 entry: %x = load fp128, ptr %0 diff --git a/llvm/test/CodeGen/SystemZ/fp-conv-20.ll b/llvm/test/CodeGen/SystemZ/fp-conv-20.ll new file mode 100644 index 0000000000000..8006a8beb0789 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-conv-20.ll @@ -0,0 +1,112 @@ +; Test floating-point conversion to/from 128-bit integers. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test signed i128->f128. +define fp128 @f1(i128 %i) { +; CHECK-LABEL: f1: +; CHECK: brasl %r14, __floattitf@PLT +; CHECK: br %r14 + %conv = sitofp i128 %i to fp128 + ret fp128 %conv +} + +; Test signed i128->f64. +define double @f2(i128 %i) { +; CHECK-LABEL: f2: +; CHECK: brasl %r14, __floattidf@PLT +; CHECK: br %r14 + %conv = sitofp i128 %i to double + ret double %conv +} + +; Test signed i128->f32. +define float @f3(i128 %i) { +; CHECK-LABEL: f3: +; CHECK: brasl %r14, __floattisf@PLT +; CHECK: br %r14 + %conv = sitofp i128 %i to float + ret float %conv +} + +; Test unsigned i128->f128. +define fp128 @f4(i128 %i) { +; CHECK-LABEL: f4: +; CHECK: brasl %r14, __floatuntitf@PLT +; CHECK: br %r14 + %conv = uitofp i128 %i to fp128 + ret fp128 %conv +} + +; Test unsigned i128->f64. +define double @f5(i128 %i) { +; CHECK-LABEL: f5: +; CHECK: brasl %r14, __floatuntidf@PLT +; CHECK: br %r14 + %conv = uitofp i128 %i to double + ret double %conv +} + +; Test unsigned i128->f32. +define float @f6(i128 %i) { +; CHECK-LABEL: f6: +; CHECK: brasl %r14, __floatuntisf@PLT +; CHECK: br %r14 + %conv = uitofp i128 %i to float + ret float %conv +} + +; Test signed f128->i128. +define i128 @f7(fp128 %f) { +; CHECK-LABEL: f7: +; CHECK: brasl %r14, __fixtfti@PLT +; CHECK: br %r14 + %conv = fptosi fp128 %f to i128 + ret i128 %conv +} + +; Test signed f64->i128. +define i128 @f8(double %f) { +; CHECK-LABEL: f8: +; CHECK: brasl %r14, __fixdfti@PLT +; CHECK: br %r14 + %conv = fptosi double %f to i128 + ret i128 %conv +} + +; Test signed f9->i128. +define i128 @f9(float %f) { +; CHECK-LABEL: f9: +; CHECK: brasl %r14, __fixsfti@PLT +; CHECK: br %r14 + %conv = fptosi float %f to i128 + ret i128 %conv +} + +; Test unsigned f128->i128. +define i128 @f10(fp128 %f) { +; CHECK-LABEL: f10: +; CHECK: brasl %r14, __fixunstfti@PLT +; CHECK: br %r14 + %conv = fptoui fp128 %f to i128 + ret i128 %conv +} + +; Test unsigned f64->i128. +define i128 @f11(double %f) { +; CHECK-LABEL: f11: +; CHECK: brasl %r14, __fixunsdfti@PLT +; CHECK: br %r14 + %conv = fptoui double %f to i128 + ret i128 %conv +} + +; Test unsigned f32->i128. +define i128 @f12(float %f) { +; CHECK-LABEL: f12: +; CHECK: brasl %r14, __fixunssfti@PLT +; CHECK: br %r14 + %conv = fptoui float %f to i128 + ret i128 %conv +} diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll new file mode 100644 index 0000000000000..eb0ff4b825609 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll @@ -0,0 +1,148 @@ +; Test floating-point strict conversion to/from 128-bit integers. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare fp128 @llvm.experimental.constrained.sitofp.f128.i128(i128, metadata, metadata) +declare double @llvm.experimental.constrained.sitofp.f64.i128(i128, metadata, metadata) +declare float @llvm.experimental.constrained.sitofp.f32.i128(i128, metadata, metadata) + +declare fp128 @llvm.experimental.constrained.uitofp.f128.i128(i128, metadata, metadata) +declare double @llvm.experimental.constrained.uitofp.f64.i128(i128, metadata, metadata) +declare float @llvm.experimental.constrained.uitofp.f32.i128(i128, metadata, metadata) + +declare i128 @llvm.experimental.constrained.fptosi.i128.f128(fp128, metadata) +declare i128 @llvm.experimental.constrained.fptosi.i128.f64(double, metadata) +declare i128 @llvm.experimental.constrained.fptosi.i128.f32(float, metadata) + +declare i128 @llvm.experimental.constrained.fptoui.i128.f128(fp128, metadata) +declare i128 @llvm.experimental.constrained.fptoui.i128.f64(double, metadata) +declare i128 @llvm.experimental.constrained.fptoui.i128.f32(float, metadata) + +; Test signed i128->f128. +define fp128 @f1(i128 %i) { +; CHECK-LABEL: f1: +; CHECK: brasl %r14, __floattitf@PLT +; CHECK: br %r14 + %conv = call fp128 @llvm.experimental.constrained.sitofp.f128.i128(i128 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret fp128 %conv +} + +; Test signed i128->f64. +define double @f2(i128 %i) { +; CHECK-LABEL: f2: +; CHECK: brasl %r14, __floattidf@PLT +; CHECK: br %r14 + %conv = call double @llvm.experimental.constrained.sitofp.f64.i128(i128 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %conv +} + +; Test signed i128->f32. +define float @f3(i128 %i) { +; CHECK-LABEL: f3: +; CHECK: brasl %r14, __floattisf@PLT +; CHECK: br %r14 + %conv = call float @llvm.experimental.constrained.sitofp.f32.i128(i128 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %conv +} + +; Test unsigned i128->f128. +define fp128 @f4(i128 %i) { +; CHECK-LABEL: f4: +; CHECK: brasl %r14, __floatuntitf@PLT +; CHECK: br %r14 + %conv = call fp128 @llvm.experimental.constrained.uitofp.f128.i128(i128 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret fp128 %conv +} + +; Test unsigned i128->f64. +define double @f5(i128 %i) { +; CHECK-LABEL: f5: +; CHECK: brasl %r14, __floatuntidf@PLT +; CHECK: br %r14 + %conv = call double @llvm.experimental.constrained.uitofp.f64.i128(i128 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret double %conv +} + +; Test unsigned i128->f32. +define float @f6(i128 %i) { +; CHECK-LABEL: f6: +; CHECK: brasl %r14, __floatuntisf@PLT +; CHECK: br %r14 + %conv = call float @llvm.experimental.constrained.uitofp.f32.i128(i128 %i, + metadata !"round.dynamic", + metadata !"fpexcept.strict") #0 + ret float %conv +} + +; Test signed f128->i128. +define i128 @f7(fp128 %f) { +; CHECK-LABEL: f7: +; CHECK: brasl %r14, __fixtfti@PLT +; CHECK: br %r14 + %conv = call i128 @llvm.experimental.constrained.fptosi.i128.f128(fp128 %f, + metadata !"fpexcept.strict") #0 + ret i128 %conv +} + +; Test signed f64->i128. +define i128 @f8(double %f) { +; CHECK-LABEL: f8: +; CHECK: brasl %r14, __fixdfti@PLT +; CHECK: br %r14 + %conv = call i128 @llvm.experimental.constrained.fptosi.i128.f64(double %f, + metadata !"fpexcept.strict") #0 + ret i128 %conv +} + +; Test signed f9->i128. +define i128 @f9(float %f) { +; CHECK-LABEL: f9: +; CHECK: brasl %r14, __fixsfti@PLT +; CHECK: br %r14 + %conv = call i128 @llvm.experimental.constrained.fptosi.i128.f32(float %f, + metadata !"fpexcept.strict") #0 + ret i128 %conv +} + +; Test unsigned f128->i128. +define i128 @f10(fp128 %f) { +; CHECK-LABEL: f10: +; CHECK: brasl %r14, __fixunstfti@PLT +; CHECK: br %r14 + %conv = call i128 @llvm.experimental.constrained.fptoui.i128.f128(fp128 %f, + metadata !"fpexcept.strict") #0 + ret i128 %conv +} + +; Test unsigned f64->i128. +define i128 @f11(double %f) { +; CHECK-LABEL: f11: +; CHECK: brasl %r14, __fixunsdfti@PLT +; CHECK: br %r14 + %conv = call i128 @llvm.experimental.constrained.fptoui.i128.f64(double %f, + metadata !"fpexcept.strict") #0 + ret i128 %conv +} + +; Test unsigned f32->i128. +define i128 @f12(float %f) { +; CHECK-LABEL: f12: +; CHECK: brasl %r14, __fixunssfti@PLT +; CHECK: br %r14 + %conv = call i128 @llvm.experimental.constrained.fptoui.i128.f32(float %f, + metadata !"fpexcept.strict") #0 + ret i128 %conv +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/SystemZ/int-abs-02.ll b/llvm/test/CodeGen/SystemZ/int-abs-02.ll new file mode 100644 index 0000000000000..67082b50b3a0f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-abs-02.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit absolute value in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define i128 @f1(i128 %src) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 127 +; CHECK-NEXT: vsrab %v2, %v0, %v1 +; CHECK-NEXT: vsra %v1, %v2, %v1 +; CHECK-NEXT: vx %v0, %v0, %v1 +; CHECK-NEXT: vsq %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp slt i128 %src, 0 + %neg = sub i128 0, %src + %res = select i1 %cmp, i128 %neg, i128 %src + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/int-add-19.ll b/llvm/test/CodeGen/SystemZ/int-add-19.ll new file mode 100644 index 0000000000000..6a4eea1027db8 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-add-19.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit addition in vector registers on z13 and later +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = add i128 %a, %b + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/int-cmp-63.ll b/llvm/test/CodeGen/SystemZ/int-cmp-63.ll new file mode 100644 index 0000000000000..0c0e6b7b33630 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-cmp-63.ll @@ -0,0 +1,237 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit comparisons in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs | FileCheck %s + +; Equality comparison. +define i64 @f1(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vceqgs %v0, %v1, %v0 +; CHECK-NEXT: locgrnhe %r4, %r5 +; CHECK-NEXT: lgr %r2, %r4 +; CHECK-NEXT: br %r14 + %cond = icmp eq i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Inequality comparison. +define i64 @f2(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vceqgs %v0, %v1, %v0 +; CHECK-NEXT: locgre %r4, %r5 +; CHECK-NEXT: lgr %r2, %r4 +; CHECK-NEXT: br %r14 + %cond = icmp ne i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Signed greater-than comparison. +define i64 @f3(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: jlh .LBB2_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v1, %v0 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: locgrl %r5, %r4 +; CHECK-NEXT: lgr %r2, %r5 +; CHECK-NEXT: br %r14 + %cond = icmp sgt i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Signed less-than comparison. +define i64 @f4(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: jlh .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v1, %v0 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: locgrl %r5, %r4 +; CHECK-NEXT: lgr %r2, %r5 +; CHECK-NEXT: br %r14 + %cond = icmp slt i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Signed greater-or-equal comparison. +define i64 @f5(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: jlh .LBB4_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v1, %v0 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: locgrnl %r5, %r4 +; CHECK-NEXT: lgr %r2, %r5 +; CHECK-NEXT: br %r14 + %cond = icmp sge i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Signed less-or-equal comparison. +define i64 @f6(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: jlh .LBB5_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v1, %v0 +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: locgrnl %r5, %r4 +; CHECK-NEXT: lgr %r2, %r5 +; CHECK-NEXT: br %r14 + %cond = icmp sle i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Unsigned greater-than comparison. +define i64 @f7(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: jlh .LBB6_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v1, %v0 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: locgrl %r5, %r4 +; CHECK-NEXT: lgr %r2, %r5 +; CHECK-NEXT: br %r14 + %cond = icmp ugt i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Unsigned less-than comparison. +define i64 @f8(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: jlh .LBB7_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v1, %v0 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: locgrl %r5, %r4 +; CHECK-NEXT: lgr %r2, %r5 +; CHECK-NEXT: br %r14 + %cond = icmp ult i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Unsigned greater-or-equal comparison. +define i64 @f9(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: jlh .LBB8_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v1, %v0 +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: locgrnl %r5, %r4 +; CHECK-NEXT: lgr %r2, %r5 +; CHECK-NEXT: br %r14 + %cond = icmp uge i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Unsigned less-or-equal comparison. +define i64 @f10(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: jlh .LBB9_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vchlgs %v0, %v1, %v0 +; CHECK-NEXT: .LBB9_2: +; CHECK-NEXT: locgrnl %r5, %r4 +; CHECK-NEXT: lgr %r2, %r5 +; CHECK-NEXT: br %r14 + %cond = icmp ule i128 %value1, %value2 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Use VTM for "x & y == 0" comparison. +define i64 @f11(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vtm %v1, %v0 +; CHECK-NEXT: locgrnhe %r4, %r5 +; CHECK-NEXT: lgr %r2, %r4 +; CHECK-NEXT: br %r14 + %and = and i128 %value1, %value2 + %cond = icmp eq i128 %and, 0 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Use VTM for "x & y != 0" comparison. +define i64 @f12(i128 %value1, i128 %value2, i64 %a, i64 %b) { +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vtm %v1, %v0 +; CHECK-NEXT: locgre %r4, %r5 +; CHECK-NEXT: lgr %r2, %r4 +; CHECK-NEXT: br %r14 + %and = and i128 %value1, %value2 + %cond = icmp ne i128 %and, 0 + %res = select i1 %cond, i64 %a, i64 %b + ret i64 %res +} + +; Select between i128 values. +define i128 @f13(i64 %value1, i64 %value2, i128 %a, i128 %b) { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r5), 3 +; CHECK-NEXT: cgrje %r3, %r4, .LBB12_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vl %v1, 0(%r6), 3 +; CHECK-NEXT: vaq %v0, %v0, %v1 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cond = icmp eq i64 %value1, %value2 + %sum = add i128 %a, %b + %res = select i1 %cond, i128 %a, i128 %sum + ret i128 %res +} + diff --git a/llvm/test/CodeGen/SystemZ/int-const-07.ll b/llvm/test/CodeGen/SystemZ/int-const-07.ll new file mode 100644 index 0000000000000..cb691c078408d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-const-07.ll @@ -0,0 +1,47 @@ +; Test loading of 128-bit constants in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Constant zero. +define i128 @f1() { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + ret i128 0 +} + +; Constant created using VBGM. +define i128 @f2() { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + ret i128 255 +} + +; Constant created using VREPIB. +define i128 @f3() { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + ret i128 1334440654591915542993625911497130241 +} + +; Constant loaded from literal pool. +define i128 @f4() { +; CHECK-LABEL: .LCPI3_0: +; CHECK-NEXT: .quad 54210108624275221 +; CHECK-NEXT: .quad -5527149226598858752 +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI3_0 +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + ret i128 1000000000000000000000000000000000000 +} diff --git a/llvm/test/CodeGen/SystemZ/int-conv-14.ll b/llvm/test/CodeGen/SystemZ/int-conv-14.ll new file mode 100644 index 0000000000000..98dc88f289620 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-conv-14.ll @@ -0,0 +1,416 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit arithmetic in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Sign extension from i64. +define i128 @f1(i64 %a) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: srag %r0, %r3, 63 +; CHECK-NEXT: vlvgp %v0, %r0, %r3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = sext i64 %a to i128 + ret i128 %res +} + +; Sign extension from i64 from memory. +define i128 @f2(ptr %ptr) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepg %v0, 0(%r3) +; CHECK-NEXT: vrepib %v1, 64 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i64, ptr %ptr + %res = sext i64 %a to i128 + ret i128 %res +} + +; Zero extension from i64. +define i128 @f3(i64 %a) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vlvgg %v0, %r3, 1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = zext i64 %a to i128 + ret i128 %res +} + +; Zero extension from i64 from memory. +define i128 @f4(ptr %ptr) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vleg %v0, 0(%r3), 1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i64, ptr %ptr + %res = zext i64 %a to i128 + ret i128 %res +} + +; Truncation to i64. +define i64 @f5(i128 %a) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vlgvg %r2, %v0, 1 +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i64 + ret i64 %res +} + +; Truncation to i64 in memory. +define void @f6(ptr %ptr, i128 %a) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vsteg %v0, 0(%r2), 1 +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i64 + store i64 %res, ptr %ptr + ret void +} + +; Sign extension from i32. +define i128 @f7(i32 %a) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: lgfr %r0, %r3 +; CHECK-NEXT: srag %r1, %r0, 63 +; CHECK-NEXT: vlvgp %v0, %r1, %r0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = sext i32 %a to i128 + ret i128 %res +} + +; Sign extension from i32 from memory. +define i128 @f8(ptr %ptr) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r3) +; CHECK-NEXT: vrepib %v1, 96 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i32, ptr %ptr + %res = sext i32 %a to i128 + ret i128 %res +} + +; Zero extension from i32. +define i128 @f9(i32 %a) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vlvgf %v0, %r3, 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = zext i32 %a to i128 + ret i128 %res +} + +; Zero extension from i32 from memory. +define i128 @f10(ptr %ptr) { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vlef %v0, 0(%r3), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i32, ptr %ptr + %res = zext i32 %a to i128 + ret i128 %res +} + +; Truncation to i32. +define i32 @f11(i128 %a) { +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vlgvf %r2, %v0, 3 +; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i32 + ret i32 %res +} + +; Truncation to i32 in memory. +define void @f12(ptr %ptr, i128 %a) { +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vstef %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i32 + store i32 %res, ptr %ptr + ret void +} + +; Sign extension from i16. +define i128 @f13(i16 %a) { +; CHECK-LABEL: f13: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: lghr %r0, %r3 +; CHECK-NEXT: srag %r1, %r0, 63 +; CHECK-NEXT: vlvgp %v0, %r1, %r0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = sext i16 %a to i128 + ret i128 %res +} + +; Sign extension from i16 from memory. +define i128 @f14(ptr %ptr) { +; CHECK-LABEL: f14: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r3) +; CHECK-NEXT: vrepib %v1, 112 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i16, ptr %ptr + %res = sext i16 %a to i128 + ret i128 %res +} + +; Zero extension from i16. +define i128 @f15(i16 %a) { +; CHECK-LABEL: f15: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vlvgh %v0, %r3, 7 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = zext i16 %a to i128 + ret i128 %res +} + +; Zero extension from i16 from memory. +define i128 @f16(ptr %ptr) { +; CHECK-LABEL: f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vleh %v0, 0(%r3), 7 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i16, ptr %ptr + %res = zext i16 %a to i128 + ret i128 %res +} + +; Truncation to i16. +define i16 @f17(i128 %a) { +; CHECK-LABEL: f17: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vlgvf %r2, %v0, 3 +; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i16 + ret i16 %res +} + +; Truncation to i16 in memory. +define void @f18(ptr %ptr, i128 %a) { +; CHECK-LABEL: f18: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vsteh %v0, 0(%r2), 7 +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i16 + store i16 %res, ptr %ptr + ret void +} + +; Sign extension from i8. +define i128 @f19(i8 %a) { +; CHECK-LABEL: f19: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3l killed $r3l def $r3d +; CHECK-NEXT: lgbr %r0, %r3 +; CHECK-NEXT: srag %r1, %r0, 63 +; CHECK-NEXT: vlvgp %v0, %r1, %r0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = sext i8 %a to i128 + ret i128 %res +} + +; Sign extension from i8 from memory. +define i128 @f20(ptr %ptr) { +; CHECK-LABEL: f20: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r3) +; CHECK-NEXT: vrepib %v1, 120 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i8, ptr %ptr + %res = sext i8 %a to i128 + ret i128 %res +} + +; Zero extension from i8. +define i128 @f21(i8 %a) { +; CHECK-LABEL: f21: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vlvgb %v0, %r3, 15 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = zext i8 %a to i128 + ret i128 %res +} + +; Zero extension from i8 from memory. +define i128 @f22(ptr %ptr) { +; CHECK-LABEL: f22: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vleb %v0, 0(%r3), 15 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i8, ptr %ptr + %res = zext i8 %a to i128 + ret i128 %res +} + +; Truncation to i8. +define i8 @f23(i128 %a) { +; CHECK-LABEL: f23: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vlgvf %r2, %v0, 3 +; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i8 + ret i8 %res +} + +; Truncation to i8 in memory. +define void @f24(ptr %ptr, i128 %a) { +; CHECK-LABEL: f24: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vsteb %v0, 0(%r2), 15 +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i8 + store i8 %res, ptr %ptr + ret void +} + +; Sign extension from i1. +define i128 @f25(i1 %a) { +; CHECK-LABEL: f25: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI24_0 +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vn %v0, %v0, %v1 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vsq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = sext i1 %a to i128 + ret i128 %res +} + +; Sign extension from i1 from memory. +define i128 @f26(ptr %ptr) { +; CHECK-LABEL: f26: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vleb %v1, 0(%r3), 15 +; CHECK-NEXT: larl %r1, .LCPI25_0 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vsq %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i1, ptr %ptr + %res = sext i1 %a to i128 + ret i128 %res +} + +; Zero extension from i1. +define i128 @f27(i1 %a) { +; CHECK-LABEL: f27: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI26_0 +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vn %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = zext i1 %a to i128 + ret i128 %res +} + +; Zero extension from i1 from memory. +define i128 @f28(ptr %ptr) { +; CHECK-LABEL: f28: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vleb %v0, 0(%r3), 15 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %a = load i1, ptr %ptr + %res = zext i1 %a to i128 + ret i128 %res +} + +; Truncation to i1. +define i1 @f29(i128 %a) { +; CHECK-LABEL: f29: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vlgvf %r2, %v0, 3 +; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i1 + ret i1 %res +} + +; Truncation to i1 in memory. +define void @f30(ptr %ptr, i128 %a) { +; CHECK-LABEL: f30: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: larl %r1, .LCPI29_0 +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vaq %v0, %v0, %v0 +; CHECK-NEXT: vn %v0, %v0, %v1 +; CHECK-NEXT: vsteb %v0, 0(%r2), 15 +; CHECK-NEXT: br %r14 + %op = add i128 %a, %a + %res = trunc i128 %op to i1 + store i1 %res, ptr %ptr + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/int-div-07.ll b/llvm/test/CodeGen/SystemZ/int-div-07.ll new file mode 100644 index 0000000000000..9cee91f7b3c63 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-div-07.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit division and remainder in vector registers on z13 using libcalls +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Divide signed. +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -208 +; CHECK-NEXT: .cfi_def_cfa_offset 368 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: lgr %r13, %r2 +; CHECK-NEXT: la %r2, 192(%r15) +; CHECK-NEXT: la %r3, 176(%r15) +; CHECK-NEXT: la %r4, 160(%r15) +; CHECK-NEXT: vst %v1, 160(%r15), 3 +; CHECK-NEXT: vst %v0, 176(%r15), 3 +; CHECK-NEXT: brasl %r14, __divti3@PLT +; CHECK-NEXT: vl %v0, 192(%r15), 3 +; CHECK-NEXT: vst %v0, 0(%r13), 3 +; CHECK-NEXT: lmg %r13, %r15, 312(%r15) +; CHECK-NEXT: br %r14 + %res = sdiv i128 %a, %b + ret i128 %res +} + +; Divide unsigned. +define i128 @f2(i128 %a, i128 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -208 +; CHECK-NEXT: .cfi_def_cfa_offset 368 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: lgr %r13, %r2 +; CHECK-NEXT: la %r2, 192(%r15) +; CHECK-NEXT: la %r3, 176(%r15) +; CHECK-NEXT: la %r4, 160(%r15) +; CHECK-NEXT: vst %v1, 160(%r15), 3 +; CHECK-NEXT: vst %v0, 176(%r15), 3 +; CHECK-NEXT: brasl %r14, __udivti3@PLT +; CHECK-NEXT: vl %v0, 192(%r15), 3 +; CHECK-NEXT: vst %v0, 0(%r13), 3 +; CHECK-NEXT: lmg %r13, %r15, 312(%r15) +; CHECK-NEXT: br %r14 + %res = udiv i128 %a, %b + ret i128 %res +} + +; Remainder signed. +define i128 @f3(i128 %a, i128 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -208 +; CHECK-NEXT: .cfi_def_cfa_offset 368 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: lgr %r13, %r2 +; CHECK-NEXT: la %r2, 192(%r15) +; CHECK-NEXT: la %r3, 176(%r15) +; CHECK-NEXT: la %r4, 160(%r15) +; CHECK-NEXT: vst %v1, 160(%r15), 3 +; CHECK-NEXT: vst %v0, 176(%r15), 3 +; CHECK-NEXT: brasl %r14, __modti3@PLT +; CHECK-NEXT: vl %v0, 192(%r15), 3 +; CHECK-NEXT: vst %v0, 0(%r13), 3 +; CHECK-NEXT: lmg %r13, %r15, 312(%r15) +; CHECK-NEXT: br %r14 + %res = srem i128 %a, %b + ret i128 %res +} + +; Remainder unsigned. +define i128 @f4(i128 %a, i128 %b) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -208 +; CHECK-NEXT: .cfi_def_cfa_offset 368 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: lgr %r13, %r2 +; CHECK-NEXT: la %r2, 192(%r15) +; CHECK-NEXT: la %r3, 176(%r15) +; CHECK-NEXT: la %r4, 160(%r15) +; CHECK-NEXT: vst %v1, 160(%r15), 3 +; CHECK-NEXT: vst %v0, 176(%r15), 3 +; CHECK-NEXT: brasl %r14, __umodti3@PLT +; CHECK-NEXT: vl %v0, 192(%r15), 3 +; CHECK-NEXT: vst %v0, 0(%r13), 3 +; CHECK-NEXT: lmg %r13, %r15, 312(%r15) +; CHECK-NEXT: br %r14 + %res = urem i128 %a, %b + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/int-max-01.ll b/llvm/test/CodeGen/SystemZ/int-max-01.ll new file mode 100644 index 0000000000000..404a9a159304f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-max-01.ll @@ -0,0 +1,204 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test i128 maximum on z13. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define i128 @f1(i128 %val1, i128 %val2) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vecg %v1, %v0 +; CHECK-NEXT: je .LBB0_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB0_4 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: vchlgs %v2, %v0, %v1 +; CHECK-NEXT: jl .LBB0_2 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp slt i128 %val1, %val2 + %ret = select i1 %cmp, i128 %val2, i128 %val1 + ret i128 %ret +} + +; Test with sle. +define i128 @f2(i128 %val1, i128 %val2) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: je .LBB1_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB1_4 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: jnl .LBB1_2 +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp sle i128 %val1, %val2 + %ret = select i1 %cmp, i128 %val2, i128 %val1 + ret i128 %ret +} + +; Test with sgt. +define i128 @f3(i128 %val1, i128 %val2) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vecg %v1, %v0 +; CHECK-NEXT: je .LBB2_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB2_4 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: vchlgs %v2, %v0, %v1 +; CHECK-NEXT: jl .LBB2_2 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt i128 %val1, %val2 + %ret = select i1 %cmp, i128 %val1, i128 %val2 + ret i128 %ret +} + +; Test with sge. +define i128 @f4(i128 %val1, i128 %val2) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: je .LBB3_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB3_4 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB3_3: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: jnl .LBB3_2 +; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp sge i128 %val1, %val2 + %ret = select i1 %cmp, i128 %val1, i128 %val2 + ret i128 %ret +} + +; Test with ult. +define i128 @f5(i128 %val1, i128 %val2) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: veclg %v1, %v0 +; CHECK-NEXT: je .LBB4_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB4_4 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB4_3: +; CHECK-NEXT: vchlgs %v2, %v0, %v1 +; CHECK-NEXT: jl .LBB4_2 +; CHECK-NEXT: .LBB4_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp ult i128 %val1, %val2 + %ret = select i1 %cmp, i128 %val2, i128 %val1 + ret i128 %ret +} + +; Test with ule. +define i128 @f6(i128 %val1, i128 %val2) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: je .LBB5_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB5_4 +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB5_3: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: jnl .LBB5_2 +; CHECK-NEXT: .LBB5_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp ule i128 %val1, %val2 + %ret = select i1 %cmp, i128 %val2, i128 %val1 + ret i128 %ret +} + +; Test with ugt. +define i128 @f7(i128 %val1, i128 %val2) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: veclg %v1, %v0 +; CHECK-NEXT: je .LBB6_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB6_4 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB6_3: +; CHECK-NEXT: vchlgs %v2, %v0, %v1 +; CHECK-NEXT: jl .LBB6_2 +; CHECK-NEXT: .LBB6_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt i128 %val1, %val2 + %ret = select i1 %cmp, i128 %val1, i128 %val2 + ret i128 %ret +} + +; Test with uge. +define i128 @f8(i128 %val1, i128 %val2) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: je .LBB7_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB7_4 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: jnl .LBB7_2 +; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp uge i128 %val1, %val2 + %ret = select i1 %cmp, i128 %val1, i128 %val2 + ret i128 %ret +} diff --git a/llvm/test/CodeGen/SystemZ/int-min-01.ll b/llvm/test/CodeGen/SystemZ/int-min-01.ll new file mode 100644 index 0000000000000..8280be9c61339 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-min-01.ll @@ -0,0 +1,204 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test i128 minimum on z13. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test with slt. +define i128 @f1(i128 %val1, i128 %val2) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: je .LBB0_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB0_4 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: jl .LBB0_2 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp slt i128 %val2, %val1 + %ret = select i1 %cmp, i128 %val2, i128 %val1 + ret i128 %ret +} + +; Test with sle. +define i128 @f2(i128 %val1, i128 %val2) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vecg %v1, %v0 +; CHECK-NEXT: je .LBB1_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB1_4 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: vchlgs %v2, %v0, %v1 +; CHECK-NEXT: jnl .LBB1_2 +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp sle i128 %val2, %val1 + %ret = select i1 %cmp, i128 %val2, i128 %val1 + ret i128 %ret +} + +; Test with sgt. +define i128 @f3(i128 %val1, i128 %val2) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vecg %v0, %v1 +; CHECK-NEXT: je .LBB2_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB2_4 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: jl .LBB2_2 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp sgt i128 %val2, %val1 + %ret = select i1 %cmp, i128 %val1, i128 %val2 + ret i128 %ret +} + +; Test with sge. +define i128 @f4(i128 %val1, i128 %val2) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vecg %v1, %v0 +; CHECK-NEXT: je .LBB3_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB3_4 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB3_3: +; CHECK-NEXT: vchlgs %v2, %v0, %v1 +; CHECK-NEXT: jnl .LBB3_2 +; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp sge i128 %val2, %val1 + %ret = select i1 %cmp, i128 %val1, i128 %val2 + ret i128 %ret +} + +; Test with ult. +define i128 @f5(i128 %val1, i128 %val2) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: je .LBB4_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB4_4 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB4_3: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: jl .LBB4_2 +; CHECK-NEXT: .LBB4_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp ult i128 %val2, %val1 + %ret = select i1 %cmp, i128 %val2, i128 %val1 + ret i128 %ret +} + +; Test with ule. +define i128 @f6(i128 %val1, i128 %val2) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: veclg %v1, %v0 +; CHECK-NEXT: je .LBB5_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB5_4 +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB5_3: +; CHECK-NEXT: vchlgs %v2, %v0, %v1 +; CHECK-NEXT: jnl .LBB5_2 +; CHECK-NEXT: .LBB5_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp ule i128 %val2, %val1 + %ret = select i1 %cmp, i128 %val2, i128 %val1 + ret i128 %ret +} + +; Test with ugt. +define i128 @f7(i128 %val1, i128 %val2) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: veclg %v0, %v1 +; CHECK-NEXT: je .LBB6_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jnl .LBB6_4 +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB6_3: +; CHECK-NEXT: vchlgs %v2, %v1, %v0 +; CHECK-NEXT: jl .LBB6_2 +; CHECK-NEXT: .LBB6_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp ugt i128 %val2, %val1 + %ret = select i1 %cmp, i128 %val1, i128 %val2 + ret i128 %ret +} + +; Test with uge. +define i128 @f8(i128 %val1, i128 %val2) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: veclg %v1, %v0 +; CHECK-NEXT: je .LBB7_3 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: jl .LBB7_4 +; CHECK-NEXT: .LBB7_2: +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .LBB7_3: +; CHECK-NEXT: vchlgs %v2, %v0, %v1 +; CHECK-NEXT: jnl .LBB7_2 +; CHECK-NEXT: .LBB7_4: +; CHECK-NEXT: vlr %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %cmp = icmp uge i128 %val2, %val1 + %ret = select i1 %cmp, i128 %val1, i128 %val2 + ret i128 %ret +} diff --git a/llvm/test/CodeGen/SystemZ/int-mul-12.ll b/llvm/test/CodeGen/SystemZ/int-mul-12.ll new file mode 100644 index 0000000000000..e7005f50a12fd --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-mul-12.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit multiplication on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Multiplication. +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r12, %r15, 96(%r15) +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: lg %r13, 8(%r3) +; CHECK-NEXT: lg %r0, 8(%r4) +; CHECK-NEXT: lgr %r1, %r13 +; CHECK-NEXT: mlgr %r12, %r0 +; CHECK-NEXT: msg %r1, 0(%r4) +; CHECK-NEXT: msg %r0, 0(%r3) +; CHECK-NEXT: agr %r1, %r12 +; CHECK-NEXT: agr %r0, %r1 +; CHECK-NEXT: stg %r13, 8(%r2) +; CHECK-NEXT: stg %r0, 0(%r2) +; CHECK-NEXT: lmg %r12, %r15, 96(%r15) +; CHECK-NEXT: br %r14 + %res = mul i128 %a, %b + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/int-mul-13.ll b/llvm/test/CodeGen/SystemZ/int-mul-13.ll new file mode 100644 index 0000000000000..82937cf66c629 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-mul-13.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; Test high-part i64->i128 multiplications on z13. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Check zero-extended multiplication in which only the high part is used. +define i64 @f1(i64 %dummy, i64 %a, i64 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlgr %r2, %r4 +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check sign-extended multiplication in which only the high part is used. +; This needs a rather convoluted sequence. +define i64 @f2(i64 %dummy, i64 %a, i64 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: srag %r1, %r4, 63 +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: srag %r0, %r3, 63 +; CHECK-NEXT: ngr %r1, %r3 +; CHECK-NEXT: mlgr %r2, %r4 +; CHECK-NEXT: ngr %r0, %r4 +; CHECK-NEXT: agr %r0, %r1 +; CHECK-NEXT: sgr %r2, %r0 +; CHECK-NEXT: br %r14 + %ax = sext i64 %a to i128 + %bx = sext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check zero-extended multiplication in which only part of the high half +; is used. +define i64 @f3(i64 %dummy, i64 %a, i64 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlgr %r2, %r4 +; CHECK-NEXT: srlg %r2, %r2, 3 +; CHECK-NEXT: br %r14 + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 67 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check zero-extended multiplication in which the result is split into +; high and low halves. +define i64 @f4(i64 %dummy, i64 %a, i64 %b) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlgr %r2, %r4 +; CHECK-NEXT: ogr %r2, %r3 +; CHECK-NEXT: br %r14 + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + %low = trunc i128 %mulx to i64 + %or = or i64 %high, %low + ret i64 %or +} + +; Check division by a constant, which should use multiplication instead. +define i64 @f5(i64 %dummy, i64 %a) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: llihf %r0, 1782028570 +; CHECK-NEXT: oilf %r0, 598650223 +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlgr %r2, %r0 +; CHECK-NEXT: srlg %r2, %r2, 9 +; CHECK-NEXT: br %r14 + %res = udiv i64 %a, 1234 + ret i64 %res +} + +; Check MLG with no displacement. +define i64 @f6(i64 %dummy, i64 %a, ptr %src) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlg %r2, 0(%r4) +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %b = load i64, ptr %src + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the high end of the aligned MLG range. +define i64 @f7(i64 %dummy, i64 %a, ptr %src) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlg %r2, 524280(%r4) +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %ptr = getelementptr i64, ptr %src, i64 65535 + %b = load i64, ptr %ptr + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the next doubleword up, which requires separate address logic. +; Other sequences besides this one would be OK. +define i64 @f8(i64 %dummy, i64 %a, ptr %src) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: agfi %r4, 524288 +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlg %r2, 0(%r4) +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %ptr = getelementptr i64, ptr %src, i64 65536 + %b = load i64, ptr %ptr + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the high end of the negative aligned MLG range. +define i64 @f9(i64 %dummy, i64 %a, ptr %src) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlg %r2, -8(%r4) +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %ptr = getelementptr i64, ptr %src, i64 -1 + %b = load i64, ptr %ptr + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the low end of the MLG range. +define i64 @f10(i64 %dummy, i64 %a, ptr %src) { +; CHECK-LABEL: f10: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlg %r2, -524288(%r4) +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %ptr = getelementptr i64, ptr %src, i64 -65536 + %b = load i64, ptr %ptr + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check the next doubleword down, which needs separate address logic. +; Other sequences besides this one would be OK. +define i64 @f11(ptr %dest, i64 %a, ptr %src) { +; CHECK-LABEL: f11: +; CHECK: # %bb.0: +; CHECK-NEXT: agfi %r4, -524296 +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlg %r2, 0(%r4) +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %ptr = getelementptr i64, ptr %src, i64 -65537 + %b = load i64, ptr %ptr + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + +; Check that MLG allows an index. +define i64 @f12(ptr %dest, i64 %a, i64 %src, i64 %index) { +; CHECK-LABEL: f12: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r3d killed $r3d def $r2q +; CHECK-NEXT: mlg %r2, 524287(%r5,%r4) +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %add1 = add i64 %src, %index + %add2 = add i64 %add1, 524287 + %ptr = inttoptr i64 %add2 to ptr + %b = load i64, ptr %ptr + %ax = zext i64 %a to i128 + %bx = zext i64 %b to i128 + %mulx = mul i128 %ax, %bx + %highx = lshr i128 %mulx, 64 + %high = trunc i128 %highx to i64 + ret i64 %high +} + diff --git a/llvm/test/CodeGen/SystemZ/int-neg-03.ll b/llvm/test/CodeGen/SystemZ/int-neg-03.ll new file mode 100644 index 0000000000000..89af30f9353e6 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-neg-03.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit negation in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define i128 @f1(i128 %src) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vsq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = sub i128 0, %src + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/int-sub-12.ll b/llvm/test/CodeGen/SystemZ/int-sub-12.ll new file mode 100644 index 0000000000000..2e30f1c33aebe --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-sub-12.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit subtraction in vector registers on z13 and later +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vsq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = sub i128 %a, %b + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/int-uadd-13.ll b/llvm/test/CodeGen/SystemZ/int-uadd-13.ll new file mode 100644 index 0000000000000..6e319a8110e48 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-uadd-13.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; Test 128-bit addition on z13 and higher +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define zeroext i1 @f1(i128 %a, i128 %b, ptr %res) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vaccq %v2, %v1, %v0 +; CHECK-NEXT: vlgvg %r2, %v2, 1 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r4), 3 +; CHECK-NEXT: br %r14 + %t = call {i128, i1} @llvm.uadd.with.overflow.i128(i128 %a, i128 %b) + %val = extractvalue {i128, i1} %t, 0 + %obit = extractvalue {i128, i1} %t, 1 + store i128 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @f2(i128 %a, i128 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vaccq %v0, %v1, %v0 +; CHECK-NEXT: vlgvg %r2, %v0, 1 +; CHECK-NEXT: br %r14 + %t = call {i128, i1} @llvm.uadd.with.overflow.i128(i128 %a, i128 %b) + %obit = extractvalue {i128, i1} %t, 1 + ret i1 %obit +} + +define i128 @f3(i128 %a, i128 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %t = call {i128, i1} @llvm.uadd.with.overflow.i128(i128 %a, i128 %b) + %val = extractvalue {i128, i1} %t, 0 + ret i128 %val +} + +declare {i128, i1} @llvm.uadd.with.overflow.i128(i128, i128) nounwind readnone + diff --git a/llvm/test/CodeGen/SystemZ/int-uadd-14.ll b/llvm/test/CodeGen/SystemZ/int-uadd-14.ll new file mode 100644 index 0000000000000..c8873a4dfadef --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-uadd-14.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; Test 256-bit addition on z13 and higher +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v2, 16(%r3), 3 +; CHECK-NEXT: vl %v3, 16(%r2), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vaccq %v4, %v3, %v2 +; CHECK-NEXT: vacccq %v5, %v1, %v0, %v4 +; CHECK-NEXT: vlgvg %r2, %v5, 1 +; CHECK-NEXT: vacq %v0, %v1, %v0, %v4 +; CHECK-NEXT: vaq %v1, %v3, %v2 +; CHECK-NEXT: vst %v1, 16(%r4), 3 +; CHECK-NEXT: vst %v0, 0(%r4), 3 +; CHECK-NEXT: br %r14 + %t = call {i256, i1} @llvm.uadd.with.overflow.i256(i256 %a, i256 %b) + %val = extractvalue {i256, i1} %t, 0 + %obit = extractvalue {i256, i1} %t, 1 + store i256 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @f2(i256 %a, i256 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v2, 16(%r3), 3 +; CHECK-NEXT: vl %v3, 16(%r2), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vaccq %v2, %v3, %v2 +; CHECK-NEXT: vacccq %v0, %v1, %v0, %v2 +; CHECK-NEXT: vlgvg %r2, %v0, 1 +; CHECK-NEXT: br %r14 + %t = call {i256, i1} @llvm.uadd.with.overflow.i256(i256 %a, i256 %b) + %obit = extractvalue {i256, i1} %t, 1 + ret i1 %obit +} + +define i256 @f3(i256 %a, i256 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v2, 16(%r4), 3 +; CHECK-NEXT: vl %v3, 16(%r3), 3 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vaccq %v4, %v3, %v2 +; CHECK-NEXT: vacq %v0, %v1, %v0, %v4 +; CHECK-NEXT: vaq %v1, %v3, %v2 +; CHECK-NEXT: vst %v1, 16(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %t = call {i256, i1} @llvm.uadd.with.overflow.i256(i256 %a, i256 %b) + %val = extractvalue {i256, i1} %t, 0 + ret i256 %val +} + +declare {i256, i1} @llvm.uadd.with.overflow.i256(i256, i256) nounwind readnone + diff --git a/llvm/test/CodeGen/SystemZ/int-usub-12.ll b/llvm/test/CodeGen/SystemZ/int-usub-12.ll new file mode 100644 index 0000000000000..c39a6da37048d --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-usub-12.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; Test 128-bit subtraction on z13 and higher +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define zeroext i1 @f1(i128 %a, i128 %b, ptr %res) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vscbiq %v2, %v1, %v0 +; CHECK-NEXT: vlgvg %r2, %v2, 1 +; CHECK-NEXT: vsq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r4), 3 +; CHECK-NEXT: br %r14 + %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b) + %val = extractvalue {i128, i1} %t, 0 + %obit = extractvalue {i128, i1} %t, 1 + store i128 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @f2(i128 %a, i128 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vscbiq %v0, %v1, %v0 +; CHECK-NEXT: vlgvg %r2, %v0, 1 +; CHECK-NEXT: br %r14 + %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b) + %obit = extractvalue {i128, i1} %t, 1 + ret i1 %obit +} + +define i128 @f3(i128 %a, i128 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vsq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %t = call {i128, i1} @llvm.usub.with.overflow.i128(i128 %a, i128 %b) + %val = extractvalue {i128, i1} %t, 0 + ret i128 %val +} + +declare {i128, i1} @llvm.usub.with.overflow.i128(i128, i128) nounwind readnone + diff --git a/llvm/test/CodeGen/SystemZ/int-usub-13.ll b/llvm/test/CodeGen/SystemZ/int-usub-13.ll new file mode 100644 index 0000000000000..637e1a81de996 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/int-usub-13.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; Test 256-bit subtraction on z13 and higher +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +define zeroext i1 @f1(i256 %a, i256 %b, ptr %res) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v2, 16(%r3), 3 +; CHECK-NEXT: vl %v3, 16(%r2), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vscbiq %v4, %v3, %v2 +; CHECK-NEXT: vsbcbiq %v5, %v1, %v0, %v4 +; CHECK-NEXT: vlgvg %r2, %v5, 1 +; CHECK-NEXT: vsbiq %v0, %v1, %v0, %v4 +; CHECK-NEXT: vsq %v1, %v3, %v2 +; CHECK-NEXT: vst %v1, 16(%r4), 3 +; CHECK-NEXT: vst %v0, 0(%r4), 3 +; CHECK-NEXT: br %r14 + %t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b) + %val = extractvalue {i256, i1} %t, 0 + %obit = extractvalue {i256, i1} %t, 1 + store i256 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @f2(i256 %a, i256 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v2, 16(%r3), 3 +; CHECK-NEXT: vl %v3, 16(%r2), 3 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r2), 3 +; CHECK-NEXT: vscbiq %v2, %v3, %v2 +; CHECK-NEXT: vsbcbiq %v0, %v1, %v0, %v2 +; CHECK-NEXT: vlgvg %r2, %v0, 1 +; CHECK-NEXT: br %r14 + %t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b) + %obit = extractvalue {i256, i1} %t, 1 + ret i1 %obit +} + +define i256 @f3(i256 %a, i256 %b) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v2, 16(%r4), 3 +; CHECK-NEXT: vl %v3, 16(%r3), 3 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vscbiq %v4, %v3, %v2 +; CHECK-NEXT: vsbiq %v0, %v1, %v0, %v4 +; CHECK-NEXT: vsq %v1, %v3, %v2 +; CHECK-NEXT: vst %v1, 16(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %t = call {i256, i1} @llvm.usub.with.overflow.i256(i256 %a, i256 %b) + %val = extractvalue {i256, i1} %t, 0 + ret i256 %val +} + +declare {i256, i1} @llvm.usub.with.overflow.i256(i256, i256) nounwind readnone + diff --git a/llvm/test/CodeGen/SystemZ/or-09.ll b/llvm/test/CodeGen/SystemZ/or-09.ll new file mode 100644 index 0000000000000..645d47ed5d4c0 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/or-09.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit OR in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Or. +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vo %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = or i128 %a, %b + ret i128 %res +} + +; NOR. +define i128 @f2(i128 %a, i128 %b) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vno %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %op = or i128 %a, %b + %res = xor i128 %op, -1 + ret i128 %res +} + +; Complement. +define i128 @f3(i128 %a) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vno %v0, %v0, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = xor i128 %a, -1 + ret i128 %res +} + +; Select. +define i128 @f4(i128 %mask, i128 %true, i128 %false) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 0(%r5), 3 +; CHECK-NEXT: vl %v2, 0(%r4), 3 +; CHECK-NEXT: vsel %v0, %v2, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %notmask = xor i128 %mask, -1 + %res1 = and i128 %true, %mask + %res2 = and i128 %false, %notmask + %res = or i128 %res1, %res2 + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/or-10.ll b/llvm/test/CodeGen/SystemZ/or-10.ll new file mode 100644 index 0000000000000..25429593306d2 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/or-10.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit OR-NOT in vector registers on z14 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; Or with complement. +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: voc %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %notb = xor i128 %b, -1 + %res = or i128 %a, %notb + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/regalloc-GR128.ll b/llvm/test/CodeGen/SystemZ/regalloc-GR128.ll index e84e23613d9cb..0970ceb33adf5 100644 --- a/llvm/test/CodeGen/SystemZ/regalloc-GR128.ll +++ b/llvm/test/CodeGen/SystemZ/regalloc-GR128.ll @@ -1,3 +1,4 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -O3 -o /dev/null ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -O3 -o /dev/null ; ; Test that regalloc does not run out of registers diff --git a/llvm/test/CodeGen/SystemZ/rot-03.ll b/llvm/test/CodeGen/SystemZ/rot-03.ll new file mode 100644 index 0000000000000..22e4b13cc8d02 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/rot-03.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test 128-bit rotate by constant amount. +define i128 @f1(i128 %val) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 100 +; CHECK-NEXT: vsrlb %v2, %v0, %v1 +; CHECK-NEXT: vsrl %v1, %v2, %v1 +; CHECK-NEXT: vrepib %v2, 28 +; CHECK-NEXT: vslb %v0, %v0, %v2 +; CHECK-NEXT: vsl %v0, %v0, %v2 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + + %parta = shl i128 %val, 28 + %partb = lshr i128 %val, 100 + + %rotl = or i128 %parta, %partb + + ret i128 %rotl +} + +; Test 128-bit rotate by constant amount (full bytes). +define i128 @f2(i128 %val) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 96 +; CHECK-NEXT: vrepib %v2, 32 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vslb %v0, %v0, %v2 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + + %parta = shl i128 %val, 32 + %partb = lshr i128 %val, 96 + + %rotl = or i128 %parta, %partb + + ret i128 %rotl +} + +; Test 128-bit rotate by variable amount. +define i128 @f3(i128 %val, i128 %amt) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vslb %v2, %v0, %v1 +; CHECK-NEXT: lhi %r1, 128 +; CHECK-NEXT: sr %r1, %r0 +; CHECK-NEXT: vsl %v1, %v2, %v1 +; CHECK-NEXT: vlvgp %v2, %r1, %r1 +; CHECK-NEXT: vrepb %v2, %v2, 15 +; CHECK-NEXT: vsrlb %v0, %v0, %v2 +; CHECK-NEXT: vsrl %v0, %v0, %v2 +; CHECK-NEXT: vo %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + + %inv = sub i128 128, %amt + %parta = shl i128 %val, %amt + %partb = lshr i128 %val, %inv + + %rotl = or i128 %parta, %partb + + ret i128 %rotl +} + diff --git a/llvm/test/CodeGen/SystemZ/scalar-ctlz-01.ll b/llvm/test/CodeGen/SystemZ/scalar-ctlz-01.ll new file mode 100644 index 0000000000000..e932210d3e71f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/scalar-ctlz-01.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s +; +; FIXME: two consecutive immediate adds not fused in i16/i8 functions. + +declare i64 @llvm.ctlz.i64(i64, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i16 @llvm.ctlz.i16(i16, i1) +declare i8 @llvm.ctlz.i8(i8, i1) + +define i64 @f0(i64 %arg) { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: flogr %r2, %r2 +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %1 = tail call i64 @llvm.ctlz.i64(i64 %arg, i1 false) + ret i64 %1 +} + +define i64 @f1(i64 %arg) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: flogr %r2, %r2 +; CHECK-NEXT: # kill: def $r2d killed $r2d killed $r2q +; CHECK-NEXT: br %r14 + %1 = tail call i64 @llvm.ctlz.i64(i64 %arg, i1 true) + ret i64 %1 +} + +define i32 @f2(i32 %arg) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r0, %r2 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: aghi %r2, -32 +; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d +; CHECK-NEXT: br %r14 + %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 false) + ret i32 %1 +} + +define i32 @f3(i32 %arg) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r0, %r2 +; CHECK-NEXT: flogr %r2, %r0 +; CHECK-NEXT: aghi %r2, -32 +; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d +; CHECK-NEXT: br %r14 + %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 true) + ret i32 %1 +} + +define i16 @f4(i16 %arg) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: llghr %r0, %r2 +; CHECK-NEXT: flogr %r0, %r0 +; CHECK-NEXT: aghi %r0, -32 +; CHECK-NEXT: ahik %r2, %r0, -16 +; CHECK-NEXT: br %r14 + %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 false) + ret i16 %1 +} + +define i16 @f5(i16 %arg) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: llghr %r0, %r2 +; CHECK-NEXT: flogr %r0, %r0 +; CHECK-NEXT: aghi %r0, -32 +; CHECK-NEXT: ahik %r2, %r0, -16 +; CHECK-NEXT: br %r14 + %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true) + ret i16 %1 +} + +define i8 @f6(i8 %arg) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: llgcr %r0, %r2 +; CHECK-NEXT: flogr %r0, %r0 +; CHECK-NEXT: aghi %r0, -32 +; CHECK-NEXT: ahik %r2, %r0, -24 +; CHECK-NEXT: br %r14 + %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 false) + ret i8 %1 +} + +define i8 @f7(i8 %arg) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d +; CHECK-NEXT: llgcr %r0, %r2 +; CHECK-NEXT: flogr %r0, %r0 +; CHECK-NEXT: aghi %r0, -32 +; CHECK-NEXT: ahik %r2, %r0, -24 +; CHECK-NEXT: br %r14 + %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true) + ret i8 %1 +} diff --git a/llvm/test/CodeGen/SystemZ/scalar-ctlz-02.ll b/llvm/test/CodeGen/SystemZ/scalar-ctlz-02.ll new file mode 100644 index 0000000000000..2c3bf944cdf89 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/scalar-ctlz-02.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; FIXME: can do better here ... + +declare i128 @llvm.ctlz.i128(i128, i1) + +define i128 @f1(i128 %a) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 1 +; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 2 +; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 4 +; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 8 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 16 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 32 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 64 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vno %v0, %v0, %v1 +; CHECK-NEXT: vpopct %v0, %v0, 0 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vsumb %v0, %v0, %v1 +; CHECK-NEXT: vsumgf %v0, %v0, %v1 +; CHECK-NEXT: vsumqg %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = tail call i128 @llvm.ctlz.i128(i128 %a, i1 false) + ret i128 %res +} + +define i128 @f2(i128 %a) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 1 +; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 2 +; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 4 +; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 8 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 16 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 32 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vrepib %v1, 64 +; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vno %v0, %v0, %v1 +; CHECK-NEXT: vpopct %v0, %v0, 0 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vsumb %v0, %v0, %v1 +; CHECK-NEXT: vsumgf %v0, %v0, %v1 +; CHECK-NEXT: vsumqg %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = tail call i128 @llvm.ctlz.i128(i128 %a, i1 true) + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/scalar-ctlz.ll b/llvm/test/CodeGen/SystemZ/scalar-ctlz.ll deleted file mode 100644 index b3839ecdd995a..0000000000000 --- a/llvm/test/CodeGen/SystemZ/scalar-ctlz.ll +++ /dev/null @@ -1,103 +0,0 @@ -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -; -; FIXME: two consecutive immediate adds not fused in i16/i8 functions. - -declare i64 @llvm.ctlz.i64(i64, i1) -declare i32 @llvm.ctlz.i32(i32, i1) -declare i16 @llvm.ctlz.i16(i16, i1) -declare i8 @llvm.ctlz.i8(i8, i1) - -define i64 @f0(i64 %arg) { -; CHECK-LABEL: f0: -; CHECK-LABEL: %bb.0: -; CHECK-NOT: %bb.1: -; CHECK: flogr - %1 = tail call i64 @llvm.ctlz.i64(i64 %arg, i1 false) - ret i64 %1 -} - -define i64 @f1(i64 %arg) { -; CHECK-LABEL: f1: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: flogr -; CHECK-NEXT: # kill -; CHECK-NEXT: br %r14 - %1 = tail call i64 @llvm.ctlz.i64(i64 %arg, i1 true) - ret i64 %1 -} - -define i32 @f2(i32 %arg) { -; CHECK-LABEL: f2: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: llgfr %r0, %r2 -; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: # kill -; CHECK-NEXT: br %r14 - %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 false) - ret i32 %1 -} - -define i32 @f3(i32 %arg) { -; CHECK-LABEL: f3: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: llgfr %r0, %r2 -; CHECK-NEXT: flogr %r2, %r0 -; CHECK-NEXT: aghi %r2, -32 -; CHECK-NEXT: # kill -; CHECK-NEXT: br %r14 - %1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 true) - ret i32 %1 -} - -define i16 @f4(i16 %arg) { -; CHECK-LABEL: f4: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: # kill -; CHECK-NEXT: llghr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -16 -; CHECK-NEXT: br %r14 - %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 false) - ret i16 %1 -} - -define i16 @f5(i16 %arg) { -; CHECK-LABEL: f5: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: # kill -; CHECK-NEXT: llghr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -16 -; CHECK-NEXT: br %r14 - %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true) - ret i16 %1 -} - -define i8 @f6(i8 %arg) { -; CHECK-LABEL: f6: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: # kill -; CHECK-NEXT: llgcr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -24 -; CHECK-NEXT: br %r14 - %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 false) - ret i8 %1 -} - -define i8 @f7(i8 %arg) { -; CHECK-LABEL: f7: -; CHECK-LABEL: %bb.0: -; CHECK-NEXT: # kill -; CHECK-NEXT: llgcr %r0, %r2 -; CHECK-NEXT: flogr %r0, %r0 -; CHECK-NEXT: aghi %r0, -32 -; CHECK-NEXT: ahik %r2, %r0, -24 -; CHECK-NEXT: br %r14 - %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true) - ret i8 %1 -} diff --git a/llvm/test/CodeGen/SystemZ/scalar-cttz-01.ll b/llvm/test/CodeGen/SystemZ/scalar-cttz-01.ll new file mode 100644 index 0000000000000..7c4747854219b --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/scalar-cttz-01.ll @@ -0,0 +1,129 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare i64 @llvm.cttz.i64(i64, i1) +declare i32 @llvm.cttz.i32(i32, i1) +declare i16 @llvm.cttz.i16(i16, i1) +declare i8 @llvm.cttz.i8(i8, i1) + +define i64 @f0(i64 %arg) { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: lay %r0, -1(%r2) +; CHECK-NEXT: ngr %r2, %r0 +; CHECK-NEXT: xgr %r2, %r0 +; CHECK-NEXT: flogr %r0, %r2 +; CHECK-NEXT: lghi %r2, 64 +; CHECK-NEXT: sgr %r2, %r0 +; CHECK-NEXT: br %r14 + %1 = tail call i64 @llvm.cttz.i64(i64 %arg, i1 false) + ret i64 %1 +} + +define i64 @f1(i64 %arg) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: lay %r0, -1(%r2) +; CHECK-NEXT: ngr %r2, %r0 +; CHECK-NEXT: xgr %r2, %r0 +; CHECK-NEXT: flogr %r0, %r2 +; CHECK-NEXT: lghi %r2, 64 +; CHECK-NEXT: sgr %r2, %r0 +; CHECK-NEXT: br %r14 + %1 = tail call i64 @llvm.cttz.i64(i64 %arg, i1 true) + ret i64 %1 +} + +define i32 @f2(i32 %arg) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: ahik %r0, %r2, -1 +; CHECK-NEXT: xilf %r2, 4294967295 +; CHECK-NEXT: nr %r0, %r2 +; CHECK-NEXT: popcnt %r0, %r0 +; CHECK-NEXT: sllk %r1, %r0, 16 +; CHECK-NEXT: ar %r0, %r1 +; CHECK-NEXT: sllk %r1, %r0, 8 +; CHECK-NEXT: ar %r0, %r1 +; CHECK-NEXT: srlk %r2, %r0, 24 +; CHECK-NEXT: br %r14 + %1 = tail call i32 @llvm.cttz.i32(i32 %arg, i1 false) + ret i32 %1 +} + +define i32 @f3(i32 %arg) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: ahik %r0, %r2, -1 +; CHECK-NEXT: xilf %r2, 4294967295 +; CHECK-NEXT: nr %r0, %r2 +; CHECK-NEXT: popcnt %r0, %r0 +; CHECK-NEXT: sllk %r1, %r0, 16 +; CHECK-NEXT: ar %r0, %r1 +; CHECK-NEXT: sllk %r1, %r0, 8 +; CHECK-NEXT: ar %r0, %r1 +; CHECK-NEXT: srlk %r2, %r0, 24 +; CHECK-NEXT: br %r14 + %1 = tail call i32 @llvm.cttz.i32(i32 %arg, i1 true) + ret i32 %1 +} + +define i16 @f4(i16 %arg) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: ahik %r0, %r2, -1 +; CHECK-NEXT: xilf %r2, 4294967295 +; CHECK-NEXT: nr %r0, %r2 +; CHECK-NEXT: llhr %r0, %r0 +; CHECK-NEXT: popcnt %r0, %r0 +; CHECK-NEXT: risblg %r1, %r0, 16, 151, 8 +; CHECK-NEXT: ar %r0, %r1 +; CHECK-NEXT: srlk %r2, %r0, 8 +; CHECK-NEXT: br %r14 + %1 = tail call i16 @llvm.cttz.i16(i16 %arg, i1 false) + ret i16 %1 +} + +define i16 @f5(i16 %arg) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: ahik %r0, %r2, -1 +; CHECK-NEXT: xilf %r2, 4294967295 +; CHECK-NEXT: nr %r0, %r2 +; CHECK-NEXT: llhr %r0, %r0 +; CHECK-NEXT: popcnt %r0, %r0 +; CHECK-NEXT: risblg %r1, %r0, 16, 151, 8 +; CHECK-NEXT: ar %r0, %r1 +; CHECK-NEXT: srlk %r2, %r0, 8 +; CHECK-NEXT: br %r14 + %1 = tail call i16 @llvm.cttz.i16(i16 %arg, i1 true) + ret i16 %1 +} + +define i8 @f6(i8 %arg) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: ahik %r0, %r2, -1 +; CHECK-NEXT: xilf %r2, 4294967295 +; CHECK-NEXT: nr %r0, %r2 +; CHECK-NEXT: llcr %r0, %r0 +; CHECK-NEXT: popcnt %r2, %r0 +; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d +; CHECK-NEXT: br %r14 + %1 = tail call i8 @llvm.cttz.i8(i8 %arg, i1 false) + ret i8 %1 +} + +define i8 @f7(i8 %arg) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: ahik %r0, %r2, -1 +; CHECK-NEXT: xilf %r2, 4294967295 +; CHECK-NEXT: nr %r0, %r2 +; CHECK-NEXT: llcr %r0, %r0 +; CHECK-NEXT: popcnt %r2, %r0 +; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d +; CHECK-NEXT: br %r14 + %1 = tail call i8 @llvm.cttz.i8(i8 %arg, i1 true) + ret i8 %1 +} diff --git a/llvm/test/CodeGen/SystemZ/scalar-cttz-02.ll b/llvm/test/CodeGen/SystemZ/scalar-cttz-02.ll new file mode 100644 index 0000000000000..a8d11be21be2f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/scalar-cttz-02.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit arithmetic in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare i128 @llvm.cttz.i128(i128, i1) + +define i128 @f1(i128 %a) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vgbm %v1, 65535 +; CHECK-NEXT: vaq %v1, %v0, %v1 +; CHECK-NEXT: vnc %v0, %v1, %v0 +; CHECK-NEXT: vpopct %v0, %v0, 0 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vsumb %v0, %v0, %v1 +; CHECK-NEXT: vsumgf %v0, %v0, %v1 +; CHECK-NEXT: vsumqg %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = tail call i128 @llvm.cttz.i128(i128 %a, i1 false) + ret i128 %res +} + +define i128 @f2(i128 %a) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vgbm %v1, 65535 +; CHECK-NEXT: vaq %v1, %v0, %v1 +; CHECK-NEXT: vnc %v0, %v1, %v0 +; CHECK-NEXT: vpopct %v0, %v0, 0 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vsumb %v0, %v0, %v1 +; CHECK-NEXT: vsumgf %v0, %v0, %v1 +; CHECK-NEXT: vsumqg %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = tail call i128 @llvm.cttz.i128(i128 %a, i1 true) + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/shift-12.ll b/llvm/test/CodeGen/SystemZ/shift-12.ll index 421928f286985..419caeebbbf9d 100644 --- a/llvm/test/CodeGen/SystemZ/shift-12.ll +++ b/llvm/test/CodeGen/SystemZ/shift-12.ll @@ -122,25 +122,12 @@ define i32 @f10(i32 %a, i32 %sh) { define i128 @f11(i128 %a, i32 %sh) { ; CHECK-LABEL: f11: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r14, %r15, 112(%r15) -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 8(%r3) -; CHECK-NEXT: lg %r1, 0(%r3) -; CHECK-NEXT: risblg %r3, %r4, 25, 159, 0 -; CHECK-NEXT: lcr %r14, %r3 -; CHECK-NEXT: sllg %r5, %r1, 0(%r4) -; CHECK-NEXT: srlg %r14, %r0, 0(%r14) -; CHECK-NEXT: ogr %r5, %r14 -; CHECK-NEXT: sllg %r3, %r0, -64(%r3) -; CHECK-NEXT: tmll %r4, 127 -; CHECK-NEXT: locgrle %r3, %r5 -; CHECK-NEXT: sllg %r0, %r0, 0(%r4) -; CHECK-NEXT: locgre %r3, %r1 -; CHECK-NEXT: locghinle %r0, 0 -; CHECK-NEXT: stg %r0, 8(%r2) -; CHECK-NEXT: stg %r3, 0(%r2) -; CHECK-NEXT: lmg %r14, %r15, 112(%r15) +; CHECK-NEXT: vlvgp %v1, %r4, %r4 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 %and = and i32 %sh, 127 %ext = zext i32 %and to i128 @@ -151,25 +138,12 @@ define i128 @f11(i128 %a, i32 %sh) { define i128 @f12(i128 %a, i32 %sh) { ; CHECK-LABEL: f12: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r14, %r15, 112(%r15) -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 0(%r3) -; CHECK-NEXT: lg %r1, 8(%r3) -; CHECK-NEXT: risblg %r3, %r4, 25, 159, 0 -; CHECK-NEXT: lcr %r14, %r3 -; CHECK-NEXT: srlg %r5, %r1, 0(%r4) -; CHECK-NEXT: sllg %r14, %r0, 0(%r14) -; CHECK-NEXT: ogr %r5, %r14 -; CHECK-NEXT: srlg %r3, %r0, -64(%r3) -; CHECK-NEXT: tmll %r4, 127 -; CHECK-NEXT: locgrle %r3, %r5 -; CHECK-NEXT: srlg %r0, %r0, 0(%r4) -; CHECK-NEXT: locgre %r3, %r1 -; CHECK-NEXT: locghinle %r0, 0 -; CHECK-NEXT: stg %r0, 0(%r2) -; CHECK-NEXT: stg %r3, 8(%r2) -; CHECK-NEXT: lmg %r14, %r15, 112(%r15) +; CHECK-NEXT: vlvgp %v1, %r4, %r4 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vsrl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 %and = and i32 %sh, 127 %ext = zext i32 %and to i128 @@ -180,26 +154,12 @@ define i128 @f12(i128 %a, i32 %sh) { define i128 @f13(i128 %a, i32 %sh) { ; CHECK-LABEL: f13: ; CHECK: # %bb.0: -; CHECK-NEXT: stmg %r14, %r15, 112(%r15) -; CHECK-NEXT: .cfi_offset %r14, -48 -; CHECK-NEXT: .cfi_offset %r15, -40 -; CHECK-NEXT: lg %r0, 0(%r3) -; CHECK-NEXT: lg %r1, 8(%r3) -; CHECK-NEXT: risblg %r3, %r4, 25, 159, 0 -; CHECK-NEXT: lcr %r14, %r3 -; CHECK-NEXT: srlg %r5, %r1, 0(%r4) -; CHECK-NEXT: sllg %r14, %r0, 0(%r14) -; CHECK-NEXT: ogr %r5, %r14 -; CHECK-NEXT: srag %r14, %r0, 0(%r4) -; CHECK-NEXT: srag %r3, %r0, -64(%r3) -; CHECK-NEXT: srag %r0, %r0, 63 -; CHECK-NEXT: tmll %r4, 127 -; CHECK-NEXT: locgrle %r3, %r5 -; CHECK-NEXT: locgre %r3, %r1 -; CHECK-NEXT: locgrle %r0, %r14 -; CHECK-NEXT: stg %r0, 0(%r2) -; CHECK-NEXT: stg %r3, 8(%r2) -; CHECK-NEXT: lmg %r14, %r15, 112(%r15) +; CHECK-NEXT: vlvgp %v1, %r4, %r4 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vsra %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 %and = and i32 %sh, 127 %ext = zext i32 %and to i128 diff --git a/llvm/test/CodeGen/SystemZ/shift-13.ll b/llvm/test/CodeGen/SystemZ/shift-13.ll new file mode 100644 index 0000000000000..e214a18861172 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/shift-13.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit shift left in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Shift left immediate (general case). +define i128 @f1(i128 %a) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 100 +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = shl i128 %a, 100 + ret i128 %res +} + +; Shift left immediate (< 8 bits). +define i128 @f2(i128 %a) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 7 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = shl i128 %a, 7 + ret i128 %res +} + +; Shift left immediate (full bytes). +define i128 @f3(i128 %a) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 32 +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = shl i128 %a, 32 + ret i128 %res +} + +; Shift left variable. +define i128 @f4(i128 %a, i128 %sh) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = shl i128 %a, %sh + ret i128 %res +} + +; Test removal of AND mask with only bottom 7 bits set. +define i128 @f5(i128 %a, i128 %sh) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 127 + %shift = shl i128 %a, %and + ret i128 %shift +} + +; Test removal of AND mask including but not limited to bottom 7 bits. +define i128 @f6(i128 %a, i128 %sh) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 511 + %shift = shl i128 %a, %and + ret i128 %shift +} + +; Test that AND is not removed when some lower 7 bits are not set. +define i128 @f7(i128 %a, i128 %sh) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: lhi %r0, 63 +; CHECK-NEXT: n %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 63 + %shift = shl i128 %a, %and + ret i128 %shift +} + +; Test that AND with two register operands is not affected. +define i128 @f8(i128 %a, i128 %b, i128 %sh) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r5), 3 +; CHECK-NEXT: vn %v1, %v2, %v1 +; CHECK-NEXT: vlgvf %r0, %v1, 3 +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, %b + %shift = shl i128 %a, %and + ret i128 %shift +} + +; Test that AND is not entirely removed if the result is reused. +define i128 @f9(i128 %a, i128 %sh) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI8_0 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vlgvf %r0, %v1, 3 +; CHECK-NEXT: vlvgp %v2, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v2, %v2, 15 +; CHECK-NEXT: vslb %v0, %v0, %v2 +; CHECK-NEXT: vsl %v0, %v0, %v2 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 127 + %shift = shl i128 %a, %and + %reuse = add i128 %and, %shift + ret i128 %reuse +} + diff --git a/llvm/test/CodeGen/SystemZ/shift-14.ll b/llvm/test/CodeGen/SystemZ/shift-14.ll new file mode 100644 index 0000000000000..e45126043f273 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/shift-14.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit shift right logica in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Shift right logical immediate (general case). +define i128 @f1(i128 %a) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 100 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vsrl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = lshr i128 %a, 100 + ret i128 %res +} + +; Shift right logical immediate (< 8 bits). +define i128 @f2(i128 %a) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 7 +; CHECK-NEXT: vsrl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = lshr i128 %a, 7 + ret i128 %res +} + +; Shift right logical immediate (full bytes). +define i128 @f3(i128 %a) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 32 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = lshr i128 %a, 32 + ret i128 %res +} + +; Shift right logical variable. +define i128 @f4(i128 %a, i128 %sh) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vsrl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = lshr i128 %a, %sh + ret i128 %res +} + +; Test removal of AND mask with only bottom 7 bits set. +define i128 @f5(i128 %a, i128 %sh) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vsrl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 127 + %shift = lshr i128 %a, %and + ret i128 %shift +} + +; Test removal of AND mask including but not limited to bottom 7 bits. +define i128 @f6(i128 %a, i128 %sh) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vsrl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 511 + %shift = lshr i128 %a, %and + ret i128 %shift +} + +; Test that AND is not removed when some lower 7 bits are not set. +define i128 @f7(i128 %a, i128 %sh) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: lhi %r0, 63 +; CHECK-NEXT: n %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vsrl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 63 + %shift = lshr i128 %a, %and + ret i128 %shift +} + +; Test that AND with two register operands is not affected. +define i128 @f8(i128 %a, i128 %b, i128 %sh) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r5), 3 +; CHECK-NEXT: vn %v1, %v2, %v1 +; CHECK-NEXT: vlgvf %r0, %v1, 3 +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vsrl %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, %b + %shift = lshr i128 %a, %and + ret i128 %shift +} + +; Test that AND is not entirely removed if the result is reused. +define i128 @f9(i128 %a, i128 %sh) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI8_0 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vlgvf %r0, %v1, 3 +; CHECK-NEXT: vlvgp %v2, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v2, %v2, 15 +; CHECK-NEXT: vsrlb %v0, %v0, %v2 +; CHECK-NEXT: vsrl %v0, %v0, %v2 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 127 + %shift = lshr i128 %a, %and + %reuse = add i128 %and, %shift + ret i128 %reuse +} + diff --git a/llvm/test/CodeGen/SystemZ/shift-15.ll b/llvm/test/CodeGen/SystemZ/shift-15.ll new file mode 100644 index 0000000000000..e21d05c4c91c8 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/shift-15.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit shift right arithmetic in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Shift right arithmetic immediate (general case). +define i128 @f1(i128 %a) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 100 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vsra %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = ashr i128 %a, 100 + ret i128 %res +} + +; Shift right arithmetic immediate (< 8 bits). +define i128 @f2(i128 %a) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 7 +; CHECK-NEXT: vsra %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = ashr i128 %a, 7 + ret i128 %res +} + +; Shift right arithmetic immediate (full bytes). +define i128 @f3(i128 %a) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepib %v1, 32 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = ashr i128 %a, 32 + ret i128 %res +} + +; Shift right arithmetic variable. +define i128 @f4(i128 %a, i128 %sh) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vsra %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = ashr i128 %a, %sh + ret i128 %res +} + +; Test removal of AND mask with only bottom 7 bits set. +define i128 @f5(i128 %a, i128 %sh) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vsra %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 127 + %shift = ashr i128 %a, %and + ret i128 %shift +} + +; Test removal of AND mask including but not limited to bottom 7 bits. +define i128 @f6(i128 %a, i128 %sh) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vsra %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 511 + %shift = ashr i128 %a, %and + ret i128 %shift +} + +; Test that AND is not removed when some lower 7 bits are not set. +define i128 @f7(i128 %a, i128 %sh) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: lhi %r0, 63 +; CHECK-NEXT: n %r0, 12(%r4) +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vsra %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 63 + %shift = ashr i128 %a, %and + ret i128 %shift +} + +; Test that AND with two register operands is not affected. +define i128 @f8(i128 %a, i128 %b, i128 %sh) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r5), 3 +; CHECK-NEXT: vn %v1, %v2, %v1 +; CHECK-NEXT: vlgvf %r0, %v1, 3 +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v1, %v1, 15 +; CHECK-NEXT: vsrab %v0, %v0, %v1 +; CHECK-NEXT: vsra %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, %b + %shift = ashr i128 %a, %and + ret i128 %shift +} + +; Test that AND is not entirely removed if the result is reused. +define i128 @f9(i128 %a, i128 %sh) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, .LCPI8_0 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vlgvf %r0, %v1, 3 +; CHECK-NEXT: vlvgp %v2, %r0, %r0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vrepb %v2, %v2, 15 +; CHECK-NEXT: vsrab %v0, %v0, %v2 +; CHECK-NEXT: vsra %v0, %v0, %v2 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %and = and i128 %sh, 127 + %shift = ashr i128 %a, %and + %reuse = add i128 %and, %shift + ret i128 %reuse +} + diff --git a/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll b/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll index 3c0d6cbc4e5a1..a7b5eddc5985e 100644 --- a/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll +++ b/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll @@ -366,14 +366,11 @@ define void @fun_3Eltsx2x4i(ptr %Dst) { ret void } -; i128 replicated '1': not using vrepib, but should compile. define void @fun_16x1i(ptr %Dst) { ; CHECK-LABEL: fun_16x1i: ; CHECK: # %bb.0: -; CHECK-NEXT: llihf %r0, 16843009 -; CHECK-NEXT: oilf %r0, 16843009 -; CHECK-NEXT: stg %r0, 8(%r2) -; CHECK-NEXT: stg %r0, 0(%r2) +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 store i128 1334440654591915542993625911497130241, ptr %Dst ret void diff --git a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll index 1507f2c3581b2..249136af1c737 100644 --- a/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll +++ b/llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll @@ -5,18 +5,35 @@ define void @fun0(<4 x i31> %src, ptr %p) ; CHECK-LABEL: fun0: ; CHECK: # %bb.0: -; CHECK-NEXT: vlgvf %r1, %v24, 0 +; CHECK-NEXT: vlgvf %r0, %v24, 0 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: vrepib %v1, 93 ; CHECK-NEXT: vlgvf %r0, %v24, 1 -; CHECK-NEXT: sllg %r1, %r1, 29 -; CHECK-NEXT: rosbg %r1, %r0, 35, 63, 62 -; CHECK-NEXT: nihh %r1, 4095 -; CHECK-NEXT: stg %r1, 0(%r2) -; CHECK-NEXT: vlgvf %r1, %v24, 2 -; CHECK-NEXT: sllg %r0, %r0, 62 -; CHECK-NEXT: rosbg %r0, %r1, 2, 32, 31 -; CHECK-NEXT: vlgvf %r1, %v24, 3 -; CHECK-NEXT: rosbg %r0, %r1, 33, 63, 0 -; CHECK-NEXT: stg %r0, 8(%r2) +; CHECK-NEXT: vslb %v0, %v0, %v1 +; CHECK-NEXT: larl %r1, .LCPI0_0 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vsl %v0, %v0, %v1 +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vrepib %v3, 62 +; CHECK-NEXT: vslb %v1, %v1, %v3 +; CHECK-NEXT: vlgvf %r0, %v24, 2 +; CHECK-NEXT: vsl %v1, %v1, %v3 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vrepib %v3, 31 +; CHECK-NEXT: vslb %v1, %v1, %v3 +; CHECK-NEXT: vlgvf %r0, %v24, 3 +; CHECK-NEXT: vsl %v1, %v1, %v3 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: larl %r1, .LCPI0_1 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vn %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 { store <4 x i31> %src, ptr %p @@ -73,42 +90,81 @@ define i16 @fun1(<16 x i1> %src) define void @fun2(<8 x i32> %src, ptr %p) ; CHECK-LABEL: fun2: ; CHECK: # %bb.0: -; CHECK-NEXT: vlgvf %r1, %v26, 3 -; CHECK-NEXT: vlgvf %r5, %v24, 0 -; CHECK-NEXT: vlgvf %r3, %v24, 1 -; CHECK-NEXT: srlk %r0, %r1, 8 -; CHECK-NEXT: sth %r0, 28(%r2) -; CHECK-NEXT: vlgvf %r0, %v24, 2 -; CHECK-NEXT: sllg %r5, %r5, 33 -; CHECK-NEXT: sllg %r4, %r3, 58 -; CHECK-NEXT: risbgn %r0, %r0, 6, 164, 27 -; CHECK-NEXT: rosbg %r5, %r3, 31, 55, 2 -; CHECK-NEXT: vlgvf %r3, %v26, 2 -; CHECK-NEXT: stc %r1, 30(%r2) -; CHECK-NEXT: ogr %r4, %r0 -; CHECK-NEXT: risbgn %r1, %r1, 33, 167, 0 -; CHECK-NEXT: rosbg %r5, %r4, 56, 63, 8 -; CHECK-NEXT: risbgn %r3, %r3, 2, 160, 31 -; CHECK-NEXT: ogr %r1, %r3 -; CHECK-NEXT: vlgvf %r4, %v24, 3 -; CHECK-NEXT: srlg %r1, %r1, 24 -; CHECK-NEXT: rosbg %r0, %r4, 37, 63, 60 -; CHECK-NEXT: st %r1, 24(%r2) -; CHECK-NEXT: vlgvf %r1, %v26, 0 -; CHECK-NEXT: stg %r5, 0(%r2) -; CHECK-NEXT: risbgn %r1, %r1, 4, 162, 29 -; CHECK-NEXT: sllg %r5, %r4, 60 -; CHECK-NEXT: ogr %r5, %r1 -; CHECK-NEXT: sllg %r0, %r0, 8 -; CHECK-NEXT: rosbg %r0, %r5, 56, 63, 8 -; CHECK-NEXT: stg %r0, 8(%r2) +; CHECK-NEXT: vlgvf %r0, %v26, 3 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: srl %r0, 8 +; CHECK-NEXT: vsteb %v0, 30(%r2), 15 +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vsteh %v1, 28(%r2), 7 +; CHECK-NEXT: larl %r1, .LCPI2_0 +; CHECK-NEXT: vl %v1, 0(%r1), 3 +; CHECK-NEXT: vlgvf %r0, %v26, 2 +; CHECK-NEXT: larl %r1, .LCPI2_1 +; CHECK-NEXT: vl %v2, 0(%r1), 3 +; CHECK-NEXT: vn %v0, %v0, %v1 +; CHECK-NEXT: vlvgp %v1, %r0, %r0 +; CHECK-NEXT: vn %v1, %v1, %v2 +; CHECK-NEXT: vrepib %v3, 31 +; CHECK-NEXT: vslb %v1, %v1, %v3 +; CHECK-NEXT: vsl %v1, %v1, %v3 +; CHECK-NEXT: vo %v0, %v1, %v0 +; CHECK-NEXT: vrepib %v3, 24 +; CHECK-NEXT: vlgvf %r0, %v24, 3 +; CHECK-NEXT: vsrlb %v0, %v0, %v3 +; CHECK-NEXT: vstef %v0, 24(%r2), 3 +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: vrepib %v3, 124 +; CHECK-NEXT: vlgvf %r0, %v26, 0 +; CHECK-NEXT: vslb %v4, %v0, %v3 +; CHECK-NEXT: vsl %v3, %v4, %v3 +; CHECK-NEXT: vlvgp %v4, %r0, %r0 +; CHECK-NEXT: vn %v4, %v4, %v2 ; CHECK-NEXT: vlgvf %r0, %v26, 1 -; CHECK-NEXT: sllg %r4, %r0, 62 -; CHECK-NEXT: ogr %r3, %r4 -; CHECK-NEXT: rosbg %r1, %r0, 35, 63, 62 -; CHECK-NEXT: sllg %r0, %r1, 8 -; CHECK-NEXT: rosbg %r0, %r3, 56, 63, 8 -; CHECK-NEXT: stg %r0, 16(%r2) +; CHECK-NEXT: larl %r1, .LCPI2_2 +; CHECK-NEXT: vrepib %v5, 93 +; CHECK-NEXT: vslb %v4, %v4, %v5 +; CHECK-NEXT: vsl %v4, %v4, %v5 +; CHECK-NEXT: vo %v3, %v3, %v4 +; CHECK-NEXT: vlvgp %v4, %r0, %r0 +; CHECK-NEXT: vlgvf %r0, %v24, 0 +; CHECK-NEXT: vn %v4, %v4, %v2 +; CHECK-NEXT: vrepib %v5, 62 +; CHECK-NEXT: vslb %v4, %v4, %v5 +; CHECK-NEXT: vsl %v4, %v4, %v5 +; CHECK-NEXT: vo %v4, %v3, %v4 +; CHECK-NEXT: vo %v1, %v4, %v1 +; CHECK-NEXT: vrepib %v4, 56 +; CHECK-NEXT: vrepib %v5, 58 +; CHECK-NEXT: vsrlb %v1, %v1, %v4 +; CHECK-NEXT: vsteg %v1, 16(%r2), 1 +; CHECK-NEXT: vrepib %v1, 120 +; CHECK-NEXT: vrepib %v4, 89 +; CHECK-NEXT: vsrlb %v1, %v3, %v1 +; CHECK-NEXT: vlvgp %v3, %r0, %r0 +; CHECK-NEXT: vlgvf %r0, %v24, 1 +; CHECK-NEXT: vslb %v3, %v3, %v4 +; CHECK-NEXT: vsl %v3, %v3, %v4 +; CHECK-NEXT: vlvgp %v4, %r0, %r0 +; CHECK-NEXT: vlgvf %r0, %v24, 2 +; CHECK-NEXT: vn %v4, %v4, %v2 +; CHECK-NEXT: vslb %v4, %v4, %v5 +; CHECK-NEXT: vsl %v4, %v4, %v5 +; CHECK-NEXT: vo %v3, %v3, %v4 +; CHECK-NEXT: vlvgp %v4, %r0, %r0 +; CHECK-NEXT: vn %v2, %v4, %v2 +; CHECK-NEXT: vrepib %v4, 27 +; CHECK-NEXT: vslb %v2, %v2, %v4 +; CHECK-NEXT: vsl %v2, %v2, %v4 +; CHECK-NEXT: vo %v2, %v3, %v2 +; CHECK-NEXT: vl %v3, 0(%r1), 3 +; CHECK-NEXT: vn %v0, %v0, %v3 +; CHECK-NEXT: vrepib %v3, 4 +; CHECK-NEXT: vsrl %v0, %v0, %v3 +; CHECK-NEXT: vo %v0, %v2, %v0 +; CHECK-NEXT: vrepib %v2, 8 +; CHECK-NEXT: vslb %v0, %v0, %v2 +; CHECK-NEXT: vo %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 4 ; CHECK-NEXT: br %r14 { %tmp = trunc <8 x i32> %src to <8 x i31> @@ -120,10 +176,16 @@ define void @fun2(<8 x i32> %src, ptr %p) define void @fun3(ptr %src, ptr %p) ; CHECK-LABEL: fun3: ; CHECK: # %bb.0: -; CHECK-NEXT: llgf %r0, 8(%r2) -; CHECK-NEXT: lg %r1, 0(%r2) -; CHECK-NEXT: stg %r1, 0(%r3) -; CHECK-NEXT: st %r0, 8(%r3) +; CHECK-NEXT: vgbm %v0, 0 +; CHECK-NEXT: vleg %v0, 0(%r2), 1 +; CHECK-NEXT: vgbm %v1, 0 +; CHECK-NEXT: vlef %v1, 8(%r2), 3 +; CHECK-NEXT: vrepib %v2, 32 +; CHECK-NEXT: vslb %v0, %v0, %v2 +; CHECK-NEXT: vo %v0, %v1, %v0 +; CHECK-NEXT: vstef %v0, 8(%r3), 3 +; CHECK-NEXT: vsrlb %v0, %v0, %v2 +; CHECK-NEXT: vsteg %v0, 0(%r3), 1 ; CHECK-NEXT: br %r14 { %tmp = load <3 x i31>, ptr %src diff --git a/llvm/test/CodeGen/SystemZ/tdc-04.ll b/llvm/test/CodeGen/SystemZ/tdc-04.ll index 929285b0ba8fe..8cc78f3de7522 100644 --- a/llvm/test/CodeGen/SystemZ/tdc-04.ll +++ b/llvm/test/CodeGen/SystemZ/tdc-04.ll @@ -2,6 +2,7 @@ ; signbit extraction. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s ; ; Extract sign bit. diff --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll index e69dc9d009a54..8f3867f8aee48 100644 --- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll @@ -37,10 +37,10 @@ declare <16 x i8> @llvm.s390.vaccb(<16 x i8>, <16 x i8>) declare <8 x i16> @llvm.s390.vacch(<8 x i16>, <8 x i16>) declare <4 x i32> @llvm.s390.vaccf(<4 x i32>, <4 x i32>) declare <2 x i64> @llvm.s390.vaccg(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.vaq(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vacq(<16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vaccq(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vacccq(<16 x i8>, <16 x i8>, <16 x i8>) +declare i128 @llvm.s390.vaq(i128, i128) +declare i128 @llvm.s390.vacq(i128, i128, i128) +declare i128 @llvm.s390.vaccq(i128, i128) +declare i128 @llvm.s390.vacccq(i128, i128, i128) declare <16 x i8> @llvm.s390.vavgb(<16 x i8>, <16 x i8>) declare <8 x i16> @llvm.s390.vavgh(<8 x i16>, <8 x i16>) declare <4 x i32> @llvm.s390.vavgf(<4 x i32>, <4 x i32>) @@ -53,11 +53,11 @@ declare <4 x i32> @llvm.s390.vcksm(<4 x i32>, <4 x i32>) declare <8 x i16> @llvm.s390.vgfmb(<16 x i8>, <16 x i8>) declare <4 x i32> @llvm.s390.vgfmh(<8 x i16>, <8 x i16>) declare <2 x i64> @llvm.s390.vgfmf(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vgfmg(<2 x i64>, <2 x i64>) +declare i128 @llvm.s390.vgfmg(<2 x i64>, <2 x i64>) declare <8 x i16> @llvm.s390.vgfmab(<16 x i8>, <16 x i8>, <8 x i16>) declare <4 x i32> @llvm.s390.vgfmah(<8 x i16>, <8 x i16>, <4 x i32>) declare <2 x i64> @llvm.s390.vgfmaf(<4 x i32>, <4 x i32>, <2 x i64>) -declare <16 x i8> @llvm.s390.vgfmag(<2 x i64>, <2 x i64>, <16 x i8>) +declare i128 @llvm.s390.vgfmag(<2 x i64>, <2 x i64>, i128) declare <16 x i8> @llvm.s390.vmahb(<16 x i8>, <16 x i8>, <16 x i8>) declare <8 x i16> @llvm.s390.vmahh(<8 x i16>, <8 x i16>, <8 x i16>) declare <4 x i32> @llvm.s390.vmahf(<4 x i32>, <4 x i32>, <4 x i32>) @@ -109,16 +109,16 @@ declare <16 x i8> @llvm.s390.vscbib(<16 x i8>, <16 x i8>) declare <8 x i16> @llvm.s390.vscbih(<8 x i16>, <8 x i16>) declare <4 x i32> @llvm.s390.vscbif(<4 x i32>, <4 x i32>) declare <2 x i64> @llvm.s390.vscbig(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.vsq(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsbiq(<16 x i8>, <16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vscbiq(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vsbcbiq(<16 x i8>, <16 x i8>, <16 x i8>) +declare i128 @llvm.s390.vsq(i128, i128) +declare i128 @llvm.s390.vsbiq(i128, i128, i128) +declare i128 @llvm.s390.vscbiq(i128, i128) +declare i128 @llvm.s390.vsbcbiq(i128, i128, i128) declare <4 x i32> @llvm.s390.vsumb(<16 x i8>, <16 x i8>) declare <4 x i32> @llvm.s390.vsumh(<8 x i16>, <8 x i16>) declare <2 x i64> @llvm.s390.vsumgh(<8 x i16>, <8 x i16>) declare <2 x i64> @llvm.s390.vsumgf(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vsumqf(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.vsumqg(<2 x i64>, <2 x i64>) +declare i128 @llvm.s390.vsumqf(<4 x i32>, <4 x i32>) +declare i128 @llvm.s390.vsumqg(<2 x i64>, <2 x i64>) declare i32 @llvm.s390.vtm(<16 x i8>, <16 x i8>) declare {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8>, <16 x i8>) declare {<8 x i16>, i32} @llvm.s390.vceqhs(<8 x i16>, <8 x i16>) @@ -886,45 +886,57 @@ define <2 x i64> @test_vaccg(<2 x i64> %a, <2 x i64> %b) { } ; VAQ. -define <16 x i8> @test_vaq(<16 x i8> %a, <16 x i8> %b) { +define i128 @test_vaq(i128 %a, i128 %b) { ; CHECK-LABEL: test_vaq: ; CHECK: # %bb.0: -; CHECK-NEXT: vaq %v24, %v24, %v26 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vaq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vaq(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vaq(i128 %a, i128 %b) + ret i128 %res } ; VACQ. -define <16 x i8> @test_vacq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +define i128 @test_vacq(i128 %a, i128 %b, i128 %c) { ; CHECK-LABEL: test_vacq: ; CHECK: # %bb.0: -; CHECK-NEXT: vacq %v24, %v24, %v26, %v28 +; CHECK-NEXT: vl %v0, 0(%r5), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r3), 3 +; CHECK-NEXT: vacq %v0, %v2, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vacq(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vacq(i128 %a, i128 %b, i128 %c) + ret i128 %res } ; VACCQ. -define <16 x i8> @test_vaccq(<16 x i8> %a, <16 x i8> %b) { +define i128 @test_vaccq(i128 %a, i128 %b) { ; CHECK-LABEL: test_vaccq: ; CHECK: # %bb.0: -; CHECK-NEXT: vaccq %v24, %v24, %v26 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vaccq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vaccq(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vaccq(i128 %a, i128 %b) + ret i128 %res } ; VACCCQ. -define <16 x i8> @test_vacccq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +define i128 @test_vacccq(i128 %a, i128 %b, i128 %c) { ; CHECK-LABEL: test_vacccq: ; CHECK: # %bb.0: -; CHECK-NEXT: vacccq %v24, %v24, %v26, %v28 +; CHECK-NEXT: vl %v0, 0(%r5), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r3), 3 +; CHECK-NEXT: vacccq %v0, %v2, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vacccq(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vacccq(i128 %a, i128 %b, i128 %c) + ret i128 %res } ; VAVGB. @@ -1048,13 +1060,14 @@ define <2 x i64> @test_vgfmf(<4 x i32> %a, <4 x i32> %b) { } ; VGFMG. -define <16 x i8> @test_vgfmg(<2 x i64> %a, <2 x i64> %b) { +define i128 @test_vgfmg(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_vgfmg: ; CHECK: # %bb.0: -; CHECK-NEXT: vgfmg %v24, %v24, %v26 +; CHECK-NEXT: vgfmg %v0, %v24, %v26 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vgfmg(<2 x i64> %a, <2 x i64> %b) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vgfmg(<2 x i64> %a, <2 x i64> %b) + ret i128 %res } ; VGFMAB. @@ -1091,14 +1104,15 @@ define <2 x i64> @test_vgfmaf(<4 x i32> %a, <4 x i32> %b, <2 x i64> %c) { } ; VGFMAG. -define <16 x i8> @test_vgfmag(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) { +define i128 @test_vgfmag(<2 x i64> %a, <2 x i64> %b, i128 %c) { ; CHECK-LABEL: test_vgfmag: ; CHECK: # %bb.0: -; CHECK-NEXT: vgfmag %v24, %v24, %v26, %v28 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vgfmag %v0, %v24, %v26, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vgfmag(<2 x i64> %a, <2 x i64> %b, - <16 x i8> %c) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vgfmag(<2 x i64> %a, <2 x i64> %b, i128 %c) + ret i128 %res } ; VMAHB. @@ -1650,45 +1664,57 @@ define <2 x i64> @test_vscbig(<2 x i64> %a, <2 x i64> %b) { } ; VSQ. -define <16 x i8> @test_vsq(<16 x i8> %a, <16 x i8> %b) { +define i128 @test_vsq(i128 %a, i128 %b) { ; CHECK-LABEL: test_vsq: ; CHECK: # %bb.0: -; CHECK-NEXT: vsq %v24, %v24, %v26 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vsq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vsq(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vsq(i128 %a, i128 %b) + ret i128 %res } ; VSBIQ. -define <16 x i8> @test_vsbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +define i128 @test_vsbiq(i128 %a, i128 %b, i128 %c) { ; CHECK-LABEL: test_vsbiq: ; CHECK: # %bb.0: -; CHECK-NEXT: vsbiq %v24, %v24, %v26, %v28 +; CHECK-NEXT: vl %v0, 0(%r5), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r3), 3 +; CHECK-NEXT: vsbiq %v0, %v2, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vsbiq(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vsbiq(i128 %a, i128 %b, i128 %c) + ret i128 %res } ; VSCBIQ. -define <16 x i8> @test_vscbiq(<16 x i8> %a, <16 x i8> %b) { +define i128 @test_vscbiq(i128 %a, i128 %b) { ; CHECK-LABEL: test_vscbiq: ; CHECK: # %bb.0: -; CHECK-NEXT: vscbiq %v24, %v24, %v26 +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vscbiq %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vscbiq(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vscbiq(i128 %a, i128 %b) + ret i128 %res } ; VSBCBIQ. -define <16 x i8> @test_vsbcbiq(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +define i128 @test_vsbcbiq(i128 %a, i128 %b, i128 %c) { ; CHECK-LABEL: test_vsbcbiq: ; CHECK: # %bb.0: -; CHECK-NEXT: vsbcbiq %v24, %v24, %v26, %v28 +; CHECK-NEXT: vl %v0, 0(%r5), 3 +; CHECK-NEXT: vl %v1, 0(%r4), 3 +; CHECK-NEXT: vl %v2, 0(%r3), 3 +; CHECK-NEXT: vsbcbiq %v0, %v2, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vsbcbiq(<16 x i8> %a, <16 x i8> %b, - <16 x i8> %c) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vsbcbiq(i128 %a, i128 %b, i128 %c) + ret i128 %res } ; VSUMB. @@ -1732,23 +1758,25 @@ define <2 x i64> @test_vsumgf(<4 x i32> %a, <4 x i32> %b) { } ; VSUMQF. -define <16 x i8> @test_vsumqf(<4 x i32> %a, <4 x i32> %b) { +define i128 @test_vsumqf(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test_vsumqf: ; CHECK: # %bb.0: -; CHECK-NEXT: vsumqf %v24, %v24, %v26 +; CHECK-NEXT: vsumqf %v0, %v24, %v26 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vsumqf(<4 x i32> %a, <4 x i32> %b) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vsumqf(<4 x i32> %a, <4 x i32> %b) + ret i128 %res } ; VSUMQG. -define <16 x i8> @test_vsumqg(<2 x i64> %a, <2 x i64> %b) { +define i128 @test_vsumqg(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_vsumqg: ; CHECK: # %bb.0: -; CHECK-NEXT: vsumqg %v24, %v24, %v26 +; CHECK-NEXT: vsumqg %v0, %v24, %v26 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vsumqg(<2 x i64> %a, <2 x i64> %b) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vsumqg(<2 x i64> %a, <2 x i64> %b) + ret i128 %res } ; VTM with no processing of the result. diff --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll index 9da1cee3bb4fe..ded83a999f4b1 100644 --- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll +++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s declare <2 x i64> @llvm.s390.vbperm(<16 x i8>, <16 x i8>) -declare <16 x i8> @llvm.s390.vmslg(<2 x i64>, <2 x i64>, <16 x i8>, i32) +declare i128 @llvm.s390.vmslg(<2 x i64>, <2 x i64>, i128, i32) declare <16 x i8> @llvm.s390.vlrl(i32, ptr) declare void @llvm.s390.vstrl(<16 x i8>, i32, ptr) @@ -30,23 +30,27 @@ define <2 x i64> @test_vbperm(<16 x i8> %a, <16 x i8> %b) { } ; VMSLG with no shifts. -define <16 x i8> @test_vmslg1(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) { +define i128 @test_vmslg1(<2 x i64> %a, <2 x i64> %b, i128 %c) { ; CHECK-LABEL: test_vmslg1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmslg %v24, %v24, %v26, %v28, 0 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vmslg %v0, %v24, %v26, %v0, 0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 0) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, i128 %c, i32 0) + ret i128 %res } ; VMSLG with both shifts. -define <16 x i8> @test_vmslg2(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c) { +define i128 @test_vmslg2(<2 x i64> %a, <2 x i64> %b, i128 %c) { ; CHECK-LABEL: test_vmslg2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmslg %v24, %v24, %v26, %v28, 12 +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vmslg %v0, %v24, %v26, %v0, 12 +; CHECK-NEXT: vst %v0, 0(%r2), 3 ; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, <16 x i8> %c, i32 12) - ret <16 x i8> %res + %res = call i128 @llvm.s390.vmslg(<2 x i64> %a, <2 x i64> %b, i128 %c, i32 12) + ret i128 %res } ; VLRLR with the lowest in-range displacement. diff --git a/llvm/test/CodeGen/SystemZ/xor-09.ll b/llvm/test/CodeGen/SystemZ/xor-09.ll new file mode 100644 index 0000000000000..d0287f7fdd77e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/xor-09.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit XOR in vector registers on z13 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Exclusive or. +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vx %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %res = xor i128 %a, %b + ret i128 %res +} diff --git a/llvm/test/CodeGen/SystemZ/xor-10.ll b/llvm/test/CodeGen/SystemZ/xor-10.ll new file mode 100644 index 0000000000000..bbd4a7861a3dc --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/xor-10.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Test 128-bit NOT-XOR in vector registers on z14 +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s + +; NOT exclusive or. +define i128 @f1(i128 %a, i128 %b) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 0(%r4), 3 +; CHECK-NEXT: vl %v1, 0(%r3), 3 +; CHECK-NEXT: vnx %v0, %v1, %v0 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 + %op = xor i128 %a, %b + %res = xor i128 %op, -1 + ret i128 %res +}