diff --git a/clang/include/clang/Basic/BuiltinsSystemZ.def b/clang/include/clang/Basic/BuiltinsSystemZ.def index 079e411364885..b84cf5b9cec9f 100644 --- a/clang/include/clang/Basic/BuiltinsSystemZ.def +++ b/clang/include/clang/Basic/BuiltinsSystemZ.def @@ -105,10 +105,10 @@ TARGET_BUILTIN(__builtin_s390_verimb, "V16UcV16UcV16UcV16UcIi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_verimh, "V8UsV8UsV8UsV8UsIi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_verimf, "V4UiV4UiV4UiV4UiIi", "nc", "vector") TARGET_BUILTIN(__builtin_s390_verimg, "V2ULLiV2ULLiV2ULLiV2ULLiIi", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_verllb, "V16UcV16UcUi", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_verllh, "V8UsV8UsUi", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_verllf, "V4UiV4UiUi", "nc", "vector") -TARGET_BUILTIN(__builtin_s390_verllg, "V2ULLiV2ULLiUi", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_verllb, "V16UcV16UcUc", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_verllh, "V8UsV8UsUc", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_verllf, "V4UiV4UiUc", "nc", "vector") +TARGET_BUILTIN(__builtin_s390_verllg, "V2ULLiV2ULLiUc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_verllvb, "V16UcV16UcV16Uc", "nc", "vector") TARGET_BUILTIN(__builtin_s390_verllvh, "V8UsV8UsV8Us", "nc", "vector") TARGET_BUILTIN(__builtin_s390_verllvf, "V4UiV4UiV4Ui", "nc", "vector") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 65d9862621061..a0f4172002613 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18337,6 +18337,32 @@ Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, {X, Undef}); } + case SystemZ::BI__builtin_s390_verllb: + case SystemZ::BI__builtin_s390_verllh: + case SystemZ::BI__builtin_s390_verllf: + case SystemZ::BI__builtin_s390_verllg: { + llvm::Type *ResultType = ConvertType(E->getType()); + llvm::Value *Src = EmitScalarExpr(E->getArg(0)); + llvm::Value *Amt = EmitScalarExpr(E->getArg(1)); + // Splat scalar rotate amount to vector type. + unsigned NumElts = cast(ResultType)->getNumElements(); + Amt = Builder.CreateIntCast(Amt, ResultType->getScalarType(), false); + Amt = Builder.CreateVectorSplat(NumElts, Amt); + Function *F = CGM.getIntrinsic(Intrinsic::fshl, ResultType); + return Builder.CreateCall(F, { Src, Src, Amt }); + } + + case SystemZ::BI__builtin_s390_verllvb: + case SystemZ::BI__builtin_s390_verllvh: + case SystemZ::BI__builtin_s390_verllvf: + case SystemZ::BI__builtin_s390_verllvg: { + llvm::Type *ResultType = ConvertType(E->getType()); + llvm::Value *Src = EmitScalarExpr(E->getArg(0)); + llvm::Value *Amt = EmitScalarExpr(E->getArg(1)); + Function *F = CGM.getIntrinsic(Intrinsic::fshl, ResultType); + return Builder.CreateCall(F, { Src, Src, Amt }); + } + case SystemZ::BI__builtin_s390_vfsqsb: case SystemZ::BI__builtin_s390_vfsqdb: { llvm::Type *ResultType = ConvertType(E->getType()); diff --git a/clang/lib/Headers/vecintrin.h b/clang/lib/Headers/vecintrin.h index ec1dbfd015f6e..0c535225c78e5 100644 --- a/clang/lib/Headers/vecintrin.h +++ b/clang/lib/Headers/vecintrin.h @@ -6565,45 +6565,45 @@ vec_rl(__vector unsigned long long __a, __vector unsigned long long __b) { static inline __ATTRS_o_ai __vector signed char vec_rli(__vector signed char __a, unsigned long __b) { return (__vector signed char)__builtin_s390_verllb( - (__vector unsigned char)__a, (int)__b); + (__vector unsigned char)__a, (unsigned char)__b); } static inline __ATTRS_o_ai __vector unsigned char vec_rli(__vector unsigned char __a, unsigned long __b) { - return __builtin_s390_verllb(__a, (int)__b); + return __builtin_s390_verllb(__a, (unsigned char)__b); } static inline __ATTRS_o_ai __vector signed short vec_rli(__vector signed short __a, unsigned long __b) { return (__vector signed short)__builtin_s390_verllh( - (__vector unsigned short)__a, (int)__b); + (__vector unsigned short)__a, (unsigned char)__b); } static inline __ATTRS_o_ai __vector unsigned short vec_rli(__vector unsigned short __a, unsigned long __b) { - return __builtin_s390_verllh(__a, (int)__b); + return __builtin_s390_verllh(__a, (unsigned char)__b); } static inline __ATTRS_o_ai __vector signed int vec_rli(__vector signed int __a, unsigned long __b) { return (__vector signed int)__builtin_s390_verllf( - (__vector unsigned int)__a, (int)__b); + (__vector unsigned int)__a, (unsigned char)__b); } static inline __ATTRS_o_ai __vector unsigned int vec_rli(__vector unsigned int __a, unsigned long __b) { - return __builtin_s390_verllf(__a, (int)__b); + return __builtin_s390_verllf(__a, (unsigned char)__b); } static inline __ATTRS_o_ai __vector signed long long vec_rli(__vector signed long long __a, unsigned long __b) { return (__vector signed long long)__builtin_s390_verllg( - (__vector unsigned long long)__a, (int)__b); + (__vector unsigned long long)__a, (unsigned char)__b); } static inline __ATTRS_o_ai __vector unsigned long long vec_rli(__vector unsigned long long __a, unsigned long __b) { - return __builtin_s390_verllg(__a, (int)__b); + return __builtin_s390_verllg(__a, (unsigned char)__b); } /*-- vec_rl_mask ------------------------------------------------------------*/ diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c index f01813ee76034..d17daaf35ca4b 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-vector.c @@ -23,6 +23,7 @@ volatile vec_ulong vul; volatile vec_double vd; volatile unsigned int len; +volatile unsigned char amt; const void * volatile cptr; void * volatile ptr; int cc; @@ -184,23 +185,23 @@ void test_integer(void) { vul = __builtin_s390_verimg(vul, vul, vul, 255); // CHECK: call <2 x i64> @llvm.s390.verimg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i32 255) - vuc = __builtin_s390_verllb(vuc, len); - // CHECK: call <16 x i8> @llvm.s390.verllb(<16 x i8> %{{.*}}, i32 %{{.*}}) - vus = __builtin_s390_verllh(vus, len); - // CHECK: call <8 x i16> @llvm.s390.verllh(<8 x i16> %{{.*}}, i32 %{{.*}}) - vui = __builtin_s390_verllf(vui, len); - // CHECK: call <4 x i32> @llvm.s390.verllf(<4 x i32> %{{.*}}, i32 %{{.*}}) - vul = __builtin_s390_verllg(vul, len); - // CHECK: call <2 x i64> @llvm.s390.verllg(<2 x i64> %{{.*}}, i32 %{{.*}}) + vuc = __builtin_s390_verllb(vuc, amt); + // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + vus = __builtin_s390_verllh(vus, amt); + // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + vui = __builtin_s390_verllf(vui, amt); + // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + vul = __builtin_s390_verllg(vul, amt); + // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) vuc = __builtin_s390_verllvb(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.verllvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) vus = __builtin_s390_verllvh(vus, vus); - // CHECK: call <8 x i16> @llvm.s390.verllvh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) vui = __builtin_s390_verllvf(vui, vui); - // CHECK: call <4 x i32> @llvm.s390.verllvf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) vul = __builtin_s390_verllvg(vul, vul); - // CHECK: call <2 x i64> @llvm.s390.verllvg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) vus = __builtin_s390_vgfmb(vuc, vuc); // CHECK: call <8 x i16> @llvm.s390.vgfmb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) diff --git a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c index 44f8cbe2cc017..0dc2fa7c66dd2 100644 --- a/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c +++ b/clang/test/CodeGen/SystemZ/builtins-systemz-zvector.c @@ -2564,53 +2564,53 @@ void test_integer(void) { // (emulated) vsc = vec_rl(vsc, vuc); - // CHECK: call <16 x i8> @llvm.s390.verllvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK-ASM: verllvb vuc = vec_rl(vuc, vuc); - // CHECK: call <16 x i8> @llvm.s390.verllvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK-ASM: verllvb vss = vec_rl(vss, vus); - // CHECK: call <8 x i16> @llvm.s390.verllvh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK-ASM: verllvh vus = vec_rl(vus, vus); - // CHECK: call <8 x i16> @llvm.s390.verllvh(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK-ASM: verllvh vsi = vec_rl(vsi, vui); - // CHECK: call <4 x i32> @llvm.s390.verllvf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) // CHECK-ASM: verllvf vui = vec_rl(vui, vui); - // CHECK: call <4 x i32> @llvm.s390.verllvf(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) // CHECK-ASM: verllvf vsl = vec_rl(vsl, vul); - // CHECK: call <2 x i64> @llvm.s390.verllvg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) // CHECK-ASM: verllvg vul = vec_rl(vul, vul); - // CHECK: call <2 x i64> @llvm.s390.verllvg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) // CHECK-ASM: verllvg vsc = vec_rli(vsc, ul); - // CHECK: call <16 x i8> @llvm.s390.verllb(<16 x i8> %{{.*}}, i32 %{{.*}}) + // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK-ASM: verllb vuc = vec_rli(vuc, ul); - // CHECK: call <16 x i8> @llvm.s390.verllb(<16 x i8> %{{.*}}, i32 %{{.*}}) + // CHECK: call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK-ASM: verllb vss = vec_rli(vss, ul); - // CHECK: call <8 x i16> @llvm.s390.verllh(<8 x i16> %{{.*}}, i32 %{{.*}}) + // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK-ASM: verllh vus = vec_rli(vus, ul); - // CHECK: call <8 x i16> @llvm.s390.verllh(<8 x i16> %{{.*}}, i32 %{{.*}}) + // CHECK: call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK-ASM: verllh vsi = vec_rli(vsi, ul); - // CHECK: call <4 x i32> @llvm.s390.verllf(<4 x i32> %{{.*}}, i32 %{{.*}}) + // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) // CHECK-ASM: verllf vui = vec_rli(vui, ul); - // CHECK: call <4 x i32> @llvm.s390.verllf(<4 x i32> %{{.*}}, i32 %{{.*}}) + // CHECK: call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}) // CHECK-ASM: verllf vsl = vec_rli(vsl, ul); - // CHECK: call <2 x i64> @llvm.s390.verllg(<2 x i64> %{{.*}}, i32 %{{.*}}) + // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) // CHECK-ASM: verllg vul = vec_rli(vul, ul); - // CHECK: call <2 x i64> @llvm.s390.verllg(<2 x i64> %{{.*}}, i32 %{{.*}}) + // CHECK: call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) // CHECK-ASM: verllg vsc = vec_rl_mask(vsc, vuc, 0); diff --git a/llvm/include/llvm/IR/IntrinsicsSystemZ.td b/llvm/include/llvm/IR/IntrinsicsSystemZ.td index 9d21f3eb5352e..9f79bdfa9d2d2 100644 --- a/llvm/include/llvm/IR/IntrinsicsSystemZ.td +++ b/llvm/include/llvm/IR/IntrinsicsSystemZ.td @@ -30,10 +30,6 @@ class SystemZBinaryConv class SystemZBinary : SystemZBinaryConv; -class SystemZBinaryInt - : ClangBuiltin<"__builtin_s390_" # name>, - Intrinsic<[type], [type, llvm_i32_ty], [IntrNoMem]>; - class SystemZBinaryConvCC : Intrinsic<[result, llvm_i32_ty], [arg, arg], [IntrNoMem]>; @@ -131,13 +127,6 @@ multiclass SystemZBinaryBHFG : SystemZBinaryBHF { def g : SystemZBinary; } -multiclass SystemZBinaryIntBHFG { - def b : SystemZBinaryInt; - def h : SystemZBinaryInt; - def f : SystemZBinaryInt; - def g : SystemZBinaryInt; -} - multiclass SystemZBinaryCCBHF { def bs : SystemZBinaryCC; def hs : SystemZBinaryCC; @@ -303,8 +292,6 @@ let TargetPrefix = "s390" in { defm int_s390_vmo : SystemZBinaryExtBHF<"vmo">; defm int_s390_vmlo : SystemZBinaryExtBHF<"vmlo">; - defm int_s390_verllv : SystemZBinaryBHFG<"verllv">; - defm int_s390_verll : SystemZBinaryIntBHFG<"verll">; defm int_s390_verim : SystemZQuaternaryIntBHFG<"verim">; def int_s390_vsl : SystemZBinary<"vsl", llvm_v16i8_ty>; diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 4e57986206dc6..d0eb0255f7d92 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -385,16 +385,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); - // Detect shifts by a scalar amount and convert them into + // Detect shifts/rotates by a scalar amount and convert them into // V*_BY_SCALAR. setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); - - // At present ROTL isn't matched by DAGCombiner. ROTR should be - // converted into ROTL. - setOperationAction(ISD::ROTL, VT, Expand); - setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::ROTL, VT, Custom); // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands // and inverting the result as necessary. @@ -5979,6 +5975,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR); case ISD::SRA: return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR); + case ISD::ROTL: + return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR); case ISD::IS_FPCLASS: return lowerIS_FPCLASS(Op, DAG); case ISD::GET_ROUNDING: @@ -6143,6 +6141,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(VSHL_BY_SCALAR); OPCODE(VSRL_BY_SCALAR); OPCODE(VSRA_BY_SCALAR); + OPCODE(VROTL_BY_SCALAR); OPCODE(VSUM); OPCODE(VICMPE); OPCODE(VICMPH); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index fd951b935702a..40fe433f816fa 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -215,11 +215,12 @@ enum NodeType : unsigned { UNPACK_LOW, UNPACKL_LOW, - // Shift each element of vector operand 0 by the number of bits specified - // by scalar operand 1. + // Shift/rotate each element of vector operand 0 by the number of bits + // specified by scalar operand 1. VSHL_BY_SCALAR, VSRL_BY_SCALAR, VSRA_BY_SCALAR, + VROTL_BY_SCALAR, // For each element of the output type, sum across all sub-elements of // operand 0 belonging to the corresponding element, and add in the diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index 82863d7838a95..37d6945dc7a05 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -732,21 +732,17 @@ let Predicates = [FeatureVector] in { // Element rotate left logical (with vector shift amount). def VERLLV : BinaryVRRcGeneric<"verllv", 0xE773>; - def VERLLVB : BinaryVRRc<"verllvb", 0xE773, int_s390_verllvb, - v128b, v128b, 0>; - def VERLLVH : BinaryVRRc<"verllvh", 0xE773, int_s390_verllvh, - v128h, v128h, 1>; - def VERLLVF : BinaryVRRc<"verllvf", 0xE773, int_s390_verllvf, - v128f, v128f, 2>; - def VERLLVG : BinaryVRRc<"verllvg", 0xE773, int_s390_verllvg, - v128g, v128g, 3>; + def VERLLVB : BinaryVRRc<"verllvb", 0xE773, rotl, v128b, v128b, 0>; + def VERLLVH : BinaryVRRc<"verllvh", 0xE773, rotl, v128h, v128h, 1>; + def VERLLVF : BinaryVRRc<"verllvf", 0xE773, rotl, v128f, v128f, 2>; + def VERLLVG : BinaryVRRc<"verllvg", 0xE773, rotl, v128g, v128g, 3>; // Element rotate left logical (with scalar shift amount). def VERLL : BinaryVRSaGeneric<"verll", 0xE733>; - def VERLLB : BinaryVRSa<"verllb", 0xE733, int_s390_verllb, v128b, v128b, 0>; - def VERLLH : BinaryVRSa<"verllh", 0xE733, int_s390_verllh, v128h, v128h, 1>; - def VERLLF : BinaryVRSa<"verllf", 0xE733, int_s390_verllf, v128f, v128f, 2>; - def VERLLG : BinaryVRSa<"verllg", 0xE733, int_s390_verllg, v128g, v128g, 3>; + def VERLLB : BinaryVRSa<"verllb", 0xE733, z_vrotl_by_scalar, v128b, v128b, 0>; + def VERLLH : BinaryVRSa<"verllh", 0xE733, z_vrotl_by_scalar, v128h, v128h, 1>; + def VERLLF : BinaryVRSa<"verllf", 0xE733, z_vrotl_by_scalar, v128f, v128f, 2>; + def VERLLG : BinaryVRSa<"verllg", 0xE733, z_vrotl_by_scalar, v128g, v128g, 3>; // Element rotate and insert under mask. def VERIM : QuaternaryVRIdGeneric<"verim", 0xE772>; diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td index 6713cac2a7807..4f0f23fe3ef8e 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -324,6 +324,8 @@ def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR", SDT_ZVecBinaryInt>; def z_vsra_by_scalar : SDNode<"SystemZISD::VSRA_BY_SCALAR", SDT_ZVecBinaryInt>; +def z_vrotl_by_scalar : SDNode<"SystemZISD::VROTL_BY_SCALAR", + SDT_ZVecBinaryInt>; def z_vsum : SDNode<"SystemZISD::VSUM", SDT_ZVecBinaryConv>; def z_vicmpe : SDNode<"SystemZISD::VICMPE", SDT_ZVecBinary>; def z_vicmph : SDNode<"SystemZISD::VICMPH", SDT_ZVecBinary>; diff --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll index 5338ccc9b4292..e69dc9d009a54 100644 --- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll @@ -94,14 +94,6 @@ declare <2 x i64> @llvm.s390.vmof(<4 x i32>, <4 x i32>) declare <8 x i16> @llvm.s390.vmlob(<16 x i8>, <16 x i8>) declare <4 x i32> @llvm.s390.vmloh(<8 x i16>, <8 x i16>) declare <2 x i64> @llvm.s390.vmlof(<4 x i32>, <4 x i32>) -declare <16 x i8> @llvm.s390.verllvb(<16 x i8>, <16 x i8>) -declare <8 x i16> @llvm.s390.verllvh(<8 x i16>, <8 x i16>) -declare <4 x i32> @llvm.s390.verllvf(<4 x i32>, <4 x i32>) -declare <2 x i64> @llvm.s390.verllvg(<2 x i64>, <2 x i64>) -declare <16 x i8> @llvm.s390.verllb(<16 x i8>, i32) -declare <8 x i16> @llvm.s390.verllh(<8 x i16>, i32) -declare <4 x i32> @llvm.s390.verllf(<4 x i32>, i32) -declare <2 x i64> @llvm.s390.verllg(<2 x i64>, i32) declare <16 x i8> @llvm.s390.verimb(<16 x i8>, <16 x i8>, <16 x i8>, i32) declare <8 x i16> @llvm.s390.verimh(<8 x i16>, <8 x i16>, <8 x i16>, i32) declare <4 x i32> @llvm.s390.verimf(<4 x i32>, <4 x i32>, <4 x i32>, i32) @@ -1487,117 +1479,6 @@ define <2 x i64> @test_vmlof(<4 x i32> %a, <4 x i32> %b) { ret <2 x i64> %res } -; VERLLVB. -define <16 x i8> @test_verllvb(<16 x i8> %a, <16 x i8> %b) { -; CHECK-LABEL: test_verllvb: -; CHECK: # %bb.0: -; CHECK-NEXT: verllvb %v24, %v24, %v26 -; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.verllvb(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %res -} - -; VERLLVH. -define <8 x i16> @test_verllvh(<8 x i16> %a, <8 x i16> %b) { -; CHECK-LABEL: test_verllvh: -; CHECK: # %bb.0: -; CHECK-NEXT: verllvh %v24, %v24, %v26 -; CHECK-NEXT: br %r14 - %res = call <8 x i16> @llvm.s390.verllvh(<8 x i16> %a, <8 x i16> %b) - ret <8 x i16> %res -} - -; VERLLVF. -define <4 x i32> @test_verllvf(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: test_verllvf: -; CHECK: # %bb.0: -; CHECK-NEXT: verllvf %v24, %v24, %v26 -; CHECK-NEXT: br %r14 - %res = call <4 x i32> @llvm.s390.verllvf(<4 x i32> %a, <4 x i32> %b) - ret <4 x i32> %res -} - -; VERLLVG. -define <2 x i64> @test_verllvg(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: test_verllvg: -; CHECK: # %bb.0: -; CHECK-NEXT: verllvg %v24, %v24, %v26 -; CHECK-NEXT: br %r14 - %res = call <2 x i64> @llvm.s390.verllvg(<2 x i64> %a, <2 x i64> %b) - ret <2 x i64> %res -} - -; VERLLB. -define <16 x i8> @test_verllb(<16 x i8> %a, i32 %b) { -; CHECK-LABEL: test_verllb: -; CHECK: # %bb.0: -; CHECK-NEXT: verllb %v24, %v24, 0(%r2) -; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 %b) - ret <16 x i8> %res -} - -; VERLLH. -define <8 x i16> @test_verllh(<8 x i16> %a, i32 %b) { -; CHECK-LABEL: test_verllh: -; CHECK: # %bb.0: -; CHECK-NEXT: verllh %v24, %v24, 0(%r2) -; CHECK-NEXT: br %r14 - %res = call <8 x i16> @llvm.s390.verllh(<8 x i16> %a, i32 %b) - ret <8 x i16> %res -} - -; VERLLF. -define <4 x i32> @test_verllf(<4 x i32> %a, i32 %b) { -; CHECK-LABEL: test_verllf: -; CHECK: # %bb.0: -; CHECK-NEXT: verllf %v24, %v24, 0(%r2) -; CHECK-NEXT: br %r14 - %res = call <4 x i32> @llvm.s390.verllf(<4 x i32> %a, i32 %b) - ret <4 x i32> %res -} - -; VERLLG. -define <2 x i64> @test_verllg(<2 x i64> %a, i32 %b) { -; CHECK-LABEL: test_verllg: -; CHECK: # %bb.0: -; CHECK-NEXT: verllg %v24, %v24, 0(%r2) -; CHECK-NEXT: br %r14 - %res = call <2 x i64> @llvm.s390.verllg(<2 x i64> %a, i32 %b) - ret <2 x i64> %res -} - -; VERLLB with the smallest count. -define <16 x i8> @test_verllb_1(<16 x i8> %a) { -; CHECK-LABEL: test_verllb_1: -; CHECK: # %bb.0: -; CHECK-NEXT: verllb %v24, %v24, 1 -; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 1) - ret <16 x i8> %res -} - -; VERLLB with the largest count. -define <16 x i8> @test_verllb_4095(<16 x i8> %a) { -; CHECK-LABEL: test_verllb_4095: -; CHECK: # %bb.0: -; CHECK-NEXT: verllb %v24, %v24, 4095 -; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4095) - ret <16 x i8> %res -} - -; VERLLB with the largest count + 1. -define <16 x i8> @test_verllb_4096(<16 x i8> %a) { -; CHECK-LABEL: test_verllb_4096: -; CHECK: # %bb.0: -; CHECK-NEXT: lhi %r1, 4096 -; CHECK-NEXT: verllb %v24, %v24, 0(%r1) -; CHECK-NEXT: br %r14 - %res = call <16 x i8> @llvm.s390.verllb(<16 x i8> %a, i32 4096) - ret <16 x i8> %res -} - ; VERIMB. define <16 x i8> @test_verimb(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { ; CHECK-LABEL: test_verimb: @@ -1888,7 +1769,7 @@ define void @test_vtm_all_store(<16 x i8> %a, <16 x i8> %b, ptr %ptr) { ; CHECK: # %bb.0: ; CHECK-NEXT: vtm %v24, %v26 ; CHECK-NEXT: bler %r14 -; CHECK-NEXT: .LBB151_1: # %store +; CHECK-NEXT: .LBB140_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 %res = call i32 @llvm.s390.vtm(<16 x i8> %a, <16 x i8> %b) @@ -1937,7 +1818,7 @@ define <16 x i8> @test_vceqbs_any_store(<16 x i8> %a, <16 x i8> %b, ptr %ptr) { ; CHECK: # %bb.0: ; CHECK-NEXT: vceqbs %v24, %v24, %v26 ; CHECK-NEXT: bor %r14 -; CHECK-NEXT: .LBB154_1: # %store +; CHECK-NEXT: .LBB143_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 %call = call {<16 x i8>, i32} @llvm.s390.vceqbs(<16 x i8> %a, <16 x i8> %b) @@ -1988,7 +1869,7 @@ define <8 x i16> @test_vceqhs_notall_store(<8 x i16> %a, <8 x i16> %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vceqhs %v24, %v24, %v26 ; CHECK-NEXT: ber %r14 -; CHECK-NEXT: .LBB157_1: # %store +; CHECK-NEXT: .LBB146_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 ptr %ptr) { @@ -2040,7 +1921,7 @@ define <4 x i32> @test_vceqfs_none_store(<4 x i32> %a, <4 x i32> %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vceqfs %v24, %v24, %v26 ; CHECK-NEXT: bler %r14 -; CHECK-NEXT: .LBB160_1: # %store +; CHECK-NEXT: .LBB149_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 ptr %ptr) { @@ -2092,7 +1973,7 @@ define <2 x i64> @test_vceqgs_all_store(<2 x i64> %a, <2 x i64> %b, ptr %ptr) { ; CHECK: # %bb.0: ; CHECK-NEXT: vceqgs %v24, %v24, %v26 ; CHECK-NEXT: bnher %r14 -; CHECK-NEXT: .LBB163_1: # %store +; CHECK-NEXT: .LBB152_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 %call = call {<2 x i64>, i32} @llvm.s390.vceqgs(<2 x i64> %a, <2 x i64> %b) @@ -2143,7 +2024,7 @@ define <16 x i8> @test_vchbs_any_store(<16 x i8> %a, <16 x i8> %b, ptr %ptr) { ; CHECK: # %bb.0: ; CHECK-NEXT: vchbs %v24, %v24, %v26 ; CHECK-NEXT: bor %r14 -; CHECK-NEXT: .LBB166_1: # %store +; CHECK-NEXT: .LBB155_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 %call = call {<16 x i8>, i32} @llvm.s390.vchbs(<16 x i8> %a, <16 x i8> %b) @@ -2194,7 +2075,7 @@ define <8 x i16> @test_vchhs_notall_store(<8 x i16> %a, <8 x i16> %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vchhs %v24, %v24, %v26 ; CHECK-NEXT: ber %r14 -; CHECK-NEXT: .LBB169_1: # %store +; CHECK-NEXT: .LBB158_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 ptr %ptr) { @@ -2246,7 +2127,7 @@ define <4 x i32> @test_vchfs_none_store(<4 x i32> %a, <4 x i32> %b, ptr %ptr) { ; CHECK: # %bb.0: ; CHECK-NEXT: vchfs %v24, %v24, %v26 ; CHECK-NEXT: bler %r14 -; CHECK-NEXT: .LBB172_1: # %store +; CHECK-NEXT: .LBB161_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 %call = call {<4 x i32>, i32} @llvm.s390.vchfs(<4 x i32> %a, <4 x i32> %b) @@ -2297,7 +2178,7 @@ define <2 x i64> @test_vchgs_all_store(<2 x i64> %a, <2 x i64> %b, ptr %ptr) { ; CHECK: # %bb.0: ; CHECK-NEXT: vchgs %v24, %v24, %v26 ; CHECK-NEXT: bnher %r14 -; CHECK-NEXT: .LBB175_1: # %store +; CHECK-NEXT: .LBB164_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 %call = call {<2 x i64>, i32} @llvm.s390.vchgs(<2 x i64> %a, <2 x i64> %b) @@ -2348,7 +2229,7 @@ define <16 x i8> @test_vchlbs_any_store(<16 x i8> %a, <16 x i8> %b, ptr %ptr) { ; CHECK: # %bb.0: ; CHECK-NEXT: vchlbs %v24, %v24, %v26 ; CHECK-NEXT: bor %r14 -; CHECK-NEXT: .LBB178_1: # %store +; CHECK-NEXT: .LBB167_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 %call = call {<16 x i8>, i32} @llvm.s390.vchlbs(<16 x i8> %a, <16 x i8> %b) @@ -2399,7 +2280,7 @@ define <8 x i16> @test_vchlhs_notall_store(<8 x i16> %a, <8 x i16> %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vchlhs %v24, %v24, %v26 ; CHECK-NEXT: ber %r14 -; CHECK-NEXT: .LBB181_1: # %store +; CHECK-NEXT: .LBB170_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 ptr %ptr) { @@ -2451,7 +2332,7 @@ define <4 x i32> @test_vchlfs_none_store(<4 x i32> %a, <4 x i32> %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vchlfs %v24, %v24, %v26 ; CHECK-NEXT: bler %r14 -; CHECK-NEXT: .LBB184_1: # %store +; CHECK-NEXT: .LBB173_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 ptr %ptr) { @@ -2503,7 +2384,7 @@ define <2 x i64> @test_vchlgs_all_store(<2 x i64> %a, <2 x i64> %b, ptr %ptr) { ; CHECK: # %bb.0: ; CHECK-NEXT: vchlgs %v24, %v24, %v26 ; CHECK-NEXT: bnher %r14 -; CHECK-NEXT: .LBB187_1: # %store +; CHECK-NEXT: .LBB176_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 %call = call {<2 x i64>, i32} @llvm.s390.vchlgs(<2 x i64> %a, <2 x i64> %b) @@ -3450,7 +3331,7 @@ define <2 x i64> @test_vfcedbs_any_store(<2 x double> %a, <2 x double> %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vfcedbs %v24, %v24, %v26 ; CHECK-NEXT: bor %r14 -; CHECK-NEXT: .LBB260_1: # %store +; CHECK-NEXT: .LBB249_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 ptr %ptr) { @@ -3505,7 +3386,7 @@ define <2 x i64> @test_vfchdbs_notall_store(<2 x double> %a, <2 x double> %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vfchdbs %v24, %v24, %v26 ; CHECK-NEXT: ber %r14 -; CHECK-NEXT: .LBB263_1: # %store +; CHECK-NEXT: .LBB252_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 ptr %ptr) { @@ -3560,7 +3441,7 @@ define <2 x i64> @test_vfchedbs_none_store(<2 x double> %a, <2 x double> %b, ; CHECK: # %bb.0: ; CHECK-NEXT: vfchedbs %v24, %v24, %v26 ; CHECK-NEXT: bler %r14 -; CHECK-NEXT: .LBB266_1: # %store +; CHECK-NEXT: .LBB255_1: # %store ; CHECK-NEXT: mvhi 0(%r2), 0 ; CHECK-NEXT: br %r14 ptr %ptr) { diff --git a/llvm/test/CodeGen/SystemZ/vec-rot-01.ll b/llvm/test/CodeGen/SystemZ/vec-rot-01.ll new file mode 100644 index 0000000000000..fae20350f3caf --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-rot-01.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Test vector rotate left instructions with vector rotate amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) + +; Test a v16i8 rotate left. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val, <16 x i8> %amt) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: verllvb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + + %inv = sub <16 x i8> , %amt + %parta = shl <16 x i8> %val, %amt + %partb = lshr <16 x i8> %val, %inv + + %rotl = or <16 x i8> %parta, %partb + + ret <16 x i8> %rotl +} + +; Test a v16i8 rotate left (matched from fshl). +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val, <16 x i8> %amt) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: verllvb %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + + %rotl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %val, <16 x i8> %val, <16 x i8> %amt) + + ret <16 x i8> %rotl +} + +; Test a v8i16 rotate left. +define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val, <8 x i16> %amt) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: verllvh %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + + %inv = sub <8 x i16> , %amt + %parta = shl <8 x i16> %val, %amt + %partb = lshr <8 x i16> %val, %inv + + %rotl = or <8 x i16> %parta, %partb + + ret <8 x i16> %rotl +} + +; Test a v8i16 rotate left (matched from fshl). +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val, <8 x i16> %amt) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: verllvh %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + + %rotl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %val, <8 x i16> %val, <8 x i16> %amt) + + ret <8 x i16> %rotl +} + +; Test a v4i32 rotate left. +define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val, <4 x i32> %amt) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: verllvf %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + + %inv = sub <4 x i32> , %amt + %parta = shl <4 x i32> %val, %amt + %partb = lshr <4 x i32> %val, %inv + + %rotl = or <4 x i32> %parta, %partb + + ret <4 x i32> %rotl +} + +; Test a v4i32 rotate left (matched from fshl). +define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val, <4 x i32> %amt) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: verllvf %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + + %rotl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %val, <4 x i32> %val, <4 x i32> %amt) + + ret <4 x i32> %rotl +} + +; Test a v2i64 rotate left. +define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val, <2 x i64> %amt) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: verllvg %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + + %inv = sub <2 x i64> , %amt + %parta = shl <2 x i64> %val, %amt + %partb = lshr <2 x i64> %val, %inv + + %rotl = or <2 x i64> %parta, %partb + + ret <2 x i64> %rotl +} + +; Test a v2i64 rotate left (matched from fshl). +define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val, <2 x i64> %amt) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: verllvg %v24, %v26, %v28 +; CHECK-NEXT: br %r14 + + %rotl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %val, <2 x i64> %val, <2 x i64> %amt) + + ret <2 x i64> %rotl +} diff --git a/llvm/test/CodeGen/SystemZ/vec-rot-02.ll b/llvm/test/CodeGen/SystemZ/vec-rot-02.ll new file mode 100644 index 0000000000000..1331c6290af17 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-rot-02.ll @@ -0,0 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Test vector rotate left instructions with scalar rotate amount. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) + +; Test a v16i8 rotate left. +define <16 x i8> @f1(<16 x i8> %dummy, <16 x i8> %val, i32 %scalar) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: verllb %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %scalar_tmp = trunc i32 %scalar to i8 + %tmp = insertelement <16 x i8> undef, i8 %scalar_tmp, i32 0 + %amt = shufflevector <16 x i8> %tmp, <16 x i8> undef, + <16 x i32> zeroinitializer + + %inv = sub <16 x i8> , %amt + %parta = shl <16 x i8> %val, %amt + %partb = lshr <16 x i8> %val, %inv + + %rotl = or <16 x i8> %parta, %partb + + ret <16 x i8> %rotl +} + +; Test a v16i8 rotate left (matched from fshl). +define <16 x i8> @f2(<16 x i8> %dummy, <16 x i8> %val, i32 %scalar) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: verllb %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %scalar_tmp = trunc i32 %scalar to i8 + %tmp = insertelement <16 x i8> undef, i8 %scalar_tmp, i32 0 + %amt = shufflevector <16 x i8> %tmp, <16 x i8> undef, + <16 x i32> zeroinitializer + + %rotl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %val, <16 x i8> %val, <16 x i8> %amt) + + ret <16 x i8> %rotl +} + +; Test a v8i16 rotate left. +define <8 x i16> @f3(<8 x i16> %dummy, <8 x i16> %val, i32 %scalar) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: verllh %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %scalar_tmp = trunc i32 %scalar to i16 + %tmp = insertelement <8 x i16> undef, i16 %scalar_tmp, i32 0 + %amt = shufflevector <8 x i16> %tmp, <8 x i16> undef, + <8 x i32> zeroinitializer + + %inv = sub <8 x i16> , %amt + %parta = shl <8 x i16> %val, %amt + %partb = lshr <8 x i16> %val, %inv + + %rotl = or <8 x i16> %parta, %partb + + ret <8 x i16> %rotl +} + +; Test a v8i16 rotate left (matched from fshl). +define <8 x i16> @f4(<8 x i16> %dummy, <8 x i16> %val, i32 %scalar) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: verllh %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %scalar_tmp = trunc i32 %scalar to i16 + %tmp = insertelement <8 x i16> undef, i16 %scalar_tmp, i32 0 + %amt = shufflevector <8 x i16> %tmp, <8 x i16> undef, + <8 x i32> zeroinitializer + + %rotl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %val, <8 x i16> %val, <8 x i16> %amt) + + ret <8 x i16> %rotl +} + +; Test a v4i32 rotate left. +define <4 x i32> @f5(<4 x i32> %dummy, <4 x i32> %val, i32 %scalar) { +; CHECK-LABEL: f5: +; CHECK: # %bb.0: +; CHECK-NEXT: verllf %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %tmp = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %amt = shufflevector <4 x i32> %tmp, <4 x i32> undef, + <4 x i32> zeroinitializer + + %inv = sub <4 x i32> , %amt + %parta = shl <4 x i32> %val, %amt + %partb = lshr <4 x i32> %val, %inv + + %rotl = or <4 x i32> %parta, %partb + + ret <4 x i32> %rotl +} + +; Test a v4i32 rotate left (matched from fshl). +define <4 x i32> @f6(<4 x i32> %dummy, <4 x i32> %val, i32 %scalar) { +; CHECK-LABEL: f6: +; CHECK: # %bb.0: +; CHECK-NEXT: verllf %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %tmp = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %amt = shufflevector <4 x i32> %tmp, <4 x i32> undef, + <4 x i32> zeroinitializer + + %rotl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %val, <4 x i32> %val, <4 x i32> %amt) + + ret <4 x i32> %rotl +} + +; Test a v2i64 rotate left. +define <2 x i64> @f7(<2 x i64> %dummy, <2 x i64> %val, i32 %scalar) { +; CHECK-LABEL: f7: +; CHECK: # %bb.0: +; CHECK-NEXT: verllg %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %scalar_tmp = zext i32 %scalar to i64 + %tmp = insertelement <2 x i64> undef, i64 %scalar_tmp, i32 0 + %amt = shufflevector <2 x i64> %tmp, <2 x i64> undef, + <2 x i32> zeroinitializer + + %inv = sub <2 x i64> , %amt + %parta = shl <2 x i64> %val, %amt + %partb = lshr <2 x i64> %val, %inv + + %rotl = or <2 x i64> %parta, %partb + + ret <2 x i64> %rotl +} + +; Test a v2i64 rotate left (matched from fshl). +define <2 x i64> @f8(<2 x i64> %dummy, <2 x i64> %val, i32 %scalar) { +; CHECK-LABEL: f8: +; CHECK: # %bb.0: +; CHECK-NEXT: verllg %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %scalar_tmp = zext i32 %scalar to i64 + %tmp = insertelement <2 x i64> undef, i64 %scalar_tmp, i32 0 + %amt = shufflevector <2 x i64> %tmp, <2 x i64> undef, + <2 x i32> zeroinitializer + + %rotl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %val, <2 x i64> %val, <2 x i64> %amt) + + ret <2 x i64> %rotl +} + +; Test a v2i64 rotate left (matched from fshl). +define <2 x i64> @f9(<2 x i64> %dummy, <2 x i64> %val, i64 %scalar) { +; CHECK-LABEL: f9: +; CHECK: # %bb.0: +; CHECK-NEXT: verllg %v24, %v26, 0(%r2) +; CHECK-NEXT: br %r14 + + %tmp = insertelement <2 x i64> undef, i64 %scalar, i32 0 + %amt = shufflevector <2 x i64> %tmp, <2 x i64> undef, + <2 x i32> zeroinitializer + + %rotl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %val, <2 x i64> %val, <2 x i64> %amt) + + ret <2 x i64> %rotl +}