diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f76e88eab8e4a..06f9c0445bcea 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1998,12 +1998,14 @@ def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, [IntrNoMem, IntrSpeculatable] >; -def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], +// mul24 intrinsics can return i32 or i64. +// When returning i64, they're lowered to a mul24/mulhi24 pair. +def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable] >; -def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], +def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 4cce34bdeabcf..4caa9cd9225b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -624,34 +624,6 @@ static Value *insertValues(IRBuilder<> &Builder, return NewVal; } -// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and -// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the -// width of the original destination. -static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, - unsigned Size, unsigned NumBits, bool IsSigned) { - if (Size <= 32 || NumBits <= 32) { - Intrinsic::ID ID = - IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; - return Builder.CreateIntrinsic(ID, {}, {LHS, RHS}); - } - - assert(NumBits <= 48); - - Intrinsic::ID LoID = - IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; - Intrinsic::ID HiID = - IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24; - - Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS}); - Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS}); - - IntegerType *I64Ty = Builder.getInt64Ty(); - Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty); - Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty); - - return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32)); -} - bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { if (I.getOpcode() != Instruction::Mul) return false; @@ -691,26 +663,20 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { extractValues(Builder, RHSVals, RHS); IntegerType *I32Ty = Builder.getInt32Ty(); - for (int I = 0, E = LHSVals.size(); I != E; ++I) { - Value *LHS, *RHS; - if (IsSigned) { - LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); - RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); - } else { - LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); - RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); - } + IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty; + Type *DstTy = LHSVals[0]->getType(); - Value *Result = - getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned); - - if (IsSigned) { - ResultVals.push_back( - Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType())); - } else { - ResultVals.push_back( - Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType())); - } + for (int I = 0, E = LHSVals.size(); I != E; ++I) { + Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty) + : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); + Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty) + : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); + Intrinsic::ID ID = + IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; + Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS}); + Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy) + : Builder.CreateZExtOrTrunc(Result, DstTy); + ResultVals.push_back(Result); } Value *NewVal = insertValues(Builder, Ty, ResultVals); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 324285e580bba..fd38739876c4d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -281,11 +281,15 @@ def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>; def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>; // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore -// when performing the multiply. The result is a 32-bit value. -def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, +// when performing the multiply. The result is a 32 or 64 bit value. +def AMDGPUMul24Op : SDTypeProfile<1, 2, [ + SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2> +]>; + +def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", AMDGPUMul24Op, [SDNPCommutative, SDNPAssociative] >; -def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp, +def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", AMDGPUMul24Op, [SDNPCommutative, SDNPAssociative] >; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index b97d979b0336b..1917b24539d09 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -862,6 +862,17 @@ def : divergent_i64_BinOp ; def : divergent_i64_BinOp ; def : divergent_i64_BinOp ; +// mul24 w/ 64 bit output. +class mul24_64_Pat : GCNPat< + (i64 (Op i32:$src0, i32:$src1)), + (REG_SEQUENCE VReg_64, + (InstLo $src0, $src1), sub0, + (InstHi $src0, $src1), sub1) +>; + +def : mul24_64_Pat; +def : mul24_64_Pat; + //===----------------------------------------------------------------------===// // 16-Bit Operand Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll index 62bc145f1387e..d938c16bf6134 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll @@ -7,7 +7,7 @@ define i16 @mul_i16(i16 %lhs, i16 %rhs) { ; SI-LABEL: @mul_i16( ; SI-NEXT: [[TMP1:%.*]] = zext i16 [[LHS:%.*]] to i32 ; SI-NEXT: [[TMP2:%.*]] = zext i16 [[RHS:%.*]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP1]], i32 [[TMP2]]) ; SI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i16 ; SI-NEXT: ret i16 [[MUL]] ; @@ -29,7 +29,7 @@ define i32 @smul24_i32(i32 %lhs, i32 %rhs) { ; SI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8 ; SI-NEXT: [[SHL_RHS:%.*]] = shl i32 [[RHS:%.*]], 8 ; SI-NEXT: [[RHS24:%.*]] = ashr i32 [[SHL_RHS]], 8 -; SI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]]) +; SI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[LHS24]], i32 [[RHS24]]) ; SI-NEXT: ret i32 [[MUL]] ; ; VI-LABEL: @smul24_i32( @@ -37,7 +37,7 @@ define i32 @smul24_i32(i32 %lhs, i32 %rhs) { ; VI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8 ; VI-NEXT: [[SHL_RHS:%.*]] = shl i32 [[RHS:%.*]], 8 ; VI-NEXT: [[RHS24:%.*]] = ashr i32 [[SHL_RHS]], 8 -; VI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]]) +; VI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[LHS24]], i32 [[RHS24]]) ; VI-NEXT: ret i32 [[MUL]] ; ; DISABLED-LABEL: @smul24_i32( @@ -61,7 +61,7 @@ define <2 x i8> @mul_v1i16(<1 x i16> %arg) { ; SI-NEXT: BB: ; SI-NEXT: [[TMP0:%.*]] = extractelement <1 x i16> [[ARG:%.*]], i64 0 ; SI-NEXT: [[TMP1:%.*]] = zext i16 [[TMP0]] to i32 -; SI-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 42) +; SI-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP1]], i32 42) ; SI-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 ; SI-NEXT: [[MUL:%.*]] = insertelement <1 x i16> poison, i16 [[TMP3]], i64 0 ; SI-NEXT: [[CAST:%.*]] = bitcast <1 x i16> [[MUL]] to <2 x i8> @@ -90,7 +90,7 @@ define <1 x i8> @mul_v1i8(<1 x i8> %arg) { ; SI-NEXT: BB: ; SI-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ARG:%.*]], i64 0 ; SI-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 -; SI-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 42) +; SI-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP1]], i32 42) ; SI-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 ; SI-NEXT: [[MUL:%.*]] = insertelement <1 x i8> poison, i8 [[TMP3]], i64 0 ; SI-NEXT: ret <1 x i8> [[MUL]] @@ -120,8 +120,8 @@ define <2 x i32> @smul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1 ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1 -; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]]) -; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]]) +; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP1]], i32 [[TMP3]]) +; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP2]], i32 [[TMP4]]) ; SI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0 ; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1 ; SI-NEXT: ret <2 x i32> [[MUL]] @@ -135,8 +135,8 @@ define <2 x i32> @smul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1 ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1 -; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]]) -; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]]) +; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP1]], i32 [[TMP3]]) +; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP2]], i32 [[TMP4]]) ; VI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0 ; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1 ; VI-NEXT: ret <2 x i32> [[MUL]] @@ -161,13 +161,13 @@ define i32 @umul24_i32(i32 %lhs, i32 %rhs) { ; SI-LABEL: @umul24_i32( ; SI-NEXT: [[LHS24:%.*]] = and i32 [[LHS:%.*]], 16777215 ; SI-NEXT: [[RHS24:%.*]] = and i32 [[RHS:%.*]], 16777215 -; SI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]]) +; SI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[LHS24]], i32 [[RHS24]]) ; SI-NEXT: ret i32 [[MUL]] ; ; VI-LABEL: @umul24_i32( ; VI-NEXT: [[LHS24:%.*]] = and i32 [[LHS:%.*]], 16777215 ; VI-NEXT: [[RHS24:%.*]] = and i32 [[RHS:%.*]], 16777215 -; VI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]]) +; VI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[LHS24]], i32 [[RHS24]]) ; VI-NEXT: ret i32 [[MUL]] ; ; DISABLED-LABEL: @umul24_i32( @@ -190,8 +190,8 @@ define <2 x i32> @umul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1 ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1 -; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]]) -; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]]) +; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP1]], i32 [[TMP3]]) +; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP2]], i32 [[TMP4]]) ; SI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0 ; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1 ; SI-NEXT: ret <2 x i32> [[MUL]] @@ -203,8 +203,8 @@ define <2 x i32> @umul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1 ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1 -; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]]) -; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]]) +; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP1]], i32 [[TMP3]]) +; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP2]], i32 [[TMP4]]) ; VI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0 ; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1 ; VI-NEXT: ret <2 x i32> [[MUL]] @@ -229,12 +229,7 @@ define i64 @smul24_i64(i64 %lhs, i64 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = ashr i64 [[SHL_RHS]], 40 ; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; SI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] +; SI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; SI-NEXT: ret i64 [[MUL]] ; ; VI-LABEL: @smul24_i64( @@ -244,12 +239,7 @@ define i64 @smul24_i64(i64 %lhs, i64 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = ashr i64 [[SHL_RHS]], 40 ; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; VI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] +; VI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; VI-NEXT: ret i64 [[MUL]] ; ; DISABLED-LABEL: @smul24_i64( @@ -276,8 +266,7 @@ define i64 @smul24_i64_2(i64 %lhs, i64 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = ashr i64 [[SHL_RHS]], 49 ; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[MUL:%.*]] = sext i32 [[TMP3]] to i64 +; SI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; SI-NEXT: ret i64 [[MUL]] ; ; VI-LABEL: @smul24_i64_2( @@ -287,8 +276,7 @@ define i64 @smul24_i64_2(i64 %lhs, i64 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = ashr i64 [[SHL_RHS]], 49 ; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[MUL:%.*]] = sext i32 [[TMP3]] to i64 +; VI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; VI-NEXT: ret i64 [[MUL]] ; ; DISABLED-LABEL: @smul24_i64_2( @@ -315,12 +303,7 @@ define i64 @smul24_i64_3(i64 %lhs, i64 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64 ; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; SI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] +; SI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; SI-NEXT: ret i64 [[MUL]] ; ; VI-LABEL: @smul24_i64_3( @@ -330,12 +313,7 @@ define i64 @smul24_i64_3(i64 %lhs, i64 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64 ; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; VI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] +; VI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; VI-NEXT: ret i64 [[MUL]] ; ; DISABLED-LABEL: @smul24_i64_3( @@ -393,12 +371,7 @@ define i64 @umul24_i64(i64 %lhs, i64 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215 ; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; SI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] +; SI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.u24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; SI-NEXT: ret i64 [[MUL]] ; ; VI-LABEL: @umul24_i64( @@ -406,12 +379,7 @@ define i64 @umul24_i64(i64 %lhs, i64 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215 ; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; VI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] +; VI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.u24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; VI-NEXT: ret i64 [[MUL]] ; ; DISABLED-LABEL: @umul24_i64( @@ -432,8 +400,7 @@ define i64 @umul24_i64_2(i64 %lhs, i64 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 65535 ; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[MUL:%.*]] = zext i32 [[TMP3]] to i64 +; SI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.u24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; SI-NEXT: ret i64 [[MUL]] ; ; VI-LABEL: @umul24_i64_2( @@ -441,8 +408,7 @@ define i64 @umul24_i64_2(i64 %lhs, i64 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 65535 ; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[MUL:%.*]] = zext i32 [[TMP3]] to i64 +; VI-NEXT: [[MUL:%.*]] = call i64 @llvm.amdgcn.mul.u24.i64(i32 [[TMP1]], i32 [[TMP2]]) ; VI-NEXT: ret i64 [[MUL]] ; ; DISABLED-LABEL: @umul24_i64_2( @@ -465,7 +431,7 @@ define i31 @smul24_i31(i31 %lhs, i31 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = ashr i31 [[SHL_RHS]], 7 ; SI-NEXT: [[TMP1:%.*]] = sext i31 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = sext i31 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP1]], i32 [[TMP2]]) ; SI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i31 ; SI-NEXT: ret i31 [[MUL]] ; @@ -476,7 +442,7 @@ define i31 @smul24_i31(i31 %lhs, i31 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = ashr i31 [[SHL_RHS]], 7 ; VI-NEXT: [[TMP1:%.*]] = sext i31 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = sext i31 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP1]], i32 [[TMP2]]) ; VI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i31 ; VI-NEXT: ret i31 [[MUL]] ; @@ -502,7 +468,7 @@ define i31 @umul24_i31(i31 %lhs, i31 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = and i31 [[RHS:%.*]], 16777215 ; SI-NEXT: [[TMP1:%.*]] = zext i31 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = zext i31 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP1]], i32 [[TMP2]]) ; SI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i31 ; SI-NEXT: ret i31 [[MUL]] ; @@ -511,7 +477,7 @@ define i31 @umul24_i31(i31 %lhs, i31 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = and i31 [[RHS:%.*]], 16777215 ; VI-NEXT: [[TMP1:%.*]] = zext i31 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = zext i31 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP1]], i32 [[TMP2]]) ; VI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i31 ; VI-NEXT: ret i31 [[MUL]] ; @@ -537,11 +503,11 @@ define <2 x i31> @umul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) { ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1 ; SI-NEXT: [[TMP5:%.*]] = zext i31 [[TMP1]] to i32 ; SI-NEXT: [[TMP6:%.*]] = zext i31 [[TMP3]] to i32 -; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]]) +; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP5]], i32 [[TMP6]]) ; SI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31 ; SI-NEXT: [[TMP9:%.*]] = zext i31 [[TMP2]] to i32 ; SI-NEXT: [[TMP10:%.*]] = zext i31 [[TMP4]] to i32 -; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]]) +; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP9]], i32 [[TMP10]]) ; SI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31 ; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> poison, i31 [[TMP8]], i64 0 ; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1 @@ -556,11 +522,11 @@ define <2 x i31> @umul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) { ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1 ; VI-NEXT: [[TMP5:%.*]] = zext i31 [[TMP1]] to i32 ; VI-NEXT: [[TMP6:%.*]] = zext i31 [[TMP3]] to i32 -; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]]) +; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP5]], i32 [[TMP6]]) ; VI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31 ; VI-NEXT: [[TMP9:%.*]] = zext i31 [[TMP2]] to i32 ; VI-NEXT: [[TMP10:%.*]] = zext i31 [[TMP4]] to i32 -; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]]) +; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24.i32(i32 [[TMP9]], i32 [[TMP10]]) ; VI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31 ; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> poison, i31 [[TMP8]], i64 0 ; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1 @@ -590,11 +556,11 @@ define <2 x i31> @smul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) { ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1 ; SI-NEXT: [[TMP5:%.*]] = sext i31 [[TMP1]] to i32 ; SI-NEXT: [[TMP6:%.*]] = sext i31 [[TMP3]] to i32 -; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]]) +; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP5]], i32 [[TMP6]]) ; SI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31 ; SI-NEXT: [[TMP9:%.*]] = sext i31 [[TMP2]] to i32 ; SI-NEXT: [[TMP10:%.*]] = sext i31 [[TMP4]] to i32 -; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]]) +; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP9]], i32 [[TMP10]]) ; SI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31 ; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> poison, i31 [[TMP8]], i64 0 ; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1 @@ -611,11 +577,11 @@ define <2 x i31> @smul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) { ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1 ; VI-NEXT: [[TMP5:%.*]] = sext i31 [[TMP1]] to i32 ; VI-NEXT: [[TMP6:%.*]] = sext i31 [[TMP3]] to i32 -; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]]) +; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP5]], i32 [[TMP6]]) ; VI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31 ; VI-NEXT: [[TMP9:%.*]] = sext i31 [[TMP2]] to i32 ; VI-NEXT: [[TMP10:%.*]] = sext i31 [[TMP4]] to i32 -; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]]) +; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24.i32(i32 [[TMP9]], i32 [[TMP10]]) ; VI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31 ; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> poison, i31 [[TMP8]], i64 0 ; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1 @@ -645,13 +611,8 @@ define i33 @smul24_i33(i33 %lhs, i33 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = ashr i33 [[SHL_RHS]], 9 ; SI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; SI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] -; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i33 +; SI-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP3]] to i33 ; SI-NEXT: ret i33 [[MUL]] ; ; VI-LABEL: @smul24_i33( @@ -661,13 +622,8 @@ define i33 @smul24_i33(i33 %lhs, i33 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = ashr i33 [[SHL_RHS]], 9 ; VI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; VI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] -; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i33 +; VI-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP3]] to i33 ; VI-NEXT: ret i33 [[MUL]] ; ; DISABLED-LABEL: @smul24_i33( @@ -692,13 +648,8 @@ define i33 @umul24_i33(i33 %lhs, i33 %rhs) { ; SI-NEXT: [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215 ; SI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32 -; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; SI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] -; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i33 +; SI-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.mul.u24.i64(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP3]] to i33 ; SI-NEXT: ret i33 [[MUL]] ; ; VI-LABEL: @umul24_i33( @@ -706,13 +657,8 @@ define i33 @umul24_i33(i33 %lhs, i33 %rhs) { ; VI-NEXT: [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215 ; VI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32 -; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 -; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 -; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 -; VI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] -; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i33 +; VI-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.mul.u24.i64(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP3]] to i33 ; VI-NEXT: ret i33 [[MUL]] ; ; DISABLED-LABEL: @umul24_i33( @@ -797,24 +743,14 @@ define <2 x i33> @smul24_v2i33(<2 x i33> %lhs, <2 x i33> %rhs) { ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1 ; SI-NEXT: [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32 ; SI-NEXT: [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32 -; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]]) -; SI-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP5]], i32 [[TMP6]]) -; SI-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 -; SI-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 -; SI-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 32 -; SI-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] -; SI-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i33 -; SI-NEXT: [[TMP14:%.*]] = trunc i33 [[TMP2]] to i32 -; SI-NEXT: [[TMP15:%.*]] = trunc i33 [[TMP4]] to i32 -; SI-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP14]], i32 [[TMP15]]) -; SI-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP14]], i32 [[TMP15]]) -; SI-NEXT: [[TMP18:%.*]] = zext i32 [[TMP16]] to i64 -; SI-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 -; SI-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 32 -; SI-NEXT: [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]] -; SI-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i33 -; SI-NEXT: [[TMP23:%.*]] = insertelement <2 x i33> poison, i33 [[TMP13]], i64 0 -; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i33> [[TMP23]], i33 [[TMP22]], i64 1 +; SI-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP5]], i32 [[TMP6]]) +; SI-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i33 +; SI-NEXT: [[TMP9:%.*]] = trunc i33 [[TMP2]] to i32 +; SI-NEXT: [[TMP10:%.*]] = trunc i33 [[TMP4]] to i32 +; SI-NEXT: [[TMP11:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP9]], i32 [[TMP10]]) +; SI-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i33 +; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i33> poison, i33 [[TMP8]], i64 0 +; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i33> [[TMP13]], i33 [[TMP12]], i64 1 ; SI-NEXT: ret <2 x i33> [[MUL]] ; ; VI-LABEL: @smul24_v2i33( @@ -828,24 +764,14 @@ define <2 x i33> @smul24_v2i33(<2 x i33> %lhs, <2 x i33> %rhs) { ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1 ; VI-NEXT: [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32 ; VI-NEXT: [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32 -; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]]) -; VI-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP5]], i32 [[TMP6]]) -; VI-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 -; VI-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 -; VI-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 32 -; VI-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] -; VI-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i33 -; VI-NEXT: [[TMP14:%.*]] = trunc i33 [[TMP2]] to i32 -; VI-NEXT: [[TMP15:%.*]] = trunc i33 [[TMP4]] to i32 -; VI-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP14]], i32 [[TMP15]]) -; VI-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP14]], i32 [[TMP15]]) -; VI-NEXT: [[TMP18:%.*]] = zext i32 [[TMP16]] to i64 -; VI-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 -; VI-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 32 -; VI-NEXT: [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]] -; VI-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i33 -; VI-NEXT: [[TMP23:%.*]] = insertelement <2 x i33> poison, i33 [[TMP13]], i64 0 -; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i33> [[TMP23]], i33 [[TMP22]], i64 1 +; VI-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP5]], i32 [[TMP6]]) +; VI-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i33 +; VI-NEXT: [[TMP9:%.*]] = trunc i33 [[TMP2]] to i32 +; VI-NEXT: [[TMP10:%.*]] = trunc i33 [[TMP4]] to i32 +; VI-NEXT: [[TMP11:%.*]] = call i64 @llvm.amdgcn.mul.i24.i64(i32 [[TMP9]], i32 [[TMP10]]) +; VI-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i33 +; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i33> poison, i33 [[TMP8]], i64 0 +; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i33> [[TMP13]], i33 [[TMP12]], i64 1 ; VI-NEXT: ret <2 x i33> [[MUL]] ; ; DISABLED-LABEL: @smul24_v2i33( diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 75f3b5463c394..d9c6fbb319019 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -451,23 +451,24 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_lshrrev_b64 v[1:2], 16, v[45:46] ; CHECK-NEXT: v_lshlrev_b32_e32 v7, 16, v5 ; CHECK-NEXT: v_lshlrev_b32_e32 v8, 6, v72 +; CHECK-NEXT: v_add_co_u32 v11, vcc_lo, s46, v11 ; CHECK-NEXT: v_lshlrev_b32_e32 v10, 12, v63 +; CHECK-NEXT: v_or_b32_e32 v4, v7, v4 +; CHECK-NEXT: v_mul_hi_u32_u24_e32 v7, 0x180, v73 ; CHECK-NEXT: v_xor_b32_e32 v6, v61, v59 ; CHECK-NEXT: v_lshlrev_b32_e32 v9, 16, v56 -; CHECK-NEXT: v_or_b32_e32 v4, v7, v4 -; CHECK-NEXT: v_add_co_u32 v7, s5, s46, v11 -; CHECK-NEXT: v_add_co_ci_u32_e64 v11, null, s47, 0, s5 ; CHECK-NEXT: v_or3_b32 v10, v8, v10, v62 -; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v7, v0 -; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v11, vcc_lo +; CHECK-NEXT: ; implicit-def: $vgpr42 +; CHECK-NEXT: ; implicit-def: $vgpr43 +; CHECK-NEXT: ; implicit-def: $vgpr44 +; CHECK-NEXT: v_add_co_ci_u32_e32 v12, vcc_lo, s47, v7, vcc_lo +; CHECK-NEXT: v_add_co_u32 v7, vcc_lo, v11, v0 ; CHECK-NEXT: v_lshrrev_b64 v[5:6], 16, v[5:6] +; CHECK-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v12, vcc_lo ; CHECK-NEXT: v_or_b32_e32 v2, v9, v2 ; CHECK-NEXT: global_store_dword v[7:8], v10, off offset:4 ; CHECK-NEXT: global_store_dwordx4 v[7:8], v[1:4], off offset:8 ; CHECK-NEXT: global_store_dwordx2 v[7:8], v[5:6], off offset:24 -; CHECK-NEXT: ; implicit-def: $vgpr42 -; CHECK-NEXT: ; implicit-def: $vgpr43 -; CHECK-NEXT: ; implicit-def: $vgpr44 ; CHECK-NEXT: .LBB0_31: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_28 Depth=1 ; CHECK-NEXT: s_andn2_saveexec_b32 s4, s4 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index c3529debe693d..dc5041b7eb286 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -187,25 +187,22 @@ define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) { ; SI-LABEL: test_smul48_i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mul_i32_i24_e32 v3, v0, v2 ; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2 -; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mul_i32_i24_e32 v0, v0, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_smul48_i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mul_i32_i24_e32 v3, v0, v2 ; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2 -; VI-NEXT: v_mov_b32_e32 v0, v3 +; VI-NEXT: v_mul_i32_i24_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_smul48_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mul_i32_i24_e32 v3, v0, v2 ; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: test_smul48_i64: @@ -229,52 +226,28 @@ define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; SI-LABEL: test_smul48_v2i64: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_ashr_i64 v[5:6], v[0:1], 40 -; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40 -; SI-NEXT: v_ashr_i64 v[6:7], v[2:3], 40 -; SI-NEXT: v_ashr_i64 v[2:3], v[3:4], 40 -; SI-NEXT: v_mul_i32_i24_e32 v0, v1, v2 -; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v2 -; SI-NEXT: v_mul_i32_i24_e32 v2, v5, v6 -; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v5, v6 +; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4 +; SI-NEXT: v_mul_i32_i24_e32 v0, v0, v4 +; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6 +; SI-NEXT: v_mul_i32_i24_e32 v2, v2, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_smul48_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; VI-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1] -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3] -; VI-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2] -; VI-NEXT: v_mul_i32_i24_e32 v0, v1, v3 -; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3 -; VI-NEXT: v_mul_i32_i24_e32 v2, v7, v4 -; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4 +; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4 +; VI-NEXT: v_mul_i32_i24_e32 v0, v0, v4 +; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6 +; VI-NEXT: v_mul_i32_i24_e32 v2, v2, v6 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_smul48_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1] -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1] -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3] -; GFX9-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2] -; GFX9-NEXT: v_mul_i32_i24_e32 v0, v1, v3 -; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_i32_i24_e32 v2, v7, v4 -; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4 +; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v4 +; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v4 +; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v2, v6 +; GFX9-NEXT: v_mul_i32_i24_e32 v2, v2, v6 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: test_smul48_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index ffc533decc042..987f9e95b65ee 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -555,9 +555,8 @@ define i64 @test_umul48_i64(i64 %lhs, i64 %rhs) { ; GCN-LABEL: test_umul48_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_u32_u24_e32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v2 -; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %lhs24 = and i64 %lhs, 16777215 %rhs24 = and i64 %rhs, 16777215 @@ -569,12 +568,10 @@ define <2 x i64> @test_umul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GCN-LABEL: test_umul48_v2i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mul_u32_u24_e32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32_u24_e32 v1, v0, v4 -; GCN-NEXT: v_mul_u32_u24_e32 v4, v2, v6 +; GCN-NEXT: v_mul_u32_u24_e32 v0, v0, v4 ; GCN-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v6 -; GCN-NEXT: v_mov_b32_e32 v0, v5 -; GCN-NEXT: v_mov_b32_e32 v2, v4 +; GCN-NEXT: v_mul_u32_u24_e32 v2, v2, v6 ; GCN-NEXT: s_setpc_b64 s[30:31] %lhs24 = and <2 x i64> %lhs, %rhs24 = and <2 x i64> %rhs,