diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h index 59dcbea8dcf6f8..4fc3dee36c0522 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -306,6 +306,18 @@ m_GAShr(const LHS &L, const RHS &R) { return BinaryOp_match(L, R); } +template +inline BinaryOp_match +m_GSMax(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + +template +inline BinaryOp_match +m_GSMin(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc template struct UnaryOp_match { SrcTy L; @@ -460,6 +472,7 @@ struct TernaryOp_match { return false; } }; + template inline TernaryOp_match diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index f2b15134f4831d..b6a6fb3e77db09 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,27 +37,27 @@ def cvt_f32_ubyteN : GICombineRule< [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; -def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; +def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; def clamp_i64_to_i16 : GICombineRule< (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo), (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16, - [{ return PostLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), - (apply [{ PostLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; + [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), + (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; - def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> { + "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; + let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; } def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN, clamp_i64_to_i16]> { + uchar_to_float, cvt_f32_ubyteN]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 16621a816922c7..8182a35ed45a6f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -75,19 +75,6 @@ class AMDGPUPostLegalizerCombinerHelper { bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); void applyCvtF32UByteN(MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo); - - struct ClampI64ToI16MatchInfo { - int64_t Cmp1; - int64_t Cmp2; - Register Origin; - }; - - bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineFunction &MF, - ClampI64ToI16MatchInfo &MatchInfo); - - void applyClampI64ToI16(MachineInstr &MI, - const ClampI64ToI16MatchInfo &MatchInfo); }; bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( @@ -267,121 +254,6 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( MI.eraseFromParent(); } -bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, - ClampI64ToI16MatchInfo &MatchInfo) { - assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); - const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); - - // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or - // below). - if (SrcType != LLT::scalar(64)) - return false; - - MachineIRBuilder B(MI); - - LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16"); - - CmpInst::Predicate Predicate1; - Register Base; - - if (!mi_match(MI.getOperand(1).getReg(), MRI, m_GISelect(m_GICmp(m_Pred(Predicate1), m_Reg(), m_Reg()), m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) - return false; - - CmpInst::Predicate Predicate2; - - if (!mi_match(Base, MRI, m_GISelect(m_GICmp(m_Pred(Predicate2), m_Reg(), m_Reg()), m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) - return false; - - if ((Predicate1 == CmpInst::ICMP_SLT && - Predicate2 == CmpInst::ICMP_SGT) || - (Predicate1 == CmpInst::ICMP_SGT && - Predicate2 == CmpInst::ICMP_SLT)) { - const auto Cmp1 = MatchInfo.Cmp1; - const auto Cmp2 = MatchInfo.Cmp2; - const auto Diff = std::abs(Cmp2 - Cmp1); - - // we don't need to clamp here. - if (Diff == 0 || Diff == 1) { - return false; - } - - const int64_t Min = std::numeric_limits::min(); - const int64_t Max = std::numeric_limits::max(); - - // are we really trying to clamp against the relevant boundaries? - return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || - (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); - } - - return false; -} - -/** - * We want to find a combination of instructions that - * gets generated when an i64 gets clamped to i16. - * The corresponding pattern is: - * G_SELECT MIN/MAX, G_ICMP, G_SELECT MIN/MAX, G_ICMP, G_TRUNC. - * This can be efficiently written as following: - * v_cvt_pk_i16_i32 v0, v0, v1 - * v_med3_i32 v0, Clamp_Min, v0, Clamp_Max - */ -void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16( - MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { - LLVM_DEBUG(dbgs() << "Combining MI"); - - MachineIRBuilder B(MI); - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - - Register Src = MatchInfo.Origin; - assert(MRI.getType(Src) == LLT::scalar(64)); - const LLT S32 = LLT::scalar(32); - - auto Unmerge = B.buildUnmerge(S32, Src); - Register Hi32 = Unmerge->getOperand(0).getReg(); - Register Lo32 = Unmerge->getOperand(1).getReg(); - MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); - MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); - - constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64; - assert(MI.getOpcode() != CvtOpcode); - - const auto REG_CLASS = &AMDGPU::VGPR_32RegClass; - - Register CvtDst = MRI.createVirtualRegister(REG_CLASS); - MRI.setType(CvtDst, S32); - - auto CvtPk = B.buildInstr(CvtOpcode); - CvtPk.addDef(CvtDst); - CvtPk.addReg(Hi32); - CvtPk.addReg(Lo32); - CvtPk.setMIFlags(MI.getFlags()); - - auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); - auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); - - Register MinBoundaryDst = MRI.createVirtualRegister(REG_CLASS); - MRI.setType(MinBoundaryDst, S32); - B.buildConstant(MinBoundaryDst, min); - - Register MaxBoundaryDst = MRI.createVirtualRegister(REG_CLASS); - MRI.setType(MaxBoundaryDst, S32); - B.buildConstant(MaxBoundaryDst, max); - - Register MedDst = MRI.createVirtualRegister(REG_CLASS); - MRI.setType(MedDst, S32); - - auto Med = B.buildInstr(AMDGPU::V_MED3_I32); - Med.addDef(MedDst); - Med.addReg(MinBoundaryDst); - Med.addReg(CvtDst); - Med.addReg(MaxBoundaryDst); - Med.setMIFlags(MI.getFlags()); - - B.buildCopy(MI.getOperand(0).getReg(), MedDst); - - MI.eraseFromParent(); -} class AMDGPUPostLegalizerCombinerHelperState { protected: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index e4b628bf6b2381..f0c180b68b8eac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -11,7 +11,12 @@ // //===----------------------------------------------------------------------===// +<<<<<<< HEAD #include "AMDGPU.h" +======= +#include "AMDGPULegalizerInfo.h" +#include "AMDGPUTargetMachine.h" +>>>>>>> Move Combiner to PreLegalize step #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -26,6 +31,156 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPUPreLegalizerCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + CombinerHelper &Helper; + +public: + AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; + + struct ClampI64ToI16MatchInfo { + int64_t Cmp1; + int64_t Cmp2; + Register Origin; + }; + + bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo); + + void applyClampI64ToI16(MachineInstr &MI, + const ClampI64ToI16MatchInfo &MatchInfo); +}; + +bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); + + // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or + // below). + const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); + + if (SrcType != LLT::scalar(64)) + return false; + + const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); + + if (DstType != LLT::scalar(16)) + return false; + + MachineIRBuilder B(MI); + + LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16\n"); + + Register Base; + + // match max / min pattern + if (!mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) + return false; + + if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) + return false; + + const auto Cmp1 = MatchInfo.Cmp1; + const auto Cmp2 = MatchInfo.Cmp2; + const auto Diff = std::abs(Cmp2 - Cmp1); + + // we don't need to clamp here. + if (Diff == 0 || Diff == 1) + return false; + + const int64_t Min = std::numeric_limits::min(); + const int64_t Max = std::numeric_limits::max(); + + // are we really trying to clamp against the relevant boundaries? + return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || + (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); +} + +// We want to find a combination of instructions that +// gets generated when an i64 gets clamped to i16. +// The corresponding pattern is: +// G_MAX / G_MAX for i16 <= G_TRUNC i64. +// This can be efficiently written as following: +// v_cvt_pk_i16_i32 v0, v0, v1 +// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max + +void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( + MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { + LLVM_DEBUG(dbgs() << "Combining MI\n"); + + MachineIRBuilder B(MI); + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + + Register Src = MatchInfo.Origin; + assert(MRI.getType(Src) == LLT::scalar(64)); + const LLT S32 = LLT::scalar(32); + + auto Unmerge = B.buildUnmerge(S32, Src); + Register Hi32 = Unmerge->getOperand(0).getReg(); + Register Lo32 = Unmerge->getOperand(1).getReg(); + MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); + + constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64; + assert(MI.getOpcode() != CvtOpcode); + + const auto REG_CLASS = &AMDGPU::VGPR_32RegClass; + + Register CvtDst = MRI.createVirtualRegister(REG_CLASS); + MRI.setType(CvtDst, S32); + + auto CvtPk = B.buildInstr(CvtOpcode); + CvtPk.addDef(CvtDst); + CvtPk.addReg(Hi32); + CvtPk.addReg(Lo32); + CvtPk.setMIFlags(MI.getFlags()); + + auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); + auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); + + Register MinBoundaryDst = MRI.createVirtualRegister(REG_CLASS); + MRI.setType(MinBoundaryDst, S32); + B.buildConstant(MinBoundaryDst, min); + + Register MaxBoundaryDst = MRI.createVirtualRegister(REG_CLASS); + MRI.setType(MaxBoundaryDst, S32); + B.buildConstant(MaxBoundaryDst, max); + + Register MedDst = MRI.createVirtualRegister(REG_CLASS); + MRI.setType(MedDst, S32); + + auto Med = B.buildInstr(AMDGPU::V_MED3_I32); + Med.addDef(MedDst); + Med.addReg(MinBoundaryDst); + Med.addReg(CvtDst); + Med.addReg(MaxBoundaryDst); + Med.setMIFlags(MI.getFlags()); + + Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16)); + B.buildTrunc(TruncDst, MedDst); + B.buildCopy(MI.getOperand(0).getReg(), TruncDst); + + MI.eraseFromParent(); +} + +class AMDGPUPreLegalizerCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; + +public: + AMDGPUPreLegalizerCombinerHelperState( + CombinerHelper &Helper, + AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) + : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} +}; + #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS @@ -43,9 +198,10 @@ class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + const AMDGPULegalizerInfo *LI, GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), + /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), KB(KB), MDT(MDT) { if (!GeneratedRuleCfg.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); @@ -59,7 +215,9 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg); + AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); + AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, + PreLegalizerHelper); if (Generated.tryCombineAll(Observer, MI, B, Helper)) return true; @@ -125,11 +283,16 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + + const GCNSubtarget &ST = MF.getSubtarget(); + const AMDGPULegalizerInfo *LI = + static_cast(ST.getLegalizerInfo()); + GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis(); AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), - F.hasMinSize(), KB, MDT); + F.hasMinSize(), LI, KB, MDT); Combiner C(PCInfo, TPC); return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll index 8f2b3dd8dea69e..492318d37f5796 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll @@ -2,6 +2,9 @@ ; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s ; RUN: llc -global-isel -mcpu=gfx1010 -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +declare i64 @llvm.smax.i64(i64, i64) +declare i64 @llvm.smin.i64(i64, i64) + ; GFX10-LABEL: {{^}}v_clamp_i64_i16 ; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] ; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] @@ -9,15 +12,13 @@ ; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff ; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] ; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] -; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff -; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[C]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x7fff define i16 @v_clamp_i64_i16(i64 %in) #0 { entry: - %greater = icmp sgt i64 %in, -32768 - %i1 = select i1 %greater, i64 %in, i64 -32768 - %lower = icmp slt i64 %i1, 32767 - %intermed = select i1 %lower, i64 %i1, i64 32767 - %result = trunc i64 %intermed to i16 + %max = call i64 @llvm.smax.i64(i64 %in, i64 -32768) + %min = call i64 @llvm.smin.i64(i64 %max, i64 32767) + %result = trunc i64 %min to i16 ret i16 %result } @@ -32,11 +33,9 @@ entry: ; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[C]] define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 { entry: - %lower = icmp slt i64 %in, 32767 - %i1 = select i1 %lower, i64 %in, i64 32767 - %greater = icmp sgt i64 %i1, -32768 - %intermed = select i1 %greater, i64 %i1, i64 -32768 - %result = trunc i64 %intermed to i16 + %min = call i64 @llvm.smin.i64(i64 %in, i64 32767) + %max = call i64 @llvm.smax.i64(i64 %min, i64 -32768) + %result = trunc i64 %min to i16 ret i16 %result } @@ -49,25 +48,21 @@ entry: ; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo define i16 @v_clamp_i64_i16_wrong_lower(i64 %in) #0 { entry: - %lower = icmp slt i64 %in, 32769 - %i1 = select i1 %lower, i64 %in, i64 32769 - %greater = icmp sgt i64 %i1, -32768 - %intermed = select i1 %greater, i64 %i1, i64 -32768 - %result = trunc i64 %intermed to i16 + %min = call i64 @llvm.smin.i64(i64 %in, i64 32769) + %max = call i64 @llvm.smax.i64(i64 %min, i64 -32768) + %result = trunc i64 %max to i16 ret i16 %result } -; GFX10-LABEL: {{^}}v_clamp_i64_i16_wrong_lower_and_higher +; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher ; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000 ; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc ; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo -define i16 @v_clamp_i64_i16_wrong_lower_and_higher(i64 %in) #0 { +define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 { entry: - %greater = icmp sgt i64 %in, -32769 - %i1 = select i1 %greater, i64 %in, i64 -32769 - %lower = icmp slt i64 %i1, 32768 - %intermed = select i1 %lower, i64 %i1, i64 32768 - %result = trunc i64 %intermed to i16 + %max = call i64 @llvm.smax.i64(i64 %in, i64 -32769) + %min = call i64 @llvm.smin.i64(i64 %max, i64 32768) + %result = trunc i64 %min to i16 ret i16 %result } @@ -82,11 +77,9 @@ entry: ; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[C]] define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 { entry: - %lower = icmp slt i64 %in, 256 - %i1 = select i1 %lower, i64 %in, i64 256 - %greater = icmp sgt i64 %i1, -255 - %intermed = select i1 %greater, i64 %i1, i64 -255 - %result = trunc i64 %intermed to i16 + %min = call i64 @llvm.smin.i64(i64 %in, i64 256) + %max = call i64 @llvm.smax.i64(i64 %min, i64 -255) + %result = trunc i64 %max to i16 ret i16 %result } @@ -101,11 +94,9 @@ entry: ; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[C]] define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 { entry: - %greater = icmp sgt i64 %in, -255 - %i1 = select i1 %greater, i64 %in, i64 -255 - %lower = icmp slt i64 %i1, 256 - %intermed = select i1 %lower, i64 %i1, i64 256 - %result = trunc i64 %intermed to i16 + %max = call i64 @llvm.smax.i64(i64 %in, i64 -255) + %min = call i64 @llvm.smin.i64(i64 %max, i64 256) + %result = trunc i64 %min to i16 ret i16 %result } @@ -114,10 +105,8 @@ entry: ; GFX10: v_mov_b32_e32 [[A:v[0-9]+]], 0 define i16 @v_clamp_i64_i16_zero(i64 %in) #0 { entry: - %greater = icmp sgt i64 %in, 0 - %i1 = select i1 %greater, i64 %in, i64 0 - %lower = icmp slt i64 %i1, 0 - %intermed = select i1 %lower, i64 %i1, i64 0 - %result = trunc i64 %intermed to i16 + %max = call i64 @llvm.smax.i64(i64 %in, i64 0) + %min = call i64 @llvm.smin.i64(i64 %max, i64 0) + %result = trunc i64 %max to i16 ret i16 %result } \ No newline at end of file