Skip to content

Commit f89f6d1

Browse files
committed
[AMDGPU]: Fixes an invalid clamp selection pattern.
When running the tests on PowerPC and x86, the lit test GlobalISel/trunc.ll fails at the memory sanitize step. This seems to be due to wrong invalid logic (which matches even if it shouldn't) and likely missing variable initialisation." Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D95878
1 parent 86bde76 commit f89f6d1

File tree

8 files changed

+302
-4
lines changed

8 files changed

+302
-4
lines changed

llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,18 @@ m_GAShr(const LHS &L, const RHS &R) {
306306
return BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>(L, R);
307307
}
308308

309+
template <typename LHS, typename RHS>
310+
inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, false>
311+
m_GSMax(const LHS &L, const RHS &R) {
312+
return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMAX, false>(L, R);
313+
}
314+
315+
template <typename LHS, typename RHS>
316+
inline BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, false>
317+
m_GSMin(const LHS &L, const RHS &R) {
318+
return BinaryOp_match<LHS, RHS, TargetOpcode::G_SMIN, false>(L, R);
319+
}
320+
309321
// Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
310322
template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
311323
SrcTy L;
@@ -468,6 +480,13 @@ m_GInsertVecElt(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
468480
TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2);
469481
}
470482

483+
template <typename Src0Ty, typename Src1Ty, typename Src2Ty>
484+
inline TernaryOp_match<Src0Ty, Src1Ty, Src2Ty, TargetOpcode::G_SELECT>
485+
m_GISelect(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) {
486+
return TernaryOp_match<Src0Ty, Src1Ty, Src2Ty, TargetOpcode::G_SELECT>(
487+
Src0, Src1, Src2);
488+
}
489+
471490
/// Matches a register negated by a G_SUB.
472491
/// G_SUB 0, %negated_reg
473492
template <typename SrcTy>
@@ -484,7 +503,7 @@ m_Not(const SrcTy &&Src) {
484503
return m_GXor(Src, m_AllOnesInt());
485504
}
486505

487-
} // namespace GMIPatternMatch
506+
} // namespace MIPatternMatch
488507
} // namespace llvm
489508

490509
#endif

llvm/lib/Target/AMDGPU/AMDGPUCombine.td

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,21 @@ def cvt_f32_ubyteN : GICombineRule<
3737
[{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
3838
(apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
3939

40+
def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
41+
42+
def clamp_i64_to_i16 : GICombineRule<
43+
(defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo),
44+
(match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16,
45+
[{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]),
46+
(apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
47+
4048
// Combines which should only apply on SI/VI
4149
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
4250

43-
4451
def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
45-
"AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> {
52+
"AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> {
4653
let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
54+
let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
4755
}
4856

4957
def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<

llvm/lib/Target/AMDGPU/AMDGPUGISel.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE1, AMDGPUcvt_f32_ubyte1>;
174174
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
175175
def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
176176

177+
def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
178+
def : GINodeEquiv<G_AMDGPU_MED3, AMDGPUsmed3>;
179+
177180
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
178181
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
179182
def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,8 @@ def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
213213
def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
214214
SDTIntToFPOp, []>;
215215

216+
def AMDGPUcvt_pk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32",
217+
AMDGPUIntPackOp, []>;
216218

217219
// urecip - This operation is a helper for integer division, it returns the
218220
// result of 1 / a as a fractional unsigned integer.

llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp

Lines changed: 141 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "AMDGPU.h"
15+
#include "AMDGPULegalizerInfo.h"
16+
#include "GCNSubtarget.h"
17+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1518
#include "llvm/CodeGen/GlobalISel/Combiner.h"
1619
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
1720
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -26,6 +29,141 @@
2629
using namespace llvm;
2730
using namespace MIPatternMatch;
2831

32+
class AMDGPUPreLegalizerCombinerHelper {
33+
protected:
34+
MachineIRBuilder &B;
35+
MachineFunction &MF;
36+
MachineRegisterInfo &MRI;
37+
CombinerHelper &Helper;
38+
39+
public:
40+
AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41+
: B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42+
43+
struct ClampI64ToI16MatchInfo {
44+
int64_t Cmp1 = 0;
45+
int64_t Cmp2 = 0;
46+
Register Origin;
47+
};
48+
49+
bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
50+
MachineFunction &MF,
51+
ClampI64ToI16MatchInfo &MatchInfo);
52+
53+
void applyClampI64ToI16(MachineInstr &MI,
54+
const ClampI64ToI16MatchInfo &MatchInfo);
55+
};
56+
57+
bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
58+
MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
59+
ClampI64ToI16MatchInfo &MatchInfo) {
60+
assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
61+
62+
// Try to find a pattern where an i64 value should get clamped to short.
63+
const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
64+
if (SrcType != LLT::scalar(64))
65+
return false;
66+
67+
const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
68+
if (DstType != LLT::scalar(16))
69+
return false;
70+
71+
Register Base;
72+
73+
auto IsApplicableForCombine = [&MatchInfo]() -> bool {
74+
const auto Cmp1 = MatchInfo.Cmp1;
75+
const auto Cmp2 = MatchInfo.Cmp2;
76+
const auto Diff = std::abs(Cmp2 - Cmp1);
77+
78+
// If the difference between both comparison values is 0 or 1, there is no
79+
// need to clamp.
80+
if (Diff == 0 || Diff == 1)
81+
return false;
82+
83+
const int64_t Min = std::numeric_limits<int16_t>::min();
84+
const int64_t Max = std::numeric_limits<int16_t>::max();
85+
86+
// Check if the comparison values are between SHORT_MIN and SHORT_MAX.
87+
return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
88+
(Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
89+
};
90+
91+
// Try to match a combination of min / max MIR opcodes.
92+
if (mi_match(MI.getOperand(1).getReg(), MRI,
93+
m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
94+
if (mi_match(Base, MRI,
95+
m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
96+
return IsApplicableForCombine();
97+
}
98+
}
99+
100+
if (mi_match(MI.getOperand(1).getReg(), MRI,
101+
m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
102+
if (mi_match(Base, MRI,
103+
m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
104+
return IsApplicableForCombine();
105+
}
106+
}
107+
108+
return false;
109+
}
110+
111+
// We want to find a combination of instructions that
112+
// gets generated when an i64 gets clamped to i16.
113+
// The corresponding pattern is:
114+
// G_MAX / G_MAX for i16 <= G_TRUNC i64.
115+
// This can be efficiently written as following:
116+
// v_cvt_pk_i16_i32 v0, v0, v1
117+
// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
118+
void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
119+
MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
120+
121+
Register Src = MatchInfo.Origin;
122+
assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
123+
LLT::scalar(64));
124+
const LLT S32 = LLT::scalar(32);
125+
126+
B.setMBB(*MI.getParent());
127+
B.setInstrAndDebugLoc(MI);
128+
129+
auto Unmerge = B.buildUnmerge(S32, Src);
130+
131+
assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
132+
133+
const LLT V2S16 = LLT::vector(2, 16);
134+
auto CvtPk =
135+
B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
136+
{Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
137+
138+
auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
139+
auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
140+
auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
141+
auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
142+
143+
auto Bitcast = B.buildBitcast({S32}, CvtPk);
144+
145+
auto Med3 = B.buildInstr(
146+
AMDGPU::G_AMDGPU_MED3, {S32},
147+
{MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
148+
MI.getFlags());
149+
150+
B.buildTrunc(MI.getOperand(0).getReg(), Med3);
151+
152+
MI.eraseFromParent();
153+
}
154+
155+
class AMDGPUPreLegalizerCombinerHelperState {
156+
protected:
157+
CombinerHelper &Helper;
158+
AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
159+
160+
public:
161+
AMDGPUPreLegalizerCombinerHelperState(
162+
CombinerHelper &Helper,
163+
AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
164+
: Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
165+
};
166+
29167
#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
30168
#include "AMDGPUGenPreLegalizeGICombiner.inc"
31169
#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
@@ -59,7 +197,9 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
59197
MachineInstr &MI,
60198
MachineIRBuilder &B) const {
61199
CombinerHelper Helper(Observer, B, KB, MDT);
62-
AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg);
200+
AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
201+
AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
202+
PreLegalizerHelper);
63203

64204
if (Generated.tryCombineAll(Observer, MI, B, Helper))
65205
return true;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3507,6 +3507,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
35073507
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
35083508
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
35093509
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3510+
case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3511+
case AMDGPU::G_AMDGPU_MED3:
35103512
return getDefaultMappingVOP(MI);
35113513
case AMDGPU::G_UMULH:
35123514
case AMDGPU::G_SMULH: {

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2577,6 +2577,18 @@ def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
25772577
}
25782578
}
25792579

2580+
def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction {
2581+
let OutOperandList = (outs type0:$dst);
2582+
let InOperandList = (ins type0:$src0, type0:$src1);
2583+
let hasSideEffects = 0;
2584+
}
2585+
2586+
def G_AMDGPU_MED3 : AMDGPUGenericInstruction {
2587+
let OutOperandList = (outs type0:$dst);
2588+
let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
2589+
let hasSideEffects = 0;
2590+
}
2591+
25802592
// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
25812593
// operand Expects a MachineMemOperand in addition to explicit
25822594
// operands.
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX678,GFX6789 %s
2+
; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s
3+
; RUN: llc -global-isel -mcpu=gfx1010 -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4+
5+
declare i64 @llvm.smax.i64(i64, i64)
6+
declare i64 @llvm.smin.i64(i64, i64)
7+
8+
; GFX10-LABEL: {{^}}v_clamp_i64_i16
9+
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
10+
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
11+
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
12+
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
13+
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
14+
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
15+
; GFX10: v_mov_b32_e32 [[B]], 0x7fff
16+
; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
17+
define i16 @v_clamp_i64_i16(i64 %in) #0 {
18+
entry:
19+
%max = call i64 @llvm.smax.i64(i64 %in, i64 -32768)
20+
%min = call i64 @llvm.smin.i64(i64 %max, i64 32767)
21+
%result = trunc i64 %min to i16
22+
ret i16 %result
23+
}
24+
25+
; GFX10-LABEL: {{^}}v_clamp_i64_i16_reverse
26+
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
27+
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
28+
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
29+
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
30+
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
31+
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
32+
; GFX10: v_mov_b32_e32 [[B]], 0x7fff
33+
; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
34+
define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 {
35+
entry:
36+
%min = call i64 @llvm.smin.i64(i64 %in, i64 32767)
37+
%max = call i64 @llvm.smax.i64(i64 %min, i64 -32768)
38+
%result = trunc i64 %max to i16
39+
ret i16 %result
40+
}
41+
42+
; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower
43+
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
44+
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
45+
; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
46+
47+
; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
48+
; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
49+
define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
50+
entry:
51+
%min = call i64 @llvm.smin.i64(i64 %in, i64 32769)
52+
%max = call i64 @llvm.smax.i64(i64 %min, i64 -32768)
53+
%result = trunc i64 %max to i16
54+
ret i16 %result
55+
}
56+
57+
; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher
58+
; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
59+
; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
60+
; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo
61+
define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 {
62+
entry:
63+
%max = call i64 @llvm.smax.i64(i64 %in, i64 -32769)
64+
%min = call i64 @llvm.smin.i64(i64 %max, i64 32768)
65+
%result = trunc i64 %min to i16
66+
ret i16 %result
67+
}
68+
69+
; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short
70+
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
71+
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
72+
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
73+
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
74+
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
75+
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
76+
; GFX10: v_mov_b32_e32 [[B]], 0x100
77+
; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]]
78+
define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 {
79+
entry:
80+
%min = call i64 @llvm.smin.i64(i64 %in, i64 256)
81+
%max = call i64 @llvm.smax.i64(i64 %min, i64 -255)
82+
%result = trunc i64 %max to i16
83+
ret i16 %result
84+
}
85+
86+
; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse
87+
; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
88+
; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
89+
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
90+
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
91+
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
92+
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
93+
; GFX10: v_mov_b32_e32 [[B]], 0x100
94+
; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]]
95+
define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 {
96+
entry:
97+
%max = call i64 @llvm.smax.i64(i64 %in, i64 -255)
98+
%min = call i64 @llvm.smin.i64(i64 %max, i64 256)
99+
%result = trunc i64 %min to i16
100+
ret i16 %result
101+
}
102+
103+
; GFX10-LABEL: {{^}}v_clamp_i64_i16_zero
104+
; GFX6789: v_mov_b32_e32 v0, 0
105+
; GFX10: v_mov_b32_e32 v0, 0
106+
define i16 @v_clamp_i64_i16_zero(i64 %in) #0 {
107+
entry:
108+
%max = call i64 @llvm.smax.i64(i64 %in, i64 0)
109+
%min = call i64 @llvm.smin.i64(i64 %max, i64 0)
110+
%result = trunc i64 %min to i16
111+
ret i16 %result
112+
}

0 commit comments

Comments
 (0)