Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Implement expansion for rsq.clamp
Browse files Browse the repository at this point in the history
Not sure why we handle this removed instruction on newer subtargets
for this one and no others, but maintain compatibility with the DAG.
  • Loading branch information
arsenm committed Aug 6, 2020
1 parent c015cbc commit 5a50352
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 0 deletions.
51 changes: 51 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Expand Up @@ -3160,6 +3160,55 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
return true;
}

// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
// FIXME: Why do we handle this one but not other removed instructions?
//
// Reciprocal square root. The clamp prevents infinite results, clamping
// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
// +-max_float.
bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return true;

Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(2).getReg();
auto Flags = MI.getFlags();

LLT Ty = MRI.getType(Dst);

const fltSemantics *FltSemantics;
if (Ty == LLT::scalar(32))
FltSemantics = &APFloat::IEEEsingle();
else if (Ty == LLT::scalar(64))
FltSemantics = &APFloat::IEEEdouble();
else
return false;

auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
.addUse(Src)
.setMIFlags(Flags);

// We don't need to concern ourselves with the snan handling difference, since
// the rsq quieted (or not) so use the one which will directly select.
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
const bool UseIEEE = MFI->getMode().IEEE;

auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);

auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));

if (UseIEEE)
B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
else
B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Expand Down Expand Up @@ -4393,6 +4442,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeTrapIntrinsic(MI, MRI, B);
case Intrinsic::debugtrap:
return legalizeDebugTrapIntrinsic(MI, MRI, B);
case Intrinsic::amdgcn_rsq_clamp:
return legalizeRsqClampIntrinsic(MI, MRI, B);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Expand Up @@ -128,6 +128,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

Expand Down
63 changes: 63 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-amdgcn.rsq.clamp.mir
@@ -0,0 +1,63 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s

---
name: test_rsq_clamp_flags_ieee_on_f32
tracksRegLiveness: true
machineFunctionInfo:
mode:
ieee: true

body: |
bb.0:
liveins: $vgpr0
; SI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32
; SI: liveins: $vgpr0
; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32)
; SI: $vgpr0 = COPY [[INT]](s32)
; VI-LABEL: name: test_rsq_clamp_flags_ieee_on_f32
; VI: liveins: $vgpr0
; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000
; VI: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM_IEEE [[INT]], [[C]]
; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000
; VI: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[C1]]
; VI: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0
$vgpr0 = COPY %1
...

---
name: test_rsq_clamp_flags_ieee_off_f32
tracksRegLiveness: true
machineFunctionInfo:
mode:
ieee: false

body: |
bb.0:
liveins: $vgpr0
; SI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32
; SI: liveins: $vgpr0
; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; SI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), [[COPY]](s32)
; SI: $vgpr0 = COPY [[INT]](s32)
; VI-LABEL: name: test_rsq_clamp_flags_ieee_off_f32
; VI: liveins: $vgpr0
; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI: [[INT:%[0-9]+]]:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x47EFFFFFE0000000
; VI: [[FMINNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMINNUM [[INT]], [[C]]
; VI: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0xC7EFFFFFE0000000
; VI: [[FMAXNUM:%[0-9]+]]:_(s32) = nnan ninf nsz G_FMAXNUM [[FMINNUM]], [[C1]]
; VI: $vgpr0 = COPY [[FMAXNUM]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = nnan ninf nsz G_INTRINSIC intrinsic(@llvm.amdgcn.rsq.clamp), %0
$vgpr0 = COPY %1
...
170 changes: 170 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll
@@ -0,0 +1,170 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s

define float @v_rsq_clamp_f32(float %src) #0 {
; SI-LABEL: v_rsq_clamp_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f32_e32 v0, v0
; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
ret float %rsq_clamp
}

define float @v_rsq_clamp_fabs_f32(float %src) #0 {
; SI-LABEL: v_rsq_clamp_fabs_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f32_e64 v0, |v0|
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_fabs_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f32_e64 v0, |v0|
; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call float @llvm.fabs.f32(float %src)
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src)
ret float %rsq_clamp
}

define double @v_rsq_clamp_f64(double %src) #0 {
; SI-LABEL: v_rsq_clamp_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1]
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; VI-NEXT: s_mov_b32 s4, -1
; VI-NEXT: s_mov_b32 s5, 0x7fefffff
; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_mov_b32 s5, 0xffefffff
; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
ret double %rsq_clamp
}

define double @v_rsq_clamp_fabs_f64(double %src) #0 {
; SI-LABEL: v_rsq_clamp_fabs_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f64_e64 v[0:1], |v[0:1]|
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_fabs_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]|
; VI-NEXT: s_mov_b32 s4, -1
; VI-NEXT: s_mov_b32 s5, 0x7fefffff
; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_mov_b32 s5, 0xffefffff
; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
%fabs.src = call double @llvm.fabs.f64(double %src)
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
ret double %rsq_clamp
}

define float @v_rsq_clamp_undef_f32() #0 {
; SI-LABEL: v_rsq_clamp_undef_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f32_e32 v0, s4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_undef_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f32_e32 v0, s4
; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
ret float %rsq_clamp
}

define double @v_rsq_clamp_undef_f64() #0 {
; SI-LABEL: v_rsq_clamp_undef_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], s[4:5]
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_undef_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f64_e32 v[0:1], s[4:5]
; VI-NEXT: s_mov_b32 s4, -1
; VI-NEXT: s_mov_b32 s5, 0x7fefffff
; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_mov_b32 s5, 0xffefffff
; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
ret double %rsq_clamp
}

define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
; SI-LABEL: v_rsq_clamp_f32_non_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_f32_non_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f32_e32 v0, v0
; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0
; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
ret float %rsq_clamp
}

define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
; SI-LABEL: v_rsq_clamp_f64_non_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1]
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_rsq_clamp_f64_non_ieee:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1]
; VI-NEXT: s_mov_b32 s4, -1
; VI-NEXT: s_mov_b32 s5, 0x7fefffff
; VI-NEXT: v_min_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_mov_b32 s5, 0xffefffff
; VI-NEXT: v_max_f64 v[0:1], v[0:1], s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
ret double %rsq_clamp
}

declare float @llvm.fabs.f32(float) #1
declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
declare double @llvm.fabs.f64(double) #1
declare double @llvm.amdgcn.rsq.clamp.f64(double) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "amdgpu-ieee"="false" }

0 comments on commit 5a50352

Please sign in to comment.