Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
Expand All @@ -34,9 +35,17 @@

using namespace llvm;
using namespace AMDGPU;
using namespace llvm::MIPatternMatch;

namespace {

// AMDGPU-specific pattern matchers
template <typename SrcTy>
inline UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>
m_GAMDGPUReadAnyLane(const SrcTy &Src) {
return UnaryOp_match<SrcTy, AMDGPU::G_AMDGPU_READANYLANE>(Src);
}

class AMDGPURegBankLegalize : public MachineFunctionPass {
public:
static char ID;
Expand Down Expand Up @@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) {

Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) {
// Src = G_AMDGPU_READANYLANE RALSrc
auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
if (RAL)
Register RALSrc;
if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc))))
return RALSrc;

// TruncSrc = G_AMDGPU_READANYLANE RALSrc
// AextSrc = G_TRUNC TruncSrc
// Src = G_ANYEXT AextSrc
if (mi_match(Src, MRI,
m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) {
return RALSrc;
}

// LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
// LoSgpr = G_AMDGPU_READANYLANE LoVgpr
// HiSgpr = G_AMDGPU_READANYLANE HiVgpr
Expand Down
22 changes: 22 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
MI.eraseFromParent();
}

void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == V2S16);
auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg());
auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg());
unsigned Opc = MI.getOpcode();
auto Flags = MI.getFlags();
auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32);
auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32);
auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32);
auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32);
auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags);
auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags);
B.buildMergeLikeInstr(Dst, {Lo, Hi});
MI.eraseFromParent();
}

void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) {
Register Dst = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(Dst);
Expand Down Expand Up @@ -688,6 +705,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
return lowerUnpackBitShift(MI);
case UnpackMinMax:
return lowerUnpackMinMax(MI);
case ScalarizeToS16:
return lowerSplitTo16(MI);
case Ext32To64: {
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
MachineInstrBuilder Hi;
Expand Down Expand Up @@ -837,6 +856,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::scalar(32);
case Sgpr64:
case Vgpr64:
case UniInVgprS64:
return LLT::scalar(64);
case Sgpr128:
case Vgpr128:
Expand Down Expand Up @@ -960,6 +980,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case UniInVcc:
case UniInVgprS16:
case UniInVgprS32:
case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32:
case UniInVgprB32:
Expand Down Expand Up @@ -1092,6 +1113,7 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
case UniInVgprS64:
case UniInVgprV2S16:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class RegBankLegalizeHelper {
static constexpr LLT P6 = LLT::pointer(6, 32);

MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32};
MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16};
MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32};
MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1};

Expand Down Expand Up @@ -121,6 +122,7 @@ class RegBankLegalizeHelper {
void lowerV_BFE(MachineInstr &MI);
void lowerS_BFE(MachineInstr &MI);
void lowerSplitTo32(MachineInstr &MI);
void lowerSplitTo16(MachineInstr &MI);
void lowerSplitTo32Select(MachineInstr &MI);
void lowerSplitTo32SExtInReg(MachineInstr &MI);
void lowerUnpackMinMax(MachineInstr &MI);
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -906,9 +906,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
bool hasSALUFloat = ST->hasSALUFloatInsts();

addRulesForGOpcs({G_FADD}, Standard)
.Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
.Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
.Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
.Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16},
hasSALUFloat)
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});

addRulesForGOpcs({G_FPTOUI})
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
V4S32,

UniV2S16,
UniV2S32,

DivV2S16,
DivV2S32,

// B types
B32,
Expand Down Expand Up @@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
UniInVcc,
UniInVgprS16,
UniInVgprS32,
UniInVgprS64,
UniInVgprV2S16,
UniInVgprV2S32,
UniInVgprV4S32,
UniInVgprB32,
UniInVgprB64,
Expand Down Expand Up @@ -217,6 +221,7 @@ enum LoweringMethodID {
V_BFE,
VgprToVccCopy,
SplitTo32,
ScalarizeToS16,
SplitTo32Select,
SplitTo32SExtInReg,
Ext32To64,
Expand Down
165 changes: 165 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s

define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
; GFX11-FAKE16-LABEL: fadd_s16_uniform:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: fadd_s16_uniform:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: fadd_s16_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_f16 s0, s0, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%fadd = fadd half %a, %b
ret half %fadd
}

define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
; GFX11-FAKE16-LABEL: fadd_s16_div:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: fadd_s16_div:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX12-FAKE16-LABEL: fadd_s16_div:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
; GFX12-FAKE16-NEXT: ; return to shader part epilog
;
; GFX12-TRUE16-LABEL: fadd_s16_div:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
; GFX12-TRUE16-NEXT: ; return to shader part epilog
%fadd = fadd half %a, %b
ret half %fadd
}

define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
; GFX11-LABEL: fadd_s32_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: fadd_s32_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_f32 s0, s0, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%fadd = fadd float %a, %b
ret float %fadd
}

define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
; GCN-LABEL: fadd_s32_div:
; GCN: ; %bb.0:
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: ; return to shader part epilog
%fadd = fadd float %a, %b
ret float %fadd
}

define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
; GFX11-LABEL: fadd_s64_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3]
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fadd_s64_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3]
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
%fadd = fadd double %a, %b
store double %fadd, ptr addrspace(1) %ptr
ret void
}

define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
; GFX11-LABEL: fadd_s64_div:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: fadd_s64_div:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3]
; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX12-NEXT: s_endpgm
%fadd = fadd double %a, %b
store double %fadd, ptr addrspace(1) %ptr
ret void
}

define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
; GFX11-LABEL: fadd_v2s16_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: fadd_v2s16_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
; GFX12-NEXT: s_add_f16 s0, s0, s1
; GFX12-NEXT: s_add_f16 s1, s2, s3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: ; return to shader part epilog
%fadd = fadd <2 x half> %a, %b
ret <2 x half> %fadd
}

define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
; GCN-LABEL: fadd_v2s16_div:
; GCN: ; %bb.0:
; GCN-NEXT: v_pk_add_f16 v0, v0, v1
; GCN-NEXT: ; return to shader part epilog
%fadd = fadd <2 x half> %a, %b
ret <2 x half> %fadd
}

define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
; GFX11-LABEL: fadd_v2s32_uniform:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: fadd_v2s32_uniform:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_add_f32 s0, s0, s2
; GFX12-NEXT: s_add_f32 s1, s1, s3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: ; return to shader part epilog
%fadd = fadd <2 x float> %a, %b
ret <2 x float> %fadd
}

define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
; GCN-LABEL: fadd_v2s32_div:
; GCN: ; %bb.0:
; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
; GCN-NEXT: ; return to shader part epilog
%fadd = fadd <2 x float> %a, %b
ret <2 x float> %fadd
}