Skip to content

Commit

Permalink
[AMDGPU] SDWA operands should not intersect with potential MIs
Browse files Browse the repository at this point in the history
Summary:
There should be no intesection between SDWA operands and potential MIs. E.g.:
```
v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
v_add_u32 v3, v4, v2
```
In that example it is possible that we would fold 2nd instruction into 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was already destroyed). So if SDWAOperand is also a potential MI then do not apply it.

Reviewers: vpykhtin, arsenm

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye

Differential Revision: https://reviews.llvm.org/D32804

llvm-svn: 303347
  • Loading branch information
SamWot committed May 18, 2017
1 parent d19632f commit ebfdaf7
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 13 deletions.
45 changes: 32 additions & 13 deletions llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Expand Up @@ -30,6 +30,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include <unordered_map>
#include <unordered_set>

using namespace llvm;

Expand All @@ -44,26 +45,29 @@ namespace {
class SDWAOperand;

class SIPeepholeSDWA : public MachineFunctionPass {
public:
typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector;

private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;

std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;

Optional<int64_t> foldToImm(const MachineOperand &Op) const;

public:
static char ID;

typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;

SIPeepholeSDWA() : MachineFunctionPass(ID) {
initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
}

bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineFunction &MF);
bool isConvertibleToSDWA(const MachineInstr &MI) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);

StringRef getPassName() const override { return "SI Peephole SDWA"; }
Expand Down Expand Up @@ -468,7 +472,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {

if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
auto SDWADst =
make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
SDWAOperands[&MI] = std::move(SDWADst);
++NumSDWAPatternsFound;
Expand Down Expand Up @@ -575,8 +579,7 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
}
}

bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
// Check if this instruction can be converted to SDWA:
// 1. Does this opcode support SDWA
if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
Expand All @@ -588,6 +591,11 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
return false;
}

return true;
}

bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
// Convert to sdwa
int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
assert(SDWAOpcode != -1);
Expand Down Expand Up @@ -664,7 +672,18 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
// Apply all sdwa operand pattenrs
bool Converted = false;
for (auto &Operand : SDWAOperands) {
Converted |= Operand->convertToSDWA(*SDWAInst, TII);
// There should be no intesection between SDWA operands and potential MIs
// e.g.:
// v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
// v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
// v_add_u32 v3, v4, v2
//
// In that example it is possible that we would fold 2nd instruction into 3rd
// (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
// already destroyed). So if SDWAOperand is also a potential MI then do not
// apply it.
if (PotentialMatches.count(Operand->getParentInst()) == 0)
Converted |= Operand->convertToSDWA(*SDWAInst, TII);
}
if (!Converted) {
SDWAInst->eraseFromParent();
Expand All @@ -690,16 +709,15 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TRI = ST.getRegisterInfo();
TII = ST.getInstrInfo();

std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;


// Find all SDWA operands in MF.
matchSDWAOperands(MF);

for (auto &OperandPair : SDWAOperands) {
auto &Operand = OperandPair.second;
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
if (PotentialMI) {
PotentialMatches[PotentialMI].push_back(std::move(Operand));
if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}

Expand All @@ -708,6 +726,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
convertToSDWA(PotentialMI, PotentialPair.second);
}

PotentialMatches.clear();
SDWAOperands.clear();
return false;
}
50 changes: 50 additions & 0 deletions llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
Expand Up @@ -393,3 +393,53 @@ store_label:
store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
ret void
}


; Check that "pulling out" SDWA operands works correctly.
; GCN-LABEL: {{^}}pulled_out_test:
; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_and_b32_sdwa
; NOSDWA-NOT: v_or_b32_sdwa

; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD

define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) {
entry:
%idxprom = ashr exact i64 15, 32
%arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom
%tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8

%tmp1 = extractelement <8 x i8> %tmp, i32 0
%tmp2 = extractelement <8 x i8> %tmp, i32 1
%tmp3 = extractelement <8 x i8> %tmp, i32 2
%tmp4 = extractelement <8 x i8> %tmp, i32 3
%tmp5 = extractelement <8 x i8> %tmp, i32 4
%tmp6 = extractelement <8 x i8> %tmp, i32 5
%tmp7 = extractelement <8 x i8> %tmp, i32 6
%tmp8 = extractelement <8 x i8> %tmp, i32 7

%tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0
%tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1
%tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0
%tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1
%tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0
%tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1
%tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0
%tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1

%tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

%arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom
store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8
ret void
}

0 comments on commit ebfdaf7

Please sign in to comment.