-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AMDGPU][SDAG] Add missing cases for SI_INDIRECT_SRC/DST #170323
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][SDAG] Add missing cases for SI_INDIRECT_SRC/DST #170323
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Juan Manuel Martinez Caamaño (jmmartinez) ChangesBefore this patch, To solve this issue, we allow this expansion with Patch is 413.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170323.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a4f376aceaa4b..80ea9fc0789d5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15336,22 +15336,6 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
}
}
- // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
- if (shouldExpandVectorDynExt(N)) {
- SDLoc SL(N);
- SDValue Idx = N->getOperand(1);
- SDValue V;
- for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
- SDValue IC = DAG.getVectorIdxConstant(I, SL);
- SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
- if (I == 0)
- V = Elt;
- else
- V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
- }
- return V;
- }
-
if (!DCI.isBeforeLegalize())
return SDValue();
@@ -15393,19 +15377,45 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N,
return SDValue();
}
-SDValue
-SITargetLowering::performInsertVectorEltCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
+// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
+SDValue SITargetLowering::performExtractVectorDynEltCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ if (!shouldExpandVectorDynExt(N))
+ return SDValue();
+
SDValue Vec = N->getOperand(0);
- SDValue Idx = N->getOperand(2);
+ SelectionDAG &DAG = DCI.DAG;
+
EVT VecVT = Vec.getValueType();
- EVT EltVT = VecVT.getVectorElementType();
+ EVT ResVT = N->getValueType(0);
+
+ SDLoc SL(N);
+ SDValue Idx = N->getOperand(1);
+ SDValue V;
+ for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
+ SDValue IC = DAG.getVectorIdxConstant(I, SL);
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
+ if (I == 0)
+ V = Elt;
+ else
+ V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
+ }
+ return V;
+}
- // INSERT_VECTOR_ELT (<n x e>, var-idx)
- // => BUILD_VECTOR n x select (e, const-idx)
+// INSERT_VECTOR_ELT (<n x e>, var-idx)
+// => BUILD_VECTOR n x select (e, const-idx)
+SDValue
+SITargetLowering::performInsertVectorDynEltCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
if (!shouldExpandVectorDynExt(N))
return SDValue();
+ SDValue Vec = N->getOperand(0);
+ SDValue Idx = N->getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
SDValue Ins = N->getOperand(1);
@@ -16943,12 +16953,21 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
return Res;
break;
+ case ISD::EXTRACT_VECTOR_ELT:
+ if (SDValue V = performExtractVectorDynEltCombine(N, DCI))
+ return V;
+ break;
+ case ISD::INSERT_VECTOR_ELT:
+ if (SDValue V = performInsertVectorDynEltCombine(N, DCI))
+ return V;
+ break;
default:
break;
}
- if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
+ if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
return SDValue();
+ }
switch (N->getOpcode()) {
case ISD::ADD:
@@ -17063,8 +17082,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}
case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI);
- case ISD::INSERT_VECTOR_ELT:
- return performInsertVectorEltCombine(N, DCI);
case ISD::FP_ROUND:
return performFPRoundCombine(N, DCI);
case ISD::LOAD: {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 40c03ca024c6c..55e883e4c78c6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -223,7 +223,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performExtractVectorDynEltCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const;
+ SDValue performInsertVectorDynEltCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const;
SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index c69b0cce3d208..a4037c817c359 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=fiji < %s | FileCheck --check-prefixes=GCN-O0 %s
define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: float4_extelt:
@@ -20,6 +21,36 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: float4_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s3, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 1.0
+; GCN-O0-NEXT: s_mov_b32 s6, 0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s6
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 2.0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s4, 4.0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dword v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
store float %ext, ptr addrspace(1) %out
@@ -43,6 +74,28 @@ define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: int4_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s1, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT: s_mov_b32 s4, 0
+; GCN-O0-NEXT: s_cselect_b32 s4, s1, s4
+; GCN-O0-NEXT: s_mov_b32 s1, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT: s_cselect_b32 s1, s1, s4
+; GCN-O0-NEXT: s_mov_b32 s4, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4
+; GCN-O0-NEXT: s_mov_b32 s0, 4
+; GCN-O0-NEXT: s_cselect_b32 s0, s0, s1
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
+; GCN-O0-NEXT: flat_store_dword v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
store i32 %ext, ptr addrspace(1) %out
@@ -72,6 +125,56 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: double4_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s5, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s2, 0x3f847ae1
+; GCN-O0-NEXT: s_mov_b32 s8, 0x47ae147b
+; GCN-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GCN-O0-NEXT: s_mov_b32 s9, s2
+; GCN-O0-NEXT: s_mov_b32 s3, s9
+; GCN-O0-NEXT: s_mov_b32 s2, 0x3ff028f5
+; GCN-O0-NEXT: s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s4, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s4
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s4, s8
+; GCN-O0-NEXT: s_mov_b32 s2, s6
+; GCN-O0-NEXT: s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT: s_mov_b32 s2, 0x4000147a
+; GCN-O0-NEXT: s_mov_b32 s6, 0xe147ae14
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s6
+; GCN-O0-NEXT: s_mov_b32 s8, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT: s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s2, 0x40100a3d
+; GCN-O0-NEXT: s_mov_b32 s6, 0x70a3d70a
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s8, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT: s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s3, s6
+; GCN-O0-NEXT: s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
store double %ext, ptr addrspace(1) %out
@@ -109,6 +212,65 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: double5_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s5, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s4, 0x3f847ae1
+; GCN-O0-NEXT: s_mov_b32 s2, 0x47ae147b
+; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; GCN-O0-NEXT: s_mov_b32 s3, s4
+; GCN-O0-NEXT: s_mov_b32 s8, s2
+; GCN-O0-NEXT: s_mov_b32 s4, 0x3ff028f5
+; GCN-O0-NEXT: s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s4
+; GCN-O0-NEXT: s_mov_b32 s4, s6
+; GCN-O0-NEXT: s_mov_b32 s9, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s9
+; GCN-O0-NEXT: s_cselect_b32 s4, s4, s8
+; GCN-O0-NEXT: ; kill: def $sgpr3 killed $sgpr3 killed $sgpr2_sgpr3
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s2, 0x4000147a
+; GCN-O0-NEXT: s_mov_b32 s6, 0xe147ae14
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s8, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s2, s6
+; GCN-O0-NEXT: s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT: s_mov_b32 s2, 0x40100a3d
+; GCN-O0-NEXT: s_mov_b32 s6, 0x70a3d70a
+; GCN-O0-NEXT: s_mov_b32 s8, s6
+; GCN-O0-NEXT: s_mov_b32 s9, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s8
+; GCN-O0-NEXT: s_mov_b32 s7, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s7
+; GCN-O0-NEXT: s_cselect_b32 s4, s2, s4
+; GCN-O0-NEXT: s_mov_b32 s2, s9
+; GCN-O0-NEXT: s_cselect_b32 s3, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s2, 0x40140a3d
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s8, 4
+; GCN-O0-NEXT: s_cmp_eq_u32 s5, s8
+; GCN-O0-NEXT: s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT: s_mov_b32 s3, s6
+; GCN-O0-NEXT: s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
store double %ext, ptr addrspace(1) %out
@@ -130,6 +292,25 @@ define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: half4_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s4, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s5, 0x44004200
+; GCN-O0-NEXT: s_mov_b32 s0, 0x40003c00
+; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; GCN-O0-NEXT: s_mov_b32 s1, s5
+; GCN-O0-NEXT: s_mov_b32 s5, 4
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_lshl_b32 s4, s4, s5
+; GCN-O0-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
+; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s0
+; GCN-O0-NEXT: flat_store_short v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
store half %ext, ptr addrspace(1) %out
@@ -149,6 +330,24 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: float2_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s3, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s4, 1.0
+; GCN-O0-NEXT: s_mov_b32 s5, 0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s5
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dword v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
store float %ext, ptr addrspace(1) %out
@@ -172,6 +371,36 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: double2_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s2, 0x3f847ae1
+; GCN-O0-NEXT: s_mov_b32 s4, 0x47ae147b
+; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GCN-O0-NEXT: s_mov_b32 s5, s2
+; GCN-O0-NEXT: s_mov_b32 s3, s5
+; GCN-O0-NEXT: s_mov_b32 s2, 0x3ff028f5
+; GCN-O0-NEXT: s_mov_b32 s6, 0xc28f5c29
+; GCN-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GCN-O0-NEXT: s_mov_b32 s7, s2
+; GCN-O0-NEXT: s_mov_b32 s2, s7
+; GCN-O0-NEXT: s_mov_b32 s9, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s8, s9
+; GCN-O0-NEXT: s_cselect_b32 s2, s2, s3
+; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
+; GCN-O0-NEXT: s_mov_b32 s3, s6
+; GCN-O0-NEXT: s_cselect_b32 s3, s3, s4
+; GCN-O0-NEXT: v_mov_b32_e32 v2, s3
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s2
+; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GCN-O0-NEXT: v_mov_b32_e32 v3, v0
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
store double %ext, ptr addrspace(1) %out
@@ -217,6 +446,60 @@ define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: half8_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s3, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4000
+; GCN-O0-NEXT: s_mov_b32 s6, 0x3c00
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s6
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 2
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4200
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 3
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4400
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 4
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4500
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 5
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4600
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 6
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s3, 0x4700
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s3
+; GCN-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GCN-O0-NEXT: s_mov_b32 s3, 7
+; GCN-O0-NEXT: s_cmp_eq_u32 s2, s3
+; GCN-O0-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-O0-NEXT: s_mov_b32 s4, 0x4800
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
+; GCN-O0-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[2:3]
+; GCN-O0-NEXT: v_mov_b32_e32 v0, s0
+; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
+; GCN-O0-NEXT: flat_store_short v[0:1], v2
+; GCN-O0-NEXT: s_endpgm
entry:
%ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
store half %ext, ptr addrspace(1) %out
@@ -248,6 +531,39 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
+;
+; GCN-O0-LABEL: short8_extelt:
+; GCN-O0: ; %bb.0: ; %entry
+; GCN-O0-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GCN-O0-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GCN-O0-NEXT: s_mov_b32 s1, 1
+; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s1
+; GCN-O0-NEXT: s_mov_b32 s4, 2
+; GCN-O0-NEXT: s_cselect_b32 s1, s4, s1
+; GCN-O0-NEXT: s_cmp_eq_u32 s0, s4
+; GCN-O0-NEX...
[truncated]
|
Should just fix those to have the patterns? This shouldn't be a mandatory combine |
93d6942 to
0481f7a
Compare
You're right. I thought the problem was there for other "weird" vector sizes but it's not the case (it's only for cases that do not match to a |
0481f7a to
417134d
Compare
…tract element with a dynamic index
Before, instruction selection would fail to select extract/insert elements for i32/float vectors of sizes 3, 5, 6 and 7 when -O0 was used. This patch adds the missing SI_INDIRECT_SRC/DST cases for those sizes.
417134d to
53a07b2
Compare
|
I've updated the PR and added tests for vector sizes 3, 5, 6 and 7. |
|
Got a buildbot failure that is likely related to this PR: reverting it in #171787 |
…0323) (#171787) ``` Step 7 (test-check-all) failure: Test just built components: check-all completed (failure) ******************** TEST 'LLVM :: CodeGen/AMDGPU/insert_vector_dynelt.ll' FAILED ******************** Exit Code: 1 Command Output (stdout): -- # RUN: at line 2 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -mtriple=amdgcn -mcpu=fiji < /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -enable-var-scope -check-prefixes=GCN /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll # executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -mtriple=amdgcn -mcpu=fiji # executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck -enable-var-scope -check-prefixes=GCN /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll # RUN: at line 3 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -O0 -mtriple=amdgcn -mcpu=fiji < /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/FileCheck --check-prefixes=GCN-O0 /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/llvm-project/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll # executed command: /home/buildbot/worker/as-builder-4/ramdisk/expensive-checks/build/bin/llc -O0 -mtriple=amdgcn -mcpu=fiji # .---command stderr------------ # | # | # After Instruction Selection # | # Machine code for function insert_dyn_i32_6: IsSSA, TracksLiveness # | Function Live Ins: $sgpr16 in %8, $sgpr17 in %9, $sgpr18 in %10, $sgpr19 in %11, $sgpr20 in %12, $sgpr21 in %13, $vgpr0 in %14, $vgpr1 in %15 # | # | bb.0 (%ir-block.0): # | successors: %bb.1(0x80000000); %bb.1(100.00%) # | liveins: $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $vgpr0, $vgpr1 # | %15:vgpr_32 = COPY $vgpr1 # | %14:vgpr_32 = COPY $vgpr0 # | %13:sgpr_32 = COPY $sgpr21 # | %12:sgpr_32 = COPY $sgpr20 # | %11:sgpr_32 = COPY $sgpr19 # | %10:sgpr_32 = COPY $sgpr18 # | %9:sgpr_32 = COPY $sgpr17 # | %8:sgpr_32 = COPY $sgpr16 # | %17:sgpr_192 = REG_SEQUENCE %8:sgpr_32, %subreg.sub0, %9:sgpr_32, %subreg.sub1, %10:sgpr_32, %subreg.sub2, %11:sgpr_32, %subreg.sub3, %12:sgpr_32, %subreg.sub4, %13:sgpr_32, %subreg.sub5 # | %16:sgpr_192 = COPY %17:sgpr_192 # | %19:vreg_192 = COPY %17:sgpr_192 # | %28:sreg_64_xexec = IMPLICIT_DEF # | %27:sreg_64_xexec = S_MOV_B64 $exec # | # | bb.1: # | ; predecessors: %bb.1, %bb.0 # | successors: %bb.1(0x40000000), %bb.3(0x40000000); %bb.1(50.00%), %bb.3(50.00%) # | # | %26:vreg_192 = PHI %19:vreg_192, %bb.0, %18:vreg_192, %bb.1 # | %29:sreg_64 = PHI %28:sreg_64_xexec, %bb.0, %30:sreg_64, %bb.1 # | %31:sreg_32_xm0 = V_READFIRSTLANE_B32 %14:vgpr_32, implicit $exec # | %32:sreg_64 = V_CMP_EQ_U32_e64 %31:sreg_32_xm0, %14:vgpr_32, implicit $exec # | %30:sreg_64 = S_AND_SAVEEXEC_B64 killed %32:sreg_64, implicit-def $exec, implicit-def $scc, implicit $exec # | $m0 = COPY killed %31:sreg_32_xm0 # | %18:vreg_192 = V_INDIRECT_REG_WRITE_MOVREL_B32_V8 %26:vreg_192(tied-def 0), %15:vgpr_32, 3, implicit $m0, implicit $exec # | $exec = S_XOR_B64_term $exec, %30:sreg_64, implicit-def $scc # | S_CBRANCH_EXECNZ %bb.1, implicit $exec # | # | bb.3: ``` This reverts commit 15df9e7.
…d V/S_INDIRECT_REG_WRITE_MOVREL A buildbot failure in #170323 when expensive checks were used highlighted that some of these patterns were missing.
…d V/S_INDIRECT_REG_WRITE_MOVREL A buildbot failure in #170323 when expensive checks were used highlighted that some of these patterns were missing. This patch adds V_INDIRECT_REG_{READ/WRITE}_GPR_IDX and V/S_INDIRECT_REG_WRITE_MOVREL for V6 and V7 vector sizes
…_GPR_IDX and V/S_INDIRECT_REG_WRITE_MOVREL (#171835) A buildbot failure in llvm/llvm-project#170323 when expensive checks were used highlighted that some of these patterns were missing. This patch adds `V_INDIRECT_REG_{READ/WRITE}_GPR_IDX` and `V/S_INDIRECT_REG_WRITE_MOVREL` for `V6` and `V7` vector sizes.
Before this patch,
insertelement/extractelementwith dynamic indices wouldfail to select with
-O0for vector 32-bit element types with sizes 3, 5, 6 and 7,which did not map to a
SI_INDIRECT_SRC/DSTpattern.Other "weird" sizes bigger than 8 (like 13) are properly handled already.
To solve this issue we add the missing patterns for the problematic sizes.
Solves SWDEV-568862