[AMDGPU] Make getDWordFromOffset robust against exotic types #70153

jrbyrnes · 2023-10-25T01:45:49Z

Passing MVT::getIntegerVT(Src.getValueSizeInBits()) into getBitcastedAnyExtOrTrunc with, for example, v6i6 results in trying to produce i96 bitcast which causes problems.

Instead, get the relevant sub-dword bits, then getBitcastedAnyExtOrTrunc into MVT::i32

llvmbot · 2023-10-25T01:47:01Z

@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

Changes

Passing MVT::getIntegerVT(Src.getValueSizeInBits()) into getBitcastedAnyExtOrTrunc with, for example, v6i6 results in trying to produce i96 bitcast which causes problems.

Instead, get the relevant sub-dword bits, then getBitcastedAnyExtOrTrunc into MVT::i32

Full diff: https://github.com/llvm/llvm-project/pull/70153.diff

2 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31-16)
(modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+164-18)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c97486437ed83f1..9e1e0780f19ac8b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11230,26 +11230,41 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
                                   unsigned DWordOffset) {
   SDValue Ret;
-  if (Src.getValueSizeInBits() <= 32)
+
+  auto ValueSize = Src.getValueSizeInBits().getFixedValue();
+
+  if (ValueSize <= 32)
     return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
 
-  if (Src.getValueSizeInBits() >= 256) {
-    assert(!(Src.getValueSizeInBits() % 32));
-    Ret = DAG.getBitcast(
-        MVT::getVectorVT(MVT::i32, Src.getValueSizeInBits() / 32), Src);
+  if (!(ValueSize % 32)) {
+    Ret = DAG.getBitcast(MVT::getVectorVT(MVT::i32, ValueSize / 32), Src);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ret,
                        DAG.getConstant(DWordOffset, SL, MVT::i32));
   }
 
-  Ret = DAG.getBitcastedAnyExtOrTrunc(
-      Src, SL, MVT::getIntegerVT(Src.getValueSizeInBits()));
-  if (DWordOffset) {
-    auto Shifted = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
-                               DAG.getConstant(DWordOffset * 32, SL, MVT::i32));
-    return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Shifted);
-  }
+  // ByteProvider must be at least 8 bits
+  assert(!(ValueSize % 8));
+
+  auto BaseSize = ValueSize % 16 ? 8 : 16;
+  auto NumElements = ValueSize / BaseSize;
+  auto Trunc32Elements = (BaseSize * NumElements) / 32;
+  auto NormalizedTrunc = Trunc32Elements * 32 / BaseSize;
+  auto NumElementsIn32 = 32 / BaseSize;
+  auto NumAvailElements = DWordOffset < Trunc32Elements
+                              ? NumElementsIn32
+                              : NumElements - NormalizedTrunc;
 
-  return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
+  SmallVector<SDValue, 4> VecSrcs;
+  for (int i = 0; i < (int)NumAvailElements; i++) {
+    auto ElIdx = DWordOffset * NumElementsIn32 + i;
+    VecSrcs.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
+                                  MVT::getIntegerVT(BaseSize), Src,
+                                  DAG.getConstant(ElIdx, SL, MVT::i32)));
+  }
+  Ret = DAG.getBuildVector(
+      MVT::getVectorVT(MVT::getIntegerVT(BaseSize), NumAvailElements), SL,
+      VecSrcs);
+  return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
 }
 
 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
@@ -11320,10 +11335,10 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SDValue OtherOp =
       SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op;
 
-  if (SecondSrc)
+  if (SecondSrc) {
     OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
-
-  assert(Op.getValueSizeInBits() == 32);
+    assert(OtherOp.getValueSizeInBits() == 32);
+  }
 
   if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
 
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index a8d53856c7c6161..a5a05829d6e76a7 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3409,21 +3409,21 @@ define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
 ; GFX10-LABEL: extract_hilo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[2:3], off
+; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3060505
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3060505
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract_hilo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[2:3], off
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3060505
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3451,21 +3451,21 @@ define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
 ; GFX10-LABEL: extract_lohi:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off
-; GFX10-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x70404
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x70404
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract_lohi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off
-; GFX9-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v6, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x70404
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3493,21 +3493,21 @@ define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
 ; GFX10-LABEL: extract_hihi:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x2070505
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x2070505
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract_hihi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v6, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2070505
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3664,3 +3664,149 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
   store i32 %res, ptr addrspace(1) %out0, align 4
   ret void
 }
+
+; Should not result in crash
+define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v6i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_ushort v2, v[0:1], off offset:6
+; GFX10-NEXT:    global_load_ushort v3, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:2
+; GFX10-NEXT:    global_load_ushort v9, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshl_or_b32 v0, v8, 16, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v9
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    global_store_dword v[6:7], v1, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v6i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v[0:1], off offset:6
+; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_ushort v9, v[0:1], off offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshl_or_b32 v1, v9, 16, v3
+; GFX9-NEXT:    global_store_dword v[4:5], v1, off
+; GFX9-NEXT:    global_store_dword v[6:7], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <6 x i16>, ptr addrspace(1) %in0, align 2
+  %el0 = extractelement <6 x i16> %vec, i32 0
+  %el1 = extractelement <6 x i16> %vec, i32 1
+  %el2 = extractelement <6 x i16> %vec, i32 2
+  %el3 = extractelement <6 x i16> %vec, i32 3
+  %z0 = zext i16 %el0 to i32
+  %z1 = zext i16 %el1 to i32
+  %s1 = shl nuw i32 %z1, 16
+  %o0 = or i32 %s1, %z0
+  %z2 = zext i16 %el2 to i32
+  %z3 = zext i16 %el3 to i32
+  %s3 = shl nuw i32 %z3, 16
+  %o1 = or i32 %z2, %s3
+
+  store i32 %o0, ptr addrspace(1) %out0, align 4
+  store i32 %o1, ptr addrspace(1) %out1, align 4
+  ret void
+}
+
+
+; Should not result in crash
+define hidden void @extract_v7i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v7i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    global_store_dword v[6:7], v1, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v7i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    global_store_dword v[6:7], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <7 x i16>, ptr addrspace(1) %in0, align 2
+  %el0 = extractelement <7 x i16> %vec, i32 0
+  %el1 = extractelement <7 x i16> %vec, i32 1
+  %el2 = extractelement <7 x i16> %vec, i32 2
+  %el3 = extractelement <7 x i16> %vec, i32 3
+  %z0 = zext i16 %el0 to i32
+  %z1 = zext i16 %el1 to i32
+  %s1 = shl nuw i32 %z1, 16
+  %o0 = or i32 %s1, %z0
+  %z2 = zext i16 %el2 to i32
+  %z3 = zext i16 %el3 to i32
+  %s3 = shl nuw i32 %z3, 16
+  %o1 = or i32 %z2, %s3
+
+  store i32 %o0, ptr addrspace(1) %out0, align 4
+  store i32 %o1, ptr addrspace(1) %out1, align 4
+  ret void
+}
+
+; Should not result in crash
+define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v13i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_bfe_u32 v0, v2, 8, 8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040c00
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040c03
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    global_store_dword v[6:7], v1, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v13i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:8
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040c00
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040c03
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_bfe_u32 v0, v2, 8, 8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s5
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    global_store_dword v[6:7], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <13 x i8>, ptr addrspace(1) %in0, align 2
+  %el0 = extractelement <13 x i8> %vec, i32 0
+  %el1 = extractelement <13 x i8> %vec, i32 1
+  %el2 = extractelement <13 x i8> %vec, i32 7
+  %el3 = extractelement <13 x i8> %vec, i32 8
+  %z0 = zext i8 %el0 to i32
+  %z1 = zext i8 %el1 to i32
+  %s1 = shl nuw i32 %z1, 16
+  %o0 = or i32 %s1, %z0
+  %z2 = zext i8 %el2 to i32
+  %z3 = zext i8 %el3 to i32
+  %s3 = shl nuw i32 %z3, 16
+  %o1 = or i32 %z2, %s3
+
+  store i32 %o0, ptr addrspace(1) %out0, align 4
+  store i32 %o1, ptr addrspace(1) %out1, align 4
+  ret void
+}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Change-Id: Iace875bd241ace9153a50e65526ad4724f3889e7

jrbyrnes · 2023-10-25T03:44:50Z

Just extract from the existing vector instead of trying to convert to EVT vector first.

llvmbot added the backend:AMDGPU label Oct 25, 2023

jrbyrnes requested a review from arsenm October 25, 2023 01:46

arsenm reviewed Oct 25, 2023

View reviewed changes

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved

[AMDGPU] Make getDWordFromOffset robust against exotic types

5c09345

Change-Id: Iace875bd241ace9153a50e65526ad4724f3889e7

jrbyrnes force-pushed the FixArbRebase0 branch from b487f62 to 5c09345 Compare October 25, 2023 03:43

jrbyrnes closed this Oct 25, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Make getDWordFromOffset robust against exotic types #70153

[AMDGPU] Make getDWordFromOffset robust against exotic types #70153

jrbyrnes commented Oct 25, 2023

llvmbot commented Oct 25, 2023

jrbyrnes commented Oct 25, 2023

[AMDGPU] Make getDWordFromOffset robust against exotic types #70153

[AMDGPU] Make getDWordFromOffset robust against exotic types #70153

Conversation

jrbyrnes commented Oct 25, 2023

llvmbot commented Oct 25, 2023

jrbyrnes commented Oct 25, 2023