Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Make getDWordFromOffset robust against exotic types #70153

Closed
wants to merge 1 commit into from

Conversation

jrbyrnes
Copy link
Contributor

Passing MVT::getIntegerVT(Src.getValueSizeInBits()) into getBitcastedAnyExtOrTrunc with, for example, v6i6 results in trying to produce i96 bitcast which causes problems.

Instead, get the relevant sub-dword bits, then getBitcastedAnyExtOrTrunc into MVT::i32

@llvmbot
Copy link
Collaborator

llvmbot commented Oct 25, 2023

@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

Changes

Passing MVT::getIntegerVT(Src.getValueSizeInBits()) into getBitcastedAnyExtOrTrunc with, for example, v6i6 results in trying to produce i96 bitcast which causes problems.

Instead, get the relevant sub-dword bits, then getBitcastedAnyExtOrTrunc into MVT::i32


Full diff: https://github.com/llvm/llvm-project/pull/70153.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31-16)
  • (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+164-18)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c97486437ed83f1..9e1e0780f19ac8b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11230,26 +11230,41 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
 static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
                                   unsigned DWordOffset) {
   SDValue Ret;
-  if (Src.getValueSizeInBits() <= 32)
+
+  auto ValueSize = Src.getValueSizeInBits().getFixedValue();
+
+  if (ValueSize <= 32)
     return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
 
-  if (Src.getValueSizeInBits() >= 256) {
-    assert(!(Src.getValueSizeInBits() % 32));
-    Ret = DAG.getBitcast(
-        MVT::getVectorVT(MVT::i32, Src.getValueSizeInBits() / 32), Src);
+  if (!(ValueSize % 32)) {
+    Ret = DAG.getBitcast(MVT::getVectorVT(MVT::i32, ValueSize / 32), Src);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ret,
                        DAG.getConstant(DWordOffset, SL, MVT::i32));
   }
 
-  Ret = DAG.getBitcastedAnyExtOrTrunc(
-      Src, SL, MVT::getIntegerVT(Src.getValueSizeInBits()));
-  if (DWordOffset) {
-    auto Shifted = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
-                               DAG.getConstant(DWordOffset * 32, SL, MVT::i32));
-    return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Shifted);
-  }
+  // ByteProvider must be at least 8 bits
+  assert(!(ValueSize % 8));
+
+  auto BaseSize = ValueSize % 16 ? 8 : 16;
+  auto NumElements = ValueSize / BaseSize;
+  auto Trunc32Elements = (BaseSize * NumElements) / 32;
+  auto NormalizedTrunc = Trunc32Elements * 32 / BaseSize;
+  auto NumElementsIn32 = 32 / BaseSize;
+  auto NumAvailElements = DWordOffset < Trunc32Elements
+                              ? NumElementsIn32
+                              : NumElements - NormalizedTrunc;
 
-  return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
+  SmallVector<SDValue, 4> VecSrcs;
+  for (int i = 0; i < (int)NumAvailElements; i++) {
+    auto ElIdx = DWordOffset * NumElementsIn32 + i;
+    VecSrcs.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
+                                  MVT::getIntegerVT(BaseSize), Src,
+                                  DAG.getConstant(ElIdx, SL, MVT::i32)));
+  }
+  Ret = DAG.getBuildVector(
+      MVT::getVectorVT(MVT::getIntegerVT(BaseSize), NumAvailElements), SL,
+      VecSrcs);
+  return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
 }
 
 static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
@@ -11320,10 +11335,10 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SDValue OtherOp =
       SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op;
 
-  if (SecondSrc)
+  if (SecondSrc) {
     OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
-
-  assert(Op.getValueSizeInBits() == 32);
+    assert(OtherOp.getValueSizeInBits() == 32);
+  }
 
   if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
 
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index a8d53856c7c6161..a5a05829d6e76a7 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -3409,21 +3409,21 @@ define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
 ; GFX10-LABEL: extract_hilo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v7, v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[2:3], off
+; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x3060505
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x3060505
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract_hilo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v7, v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[2:3], off
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3060505
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3451,21 +3451,21 @@ define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
 ; GFX10-LABEL: extract_lohi:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off
-; GFX10-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x70404
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x70404
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract_lohi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off
-; GFX9-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v6, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x70404
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3493,21 +3493,21 @@ define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
 ; GFX10-LABEL: extract_hihi:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX10-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v6, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v7, v[0:1], off offset:4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_perm_b32 v0, v6, v7, 0x2070505
+; GFX10-NEXT:    v_perm_b32 v0, v7, v6, 0x2070505
 ; GFX10-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: extract_hihi:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
-; GFX9-NEXT:    global_load_dword v7, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v6, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2070505
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_perm_b32 v0, v6, v7, s4
+; GFX9-NEXT:    v_perm_b32 v0, v7, v6, s4
 ; GFX9-NEXT:    global_store_dword v[4:5], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3664,3 +3664,149 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p
   store i32 %res, ptr addrspace(1) %out0, align 4
   ret void
 }
+
+; Should not result in crash
+define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v6i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_ushort v2, v[0:1], off offset:6
+; GFX10-NEXT:    global_load_ushort v3, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:2
+; GFX10-NEXT:    global_load_ushort v9, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshl_or_b32 v0, v8, 16, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 16, v9
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    global_store_dword v[6:7], v1, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v6i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v[0:1], off offset:6
+; GFX9-NEXT:    global_load_ushort v3, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_ushort v9, v[0:1], off offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshl_or_b32 v1, v9, 16, v3
+; GFX9-NEXT:    global_store_dword v[4:5], v1, off
+; GFX9-NEXT:    global_store_dword v[6:7], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <6 x i16>, ptr addrspace(1) %in0, align 2
+  %el0 = extractelement <6 x i16> %vec, i32 0
+  %el1 = extractelement <6 x i16> %vec, i32 1
+  %el2 = extractelement <6 x i16> %vec, i32 2
+  %el3 = extractelement <6 x i16> %vec, i32 3
+  %z0 = zext i16 %el0 to i32
+  %z1 = zext i16 %el1 to i32
+  %s1 = shl nuw i32 %z1, 16
+  %o0 = or i32 %s1, %z0
+  %z2 = zext i16 %el2 to i32
+  %z3 = zext i16 %el3 to i32
+  %s3 = shl nuw i32 %z3, 16
+  %o1 = or i32 %z2, %s3
+
+  store i32 %o0, ptr addrspace(1) %out0, align 4
+  store i32 %o1, ptr addrspace(1) %out1, align 4
+  ret void
+}
+
+
+; Should not result in crash
+define hidden void @extract_v7i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v7i16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    global_store_dword v[6:7], v1, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v7i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    global_store_dword v[6:7], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <7 x i16>, ptr addrspace(1) %in0, align 2
+  %el0 = extractelement <7 x i16> %vec, i32 0
+  %el1 = extractelement <7 x i16> %vec, i32 1
+  %el2 = extractelement <7 x i16> %vec, i32 2
+  %el3 = extractelement <7 x i16> %vec, i32 3
+  %z0 = zext i16 %el0 to i32
+  %z1 = zext i16 %el1 to i32
+  %s1 = shl nuw i32 %z1, 16
+  %o0 = or i32 %s1, %z0
+  %z2 = zext i16 %el2 to i32
+  %z3 = zext i16 %el3 to i32
+  %s3 = shl nuw i32 %z3, 16
+  %o1 = or i32 %z2, %s3
+
+  store i32 %o0, ptr addrspace(1) %out0, align 4
+  store i32 %o1, ptr addrspace(1) %out1, align 4
+  ret void
+}
+
+; Should not result in crash
+define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX10-LABEL: extract_v13i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v8, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_bfe_u32 v0, v2, 8, 8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040c00
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x5040c03
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:    global_store_dword v[6:7], v1, off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: extract_v13i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v8, v[0:1], off offset:8
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040c00
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040c03
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_bfe_u32 v0, v2, 8, 8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s5
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    global_store_dword v[6:7], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %vec = load <13 x i8>, ptr addrspace(1) %in0, align 2
+  %el0 = extractelement <13 x i8> %vec, i32 0
+  %el1 = extractelement <13 x i8> %vec, i32 1
+  %el2 = extractelement <13 x i8> %vec, i32 7
+  %el3 = extractelement <13 x i8> %vec, i32 8
+  %z0 = zext i8 %el0 to i32
+  %z1 = zext i8 %el1 to i32
+  %s1 = shl nuw i32 %z1, 16
+  %o0 = or i32 %s1, %z0
+  %z2 = zext i8 %el2 to i32
+  %z3 = zext i8 %el3 to i32
+  %s3 = shl nuw i32 %z3, 16
+  %o1 = or i32 %z2, %s3
+
+  store i32 %o0, ptr addrspace(1) %out0, align 4
+  store i32 %o1, ptr addrspace(1) %out1, align 4
+  ret void
+}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved
llvm/lib/Target/AMDGPU/SIISelLowering.cpp Outdated Show resolved Hide resolved
Change-Id: Iace875bd241ace9153a50e65526ad4724f3889e7
@jrbyrnes
Copy link
Contributor Author

Just extract from the existing vector instead of trying to convert to EVT vector first.

@jrbyrnes jrbyrnes closed this Oct 25, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants