Skip to content

Conversation

@arsenm
Copy link
Contributor

@arsenm arsenm commented Nov 13, 2025

No description provided.

Copy link
Contributor Author

arsenm commented Nov 13, 2025

@llvmbot
Copy link
Member

llvmbot commented Nov 13, 2025

@llvm/pr-subscribers-backend-nvptx
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-selectiondag

Author: Matt Arsenault (arsenm)

Changes

Patch is 44.32 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167909.diff

12 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+10-6)
  • (modified) llvm/test/CodeGen/AMDGPU/load-select-ptr.ll (+49-48)
  • (modified) llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll (+15-12)
  • (modified) llvm/test/CodeGen/AMDGPU/select-vectors.ll (+34-29)
  • (modified) llvm/test/CodeGen/AMDGPU/select64.ll (+2-2)
  • (modified) llvm/test/CodeGen/NVPTX/bf16-instructions.ll (+7-6)
  • (modified) llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll (+7-5)
  • (modified) llvm/test/CodeGen/NVPTX/bug22246.ll (+9-8)
  • (modified) llvm/test/CodeGen/NVPTX/fast-math.ll (+36-28)
  • (modified) llvm/test/CodeGen/NVPTX/i1-select.ll (+43-34)
  • (modified) llvm/test/CodeGen/NVPTX/i8x4-instructions.ll (+7-5)
  • (modified) llvm/test/CodeGen/NVPTX/lower-byval-args.ll (+19-38)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index df353c4d91b1a..c5f3cd29f684e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -29031,9 +29031,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
         // over-conservative. It would be beneficial to be able to remember
         // both potential memory locations.  Since we are discarding
         // src value info, don't do the transformation if the memory
-        // locations are not in the default address space.
-        LLD->getPointerInfo().getAddrSpace() != 0 ||
-        RLD->getPointerInfo().getAddrSpace() != 0 ||
+        // locations are not in the same address space.
+        LLD->getPointerInfo().getAddrSpace() !=
+            RLD->getPointerInfo().getAddrSpace() ||
         // We can't produce a CMOV of a TargetFrameIndex since we won't
         // generate the address generation required.
         LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
@@ -29115,6 +29115,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     // but the new load must be the minimum (most restrictive) alignment of the
     // inputs.
     Align Alignment = std::min(LLD->getAlign(), RLD->getAlign());
+    unsigned AddrSpace = LLD->getAddressSpace();
+    assert(AddrSpace == RLD->getAddressSpace());
+
     MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
     if (!RLD->isInvariant())
       MMOFlags &= ~MachineMemOperand::MOInvariant;
@@ -29123,15 +29126,16 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
       // FIXME: Discards pointer and AA info.
       Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
-                         LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
-                         MMOFlags);
+                         LLD->getChain(), Addr, MachinePointerInfo(AddrSpace),
+                         Alignment, MMOFlags);
     } else {
       // FIXME: Discards pointer and AA info.
       Load = DAG.getExtLoad(
           LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
                                                   : LLD->getExtensionType(),
           SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
-          MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
+          MachinePointerInfo(AddrSpace), LLD->getMemoryVT(), Alignment,
+          MMOFlags);
     }
 
     // Users of the select now use the result of the load.
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index d9ad9590d9762..5aabad682ad30 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -7,27 +7,31 @@
 define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %ptr0, [8 x i32], ptr %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
 ; GCN-LABEL: select_ptr_crash_i64_flat:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s6, s[8:9], 0x0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x28
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x50
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x78
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x78
+; GCN-NEXT:    s_add_u32 s4, s8, 40
+; GCN-NEXT:    s_addc_u32 s3, s9, 0
+; GCN-NEXT:    s_add_u32 s5, s8, 0x50
+; GCN-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_eq_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s2
-; GCN-NEXT:    s_cselect_b32 s1, s1, s3
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    s_add_u32 s0, s0, 4
+; GCN-NEXT:    s_cmp_eq_u32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s3, s3, s6
+; GCN-NEXT:    s_cselect_b32 s2, s4, s5
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    s_add_u32 s2, s2, 4
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v2, s1
-; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NEXT:    flat_load_dword v1, v[1:2]
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
@@ -45,25 +49,28 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p
 define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr addrspace(1) %ptr0, [8 x i32], ptr addrspace(1) %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
 ; GCN-LABEL: select_ptr_crash_i64_global:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x28
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x50
-; GCN-NEXT:    s_load_dword s6, s[8:9], 0x0
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x78
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x78
+; GCN-NEXT:    s_add_u32 s4, s8, 40
+; GCN-NEXT:    s_addc_u32 s3, s9, 0
+; GCN-NEXT:    s_add_u32 s5, s8, 0x50
+; GCN-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GCN-NEXT:    s_cmp_eq_u32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s3, s3, s6
+; GCN-NEXT:    s_cselect_b32 s2, s4, s5
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NEXT:    s_cmp_eq_u32 s6, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cselect_b32 s1, s1, s3
-; GCN-NEXT:    s_cselect_b32 s0, s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GCN-NEXT:    s_endpgm
   %tmp2 = icmp eq i32 %tmp, 0
   %tmp3 = load i64, ptr addrspace(1) %ptr0, align 8
@@ -78,22 +85,18 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3)
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    ds_read_b64 v[0:1], v0
-; GCN-NEXT:    ds_read_b64 v[2:3], v2
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT:    s_cselect_b32 s0, s1, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ds_read_b64 v[0:1], v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
   %tmp2 = icmp eq i32 %tmp, 0
@@ -112,22 +115,20 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    ds_read_b64 v[0:1], v0 offset:128
-; GCN-NEXT:    ds_read_b64 v[2:3], v2 offset:512
+; GCN-NEXT:    s_addk_i32 s1, 0x80
+; GCN-NEXT:    s_addk_i32 s2, 0x200
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
-; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT:    s_cselect_b32 s0, s1, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ds_read_b64 v[0:1], v0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
   %tmp2 = icmp eq i32 %tmp, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll b/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
index 423fb7d52d3e3..cc5ae2717faf0 100644
--- a/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
@@ -22,12 +22,12 @@ define i32 @select_load_i32_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %
 ; CHECK-LABEL: select_load_i32_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dword v5, v[1:2], off
-; CHECK-NEXT:    global_load_dword v6, v[3:4], off
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    global_load_dword v0, v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ld0 = load i32, ptr addrspace(1) %a
   %ld1 = load i32, ptr addrspace(1) %b
@@ -39,12 +39,11 @@ define i32 @select_load_i32_p3(i1 %cond, ptr addrspace(3) %a, ptr addrspace(3) %
 ; CHECK-LABEL: select_load_i32_p3:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v1, v1
-; CHECK-NEXT:    ds_read_b32 v2, v2
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; CHECK-NEXT:    ds_read_b32 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ld0 = load i32, ptr addrspace(3) %a
   %ld1 = load i32, ptr addrspace(3) %b
@@ -90,12 +89,12 @@ define i8 @select_load_i8_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %b)
 ; CHECK-LABEL: select_load_i8_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v6, v[3:4], off
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    global_load_ubyte v0, v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ld0 = load i8, ptr addrspace(1) %a
   %ld1 = load i8, ptr addrspace(1) %b
@@ -107,12 +106,16 @@ define i32 @select_load_i32_p1_offset(i1 %cond, ptr addrspace(1) %a, ptr addrspa
 ; CHECK-LABEL: select_load_i32_p1_offset:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dword v3, v[1:2], off offset:256
-; CHECK-NEXT:    global_load_dword v4, v[1:2], off offset:512
+; CHECK-NEXT:    v_add_co_u32_e32 v3, vcc, 0x100, v1
+; CHECK-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_add_co_u32_e32 v5, vcc, 0x200, v1
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; CHECK-NEXT:    global_load_dword v0, v[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %gep.a = getelementptr i8, ptr addrspace(1) %a, i64 256
   %gep.b = getelementptr i8, ptr addrspace(1) %a, i64 512
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index bee00f6efbd12..e754f665c5f43 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -16,9 +16,9 @@
 ; SelectionDAGBuilder for some reason changes the select type.
 ; VI: s_cselect_b64
 ; VI: v_cndmask_b32
-define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2
-  %b = load <2 x i8>, ptr addrspace(1) %b.ptr, align 2
+  %b = load <2 x i8>, ptr addrspace(4) %b.ptr, align 2
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
   store <2 x i8> %select, ptr addrspace(1) %out, align 2
@@ -28,9 +28,9 @@ define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-LABEL: {{^}}v_select_v4i8:
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <4 x i8>, ptr addrspace(1) %a.ptr
-  %b = load <4 x i8>, ptr addrspace(1) %b.ptr
+  %b = load <4 x i8>, ptr addrspace(4) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %select, ptr addrspace(1) %out, align 4
@@ -41,9 +41,9 @@ define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <8 x i8>, ptr addrspace(1) %a.ptr
-  %b = load <8 x i8>, ptr addrspace(1) %b.ptr
+  %b = load <8 x i8>, ptr addrspace(4) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
   store <8 x i8> %select, ptr addrspace(1) %out, align 4
@@ -56,9 +56,9 @@ define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <16 x i8>, ptr addrspace(1) %a.ptr
-  %b = load <16 x i8>, ptr addrspace(1) %b.ptr
+  %b = load <16 x i8>, ptr addrspace(4) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
   store <16 x i8> %select, ptr addrspace(1) %out, align 4
@@ -93,13 +93,16 @@ define amdgpu_kernel void @select_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2
 }
 
 ; GCN-LABEL: {{^}}v_select_v2i16:
-; GCN: buffer_load_dword v
-; GCN: buffer_load_dword v
+; GCN: {{buffer|flat|global}}_load_dword v
+; GCN: {{buffer|flat|global}}_load_dword v
 ; GCN: v_cndmask_b32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
-  %a = load <2 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <2 x i16>, ptr addrspace(1) %b.ptr
+define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.a = getelementptr <2 x i16>, ptr addrspace(1) %a.ptr, i32 %id
+  %gep.b = getelementptr <2 x i16>, ptr addrspace(4) %b.ptr, i32 %id
+  %a = load <2 x i16>, ptr addrspace(1) %gep.a
+  %b = load <2 x i16>, ptr addrspace(4) %gep.b
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
   store <2 x i16> %select, ptr addrspace(1) %out, align 4
@@ -114,9 +117,9 @@ define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ; VI: s_cselect_b64
 ; GFX9: cndmask
 ; GFX9: cndmask
-define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <3 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <3 x i16>, ptr addrspace(1) %b.ptr
+  %b = load <3 x i16>, ptr addrspace(4) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
   store <3 x i16> %select, ptr addrspace(1) %out, align 4
@@ -127,9 +130,9 @@ define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <4 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <4 x i16>, ptr addrspace(1) %b.ptr
+  %b = load <4 x i16>, ptr addrspace(4) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %select, ptr addrspace(1) %out, align 4
@@ -142,9 +145,9 @@ define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <8 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <8 x i16>, ptr addrspace(1) %b.ptr
+  %b = load <8 x i16>, ptr addrspace(4) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
   store <8 x i16> %select, ptr addrspace(1) %out, align 4
@@ -161,9 +164,9 @@ define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <16 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <16 x i16>, ptr addrspace(1) %b.ptr
+  %b = load <16 x i16>, ptr addrspace(4) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <16 x i16> %a, <16 x i16> %b
   store <16 x i16> %select, ptr addrspace(1) %out, align 4
@@ -188,9 +191,9 @@ define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
   %a = load <32 x i16>, ptr addr...
[truncated]

Copy link
Contributor

@shiltian shiltian left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. The change looks pretty mechanical but quite surprised to see this was not done already.

@arsenm
Copy link
Contributor Author

arsenm commented Nov 13, 2025

Interesting. The change looks pretty mechanical but quite surprised to see this was not done already.

I've had this sitting in a branch since 2018

Base automatically changed from users/arsenm/amdgpu/baseline-test-load-select-combine to main November 13, 2025 18:17
@arsenm arsenm force-pushed the users/arsenm/dag/ptr-select-combine-addrspace branch from 780323f to 57cf94b Compare November 13, 2025 18:18
@arsenm arsenm enabled auto-merge (squash) November 13, 2025 18:36
@arsenm arsenm merged commit e5f499f into main Nov 13, 2025
9 of 10 checks passed
@arsenm arsenm deleted the users/arsenm/dag/ptr-select-combine-addrspace branch November 13, 2025 18:58
@ronlieb
Copy link
Contributor

ronlieb commented Nov 16, 2025

This PR is breaking a few select related tests in the oclConformance
conformance/2.0/relationals/test_relationals relational_select_signed

Initializing random seed to 0.
Requesting Default device based on command line for platform index 0 and device index 0
Compute Device Name = gfx90a:sramecc+:xnack-, Compute Device Vendor = Advanced Micro Devices, Inc., Compute Device Version = OpenCL 2.0
, CL C Version = OpenCL C 2.0
Supports single precision denormals: YES
sizeof( void*) = 8 (host)
sizeof( void*) = 8 (device)
relational_select_signed...
ERROR: Data sample 1:0 does not validate! Expected (0x160d68cd), got (0x04ff885a) from (0x38ca7033) and (0x160d68cd) with test (0xd8d86
f1e)
inA: 0x33 0x70 0xca 0x38 0x23 0x29 0xf5 0x46 0x61 0xb3 0x6d 0xbf 0xcd 0x59 0xd7 0x07
inB: 0xcd 0x68 0x0d 0x16 0xd9 0x47 0xad 0x65 0x5e 0x00 0xd6 0xea 0xec 0x19 0xaa 0x33
inC: 0x1e 0x6f 0xd8 0xd8 0xe8 0x6b 0x40 0x5d 0xfb 0x35 0xb6 0x77 0xfd 0x8e 0x1a 0xf8
exp: 0xcd 0x68 0x0d 0x16 0x00 0x00 0x00 0x00 0xe0 0x3c 0x13 0x24 0x2d 0x7f 0x00 0x00
got: 0x5a 0x88 0xff 0x04 0x5a 0x88 0xff 0x04 0x5a 0x88 0xff 0x04 0x5a 0x88 0xff 0x04 Vector int1, test vector int1 FAILED
ERROR: Data sample 1:0 does not validate! Expected (0x56f8521f), got (0x6c9e2097) from (0x997c80e0) and (0x56f8521f) with test (0xac65c
7b3)
inA: 0xe0 0x80 0x7c 0x99 0x2e 0xc0 0xa9 0x1d 0x85 0xb3 0x9a 0xa1 0xc4 0x46 0x83 0xd3
inB: 0x1f 0x52 0xf8 0x56 0x86 0x8c 0x87 0xac 0x58 0x1f 0x2e 0xbf 0x21 0xf7 0xe5 0xe5
inC: 0xb3 0xc7 0x65 0xac 0x8e 0xba 0x2d 0x7e 0x03 0xeb 0x31 0x01 0xd1 0xf4 0xb9 0xa8
exp: 0x1f 0x52 0xf8 0x56 0x86 0x8c 0x87 0xac 0xe0 0x3c 0x13 0x24 0x2d 0x7f 0x00 0x00
got: 0x97 0x20 0x9e 0x6c 0x20 0xc5 0x96 0x64 0x97 0x20 0x9e 0x6c 0x20 0xc5 0x96 0x64 Vector long1, test vector long1 FAILED
Testing doubles.
relational_select_signed FAILED
FAILED test.

ronlieb added a commit to ROCm/llvm-project that referenced this pull request Nov 16, 2025
…167909)"

breaks oclConformance
conformance/2.0/relationals/test_relationals relational_select_signed

This reverts commit e5f499f.
ronlieb added a commit that referenced this pull request Nov 16, 2025
llvm-sync bot pushed a commit to arm/arm-toolchain that referenced this pull request Nov 17, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants