Skip to content

Conversation

@ronlieb
Copy link
Contributor

@ronlieb ronlieb commented Nov 16, 2025

Reverts #167909

@llvmbot
Copy link
Member

llvmbot commented Nov 16, 2025

@llvm/pr-subscribers-backend-nvptx
@llvm/pr-subscribers-llvm-selectiondag

@llvm/pr-subscribers-backend-amdgpu

Author: None (ronlieb)

Changes

Reverts llvm/llvm-project#167909


Patch is 44.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168292.diff

12 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+6-10)
  • (modified) llvm/test/CodeGen/AMDGPU/load-select-ptr.ll (+48-49)
  • (modified) llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll (+12-15)
  • (modified) llvm/test/CodeGen/AMDGPU/select-vectors.ll (+29-34)
  • (modified) llvm/test/CodeGen/AMDGPU/select64.ll (+2-2)
  • (modified) llvm/test/CodeGen/NVPTX/bf16-instructions.ll (+6-7)
  • (modified) llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll (+5-7)
  • (modified) llvm/test/CodeGen/NVPTX/bug22246.ll (+8-9)
  • (modified) llvm/test/CodeGen/NVPTX/fast-math.ll (+28-36)
  • (modified) llvm/test/CodeGen/NVPTX/i1-select.ll (+34-43)
  • (modified) llvm/test/CodeGen/NVPTX/i8x4-instructions.ll (+5-7)
  • (modified) llvm/test/CodeGen/NVPTX/lower-byval-args.ll (+38-19)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6fbac0f8c8cdf..c9513611e6dcb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -29033,9 +29033,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
         // over-conservative. It would be beneficial to be able to remember
         // both potential memory locations.  Since we are discarding
         // src value info, don't do the transformation if the memory
-        // locations are not in the same address space.
-        LLD->getPointerInfo().getAddrSpace() !=
-            RLD->getPointerInfo().getAddrSpace() ||
+        // locations are not in the default address space.
+        LLD->getPointerInfo().getAddrSpace() != 0 ||
+        RLD->getPointerInfo().getAddrSpace() != 0 ||
         // We can't produce a CMOV of a TargetFrameIndex since we won't
         // generate the address generation required.
         LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
@@ -29117,9 +29117,6 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     // but the new load must be the minimum (most restrictive) alignment of the
     // inputs.
     Align Alignment = std::min(LLD->getAlign(), RLD->getAlign());
-    unsigned AddrSpace = LLD->getAddressSpace();
-    assert(AddrSpace == RLD->getAddressSpace());
-
     MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
     if (!RLD->isInvariant())
       MMOFlags &= ~MachineMemOperand::MOInvariant;
@@ -29128,16 +29125,15 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
       // FIXME: Discards pointer and AA info.
       Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
-                         LLD->getChain(), Addr, MachinePointerInfo(AddrSpace),
-                         Alignment, MMOFlags);
+                         LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
+                         MMOFlags);
     } else {
       // FIXME: Discards pointer and AA info.
       Load = DAG.getExtLoad(
           LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
                                                   : LLD->getExtensionType(),
           SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
-          MachinePointerInfo(AddrSpace), LLD->getMemoryVT(), Alignment,
-          MMOFlags);
+          MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
     }
 
     // Users of the select now use the result of the load.
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 5aabad682ad30..d9ad9590d9762 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -7,31 +7,27 @@
 define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %ptr0, [8 x i32], ptr %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
 ; GCN-LABEL: select_ptr_crash_i64_flat:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x28
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x50
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x78
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_load_dword s2, s[8:9], 0x0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x78
-; GCN-NEXT:    s_add_u32 s4, s8, 40
-; GCN-NEXT:    s_addc_u32 s3, s9, 0
-; GCN-NEXT:    s_add_u32 s5, s8, 0x50
-; GCN-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_eq_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s3, s3, s6
-; GCN-NEXT:    s_cselect_b32 s2, s4, s5
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NEXT:    s_cmp_eq_u32 s6, 0
+; GCN-NEXT:    s_cselect_b32 s0, s0, s2
+; GCN-NEXT:    s_cselect_b32 s1, s1, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_add_u32 s0, s0, 4
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-NEXT:    s_add_u32 s2, s2, 4
+; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
-; GCN-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    flat_load_dword v1, v[1:2]
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
@@ -49,28 +45,25 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p
 define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr addrspace(1) %ptr0, [8 x i32], ptr addrspace(1) %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
 ; GCN-LABEL: select_ptr_crash_i64_global:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x28
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x50
+; GCN-NEXT:    s_load_dword s6, s[8:9], 0x0
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x78
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_load_dword s2, s[8:9], 0x0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x78
-; GCN-NEXT:    s_add_u32 s4, s8, 40
-; GCN-NEXT:    s_addc_u32 s3, s9, 0
-; GCN-NEXT:    s_add_u32 s5, s8, 0x50
-; GCN-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_eq_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s3, s3, s6
-; GCN-NEXT:    s_cselect_b32 s2, s4, s5
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_cmp_eq_u32 s6, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GCN-NEXT:    s_cselect_b32 s1, s1, s3
+; GCN-NEXT:    s_cselect_b32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
   %tmp2 = icmp eq i32 %tmp, 0
   %tmp3 = load i64, ptr addrspace(1) %ptr0, align 8
@@ -85,18 +78,22 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3)
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_eq_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s1, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    ds_read_b64 v[0:1], v0
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    ds_read_b64 v[2:3], v2
+; GCN-NEXT:    s_cmp_eq_u32 s0, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
   %tmp2 = icmp eq i32 %tmp, 0
@@ -115,20 +112,22 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    s_add_i32 s12, s12, s17
 ; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GCN-NEXT:    s_mov_b32 m0, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_addk_i32 s1, 0x80
-; GCN-NEXT:    s_addk_i32 s2, 0x200
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    ds_read_b64 v[0:1], v0 offset:128
+; GCN-NEXT:    ds_read_b64 v[2:3], v2 offset:512
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s1, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ds_read_b64 v[0:1], v0
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
   %tmp2 = icmp eq i32 %tmp, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll b/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
index cc5ae2717faf0..423fb7d52d3e3 100644
--- a/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll
@@ -22,12 +22,12 @@ define i32 @select_load_i32_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %
 ; CHECK-LABEL: select_load_i32_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    global_load_dword v5, v[1:2], off
+; CHECK-NEXT:    global_load_dword v6, v[3:4], off
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT:    global_load_dword v0, v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ld0 = load i32, ptr addrspace(1) %a
   %ld1 = load i32, ptr addrspace(1) %b
@@ -39,11 +39,12 @@ define i32 @select_load_i32_p3(i1 %cond, ptr addrspace(3) %a, ptr addrspace(3) %
 ; CHECK-LABEL: select_load_i32_p3:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ds_read_b32 v1, v1
+; CHECK-NEXT:    ds_read_b32 v2, v2
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; CHECK-NEXT:    ds_read_b32 v0, v0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ld0 = load i32, ptr addrspace(3) %a
   %ld1 = load i32, ptr addrspace(3) %b
@@ -89,12 +90,12 @@ define i8 @select_load_i8_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %b)
 ; CHECK-LABEL: select_load_i8_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off
+; CHECK-NEXT:    global_load_ubyte v6, v[3:4], off
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT:    global_load_ubyte v0, v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %ld0 = load i8, ptr addrspace(1) %a
   %ld1 = load i8, ptr addrspace(1) %b
@@ -106,16 +107,12 @@ define i32 @select_load_i32_p1_offset(i1 %cond, ptr addrspace(1) %a, ptr addrspa
 ; CHECK-LABEL: select_load_i32_p1_offset:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_add_co_u32_e32 v3, vcc, 0x100, v1
-; CHECK-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v2, vcc
-; CHECK-NEXT:    v_add_co_u32_e32 v5, vcc, 0x200, v1
+; CHECK-NEXT:    global_load_dword v3, v[1:2], off offset:256
+; CHECK-NEXT:    global_load_dword v4, v[1:2], off offset:512
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; CHECK-NEXT:    global_load_dword v0, v[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %gep.a = getelementptr i8, ptr addrspace(1) %a, i64 256
   %gep.b = getelementptr i8, ptr addrspace(1) %a, i64 512
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index e754f665c5f43..bee00f6efbd12 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -16,9 +16,9 @@
 ; SelectionDAGBuilder for some reason changes the select type.
 ; VI: s_cselect_b64
 ; VI: v_cndmask_b32
-define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2
-  %b = load <2 x i8>, ptr addrspace(4) %b.ptr, align 2
+  %b = load <2 x i8>, ptr addrspace(1) %b.ptr, align 2
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
   store <2 x i8> %select, ptr addrspace(1) %out, align 2
@@ -28,9 +28,9 @@ define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-LABEL: {{^}}v_select_v4i8:
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <4 x i8>, ptr addrspace(1) %a.ptr
-  %b = load <4 x i8>, ptr addrspace(4) %b.ptr
+  %b = load <4 x i8>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %select, ptr addrspace(1) %out, align 4
@@ -41,9 +41,9 @@ define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <8 x i8>, ptr addrspace(1) %a.ptr
-  %b = load <8 x i8>, ptr addrspace(4) %b.ptr
+  %b = load <8 x i8>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
   store <8 x i8> %select, ptr addrspace(1) %out, align 4
@@ -56,9 +56,9 @@ define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <16 x i8>, ptr addrspace(1) %a.ptr
-  %b = load <16 x i8>, ptr addrspace(4) %b.ptr
+  %b = load <16 x i8>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
   store <16 x i8> %select, ptr addrspace(1) %out, align 4
@@ -93,16 +93,13 @@ define amdgpu_kernel void @select_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2
 }
 
 ; GCN-LABEL: {{^}}v_select_v2i16:
-; GCN: {{buffer|flat|global}}_load_dword v
-; GCN: {{buffer|flat|global}}_load_dword v
+; GCN: buffer_load_dword v
+; GCN: buffer_load_dword v
 ; GCN: v_cndmask_b32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.a = getelementptr <2 x i16>, ptr addrspace(1) %a.ptr, i32 %id
-  %gep.b = getelementptr <2 x i16>, ptr addrspace(4) %b.ptr, i32 %id
-  %a = load <2 x i16>, ptr addrspace(1) %gep.a
-  %b = load <2 x i16>, ptr addrspace(4) %gep.b
+define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
+  %a = load <2 x i16>, ptr addrspace(1) %a.ptr
+  %b = load <2 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
   store <2 x i16> %select, ptr addrspace(1) %out, align 4
@@ -117,9 +114,9 @@ define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1
 ; VI: s_cselect_b64
 ; GFX9: cndmask
 ; GFX9: cndmask
-define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <3 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <3 x i16>, ptr addrspace(4) %b.ptr
+  %b = load <3 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
   store <3 x i16> %select, ptr addrspace(1) %out, align 4
@@ -130,9 +127,9 @@ define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <4 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <4 x i16>, ptr addrspace(4) %b.ptr
+  %b = load <4 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %select, ptr addrspace(1) %out, align 4
@@ -145,9 +142,9 @@ define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <8 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <8 x i16>, ptr addrspace(4) %b.ptr
+  %b = load <8 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
   store <8 x i16> %select, ptr addrspace(1) %out, align 4
@@ -164,9 +161,9 @@ define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <16 x i16>, ptr addrspace(1) %a.ptr
-  %b = load <16 x i16>, ptr addrspace(4) %b.ptr
+  %b = load <16 x i16>, ptr addrspace(1) %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <16 x i16> %a, <16 x i16> %b
   store <16 x i16> %select, ptr addrspace(1) %out, align 4
@@ -191,9 +188,9 @@ define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
 ; GCN-NOT: cndmask
-define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 {
+define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 {
   %a = load <32 x i16>, ptr addrspace(1) %a.ptr
-  %b...
[truncated]

@ronlieb ronlieb merged commit 6d5f87f into main Nov 16, 2025
14 checks passed
@ronlieb ronlieb deleted the revert-167909-users/arsenm/dag/ptr-select-combine-addrspace branch November 16, 2025 23:35
@jayfoad
Copy link
Contributor

jayfoad commented Nov 17, 2025

As the Developer Policy says: "The commit message for the reverting commit should explain why patch is being reverted."

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants