diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index df353c4d91b1a..c5f3cd29f684e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -29031,9 +29031,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, // over-conservative. It would be beneficial to be able to remember // both potential memory locations. Since we are discarding // src value info, don't do the transformation if the memory - // locations are not in the default address space. - LLD->getPointerInfo().getAddrSpace() != 0 || - RLD->getPointerInfo().getAddrSpace() != 0 || + // locations are not in the same address space. + LLD->getPointerInfo().getAddrSpace() != + RLD->getPointerInfo().getAddrSpace() || // We can't produce a CMOV of a TargetFrameIndex since we won't // generate the address generation required. LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex || @@ -29115,6 +29115,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, // but the new load must be the minimum (most restrictive) alignment of the // inputs. Align Alignment = std::min(LLD->getAlign(), RLD->getAlign()); + unsigned AddrSpace = LLD->getAddressSpace(); + assert(AddrSpace == RLD->getAddressSpace()); + MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags(); if (!RLD->isInvariant()) MMOFlags &= ~MachineMemOperand::MOInvariant; @@ -29123,15 +29126,16 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, if (LLD->getExtensionType() == ISD::NON_EXTLOAD) { // FIXME: Discards pointer and AA info. Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect), - LLD->getChain(), Addr, MachinePointerInfo(), Alignment, - MMOFlags); + LLD->getChain(), Addr, MachinePointerInfo(AddrSpace), + Alignment, MMOFlags); } else { // FIXME: Discards pointer and AA info. Load = DAG.getExtLoad( LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType() : LLD->getExtensionType(), SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr, - MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags); + MachinePointerInfo(AddrSpace), LLD->getMemoryVT(), Alignment, + MMOFlags); } // Users of the select now use the result of the load. diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll index d9ad9590d9762..5aabad682ad30 100644 --- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll @@ -7,27 +7,31 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %ptr0, [8 x i32], ptr %ptr1, [8 x i32], ptr addrspace(1) %ptr2) { ; GCN-LABEL: select_ptr_crash_i64_flat: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s6, s[8:9], 0x0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_load_dword s2, s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x78 +; GCN-NEXT: s_add_u32 s4, s8, 40 +; GCN-NEXT: s_addc_u32 s3, s9, 0 +; GCN-NEXT: s_add_u32 s5, s8, 0x50 +; GCN-NEXT: s_addc_u32 s6, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s6, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, s2 -; GCN-NEXT: s_cselect_b32 s1, s1, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_add_u32 s0, s0, 4 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s3, s3, s6 +; GCN-NEXT: s_cselect_b32 s2, s4, s5 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_add_u32 s2, s2, 4 ; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_addc_u32 s3, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: flat_load_dword v1, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -45,25 +49,28 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr addrspace(1) %ptr0, [8 x i32], ptr addrspace(1) %ptr1, [8 x i32], ptr addrspace(1) %ptr2) { ; GCN-LABEL: select_ptr_crash_i64_global: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50 -; GCN-NEXT: s_load_dword s6, s[8:9], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_load_dword s2, s[8:9], 0x0 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x78 +; GCN-NEXT: s_add_u32 s4, s8, 40 +; GCN-NEXT: s_addc_u32 s3, s9, 0 +; GCN-NEXT: s_add_u32 s5, s8, 0x50 +; GCN-NEXT: s_addc_u32 s6, s9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s3, s3, s6 +; GCN-NEXT: s_cselect_b32 s2, s4, s5 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NEXT: s_cmp_eq_u32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cselect_b32 s1, s1, s3 -; GCN-NEXT: s_cselect_b32 s0, s0, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm %tmp2 = icmp eq i32 %tmp, 0 %tmp3 = load i64, ptr addrspace(1) %ptr0, align 8 @@ -78,22 +85,18 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3) ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: ds_read_b64 v[0:1], v0 -; GCN-NEXT: ds_read_b64 v[2:3], v2 ; GCN-NEXT: s_cmp_eq_u32 s0, 0 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: s_cselect_b32 s0, s1, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ds_read_b64 v[0:1], v0 ; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm %tmp2 = icmp eq i32 %tmp, 0 @@ -112,22 +115,20 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 -; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: s_add_i32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128 -; GCN-NEXT: ds_read_b64 v[2:3], v2 offset:512 +; GCN-NEXT: s_addk_i32 s1, 0x80 +; GCN-NEXT: s_addk_i32 s2, 0x200 ; GCN-NEXT: s_cmp_eq_u32 s0, 0 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: s_cselect_b32 s0, s1, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ds_read_b64 v[0:1], v0 ; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm %tmp2 = icmp eq i32 %tmp, 0 diff --git a/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll b/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll index 423fb7d52d3e3..cc5ae2717faf0 100644 --- a/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll @@ -22,12 +22,12 @@ define i32 @select_load_i32_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) % ; CHECK-LABEL: select_load_i32_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v5, v[1:2], off -; CHECK-NEXT: global_load_dword v6, v[3:4], off ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: global_load_dword v0, v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %ld0 = load i32, ptr addrspace(1) %a %ld1 = load i32, ptr addrspace(1) %b @@ -39,12 +39,11 @@ define i32 @select_load_i32_p3(i1 %cond, ptr addrspace(3) %a, ptr addrspace(3) % ; CHECK-LABEL: select_load_i32_p3: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b32 v1, v1 -; CHECK-NEXT: ds_read_b32 v2, v2 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; CHECK-NEXT: ds_read_b32 v0, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %ld0 = load i32, ptr addrspace(3) %a %ld1 = load i32, ptr addrspace(3) %b @@ -90,12 +89,12 @@ define i8 @select_load_i8_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %b) ; CHECK-LABEL: select_load_i8_p1: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off -; CHECK-NEXT: global_load_ubyte v6, v[3:4], off ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: global_load_ubyte v0, v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %ld0 = load i8, ptr addrspace(1) %a %ld1 = load i8, ptr addrspace(1) %b @@ -107,12 +106,16 @@ define i32 @select_load_i32_p1_offset(i1 %cond, ptr addrspace(1) %a, ptr addrspa ; CHECK-LABEL: select_load_i32_p1_offset: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v3, v[1:2], off offset:256 -; CHECK-NEXT: global_load_dword v4, v[1:2], off offset:512 +; CHECK-NEXT: v_add_co_u32_e32 v3, vcc, 0x100, v1 +; CHECK-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v2, vcc +; CHECK-NEXT: v_add_co_u32_e32 v5, vcc, 0x200, v1 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %gep.a = getelementptr i8, ptr addrspace(1) %a, i64 256 %gep.b = getelementptr i8, ptr addrspace(1) %a, i64 512 diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll index bee00f6efbd12..e754f665c5f43 100644 --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -16,9 +16,9 @@ ; SelectionDAGBuilder for some reason changes the select type. ; VI: s_cselect_b64 ; VI: v_cndmask_b32 -define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <2 x i8>, ptr addrspace(1) %a.ptr, align 2 - %b = load <2 x i8>, ptr addrspace(1) %b.ptr, align 2 + %b = load <2 x i8>, ptr addrspace(4) %b.ptr, align 2 %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b store <2 x i8> %select, ptr addrspace(1) %out, align 2 @@ -28,9 +28,9 @@ define amdgpu_kernel void @v_select_v2i8(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-LABEL: {{^}}v_select_v4i8: ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <4 x i8>, ptr addrspace(1) %a.ptr - %b = load <4 x i8>, ptr addrspace(1) %b.ptr + %b = load <4 x i8>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %select, ptr addrspace(1) %out, align 4 @@ -41,9 +41,9 @@ define amdgpu_kernel void @v_select_v4i8(ptr addrspace(1) %out, ptr addrspace(1) ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <8 x i8>, ptr addrspace(1) %a.ptr - %b = load <8 x i8>, ptr addrspace(1) %b.ptr + %b = load <8 x i8>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b store <8 x i8> %select, ptr addrspace(1) %out, align 4 @@ -56,9 +56,9 @@ define amdgpu_kernel void @v_select_v8i8(ptr addrspace(1) %out, ptr addrspace(1) ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <16 x i8>, ptr addrspace(1) %a.ptr - %b = load <16 x i8>, ptr addrspace(1) %b.ptr + %b = load <16 x i8>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b store <16 x i8> %select, ptr addrspace(1) %out, align 4 @@ -93,13 +93,16 @@ define amdgpu_kernel void @select_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 } ; GCN-LABEL: {{^}}v_select_v2i16: -; GCN: buffer_load_dword v -; GCN: buffer_load_dword v +; GCN: {{buffer|flat|global}}_load_dword v +; GCN: {{buffer|flat|global}}_load_dword v ; GCN: v_cndmask_b32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { - %a = load <2 x i16>, ptr addrspace(1) %a.ptr - %b = load <2 x i16>, ptr addrspace(1) %b.ptr +define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.a = getelementptr <2 x i16>, ptr addrspace(1) %a.ptr, i32 %id + %gep.b = getelementptr <2 x i16>, ptr addrspace(4) %b.ptr, i32 %id + %a = load <2 x i16>, ptr addrspace(1) %gep.a + %b = load <2 x i16>, ptr addrspace(4) %gep.b %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b store <2 x i16> %select, ptr addrspace(1) %out, align 4 @@ -114,9 +117,9 @@ define amdgpu_kernel void @v_select_v2i16(ptr addrspace(1) %out, ptr addrspace(1 ; VI: s_cselect_b64 ; GFX9: cndmask ; GFX9: cndmask -define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <3 x i16>, ptr addrspace(1) %a.ptr - %b = load <3 x i16>, ptr addrspace(1) %b.ptr + %b = load <3 x i16>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b store <3 x i16> %select, ptr addrspace(1) %out, align 4 @@ -127,9 +130,9 @@ define amdgpu_kernel void @v_select_v3i16(ptr addrspace(1) %out, ptr addrspace(1 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <4 x i16>, ptr addrspace(1) %a.ptr - %b = load <4 x i16>, ptr addrspace(1) %b.ptr + %b = load <4 x i16>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b store <4 x i16> %select, ptr addrspace(1) %out, align 4 @@ -142,9 +145,9 @@ define amdgpu_kernel void @v_select_v4i16(ptr addrspace(1) %out, ptr addrspace(1 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <8 x i16>, ptr addrspace(1) %a.ptr - %b = load <8 x i16>, ptr addrspace(1) %b.ptr + %b = load <8 x i16>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b store <8 x i16> %select, ptr addrspace(1) %out, align 4 @@ -161,9 +164,9 @@ define amdgpu_kernel void @v_select_v8i16(ptr addrspace(1) %out, ptr addrspace(1 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <16 x i16>, ptr addrspace(1) %a.ptr - %b = load <16 x i16>, ptr addrspace(1) %b.ptr + %b = load <16 x i16>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <16 x i16> %a, <16 x i16> %b store <16 x i16> %select, ptr addrspace(1) %out, align 4 @@ -188,9 +191,9 @@ define amdgpu_kernel void @v_select_v16i16(ptr addrspace(1) %out, ptr addrspace( ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <32 x i16>, ptr addrspace(1) %a.ptr - %b = load <32 x i16>, ptr addrspace(1) %b.ptr + %b = load <32 x i16>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <32 x i16> %a, <32 x i16> %b store <32 x i16> %select, ptr addrspace(1) %out, align 4 @@ -333,6 +336,7 @@ bb: define amdgpu_kernel void @s_select_v5f32(ptr addrspace(1) %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <5 x float> %a, <5 x float> %b + call void asm "; use $0", "s"(<5 x float> %a) store <5 x float> %select, ptr addrspace(1) %out, align 16 ret void } @@ -400,6 +404,7 @@ define amdgpu_kernel void @select_v4f64(ptr addrspace(1) %out, <4 x double> %a, ; GCN: s_cselect_b32 define amdgpu_kernel void @select_v8f64(ptr addrspace(1) %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 + call void asm "; use $0", "s"(<8 x double> %a) %select = select i1 %cmp, <8 x double> %a, <8 x double> %b store <8 x double> %select, ptr addrspace(1) %out, align 16 ret void @@ -408,9 +413,9 @@ define amdgpu_kernel void @select_v8f64(ptr addrspace(1) %out, <8 x double> %a, ; GCN-LABEL: {{^}}v_select_v2f16: ; GCN: v_cndmask_b32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <2 x half>, ptr addrspace(1) %a.ptr - %b = load <2 x half>, ptr addrspace(1) %b.ptr + %b = load <2 x half>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x half> %a, <2 x half> %b store <2 x half> %select, ptr addrspace(1) %out, align 4 @@ -421,9 +426,9 @@ define amdgpu_kernel void @v_select_v2f16(ptr addrspace(1) %out, ptr addrspace(1 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <3 x half>, ptr addrspace(1) %a.ptr - %b = load <3 x half>, ptr addrspace(1) %b.ptr + %b = load <3 x half>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <3 x half> %a, <3 x half> %b store <3 x half> %select, ptr addrspace(1) %out, align 4 @@ -434,9 +439,9 @@ define amdgpu_kernel void @v_select_v3f16(ptr addrspace(1) %out, ptr addrspace(1 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e32 ; GCN-NOT: cndmask -define amdgpu_kernel void @v_select_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, i32 %c) #0 { +define amdgpu_kernel void @v_select_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(4) %b.ptr, i32 %c) #0 { %a = load <4 x half>, ptr addrspace(1) %a.ptr - %b = load <4 x half>, ptr addrspace(1) %b.ptr + %b = load <4 x half>, ptr addrspace(4) %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b store <4 x half> %select, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll index de154b5ced8a8..2013db7bc7be3 100644 --- a/llvm/test/CodeGen/AMDGPU/select64.ll +++ b/llvm/test/CodeGen/AMDGPU/select64.ll @@ -40,10 +40,10 @@ define amdgpu_kernel void @select_trunc_i64_2(ptr addrspace(1) %out, i32 %cond, ; GCN-LABEL: {{^}}v_select_trunc_i64_2: ; GCN: s_cselect_b32 ; GCN-NOT: s_cselect_b32 -define amdgpu_kernel void @v_select_trunc_i64_2(ptr addrspace(1) %out, i32 %cond, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { +define amdgpu_kernel void @v_select_trunc_i64_2(ptr addrspace(1) %out, i32 %cond, ptr addrspace(1) %aptr, ptr addrspace(4) %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 %a = load i64, ptr addrspace(1) %aptr, align 8 - %b = load i64, ptr addrspace(1) %bptr, align 8 + %b = load i64, ptr addrspace(4) %bptr, align 8 %sel = select i1 %cmp, i64 %a, i64 %b %trunc = trunc i64 %sel to i32 store i32 %trunc, ptr addrspace(1) %out, align 4 diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 3626613cf8511..41f77b5337e6d 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -768,17 +768,18 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat % ; CHECK-LABEL: test_select_cc_bf16_f64( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_bf16_f64_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_bf16_f64_param_1]; ; CHECK-NEXT: setp.lt.f64 %p1, %rd1, %rd2; -; CHECK-NEXT: ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2]; -; CHECK-NEXT: ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3]; -; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: mov.b64 %rd3, test_select_cc_bf16_f64_param_3; +; CHECK-NEXT: mov.b64 %rd4, test_select_cc_bf16_f64_param_2; +; CHECK-NEXT: selp.b64 %rd5, %rd4, %rd3, %p1; +; CHECK-NEXT: ld.param.b16 %rs1, [%rd5]; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %cc = fcmp olt double %a, %b %r = select i1 %cc, bfloat %c, bfloat %d diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index 3c6fb4b7517b8..c19e66559af86 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -229,16 +229,18 @@ define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1]; -; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: mov.b64 %rd1, test_select_param_1; +; CHECK-NEXT: mov.b64 %rd2, test_select_param_0; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = select i1 %c, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %r diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll index 198878c1b96ff..1d7a396f694dc 100644 --- a/llvm/test/CodeGen/NVPTX/bug22246.ll +++ b/llvm/test/CodeGen/NVPTX/bug22246.ll @@ -9,19 +9,20 @@ define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr noc ; CHECK-LABEL: _Z3foobbbPb( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<7>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b8 %rs1, [_Z3foobbbPb_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b8 %rs3, [_Z3foobbbPb_param_1]; -; CHECK-NEXT: ld.param.b8 %rs4, [_Z3foobbbPb_param_2]; -; CHECK-NEXT: selp.b16 %rs5, %rs3, %rs4, %p1; -; CHECK-NEXT: and.b16 %rs6, %rs5, 1; -; CHECK-NEXT: ld.param.b64 %rd1, [_Z3foobbbPb_param_3]; -; CHECK-NEXT: st.b8 [%rd1], %rs6; +; CHECK-NEXT: mov.b64 %rd1, _Z3foobbbPb_param_2; +; CHECK-NEXT: mov.b64 %rd2, _Z3foobbbPb_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b8 %rs3, [%rd3]; +; CHECK-NEXT: and.b16 %rs4, %rs3, 1; +; CHECK-NEXT: ld.param.b64 %rd4, [_Z3foobbbPb_param_3]; +; CHECK-NEXT: st.b8 [%rd4], %rs4; ; CHECK-NEXT: ret; entry: %.sink.v = select i1 %p1, i1 %p2, i1 %p3 diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll index 8561c60a46948..7e778c40b8302 100644 --- a/llvm/test/CodeGen/NVPTX/fast-math.ll +++ b/llvm/test/CodeGen/NVPTX/fast-math.ll @@ -312,18 +312,20 @@ define float @repeated_div_recip_allowed_sel(i1 %pred, float %a, float %b, float ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_recip_allowed_sel_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_recip_allowed_sel_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_sel_param_2]; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_recip_allowed_sel_param_3]; -; CHECK-NEXT: div.rn.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: mov.b64 %rd1, repeated_div_recip_allowed_sel_param_2; +; CHECK-NEXT: mov.b64 %rd2, repeated_div_recip_allowed_sel_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_sel_param_3]; +; CHECK-NEXT: div.rn.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %x = fdiv arcp float %a, %divisor %y = fdiv arcp float %b, %divisor @@ -364,18 +366,20 @@ define float @repeated_div_recip_allowed_ftz_sel(i1 %pred, float %a, float %b, f ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_recip_allowed_ftz_sel_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_recip_allowed_ftz_sel_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_sel_param_2]; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_recip_allowed_ftz_sel_param_3]; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: mov.b64 %rd1, repeated_div_recip_allowed_ftz_sel_param_2; +; CHECK-NEXT: mov.b64 %rd2, repeated_div_recip_allowed_ftz_sel_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_sel_param_3]; +; CHECK-NEXT: div.rn.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %x = fdiv arcp float %a, %divisor %y = fdiv arcp float %b, %divisor @@ -416,18 +420,20 @@ define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_fast_sel_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_fast_sel_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_sel_param_2]; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_fast_sel_param_3]; -; CHECK-NEXT: div.approx.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: mov.b64 %rd1, repeated_div_fast_sel_param_2; +; CHECK-NEXT: mov.b64 %rd2, repeated_div_fast_sel_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_sel_param_3]; +; CHECK-NEXT: div.approx.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %x = fdiv afn float %a, %divisor %y = fdiv afn float %b, %divisor @@ -468,18 +474,20 @@ define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %div ; CHECK: { ; CHECK-NEXT: .reg .pred %p<2>; ; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [repeated_div_fast_ftz_sel_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r1, [repeated_div_fast_ftz_sel_param_1]; -; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_ftz_sel_param_2]; -; CHECK-NEXT: selp.f32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: ld.param.b32 %r4, [repeated_div_fast_ftz_sel_param_3]; -; CHECK-NEXT: div.approx.ftz.f32 %r5, %r3, %r4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: mov.b64 %rd1, repeated_div_fast_ftz_sel_param_2; +; CHECK-NEXT: mov.b64 %rd2, repeated_div_fast_ftz_sel_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r1, [%rd3]; +; CHECK-NEXT: ld.param.b32 %r2, [repeated_div_fast_ftz_sel_param_3]; +; CHECK-NEXT: div.approx.ftz.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %x = fdiv afn float %a, %divisor %y = fdiv afn float %b, %divisor diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index 264f38021e1de..c91a3df204d80 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -8,21 +8,24 @@ define i32 @test_select_i1_trunc(i32 %a, i32 %b, i32 %c, i32 %true, i32 %false) ; CHECK-LABEL: test_select_i1_trunc( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_param_0]; ; CHECK-NEXT: and.b32 %r2, %r1, 1; ; CHECK-NEXT: setp.ne.b32 %p1, %r2, 0; -; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_trunc_param_1]; -; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_trunc_param_2]; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_trunc_param_3]; -; CHECK-NEXT: selp.b32 %r6, %r3, %r4, %p1; -; CHECK-NEXT: and.b32 %r7, %r6, 1; -; CHECK-NEXT: setp.ne.b32 %p2, %r7, 0; -; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_trunc_param_4]; -; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r9; +; CHECK-NEXT: mov.b64 %rd1, test_select_i1_trunc_param_2; +; CHECK-NEXT: mov.b64 %rd2, test_select_i1_trunc_param_1; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [%rd3]; +; CHECK-NEXT: and.b32 %r4, %r3, 1; +; CHECK-NEXT: setp.ne.b32 %p2, %r4, 0; +; CHECK-NEXT: mov.b64 %rd4, test_select_i1_trunc_param_4; +; CHECK-NEXT: mov.b64 %rd5, test_select_i1_trunc_param_3; +; CHECK-NEXT: selp.b64 %rd6, %rd5, %rd4, %p2; +; CHECK-NEXT: ld.param.b32 %r5, [%rd6]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %a_trunc = trunc i32 %a to i1 %b_trunc = trunc i32 %b to i1 @@ -36,23 +39,25 @@ define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false ; CHECK-LABEL: test_select_i1_trunc_2( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_select_i1_trunc_2_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 1; ; CHECK-NEXT: setp.ne.b64 %p1, %rd2, 0; -; CHECK-NEXT: ld.param.b16 %rs1, [test_select_i1_trunc_2_param_1]; -; CHECK-NEXT: ld.param.b16 %rs2, [test_select_i1_trunc_2_param_2]; -; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_2_param_3]; -; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-NEXT: and.b16 %rs4, %rs3, 1; -; CHECK-NEXT: setp.ne.b16 %p2, %rs4, 0; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_trunc_2_param_4]; -; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: mov.b64 %rd3, test_select_i1_trunc_2_param_2; +; CHECK-NEXT: mov.b64 %rd4, test_select_i1_trunc_2_param_1; +; CHECK-NEXT: selp.b64 %rd5, %rd4, %rd3, %p1; +; CHECK-NEXT: ld.param.b16 %rs1, [%rd5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p2, %rs2, 0; +; CHECK-NEXT: mov.b64 %rd6, test_select_i1_trunc_2_param_4; +; CHECK-NEXT: mov.b64 %rd7, test_select_i1_trunc_2_param_3; +; CHECK-NEXT: selp.b64 %rd8, %rd7, %rd6, %p2; +; CHECK-NEXT: ld.param.b32 %r1, [%rd8]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %a_trunc = trunc i64 %a to i1 %b_trunc = trunc i16 %b to i1 @@ -66,7 +71,8 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals ; CHECK-LABEL: test_select_i1_basic( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<4>; -; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_param_0]; @@ -75,13 +81,14 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals ; CHECK-NEXT: setp.ne.b32 %p1, %r1, 0; ; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_param_2]; ; CHECK-NEXT: setp.eq.b32 %p2, %r4, 0; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_3]; ; CHECK-NEXT: setp.eq.b32 %p3, %r3, 0; -; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_param_4]; -; CHECK-NEXT: selp.b32 %r7, %r5, %r6, %p2; -; CHECK-NEXT: selp.b32 %r8, %r7, %r6, %p1; -; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r9; +; CHECK-NEXT: mov.b64 %rd1, test_select_i1_basic_param_4; +; CHECK-NEXT: mov.b64 %rd2, test_select_i1_basic_param_3; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p2; +; CHECK-NEXT: selp.b64 %rd4, %rd3, %rd1, %p1; +; CHECK-NEXT: selp.b64 %rd5, %rd2, %rd4, %p3; +; CHECK-NEXT: ld.param.b32 %r5, [%rd5]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 @@ -95,7 +102,8 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i ; CHECK-LABEL: test_select_i1_basic_folding( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<11>; -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0]; @@ -105,16 +113,17 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i ; CHECK-NEXT: setp.eq.b32 %p3, %r2, 0; ; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2]; ; CHECK-NEXT: setp.eq.b32 %p4, %r3, 0; -; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; ; CHECK-NEXT: xor.pred %p5, %p1, %p3; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; ; CHECK-NEXT: and.pred %p6, %p5, %p4; ; CHECK-NEXT: and.pred %p7, %p2, %p4; ; CHECK-NEXT: and.pred %p8, %p3, %p6; ; CHECK-NEXT: or.pred %p9, %p8, %p7; ; CHECK-NEXT: xor.pred %p10, %p9, %p3; -; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p10; -; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: mov.b64 %rd1, test_select_i1_basic_folding_param_4; +; CHECK-NEXT: mov.b64 %rd2, test_select_i1_basic_folding_param_3; +; CHECK-NEXT: selp.b64 %rd3, %rd2, %rd1, %p10; +; CHECK-NEXT: ld.param.b32 %r4, [%rd3]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 40d6a07310265..bfac2b4ffd431 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1442,16 +1442,18 @@ define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 { ; O3: { ; O3-NEXT: .reg .pred %p<2>; ; O3-NEXT: .reg .b16 %rs<3>; -; O3-NEXT: .reg .b32 %r<4>; +; O3-NEXT: .reg .b32 %r<2>; +; O3-NEXT: .reg .b64 %rd<4>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b8 %rs1, [test_select_param_2]; ; O3-NEXT: and.b16 %rs2, %rs1, 1; ; O3-NEXT: setp.ne.b16 %p1, %rs2, 0; -; O3-NEXT: ld.param.b32 %r1, [test_select_param_0]; -; O3-NEXT: ld.param.b32 %r2, [test_select_param_1]; -; O3-NEXT: selp.b32 %r3, %r1, %r2, %p1; -; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: mov.b64 %rd1, test_select_param_1; +; O3-NEXT: mov.b64 %rd2, test_select_param_0; +; O3-NEXT: selp.b64 %rd3, %rd2, %rd1, %p1; +; O3-NEXT: ld.param.b32 %r1, [%rd3]; +; O3-NEXT: st.param.b32 [func_retval0], %r1; ; O3-NEXT: ret; %r = select i1 %c, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 21257e21bea9f..ca2914a2e8043 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -584,44 +584,25 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; -; PTX_60-LABEL: test_select( -; PTX_60: { -; PTX_60-NEXT: .reg .pred %p<2>; -; PTX_60-NEXT: .reg .b16 %rs<3>; -; PTX_60-NEXT: .reg .b32 %r<4>; -; PTX_60-NEXT: .reg .b64 %rd<3>; -; PTX_60-EMPTY: -; PTX_60-NEXT: // %bb.0: // %bb -; PTX_60-NEXT: ld.param.b8 %rs1, [test_select_param_3]; -; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; -; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_60-NEXT: ld.param.b64 %rd1, [test_select_param_2]; -; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX_60-NEXT: ld.param.b32 %r1, [test_select_param_1]; -; PTX_60-NEXT: ld.param.b32 %r2, [test_select_param_0]; -; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1; -; PTX_60-NEXT: st.global.b32 [%rd2], %r3; -; PTX_60-NEXT: ret; -; -; PTX_70-LABEL: test_select( -; PTX_70: { -; PTX_70-NEXT: .reg .pred %p<2>; -; PTX_70-NEXT: .reg .b16 %rs<3>; -; PTX_70-NEXT: .reg .b32 %r<2>; -; PTX_70-NEXT: .reg .b64 %rd<6>; -; PTX_70-EMPTY: -; PTX_70-NEXT: // %bb.0: // %bb -; PTX_70-NEXT: ld.param.b8 %rs1, [test_select_param_3]; -; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; -; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0; -; PTX_70-NEXT: ld.param.b64 %rd2, [test_select_param_2]; -; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2; -; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1; -; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1; -; PTX_70-NEXT: ld.param.b32 %r1, [%rd5]; -; PTX_70-NEXT: st.global.b32 [%rd3], %r1; -; PTX_70-NEXT: ret; +; PTX-LABEL: test_select( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %bb +; PTX-NEXT: ld.param.b8 %rs1, [test_select_param_3]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; +; PTX-NEXT: mov.b64 %rd1, test_select_param_0; +; PTX-NEXT: ld.param.b64 %rd2, [test_select_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.b64 %rd4, test_select_param_1; +; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1; +; PTX-NEXT: ld.param.b32 %r1, [%rd5]; +; PTX-NEXT: st.global.b32 [%rd3], %r1; +; PTX-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 %valloaded = load i32, ptr %ptrnew, align 4