diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b8b419d93021a..9308934c8baf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -470,6 +470,24 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); } +SDNode *AMDGPUDAGToDAGISel::packConstantV2I16(const SDNode *N, + SelectionDAG &DAG) const { + // TODO: Handle undef as zero + + assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); + uint32_t LHSVal, RHSVal; + if (getConstantValue(N->getOperand(0), LHSVal) && + getConstantValue(N->getOperand(1), RHSVal)) { + SDLoc SL(N); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); + return DAG.getMachineNode( + isVGPRImm(N) ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32, SL, + N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); + } + + return nullptr; +} + void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 4fa0d3f72e1c7..c902b7e7f1d87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -45,21 +45,6 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) { return false; } -// TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); - uint32_t LHSVal, RHSVal; - if (getConstantValue(N->getOperand(0), LHSVal) && - getConstantValue(N->getOperand(1), RHSVal)) { - SDLoc SL(N); - uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); - return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), - DAG.getTargetConstant(K, SL, MVT::i32)); - } - - return nullptr; -} - /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -115,6 +100,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) const; + SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; SDNode *glueCopyToM0LDSInit(SDNode *N) const; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 5b6fc6ae2cb91..69ed10c7c02a9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -497,10 +497,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 +; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 @@ -810,11 +810,11 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray ; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 +; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 +; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 +; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200 -; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500 -; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_add_co_u32 v0, s4, s6, v0 ; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 7d3b316915923..c98feeb96232d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -746,9 +746,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_mov_b64 exec, 0 ; GFX9-NEXT: .LBB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB6_7: ; GFX9-NEXT: s_mov_b64 exec, 0 @@ -792,9 +792,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-32-NEXT: .LBB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-32-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-32-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB6_7: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 @@ -838,9 +838,9 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_mov_b64 exec, 0 ; GFX10-64-NEXT: .LBB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-64-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-64-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB6_7: ; GFX10-64-NEXT: s_mov_b64 exec, 0 @@ -1005,9 +1005,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: .LBB7_8: ; %.return ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX9-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB7_9: ; GFX9-NEXT: s_mov_b64 exec, 0 @@ -1068,9 +1068,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: .LBB7_8: ; %.return ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-32-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-32-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-32-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-32-NEXT: s_endpgm ; GFX10-32-NEXT: .LBB7_9: ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 @@ -1131,9 +1131,9 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: .LBB7_8: ; %.return ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 -; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 -; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm +; GFX10-64-NEXT: v_bfrev_b32_e32 v0, 60 +; GFX10-64-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX10-64-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm ; GFX10-64-NEXT: s_endpgm ; GFX10-64-NEXT: .LBB7_9: ; GFX10-64-NEXT: s_mov_b64 exec, 0