diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 792e17eeedab1..2d46cf3b70a34 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4040,47 +4040,48 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); - - ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); - if (!RHS) - return SDValue(); - SDValue LHS = N->getOperand(0); - unsigned RHSVal = RHS->getZExtValue(); - if (!RHSVal) - return LHS; - + SDValue RHS = N->getOperand(1); + ConstantSDNode *CRHS = dyn_cast(RHS); SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - switch (LHS->getOpcode()) { - default: - break; - case ISD::ZERO_EXTEND: - case ISD::SIGN_EXTEND: - case ISD::ANY_EXTEND: { - SDValue X = LHS->getOperand(0); - - if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && - isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { - // Prefer build_vector as the canonical form if packed types are legal. - // (shl ([asz]ext i16:x), 16 -> build_vector 0, x - SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, - { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); - return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); - } + unsigned RHSVal; + if (CRHS) { + RHSVal = CRHS->getZExtValue(); + if (!RHSVal) + return LHS; - // shl (ext x) => zext (shl x), if shift does not overflow int - if (VT != MVT::i64) - break; - KnownBits Known = DAG.computeKnownBits(X); - unsigned LZ = Known.countMinLeadingZeros(); - if (LZ < RHSVal) + switch (LHS->getOpcode()) { + default: break; - EVT XVT = X.getValueType(); - SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); - return DAG.getZExtOrTrunc(Shl, SL, VT); - } + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: { + SDValue X = LHS->getOperand(0); + + if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && + isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { + // Prefer build_vector as the canonical form if packed types are legal. + // (shl ([asz]ext i16:x), 16 -> build_vector 0, x + SDValue Vec = DAG.getBuildVector( + MVT::v2i16, SL, + {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + } + + // shl (ext x) => zext (shl x), if shift does not overflow int + if (VT != MVT::i64) + break; + KnownBits Known = DAG.computeKnownBits(X); + unsigned LZ = Known.countMinLeadingZeros(); + if (LZ < RHSVal) + break; + EVT XVT = X.getValueType(); + SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0)); + return DAG.getZExtOrTrunc(Shl, SL, VT); + } + } } if (VT != MVT::i64) @@ -4091,18 +4092,34 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, // On some subtargets, 64-bit shift is a quarter rate instruction. In the // common case, splitting this into a move and a 32-bit shift is faster and // the same code size. - if (RHSVal < 32) + EVT TargetType = VT.getHalfSizedIntegerVT(*DAG.getContext()); + EVT TargetVecPairType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2); + KnownBits Known = DAG.computeKnownBits(RHS); + + if (Known.getMinValue().getZExtValue() < TargetType.getSizeInBits()) return SDValue(); + SDValue ShiftAmt; - SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); + if (CRHS) { + ShiftAmt = + DAG.getConstant(RHSVal - TargetType.getSizeInBits(), SL, TargetType); + } else { + SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); + const SDValue ShiftMask = + DAG.getConstant(TargetType.getSizeInBits() - 1, SL, TargetType); + // This AND instruction will clamp out of bounds shift values. + // It will also be removed during later instruction selection. + ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask); + } - SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); - SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS); + SDValue NewShift = + DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags()); - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue Zero = DAG.getConstant(0, SL, TargetType); - SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift}); - return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); + SDValue Vec = DAG.getBuildVector(TargetVecPairType, SL, {Zero, NewShift}); + return DAG.getNode(ISD::BITCAST, SL, VT, Vec); } SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, diff --git a/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll b/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll new file mode 100644 index 0000000000000..05430213c17d2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shl64_reduce.ll @@ -0,0 +1,477 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +;; Test reduction of: +;; +;; DST = shl i64 X, Y +;; +;; where Y is in the range [63-32] to: +;; +;; DST = [0, shl i32 X, (Y & 0x1F)] + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Test range with metadata +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; FIXME: This case should be reduced, but SelectionDAG::computeKnownBits() cannot +; determine the minimum from metadata in this case. Match current results +; for now. + +define i64 @shl_metadata(i64 %arg0, ptr %arg1.ptr) { +; CHECK-LABEL: shl_metadata: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dword v2, v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %shift.amt = load i64, ptr %arg1.ptr, !range !0 + %shl = shl i64 %arg0, %shift.amt + ret i64 %shl +} + +define <2 x i64> @shl_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) { +; CHECK-LABEL: shl_v2_metadata: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !0 + %shl = shl <2 x i64> %arg0, %shift.amt + ret <2 x i64> %shl +} + +define <3 x i64> @shl_v3_metadata(<3 x i64> %arg0, ptr %arg1.ptr) { +; CHECK-LABEL: shl_v3_metadata: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dword v12, v[6:7] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %shift.amt = load <3 x i64>, ptr %arg1.ptr, !range !0 + %shl = shl <3 x i64> %arg0, %shift.amt + ret <3 x i64> %shl +} + +define <4 x i64> @shl_v4_metadata(<4 x i64> %arg0, ptr %arg1.ptr) { +; CHECK-LABEL: shl_v4_metadata: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[8:9] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_load_dwordx4 v[13:16], v[8:9] offset:16 +; CHECK-NEXT: ; kill: killed $vgpr8 killed $vgpr9 +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v10, v[0:1] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v12, v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b64 v[4:5], v13, v[4:5] +; CHECK-NEXT: v_lshlrev_b64 v[6:7], v15, v[6:7] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %shift.amt = load <4 x i64>, ptr %arg1.ptr, !range !0 + %shl = shl <4 x i64> %arg0, %shift.amt + ret <4 x i64> %shl +} + +!0 = !{i64 32, i64 64} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Test range with an "or X, 16" +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; These cases must not be reduced because the known minimum, 16, is not in range. + +define i64 @shl_or16(i64 %arg0, i64 %shift_amt) { +; CHECK-LABEL: shl_or16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v2, 16, v2 +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or i64 %shift_amt, 16 + %shl = shl i64 %arg0, %or + ret i64 %shl +} + +define <2 x i64> @shl_v2_or16(<2 x i64> %arg0, <2 x i64> %shift_amt) { +; CHECK-LABEL: shl_v2_or16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v5, 16, v6 +; CHECK-NEXT: v_or_b32_e32 v4, 16, v4 +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <2 x i64> %shift_amt, splat (i64 16) + %shl = shl <2 x i64> %arg0, %or + ret <2 x i64> %shl +} + +define <3 x i64> @shl_v3_or16(<3 x i64> %arg0, <3 x i64> %shift_amt) { +; CHECK-LABEL: shl_v3_or16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v7, 16, v10 +; CHECK-NEXT: v_or_b32_e32 v8, 16, v8 +; CHECK-NEXT: v_or_b32_e32 v6, 16, v6 +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] +; CHECK-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <3 x i64> %shift_amt, splat (i64 16) + %shl = shl <3 x i64> %arg0, %or + ret <3 x i64> %shl +} + +define <4 x i64> @shl_v4_or16(<4 x i64> %arg0, <4 x i64> %shift_amt) { +; CHECK-LABEL: shl_v4_or16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v9, 16, v14 +; CHECK-NEXT: v_or_b32_e32 v11, 16, v12 +; CHECK-NEXT: v_or_b32_e32 v10, 16, v10 +; CHECK-NEXT: v_or_b32_e32 v8, 16, v8 +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] +; CHECK-NEXT: v_lshlrev_b64 v[4:5], v11, v[4:5] +; CHECK-NEXT: v_lshlrev_b64 v[6:7], v9, v[6:7] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <4 x i64> %shift_amt, splat (i64 16) + %shl = shl <4 x i64> %arg0, %or + ret <4 x i64> %shl +} + +; test SGPR + +define i64 @shl_or16_sgpr(i64 inreg %arg0, i64 inreg %shift_amt) { +; CHECK-LABEL: shl_or16_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_b32 s4, s18, 16 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[16:17], s4 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or i64 %shift_amt, 16 + %shl = shl i64 %arg0, %or + ret i64 %shl +} + +define <2 x i64> @shl_v2_or16_sgpr(<2 x i64> inreg %arg0, <2 x i64> inreg %shift_amt) { +; CHECK-LABEL: shl_v2_or16_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_b32 s6, s22, 16 +; CHECK-NEXT: s_or_b32 s4, s20, 16 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[16:17], s4 +; CHECK-NEXT: s_lshl_b64 s[6:7], s[18:19], s6 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <2 x i64> %shift_amt, splat (i64 16) + %shl = shl <2 x i64> %arg0, %or + ret <2 x i64> %shl +} + +define <3 x i64> @shl_v3_or16_sgpr(<3 x i64> inreg %arg0, <3 x i64> inreg %shift_amt) { +; CHECK-LABEL: shl_v3_or16_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_or_b32 s8, s26, 16 +; CHECK-NEXT: s_or_b32 s6, s24, 16 +; CHECK-NEXT: s_or_b32 s4, s22, 16 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[16:17], s4 +; CHECK-NEXT: s_lshl_b64 s[6:7], s[18:19], s6 +; CHECK-NEXT: s_lshl_b64 s[8:9], s[20:21], s8 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: v_mov_b32_e32 v4, s8 +; CHECK-NEXT: v_mov_b32_e32 v5, s9 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <3 x i64> %shift_amt, splat (i64 16) + %shl = shl <3 x i64> %arg0, %or + ret <3 x i64> %shl +} + +define <4 x i64> @shl_v4_or16_sgpr(<4 x i64> inreg %arg0, <4 x i64> inreg %shift_amt) { +; CHECK-LABEL: shl_v4_or16_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v0, 16, v0 +; CHECK-NEXT: s_or_b32 s8, s28, 16 +; CHECK-NEXT: s_or_b32 s6, s26, 16 +; CHECK-NEXT: s_or_b32 s4, s24, 16 +; CHECK-NEXT: s_lshl_b64 s[4:5], s[16:17], s4 +; CHECK-NEXT: s_lshl_b64 s[6:7], s[18:19], s6 +; CHECK-NEXT: s_lshl_b64 s[8:9], s[20:21], s8 +; CHECK-NEXT: v_lshlrev_b64 v[6:7], v0, s[22:23] +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: v_mov_b32_e32 v4, s8 +; CHECK-NEXT: v_mov_b32_e32 v5, s9 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <4 x i64> %shift_amt, splat (i64 16) + %shl = shl <4 x i64> %arg0, %or + ret <4 x i64> %shl +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Test range with an "or X, 32" +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; These cases are reduced because computeKnownBits() can calculate a minimum of 32 +; based on the OR with 32. + +define i64 @shl_or32(i64 %arg0, i64 %shift_amt) { +; CHECK-LABEL: shl_or32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v1, v2, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or i64 %shift_amt, 32 + %shl = shl i64 %arg0, %or + ret i64 %shl +} + +define <2 x i64> @shl_v2_or32(<2 x i64> %arg0, <2 x i64> %shift_amt) { +; CHECK-LABEL: shl_v2_or32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v1, v4, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, v6, v2 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <2 x i64> %shift_amt, splat (i64 32) + %shl = shl <2 x i64> %arg0, %or + ret <2 x i64> %shl +} + +define <3 x i64> @shl_v3_or32(<3 x i64> %arg0, <3 x i64> %shift_amt) { +; CHECK-LABEL: shl_v3_or32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v1, v6, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, v8, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v5, v10, v4 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <3 x i64> %shift_amt, splat (i64 32) + %shl = shl <3 x i64> %arg0, %or + ret <3 x i64> %shl +} + +define <4 x i64> @shl_v4_or32(<4 x i64> %arg0, <4 x i64> %shift_amt) { +; CHECK-LABEL: shl_v4_or32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v1, v8, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, v10, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v5, v12, v4 +; CHECK-NEXT: v_lshlrev_b32_e32 v7, v14, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <4 x i64> %shift_amt, splat (i64 32) + %shl = shl <4 x i64> %arg0, %or + ret <4 x i64> %shl +} + +; test SGPR + +define i64 @shl_or32_sgpr(i64 inreg %arg0, i64 inreg %shift_amt) { +; CHECK-LABEL: shl_or32_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshl_b32 s4, s16, s18 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or i64 %shift_amt, 32 + %shl = shl i64 %arg0, %or + ret i64 %shl +} + +define <2 x i64> @shl_v2_or32_sgpr(<2 x i64> inreg %arg0, <2 x i64> inreg %shift_amt) { +; CHECK-LABEL: shl_v2_or32_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshl_b32 s4, s16, s20 +; CHECK-NEXT: s_lshl_b32 s5, s18, s22 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <2 x i64> %shift_amt, splat (i64 32) + %shl = shl <2 x i64> %arg0, %or + ret <2 x i64> %shl +} + +define <3 x i64> @shl_v3_or32_sgpr(<3 x i64> inreg %arg0, <3 x i64> inreg %shift_amt) { +; CHECK-LABEL: shl_v3_or32_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshl_b32 s4, s16, s22 +; CHECK-NEXT: s_lshl_b32 s5, s18, s24 +; CHECK-NEXT: s_lshl_b32 s6, s20, s26 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v5, s6 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <3 x i64> %shift_amt, splat (i64 32) + %shl = shl <3 x i64> %arg0, %or + ret <3 x i64> %shl +} + +define <4 x i64> @shl_v4_or32_sgpr(<4 x i64> inreg %arg0, <4 x i64> inreg %shift_amt) { +; CHECK-LABEL: shl_v4_or32_sgpr: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_lshl_b32 s4, s16, s24 +; CHECK-NEXT: s_lshl_b32 s5, s18, s26 +; CHECK-NEXT: s_lshl_b32 s6, s20, s28 +; CHECK-NEXT: v_lshlrev_b32_e64 v7, v0, s22 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v5, s6 +; CHECK-NEXT: v_mov_b32_e32 v6, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %or = or <4 x i64> %shift_amt, splat (i64 32) + %shl = shl <4 x i64> %arg0, %or + ret <4 x i64> %shl +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Test range from max/min +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; FIXME: This case should be reduced too, but computeKnownBits() cannot +; determine the range. Match current results for now. + +define i64 @shl_maxmin(i64 %arg0, i64 noundef %arg1) { +; CHECK-LABEL: shl_maxmin: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[2:3] +; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, 32, v2, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[2:3] +; CHECK-NEXT: v_cndmask_b32_e32 v2, 63, v2, vcc +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %max = call i64 @llvm.umax.i64(i64 %arg1, i64 32) + %min = call i64 @llvm.umin.i64(i64 %max, i64 63) + %shl = shl i64 %arg0, %min + ret i64 %shl +} + +define <2 x i64> @shl_v2_maxmin(<2 x i64> %arg0, <2 x i64> noundef %arg1) { +; CHECK-LABEL: shl_v2_maxmin: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v4, 32, v4, vcc +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v6, 32, v6, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[4:5] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; CHECK-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %max = call <2 x i64> @llvm.umax.i64(<2 x i64> %arg1, <2 x i64> splat (i64 32)) + %min = call <2 x i64> @llvm.umin.i64(<2 x i64> %max, <2 x i64> splat (i64 63)) + %shl = shl <2 x i64> %arg0, %min + ret <2 x i64> %shl +} + +define <3 x i64> @shl_v3_maxmin(<3 x i64> %arg0, <3 x i64> noundef %arg1) { +; CHECK-LABEL: shl_v3_maxmin: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v6, 32, v6, vcc +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[8:9] +; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v8, 32, v8, vcc +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[10:11] +; CHECK-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v10, 32, v10, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[10:11] +; CHECK-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[8:9] +; CHECK-NEXT: v_lshlrev_b64 v[4:5], v10, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[6:7] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] +; CHECK-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %max = call <3 x i64> @llvm.umax.i64(<3 x i64> %arg1, <3 x i64> splat (i64 32)) + %min = call <3 x i64> @llvm.umin.i64(<3 x i64> %max, <3 x i64> splat (i64 63)) + %shl = shl <3 x i64> %arg0, %min + ret <3 x i64> %shl +} + +define <4 x i64> @shl_v4_maxmin(<4 x i64> %arg0, <4 x i64> noundef %arg1) { +; CHECK-LABEL: shl_v4_maxmin: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[8:9] +; CHECK-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v8, 32, v8, vcc +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[10:11] +; CHECK-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v10, 32, v10, vcc +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[12:13] +; CHECK-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v12, 32, v12, vcc +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, 32, v[14:15] +; CHECK-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v14, 32, v14, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[14:15] +; CHECK-NEXT: v_cndmask_b32_e32 v14, 63, v14, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[12:13] +; CHECK-NEXT: v_lshlrev_b64 v[6:7], v14, v[6:7] +; CHECK-NEXT: v_cndmask_b32_e32 v12, 63, v12, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[10:11] +; CHECK-NEXT: v_lshlrev_b64 v[4:5], v12, v[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v10, 63, v10, vcc +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, 63, v[8:9] +; CHECK-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] +; CHECK-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc +; CHECK-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; CHECK-NEXT: s_setpc_b64 s[30:31] + %max = call <4 x i64> @llvm.umax.i64(<4 x i64> %arg1, <4 x i64> splat (i64 32)) + %min = call <4 x i64> @llvm.umin.i64(<4 x i64> %max, <4 x i64> splat (i64 63)) + %shl = shl <4 x i64> %arg0, %min + ret <4 x i64> %shl +} diff --git a/llvm/test/CodeGen/AMDGPU/shl64_reduce_flags.ll b/llvm/test/CodeGen/AMDGPU/shl64_reduce_flags.ll new file mode 100644 index 0000000000000..7c75303c3463f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shl64_reduce_flags.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=finalize-isel -o - %s | FileCheck %s + +;; Test that reduction of: +;; +;; DST = shl i64 X, Y +;; +;; where Y is in the range [63-32] to: +;; +;; DST = [0, shl i32 X, (Y & 0x1F)] +;; +;; preserves flags + +define i64 @shl_nsw(i64 %arg0, i64 %shift_amt) { + ; CHECK-LABEL: name: shl_nsw + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nsw V_LSHLREV_B32_e64 killed [[COPY4]], killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: $vgpr1 = COPY [[V_LSHLREV_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + %or = or i64 %shift_amt, 32 + %shl = shl nsw i64 %arg0, %or + ret i64 %shl +} + +define i64 @shl_nuw(i64 %arg0, i64 %shift_amt) { + ; CHECK-LABEL: name: shl_nuw + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw V_LSHLREV_B32_e64 killed [[COPY4]], killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: $vgpr1 = COPY [[V_LSHLREV_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + %or = or i64 %shift_amt, 32 + %shl = shl nuw i64 %arg0, %or + ret i64 %shl +} + +define i64 @shl_nsw_nuw(i64 %arg0, i64 %shift_amt) { + ; CHECK-LABEL: name: shl_nsw_nuw + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]] + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, killed [[COPY3]], %subreg.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[COPY4]], killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: $vgpr1 = COPY [[V_LSHLREV_B32_e64_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 + %or = or i64 %shift_amt, 32 + %shl = shl nsw nuw i64 %arg0, %or + ret i64 %shl +}