diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c97486437ed83..eda0a21f79c96 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11230,25 +11230,53 @@ static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset) { SDValue Ret; - if (Src.getValueSizeInBits() <= 32) - return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32); - if (Src.getValueSizeInBits() >= 256) { - assert(!(Src.getValueSizeInBits() % 32)); - Ret = DAG.getBitcast( - MVT::getVectorVT(MVT::i32, Src.getValueSizeInBits() / 32), Src); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ret, - DAG.getConstant(DWordOffset, SL, MVT::i32)); - } + auto ValueSize = Src.getValueSizeInBits().getFixedValue(); + // ByteProvider must be at least 8 bits + assert(!(ValueSize % 8)); - Ret = DAG.getBitcastedAnyExtOrTrunc( - Src, SL, MVT::getIntegerVT(Src.getValueSizeInBits())); - if (DWordOffset) { - auto Shifted = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret, - DAG.getConstant(DWordOffset * 32, SL, MVT::i32)); - return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Shifted); - } + if (ValueSize <= 32) + return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32); + if (Src.getValueType().isVector()) { + auto BaseSize = Src.getScalarValueSizeInBits(); + if (BaseSize == 32) { + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src, + DAG.getConstant(DWordOffset, SL, MVT::i32)); + } + if (BaseSize > 32) { + Ret = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src, + DAG.getConstant(DWordOffset / (BaseSize / 32), SL, MVT::i32)); + auto ShiftVal = 32 * DWordOffset % (BaseSize / 32); + if (ShiftVal) + Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret, + DAG.getConstant(ShiftVal, SL, MVT::i32)); + return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); + } + + auto NumElements = ValueSize / BaseSize; + auto Trunc32Elements = (BaseSize * NumElements) / 32; + auto NormalizedTrunc = Trunc32Elements * 32 / BaseSize; + auto NumElementsIn32 = 32 / BaseSize; + auto NumAvailElements = DWordOffset < Trunc32Elements + ? NumElementsIn32 + : NumElements - NormalizedTrunc; + + SmallVector VecSrcs; + DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32, + NumAvailElements); + + Ret = DAG.getBuildVector( + MVT::getVectorVT(MVT::getIntegerVT(BaseSize), NumAvailElements), SL, + VecSrcs); + return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); + } + + /// Scalar Type + auto ShiftVal = 32 * DWordOffset; + Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src, + DAG.getConstant(ShiftVal, SL, MVT::i32)); return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32); } @@ -11320,10 +11348,10 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op; - if (SecondSrc) + if (SecondSrc) { OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second); - - assert(Op.getValueSizeInBits() == 32); + assert(OtherOp.getValueSizeInBits() == 32); + } if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index a8d53856c7c61..08040816db0c2 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -3409,21 +3409,21 @@ define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p ; GFX10-LABEL: extract_hilo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: global_load_dword v6, v[2:3], off +; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3060505 +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3060505 ; GFX10-NEXT: global_store_dword v[4:5], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: extract_hilo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: global_load_dword v6, v[2:3], off +; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4 ; GFX9-NEXT: s_mov_b32 s4, 0x3060505 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3451,21 +3451,21 @@ define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p ; GFX10-LABEL: extract_lohi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v[0:1], off -; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v7, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x70404 +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x70404 ; GFX10-NEXT: global_store_dword v[4:5], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: extract_lohi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off -; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v6, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v7, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x70404 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3493,21 +3493,21 @@ define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p ; GFX10-LABEL: extract_hihi: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4 +; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2070505 +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x2070505 ; GFX10-NEXT: global_store_dword v[4:5], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: extract_hihi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v6, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4 ; GFX9-NEXT: s_mov_b32 s4, 0x2070505 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3664,3 +3664,202 @@ define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, p store i32 %res, ptr addrspace(1) %out0, align 4 ret void } + +; Should not result in crash +define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: extract_v6i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6 +; GFX10-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:2 +; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshl_or_b32 v0, v8, 16, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v9 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: global_store_dword v[6:7], v1, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_v6i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:6 +; GFX9-NEXT: global_load_ushort v3, v[0:1], off +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 +; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v1, v9, 16, v3 +; GFX9-NEXT: global_store_dword v[4:5], v1, off +; GFX9-NEXT: global_store_dword v[6:7], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec = load <6 x i16>, ptr addrspace(1) %in0, align 2 + %el0 = extractelement <6 x i16> %vec, i32 0 + %el1 = extractelement <6 x i16> %vec, i32 1 + %el2 = extractelement <6 x i16> %vec, i32 2 + %el3 = extractelement <6 x i16> %vec, i32 3 + %z0 = zext i16 %el0 to i32 + %z1 = zext i16 %el1 to i32 + %s1 = shl nuw i32 %z1, 16 + %o0 = or i32 %s1, %z0 + %z2 = zext i16 %el2 to i32 + %z3 = zext i16 %el3 to i32 + %s3 = shl nuw i32 %z3, 16 + %o1 = or i32 %z2, %s3 + + store i32 %o0, ptr addrspace(1) %out0, align 4 + store i32 %o1, ptr addrspace(1) %out1, align 4 + ret void +} + + +; Should not result in crash +define hidden void @extract_v7i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: extract_v7i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: global_store_dword v[6:7], v1, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_v7i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: global_store_dword v[6:7], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec = load <7 x i16>, ptr addrspace(1) %in0, align 2 + %el0 = extractelement <7 x i16> %vec, i32 0 + %el1 = extractelement <7 x i16> %vec, i32 1 + %el2 = extractelement <7 x i16> %vec, i32 2 + %el3 = extractelement <7 x i16> %vec, i32 3 + %z0 = zext i16 %el0 to i32 + %z1 = zext i16 %el1 to i32 + %s1 = shl nuw i32 %z1, 16 + %o0 = or i32 %s1, %z0 + %z2 = zext i16 %el2 to i32 + %z3 = zext i16 %el3 to i32 + %s3 = shl nuw i32 %z3, 16 + %o1 = or i32 %z2, %s3 + + store i32 %o0, ptr addrspace(1) %out0, align 4 + store i32 %o1, ptr addrspace(1) %out1, align 4 + ret void +} + +; Should not result in crash +define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: extract_v13i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:8 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_bfe_u32 v0, v2, 8, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v8 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040c00 +; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040c03 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: global_store_dword v[6:7], v1, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_v13i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8 +; GFX9-NEXT: s_mov_b32 s4, 0x5040c00 +; GFX9-NEXT: s_mov_b32 s5, 0x5040c03 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_bfe_u32 v0, v2, 8, 8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v8 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX9-NEXT: v_perm_b32 v1, v1, v3, s5 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: global_store_dword v[6:7], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec = load <13 x i8>, ptr addrspace(1) %in0, align 2 + %el0 = extractelement <13 x i8> %vec, i32 0 + %el1 = extractelement <13 x i8> %vec, i32 1 + %el2 = extractelement <13 x i8> %vec, i32 7 + %el3 = extractelement <13 x i8> %vec, i32 8 + %z0 = zext i8 %el0 to i32 + %z1 = zext i8 %el1 to i32 + %s1 = shl nuw i32 %z1, 16 + %o0 = or i32 %s1, %z0 + %z2 = zext i8 %el2 to i32 + %z3 = zext i8 %el3 to i32 + %s3 = shl nuw i32 %z3, 16 + %o1 = or i32 %z2, %s3 + + store i32 %o0, ptr addrspace(1) %out0, align 4 + store i32 %o1, ptr addrspace(1) %out1, align 4 + ret void +} + +; Should not result in crash +define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: extract_v13i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48 +; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off +; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64 +; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshl_or_b32 v0, v13, 16, v12 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v1, v14, 16, v10 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: global_store_dword v[6:7], v1, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_v13i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48 +; GFX9-NEXT: global_load_dwordx4 v[11:14], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64 +; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshl_or_b32 v0, v13, 16, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v1, v14, 16, v10 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: global_store_dword v[6:7], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec = load <13 x i64>, ptr addrspace(1) %in0, align 2 + %el0 = extractelement <13 x i64> %vec, i32 0 + %el1 = extractelement <13 x i64> %vec, i32 1 + %el2 = extractelement <13 x i64> %vec, i32 7 + %el3 = extractelement <13 x i64> %vec, i32 8 + %el00 = lshr i64 %el0, 32 + %z0 = trunc i64 %el00 to i32 + %z1 = trunc i64 %el1 to i32 + %s1 = shl nuw i32 %z1, 16 + %o0 = or i32 %s1, %z0 + %z2 = trunc i64 %el2 to i32 + %z3 = trunc i64 %el3 to i32 + %s3 = shl nuw i32 %z3, 16 + %o1 = or i32 %z2, %s3 + + store i32 %o0, ptr addrspace(1) %out0, align 4 + store i32 %o1, ptr addrspace(1) %out1, align 4 + ret void +}