diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index d925c53a57502..e819dca5bdf0f 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -307,6 +307,10 @@ class LegalizerHelper { LegalizeResult narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx, LLT Ty); LegalizeResult narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + /// Perform Bitcast legalize action on G_EXTRACT_VECTOR_ELT. + LegalizeResult bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy); + LegalizeResult lowerBitcast(MachineInstr &MI); LegalizeResult lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index abb983dac6bdc..920c9e008012e 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2330,6 +2330,122 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) { return UnableToLegalize; } +/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this +/// is casting to a vector with a smaller element size, perform multiple element +/// extracts and merge the results. If this is coercing to a vector with larger +/// elements, index the bitcasted vector and extract the target element with bit +/// operations. This is intended to force the indexing in the native register +/// size for architectures that can dynamically index the register file. +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy) { + if (TypeIdx != 1) + return UnableToLegalize; + + Register Dst = MI.getOperand(0).getReg(); + Register SrcVec = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(2).getReg(); + LLT SrcVecTy = MRI.getType(SrcVec); + LLT IdxTy = MRI.getType(Idx); + + LLT SrcEltTy = SrcVecTy.getElementType(); + unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; + unsigned OldNumElts = SrcVecTy.getNumElements(); + + LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; + Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); + + const unsigned NewEltSize = NewEltTy.getSizeInBits(); + const unsigned OldEltSize = SrcEltTy.getSizeInBits(); + if (NewNumElts > OldNumElts) { + // Decreasing the vector element size + // + // e.g. i64 = extract_vector_elt x:v2i64, y:i32 + // => + // v4i32:castx = bitcast x:v2i64 + // + // i64 = bitcast + // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))), + // (i32 (extract_vector_elt castx, (2 * y + 1))) + // + if (NewNumElts % OldNumElts != 0) + return UnableToLegalize; + + // Type of the intermediate result vector. + const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts; + LLT MidTy = LLT::scalarOrVector(NewEltsPerOldElt, NewEltTy); + + auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt); + + SmallVector NewOps(NewEltsPerOldElt); + auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK); + + for (unsigned I = 0; I < NewEltsPerOldElt; ++I) { + auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I); + auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset); + auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx); + NewOps[I] = Elt.getReg(0); + } + + auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps); + MIRBuilder.buildBitcast(Dst, NewVec); + MI.eraseFromParent(); + return Legalized; + } + + if (NewNumElts < OldNumElts) { + if (NewEltSize % OldEltSize != 0) + return UnableToLegalize; + + // This only depends on powers of 2 because we use bit tricks to figure out + // the bit offset we need to shift to get the target element. A general + // expansion could emit division/multiply. + if (!isPowerOf2_32(NewEltSize / OldEltSize)) + return UnableToLegalize; + + // Increasing the vector element size. + // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(), %idx + // + // => + // + // %cast = G_BITCAST %vec + // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize) + // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx + // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) + // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) + // %elt_bits = G_LSHR %wide_elt, %offset_bits + // %elt = G_TRUNC %elt_bits + + const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); + auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); + + // Divide to get the index in the wider element type. + auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); + + Register WideElt = CastVec; + if (CastTy.isVector()) { + WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, + ScaledIdx).getReg(0); + } + + // Now figure out the amount we need to shift to get the target bits. + auto OffsetMask = MIRBuilder.buildConstant( + IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio)); + auto OffsetIdx = MIRBuilder.buildAnd(IdxTy, Idx, OffsetMask); + auto OffsetBits = MIRBuilder.buildShl( + IdxTy, OffsetIdx, + MIRBuilder.buildConstant(IdxTy, Log2_32(OldEltSize))); + + // Shift the wide element to get the target element. + auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits); + MIRBuilder.buildTrunc(Dst, ExtractedBits); + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} + LegalizerHelper::LegalizeResult LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { switch (MI.getOpcode()) { @@ -2378,6 +2494,8 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_EXTRACT_VECTOR_ELT: + return bitcastExtractVectorElt(MI, TypeIdx, CastTy); default: return UnableToLegalize; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index c5d5f1675bc8d..cc97e11707ab1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -70,6 +70,13 @@ static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { }; } +static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + return Ty.getSizeInBits() % 32 == 0; + }; +} + static LegalityPredicate isWideVec16(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; @@ -132,6 +139,15 @@ static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { }; } +static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + unsigned Size = Ty.getSizeInBits(); + assert(Size % 32 == 0); + return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); + }; +} + static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; @@ -1279,11 +1295,29 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT EltTy = Query.Types[EltTypeIdx]; const LLT VecTy = Query.Types[VecTypeIdx]; const LLT IdxTy = Query.Types[IdxTypeIdx]; - return (EltTy.getSizeInBits() == 16 || - EltTy.getSizeInBits() % 32 == 0) && - VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= MaxRegisterSize && - IdxTy.getSizeInBits() == 32; + const unsigned EltSize = EltTy.getSizeInBits(); + return (EltSize == 32 || EltSize == 64) && + VecTy.getSizeInBits() % 32 == 0 && + VecTy.getSizeInBits() <= MaxRegisterSize && + IdxTy.getSizeInBits() == 32; + }) + .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)), + bitcastToVectorElement32(1)) + //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) + .bitcastIf( + all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)), + [=](const LegalityQuery &Query) { + // For > 64-bit element types, try to turn this into a 64-bit + // element vector since we may be able to do better indexing + // if this is scalar. If not, fall back to 32. + const LLT EltTy = Query.Types[EltTypeIdx]; + const LLT VecTy = Query.Types[VecTypeIdx]; + const unsigned DstEltSize = EltTy.getSizeInBits(); + const unsigned VecSize = VecTy.getSizeInBits(); + + const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; + return std::make_pair( + VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); }) .clampScalar(EltTypeIdx, S32, S64) .clampScalar(VecTypeIdx, S32, S64) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll new file mode 100644 index 0000000000000..28c0651b10fd2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -0,0 +1,769 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX9-NEXT: s_lshl_b32 m0, s4, 1 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX9-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX8-NEXT: s_lshl_b32 m0, s4, 1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX8-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GFX7-NEXT: s_lshl_b32 m0, s4, 1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX7-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 %idx + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(<4 x i128> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX9-NEXT: s_lshl_b32 s0, s2, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_set_gpr_idx_on s0, gpr_idx(SRC0) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, v3 +; GFX9-NEXT: s_set_gpr_idx_off +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v18 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[10:11] +; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1] +; GFX8-NEXT: s_lshl_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 m0, s0, 1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_movrels_b32_e32 v1, v3 +; GFX8-NEXT: v_movrels_b32_e32 v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: s_lshl_b32 s0, s2, 1 +; GFX7-NEXT: s_lshl_b32 m0, s0, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_movrels_b32_e32 v1, v3 +; GFX7-NEXT: v_movrels_b32_e32 v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 %idx + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[3:4] +; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v10, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v11, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v2, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, v3, v7, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 1, v2 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[8:11], 0 addr64 offset:16 +; GFX7-NEXT: v_add_i32_e32 v17, vcc, 1, v16 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 +; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 %idx + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX9-NEXT: v_mov_b32_e32 v11, s10 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, s11 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX9-NEXT: v_mov_b32_e32 v13, s12 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX9-NEXT: v_mov_b32_e32 v14, s13 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_mov_b32_e32 v15, s14 +; GFX9-NEXT: v_mov_b32_e32 v16, s15 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, s6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_mov_b32_e32 v11, s8 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, s9 +; GFX9-NEXT: v_mov_b32_e32 v13, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX9-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-NEXT: v_mov_b32_e32 v9, s15 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX8-NEXT: v_mov_b32_e32 v11, s10 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX8-NEXT: v_mov_b32_e32 v13, s12 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX8-NEXT: v_mov_b32_e32 v14, s13 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc +; GFX8-NEXT: v_mov_b32_e32 v15, s14 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX8-NEXT: v_mov_b32_e32 v16, s15 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_mov_b32_e32 v7, s4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v9, s6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_mov_b32_e32 v11, s8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX8-NEXT: v_mov_b32_e32 v12, s9 +; GFX8-NEXT: v_mov_b32_e32 v13, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, s12 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s14 +; GFX8-NEXT: v_mov_b32_e32 v9, s15 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v4, s3 +; GFX7-NEXT: v_mov_b32_e32 v5, s4 +; GFX7-NEXT: v_mov_b32_e32 v6, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, s7 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, s8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX7-NEXT: v_mov_b32_e32 v10, s9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX7-NEXT: v_mov_b32_e32 v11, s10 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX7-NEXT: v_mov_b32_e32 v12, s11 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; GFX7-NEXT: v_mov_b32_e32 v13, s12 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX7-NEXT: v_mov_b32_e32 v14, s13 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v14, vcc +; GFX7-NEXT: v_mov_b32_e32 v15, s14 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX7-NEXT: v_mov_b32_e32 v16, s15 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v16, vcc +; GFX7-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX7-NEXT: v_mov_b32_e32 v7, s4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, s5 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v9, s6 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 +; GFX7-NEXT: v_mov_b32_e32 v10, s7 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX7-NEXT: v_mov_b32_e32 v11, s8 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 +; GFX7-NEXT: v_mov_b32_e32 v12, s9 +; GFX7-NEXT: v_mov_b32_e32 v13, s10 +; GFX7-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX7-NEXT: v_mov_b32_e32 v6, s12 +; GFX7-NEXT: v_mov_b32_e32 v7, s13 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2 +; GFX7-NEXT: v_mov_b32_e32 v8, s14 +; GFX7-NEXT: v_mov_b32_e32 v9, s15 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v8, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7-NEXT: v_readfirstlane_b32 s3, v3 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 %idx + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(<4 x i128> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i128_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 0 + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(<4 x i128> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i128_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mov_b32 s2, s6 +; GCN-NEXT: s_mov_b32 s3, s7 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 1 + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(<4 x i128> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i128_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s8 +; GCN-NEXT: s_mov_b32 s1, s9 +; GCN-NEXT: s_mov_b32 s2, s10 +; GCN-NEXT: s_mov_b32 s3, s11 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 2 + ret i128 %element +} + +define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(<4 x i128> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i128_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s12 +; GCN-NEXT: s_mov_b32 s1, s13 +; GCN-NEXT: s_mov_b32 s2, s14 +; GCN-NEXT: s_mov_b32 s3, s15 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr + %element = extractelement <4 x i128> %vector, i32 3 + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_idx0(<4 x i128> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i128_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 0 + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_idx1(<4 x i128> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i128_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-NEXT: v_mov_b32_e32 v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, v6 +; GFX8-NEXT: v_mov_b32_e32 v3, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: v_mov_b32_e32 v2, v6 +; GFX7-NEXT: v_mov_b32_e32 v3, v7 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 1 + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_idx2(<4 x i128> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i128_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v8 +; GFX8-NEXT: v_mov_b32_e32 v1, v9 +; GFX8-NEXT: v_mov_b32_e32 v2, v10 +; GFX8-NEXT: v_mov_b32_e32 v3, v11 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v8 +; GFX7-NEXT: v_mov_b32_e32 v1, v9 +; GFX7-NEXT: v_mov_b32_e32 v2, v10 +; GFX7-NEXT: v_mov_b32_e32 v3, v11 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 2 + ret i128 %element +} + +define i128 @extractelement_vgpr_v4i128_idx3(<4 x i128> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i128_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-NEXT: v_mov_b32_e32 v3, v15 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i128_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v12 +; GFX8-NEXT: v_mov_b32_e32 v1, v13 +; GFX8-NEXT: v_mov_b32_e32 v2, v14 +; GFX8-NEXT: v_mov_b32_e32 v3, v15 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i128_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:48 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v12 +; GFX7-NEXT: v_mov_b32_e32 v1, v13 +; GFX7-NEXT: v_mov_b32_e32 v2, v14 +; GFX7-NEXT: v_mov_b32_e32 v3, v15 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr + %element = extractelement <4 x i128> %vector, i32 3 + ret i128 %element +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll new file mode 100644 index 0000000000000..13d7fbeda0f6d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -0,0 +1,802 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_lshr_b32 s2, s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cselect_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s4, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 4 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(<4 x i16> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_lshr_b32 s0, s2, 1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshl_b32 s0, s1, 4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s1, 4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_lshl_b32 s0, s1, 4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 %idx + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_vgpr_idx(<4 x i16> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i16_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 %idx) { +; GCN-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(<4 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i16_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 0 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(<4 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i16_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 1 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(<4 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i16_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 2 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx3(<4 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i16_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %element = extractelement <4 x i16> %vector, i32 3 + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_idx0(<4 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i16_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 0 + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_idx1(<4 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i16_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 1 + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_idx2(<4 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i16_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 2 + ret i16 %element +} + +define i16 @extractelement_vgpr_v4i16_idx3(<4 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i16_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i16_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i16_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %element = extractelement <4 x i16> %vector, i32 3 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(<8 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_lshr_b32 s5, s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s5, 1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cselect_b32 s0, s1, s0 +; GCN-NEXT: s_cmp_eq_u32 s5, 2 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: s_cmp_eq_u32 s5, 3 +; GCN-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-NEXT: s_and_b32 s1, s4, 1 +; GCN-NEXT: s_lshl_b32 s1, s1, 4 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(<8 x i16> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_lshr_b32 s0, s2, 1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: s_lshl_b32 s0, s1, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: s_lshl_b32 s0, s1, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v8i16_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: s_lshl_b32 s0, s1, 4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 %idx + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_vgpr_idx(<8 x i16> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v8i16_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(<8 x i16> addrspace(4)* inreg %ptr, i32 %idx) { +; GCN-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 %idx + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 0 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 1 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 2 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 3 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s2 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 4 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx5: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s2, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 5 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx6: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s0, s3 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 6 + ret i16 %element +} + +define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx7(<8 x i16> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i16_idx7: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s3, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %element = extractelement <8 x i16> %vector, i32 7 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx0(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 0 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx1(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 1 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx2(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 2 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx3(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 3 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx4(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 4 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx5(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 5 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx6(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 6 + ret i16 %element +} + +define i16 @extractelement_vgpr_v8i16_idx7(<8 x i16> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i16_idx7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i16_idx7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i16_idx7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %element = extractelement <8 x i16> %vector, i32 7 + ret i16 %element +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll new file mode 100644 index 0000000000000..95b4177abbcab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -0,0 +1,3135 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(<4 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s5, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s5 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s5 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s5 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s5 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s4, 3 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX9-NEXT: s_lshl_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v4i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_and_b32 s0, s2, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v4i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: s_lshl_b32 s0, s1, 3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 %idx + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_vgpr_idx(<4 x i8> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v4i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v3, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(<4 x i8> addrspace(4)* inreg %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_sgpr_v4i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s1, s0, 8 +; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s3, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, s4 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s3, s4 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_sgpr_v4i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s1, s0, 8 +; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_sgpr_v4i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: s_and_b32 s1, s1, s4 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s3, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(<4 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i8_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s0, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s1, 8 +; GCN-NEXT: s_and_b32 s2, s2, s0 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s3, s0 +; GCN-NEXT: s_and_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 0 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(<4 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i8_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s0, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s1, 8 +; GCN-NEXT: s_and_b32 s2, s2, s0 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s3, s0 +; GCN-NEXT: s_and_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshr_b32 s0, s0, 8 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 1 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(<4 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i8_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s0, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s1, 8 +; GCN-NEXT: s_and_b32 s2, s2, s0 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s3, s0 +; GCN-NEXT: s_and_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 2 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(<4 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v4i8_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s1, s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s0, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s1, 8 +; GCN-NEXT: s_and_b32 s2, s2, s0 +; GCN-NEXT: s_lshr_b32 s3, s1, 16 +; GCN-NEXT: s_lshr_b32 s4, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s3, s0 +; GCN-NEXT: s_and_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshl_b32 s0, s0, 24 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_lshr_b32 s0, s0, 24 +; GCN-NEXT: ; return to shader part epilog + %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %element = extractelement <4 x i8> %vector, i32 3 + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_idx0(<4 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i8_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 0 + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_idx1(<4 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i8_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 1 + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_idx2(<4 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i8_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 2 + ret i8 %element +} + +define i8 @extractelement_vgpr_v4i8_idx3(<4 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v4i8_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v4i8_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v4i8_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %element = extractelement <4 x i8> %vector, i32 3 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s9, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s0, 8 +; GCN-NEXT: s_and_b32 s2, s2, s9 +; GCN-NEXT: s_lshr_b32 s3, s0, 16 +; GCN-NEXT: s_lshr_b32 s5, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s3, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s5, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshr_b32 s6, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s6, s9 +; GCN-NEXT: s_lshr_b32 s7, s1, 16 +; GCN-NEXT: s_lshr_b32 s8, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s7, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s8, s9 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_lshr_b32 s2, s4, 2 +; GCN-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-NEXT: s_cselect_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s4, 3 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v8i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: s_lshr_b32 s3, s2, 2 +; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX9-NEXT: v_and_b32_sdwa v7, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v2 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_lshl_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v8i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, 2 +; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s1, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v8i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s2, 2 +; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: s_lshl_b32 s0, s2, 3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 %idx + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v8i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s5, v5 +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v10, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s5, v3 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v9, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* inreg %ptr, i32 %idx) { +; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s8, 0xff +; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s2, s0, 8 +; GCN-NEXT: s_and_b32 s2, s2, s8 +; GCN-NEXT: s_lshr_b32 s3, s0, 16 +; GCN-NEXT: s_lshr_b32 s4, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s3, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s4, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_lshr_b32 s5, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s5, s8 +; GCN-NEXT: s_lshr_b32 s6, s1, 16 +; GCN-NEXT: s_lshr_b32 s7, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s6, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 16 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s2, s7, s8 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_or_b32 s1, s1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 0 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 8 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 1 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 2 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx3: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s1, s0, 8 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshr_b32 s2, s0, 16 +; GCN-NEXT: s_lshr_b32 s3, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 24 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 3 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 8 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 4 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx5: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 8 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 8 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 5 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx6: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 8 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 6 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(<8 x i8> addrspace(4)* inreg %ptr) { +; GCN-LABEL: extractelement_sgpr_v8i8_idx7: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s4, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s0, s1, 8 +; GCN-NEXT: s_and_b32 s0, s0, s4 +; GCN-NEXT: s_lshr_b32 s2, s1, 16 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s4 +; GCN-NEXT: s_lshl_b32 s0, s0, 8 +; GCN-NEXT: s_or_b32 s0, s1, s0 +; GCN-NEXT: s_and_b32 s1, s2, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 16 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s3, s4 +; GCN-NEXT: s_lshl_b32 s1, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_lshr_b32 s0, s0, 24 +; GCN-NEXT: ; return to shader part epilog + %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %element = extractelement <8 x i8> %vector, i32 7 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx0(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 0 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx1(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 1 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx2(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 2 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx3(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 3 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx4(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 4 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx5(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 5 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx6(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 6 + ret i8 %element +} + +define i8 @extractelement_vgpr_v8i8_idx7(<8 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v8i8_idx7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v8i8_idx7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v8i8_idx7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %element = extractelement <8 x i8> %vector, i32 7 + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) { +; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s17, 0xff +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s5, s0, 8 +; GCN-NEXT: s_and_b32 s5, s5, s17 +; GCN-NEXT: s_lshr_b32 s6, s0, 16 +; GCN-NEXT: s_lshr_b32 s7, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s0, s0, s5 +; GCN-NEXT: s_and_b32 s5, s6, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_or_b32 s0, s0, s5 +; GCN-NEXT: s_and_b32 s5, s7, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshr_b32 s8, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s5 +; GCN-NEXT: s_and_b32 s5, s8, s17 +; GCN-NEXT: s_lshr_b32 s9, s1, 16 +; GCN-NEXT: s_lshr_b32 s10, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s1, s1, s5 +; GCN-NEXT: s_and_b32 s5, s9, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_or_b32 s1, s1, s5 +; GCN-NEXT: s_and_b32 s5, s10, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshr_b32 s11, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s5 +; GCN-NEXT: s_and_b32 s5, s11, s17 +; GCN-NEXT: s_lshr_b32 s12, s2, 16 +; GCN-NEXT: s_lshr_b32 s13, s2, 24 +; GCN-NEXT: s_and_b32 s2, s2, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_and_b32 s5, s12, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_and_b32 s5, s13, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_lshr_b32 s14, s3, 8 +; GCN-NEXT: s_or_b32 s2, s2, s5 +; GCN-NEXT: s_and_b32 s5, s14, s17 +; GCN-NEXT: s_lshr_b32 s15, s3, 16 +; GCN-NEXT: s_lshr_b32 s16, s3, 24 +; GCN-NEXT: s_and_b32 s3, s3, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_and_b32 s5, s15, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 16 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_and_b32 s5, s16, s17 +; GCN-NEXT: s_lshl_b32 s5, s5, 24 +; GCN-NEXT: s_or_b32 s3, s3, s5 +; GCN-NEXT: s_lshr_b32 s5, s4, 2 +; GCN-NEXT: s_cmp_eq_u32 s5, 1 +; GCN-NEXT: s_cselect_b32 s0, s1, s0 +; GCN-NEXT: s_cmp_eq_u32 s5, 2 +; GCN-NEXT: s_cselect_b32 s0, s2, s0 +; GCN-NEXT: s_cmp_eq_u32 s5, 3 +; GCN-NEXT: s_cselect_b32 s0, s3, s0 +; GCN-NEXT: s_and_b32 s1, s4, 3 +; GCN-NEXT: s_lshl_b32 s1, s1, 3 +; GCN-NEXT: s_lshr_b32 s0, s0, s1 +; GCN-NEXT: ; return to shader part epilog + %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr + %element = extractelement <16 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 inreg %idx) { +; GFX9-LABEL: extractelement_vgpr_v16i8_sgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: s_lshr_b32 s3, s2, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v10, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v6 +; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v7 +; GFX9-NEXT: v_and_b32_sdwa v14, v2, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v2, v2, s1, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v11 +; GFX9-NEXT: v_or3_b32 v1, v1, v12, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_and_b32_sdwa v16, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v3, v3, v4, v5 +; GFX9-NEXT: v_or3_b32 v2, v2, v14, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_or3_b32 v3, v3, v16, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: s_lshl_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: extractelement_vgpr_v16i8_sgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: s_lshr_b32 s0, s2, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v12, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_and_b32_sdwa v15, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX8-NEXT: v_and_b32_sdwa v17, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: s_lshl_b32 s0, s1, 3 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX7-LABEL: extractelement_vgpr_v16i8_sgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s2, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX7-NEXT: v_and_b32_e32 v11, v11, v4 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v10 +; GFX7-NEXT: v_and_b32_e32 v12, v12, v4 +; GFX7-NEXT: v_and_b32_e32 v14, v14, v4 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_and_b32_e32 v13, v13, v4 +; GFX7-NEXT: v_and_b32_e32 v15, v15, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX7-NEXT: v_and_b32_e32 v4, v16, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v13 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 2 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: s_lshl_b32 s0, s2, 3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7-NEXT: ; return to shader part epilog + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 %idx + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 %idx) { +; GFX9-LABEL: extractelement_vgpr_v16i8_vgpr_idx: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v12, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v4, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v3, v3, s5, v8 +; GFX9-NEXT: v_and_or_b32 v4, v4, s5, v9 +; GFX9-NEXT: v_and_b32_sdwa v16, v5, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v6, v0, v1 +; GFX9-NEXT: v_or3_b32 v1, v3, v12, v13 +; GFX9-NEXT: v_or3_b32 v3, v4, v14, v15 +; GFX9-NEXT: v_and_or_b32 v5, v5, s5, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_or3_b32 v4, v5, v16, v17 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_or3_b32 v0, v0, v18, v19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_vgpr_idx: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v7, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v14, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX8-NEXT: v_and_b32_sdwa v17, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v18, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX8-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_and_b32_sdwa v0, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_vgpr_idx: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_mov_b32_e32 v0, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v9, s4, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX7-NEXT: v_and_b32_e32 v10, s4, v10 +; GFX7-NEXT: v_and_b32_e32 v12, v12, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 24, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v8, s4, v8 +; GFX7-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX7-NEXT: v_and_b32_e32 v13, v13, v0 +; GFX7-NEXT: v_and_b32_e32 v15, v15, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_and_b32_e32 v14, v14, v0 +; GFX7-NEXT: v_and_b32_e32 v16, v16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v6, v6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_and_b32_e32 v0, v17, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v15 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v16 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v14 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v18 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 %idx + ret i8 %element +} + +define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* inreg %ptr, i32 %idx) { +; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GCN-NEXT: s_movk_i32 s16, 0xff +; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b32 s4, s0, 8 +; GCN-NEXT: s_and_b32 s4, s4, s16 +; GCN-NEXT: s_lshr_b32 s5, s0, 16 +; GCN-NEXT: s_lshr_b32 s6, s0, 24 +; GCN-NEXT: s_and_b32 s0, s0, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_and_b32 s4, s5, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_and_b32 s4, s6, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshr_b32 s7, s1, 8 +; GCN-NEXT: s_or_b32 s0, s0, s4 +; GCN-NEXT: s_and_b32 s4, s7, s16 +; GCN-NEXT: s_lshr_b32 s8, s1, 16 +; GCN-NEXT: s_lshr_b32 s9, s1, 24 +; GCN-NEXT: s_and_b32 s1, s1, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_or_b32 s1, s1, s4 +; GCN-NEXT: s_and_b32 s4, s8, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_or_b32 s1, s1, s4 +; GCN-NEXT: s_and_b32 s4, s9, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshr_b32 s10, s2, 8 +; GCN-NEXT: s_or_b32 s1, s1, s4 +; GCN-NEXT: s_and_b32 s4, s10, s16 +; GCN-NEXT: s_lshr_b32 s11, s2, 16 +; GCN-NEXT: s_lshr_b32 s12, s2, 24 +; GCN-NEXT: s_and_b32 s2, s2, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_and_b32 s4, s11, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_and_b32 s4, s12, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: s_lshr_b32 s13, s3, 8 +; GCN-NEXT: s_or_b32 s2, s2, s4 +; GCN-NEXT: s_and_b32 s4, s13, s16 +; GCN-NEXT: s_lshr_b32 s14, s3, 16 +; GCN-NEXT: s_lshr_b32 s15, s3, 24 +; GCN-NEXT: s_and_b32 s3, s3, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_or_b32 s3, s3, s4 +; GCN-NEXT: s_and_b32 s4, s14, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 16 +; GCN-NEXT: s_or_b32 s3, s3, s4 +; GCN-NEXT: s_and_b32 s4, s15, s16 +; GCN-NEXT: s_lshl_b32 s4, s4, 24 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: s_or_b32 s3, s3, s4 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: ; return to shader part epilog + %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr + %element = extractelement <16 x i8> %vector, i32 %idx + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx0(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx0: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 0 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx1(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx1: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 1 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx2(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx2: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 2 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx3(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx3: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 3 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx4(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 4 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx5(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 5 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx6(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx6: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 6 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx7(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v2, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v1, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 7 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx8(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 8 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx9(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx9: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx9: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 9 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx10(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx10: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx10: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 10 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx11(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v2, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx11: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v2, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx11: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 11 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx12(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx12: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx12: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 12 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx13(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx13: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx13: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 13 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx14(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx14: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx14: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 14 + ret i8 %element +} + +define i8 @extractelement_vgpr_v16i8_idx15(<16 x i8> addrspace(1)* %ptr) { +; GFX9-LABEL: extractelement_vgpr_v16i8_idx15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v1, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v3, s5, v0 +; GFX9-NEXT: v_and_b32_sdwa v2, v3, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: extractelement_vgpr_v16i8_idx15: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: extractelement_vgpr_v16i8_idx15: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %element = extractelement <16 x i8> %vector, i32 15 + ret i8 %element +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir index 1d3311673cf6c..b548ff5503435 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extract-vector-elt.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s --- name: extract_vector_elt_0_v2i32 @@ -228,8 +228,10 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_0_v2i16_i32 ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[DEF]](<2 x s16>) - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; CHECK: $vgpr0 = COPY [[COPY]](s32) %0:_(<2 x s16>) = G_IMPLICIT_DEF %1:_(s32) = G_CONSTANT i32 0 @@ -290,25 +292,155 @@ name: extract_vector_elt_v2s8_varidx_i32 body: | bb.0: - liveins: $vgpr0_vgpr1, $vgpr2 + liveins: $vgpr0, $vgpr1 ; CHECK-LABEL: name: extract_vector_elt_v2s8_varidx_i32 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[COPY]](<2 x s32>) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) - ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV]], 8 - ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV1]], 8 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32) ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[COPY1]](s32) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) - ; CHECK: $vgpr0 = COPY [[COPY3]](s32) - %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 - %1:_(s32) = COPY $vgpr2 - %2:_(<2 x s8>) = G_TRUNC %0 - %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 - %4:_(s32) = G_ANYEXT %3 - $vgpr0 = COPY %4 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(<2 x s8>) = G_BITCAST %2 + %4:_(s8) = G_EXTRACT_VECTOR_ELT %3, %1 + %5:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 +... + +--- +name: extract_vector_elt_v2s8_constidx_0_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v2s8_constidx_0_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BUILD_VECTOR]](<2 x s32>), 0 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EXTRACT]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(<2 x s8>) = G_BITCAST %2 + %4:_(s32) = G_CONSTANT i32 0 + %5:_(s8) = G_EXTRACT_VECTOR_ELT %3, %4 + %6:_(s32) = G_ANYEXT %5 + $vgpr0 = COPY %6 +... + +--- +name: extract_vector_elt_v2s8_constidx_1_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v2s8_constidx_1_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY4]], 8 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32) + ; CHECK: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BUILD_VECTOR]](<2 x s32>), 32 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[EXTRACT]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(<2 x s8>) = G_BITCAST %2 + %4:_(s32) = G_CONSTANT i32 1 + %5:_(s8) = G_EXTRACT_VECTOR_ELT %3, %4 + %6:_(s32) = G_ANYEXT %5 + $vgpr0 = COPY %6 +... + +--- +name: extract_vector_elt_v4s4_varidx_i32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v4s4_varidx_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C4]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C5]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C6]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 4 + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY7]], 4 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY8]], 4 + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; CHECK: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY9]], 4 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) + ; CHECK: $vgpr0 = COPY [[COPY10]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(<4 x s4>) = G_BITCAST %2 + %4:_(s4) = G_EXTRACT_VECTOR_ELT %3, %1 + %5:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 ... --- @@ -343,24 +475,559 @@ name: extract_vector_elt_v4s8_varidx_i32 body: | bb.0: - liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + liveins: $vgpr0, $vgpr1 ; CHECK-LABEL: name: extract_vector_elt_v4s8_varidx_i32 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<4 x s32>) - ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV]], 8 - ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV1]], 8 - ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV2]], 8 - ; CHECK: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 8 - ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32) - ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<4 x s32>), [[COPY1]](s32) - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) - ; CHECK: $vgpr0 = COPY [[COPY3]](s32) - %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - %1:_(s32) = COPY $vgpr4 - %2:_(<4 x s8>) = G_TRUNC %0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[SHL3]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<4 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v4s8_constidx_0_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_0_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C4]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C4]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C4]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C3]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v4s8_constidx_1_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_1_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s32) = G_CONSTANT i32 1 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v4s8_constidx_2_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_2_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v4s8_constidx_3_i32 + +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v4s8_constidx_3_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR2]], [[C2]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(<4 x s8>) = G_BITCAST %0 + %2:_(s32) = G_CONSTANT i32 3 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + + + +--- +name: extract_vector_elt_v8s8_varidx_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_varidx_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C3]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]] + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C5]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[LSHR6]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C6]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C6]](s32) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL6]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; CHECK: $vgpr0 = COPY [[COPY6]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + + +--- +name: extract_vector_elt_v8s8_constidx_0_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_0_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C2]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C2]](s16) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 0 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_1_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_1_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 1 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_3_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_3_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C4]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 3 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_4_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_4_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C5]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 4 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_5_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_5_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 5 + %2:_(<8 x s8>) = G_BITCAST %0 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v8s8_constidx_7_i32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v8s8_constidx_7_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C4]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 7 + %2:_(<8 x s8>) = G_BITCAST %0 %3:_(s8) = G_EXTRACT_VECTOR_ELT %2, %1 %4:_(s32) = G_ANYEXT %3 $vgpr0 = COPY %4 @@ -376,9 +1043,14 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s16_varidx_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; CHECK: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s16>), [[COPY1]](s32) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[SHL]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY2]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -395,8 +1067,10 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx0_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) ; CHECK: $vgpr0 = COPY [[COPY1]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 0 @@ -435,8 +1109,11 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s16_idx2_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK: $vgpr0 = COPY [[DEF]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 2 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -579,9 +1256,16 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v4s16_varidx_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; CHECK: [[EVEC:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[COPY1]](s32) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC]](s16) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<2 x s32>), [[LSHR]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: $vgpr0 = COPY [[COPY2]](s32) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s16) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -599,8 +1283,18 @@ body: | ; CHECK-LABEL: name: extract_vector_elt_v2s128_varidx_i32 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s128>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; CHECK: [[EVEC:%[0-9]+]]:_(s128) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s128>), [[COPY1]](s32) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[EVEC]](s128) + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s64>) = G_BITCAST [[COPY]](<2 x s128>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[C1]] + ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<4 x s64>), [[ADD]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[C2]] + ; CHECK: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<4 x s64>), [[ADD1]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[EVEC]](s64), [[EVEC1]](s64) + ; CHECK: [[BITCAST1:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<2 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST1]](s128) %0:_(<2 x s128>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 %1:_(s32) = COPY $vgpr8 %2:_(s128) = G_EXTRACT_VECTOR_ELT %0, %1 @@ -1146,3 +1840,335 @@ body: | %3:_(s32) = G_EXTRACT_VECTOR_ELT %2, %1 S_ENDPGM 0, implicit %3 ... + +--- +name: extract_vector_elt_v32s1_varidx_i32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: extract_vector_elt_v32s1_varidx_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C4]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C5]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C6]](s32) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C7]](s32) + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C8]](s32) + ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C9]](s32) + ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 + ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C10]](s32) + ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C11]](s32) + ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C12]](s32) + ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C13]](s32) + ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C14]](s32) + ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C15]](s32) + ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C16]](s32) + ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 18 + ; CHECK: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C17]](s32) + ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 19 + ; CHECK: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C18]](s32) + ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C19]](s32) + ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 21 + ; CHECK: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C20]](s32) + ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 22 + ; CHECK: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C21]](s32) + ; CHECK: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; CHECK: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C22]](s32) + ; CHECK: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C23]](s32) + ; CHECK: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 + ; CHECK: [[LSHR24:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C24]](s32) + ; CHECK: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 26 + ; CHECK: [[LSHR25:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C25]](s32) + ; CHECK: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 27 + ; CHECK: [[LSHR26:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C26]](s32) + ; CHECK: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK: [[LSHR27:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C27]](s32) + ; CHECK: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 + ; CHECK: [[LSHR28:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C28]](s32) + ; CHECK: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 + ; CHECK: [[LSHR29:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C29]](s32) + ; CHECK: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[LSHR30:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C30]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C3]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C]] + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C5]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C]] + ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C7]](s32) + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C]] + ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C8]](s32) + ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR9]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C]] + ; CHECK: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C9]](s32) + ; CHECK: [[OR9:%[0-9]+]]:_(s32) = G_OR [[OR8]], [[SHL9]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C]] + ; CHECK: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C10]](s32) + ; CHECK: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]] + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR11]](s32) + ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C]] + ; CHECK: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C11]](s32) + ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32) + ; CHECK: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C]] + ; CHECK: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C12]](s32) + ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR11]], [[SHL12]] + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR13]](s32) + ; CHECK: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C]] + ; CHECK: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C13]](s32) + ; CHECK: [[OR13:%[0-9]+]]:_(s32) = G_OR [[OR12]], [[SHL13]] + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR14]](s32) + ; CHECK: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C]] + ; CHECK: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[AND15]], [[C14]](s32) + ; CHECK: [[OR14:%[0-9]+]]:_(s32) = G_OR [[OR13]], [[SHL14]] + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR15]](s32) + ; CHECK: [[AND16:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C]] + ; CHECK: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[AND16]], [[C15]](s32) + ; CHECK: [[OR15:%[0-9]+]]:_(s32) = G_OR [[OR14]], [[SHL15]] + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[LSHR16]](s32) + ; CHECK: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C]] + ; CHECK: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[C16]](s32) + ; CHECK: [[OR16:%[0-9]+]]:_(s32) = G_OR [[OR15]], [[SHL16]] + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[LSHR17]](s32) + ; CHECK: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C]] + ; CHECK: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C17]](s32) + ; CHECK: [[OR17:%[0-9]+]]:_(s32) = G_OR [[OR16]], [[SHL17]] + ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR18]](s32) + ; CHECK: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C]] + ; CHECK: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[AND19]], [[C18]](s32) + ; CHECK: [[OR18:%[0-9]+]]:_(s32) = G_OR [[OR17]], [[SHL18]] + ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LSHR19]](s32) + ; CHECK: [[AND20:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C]] + ; CHECK: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[AND20]], [[C19]](s32) + ; CHECK: [[OR19:%[0-9]+]]:_(s32) = G_OR [[OR18]], [[SHL19]] + ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY [[LSHR20]](s32) + ; CHECK: [[AND21:%[0-9]+]]:_(s32) = G_AND [[COPY23]], [[C]] + ; CHECK: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[AND21]], [[C20]](s32) + ; CHECK: [[OR20:%[0-9]+]]:_(s32) = G_OR [[OR19]], [[SHL20]] + ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY [[LSHR21]](s32) + ; CHECK: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C]] + ; CHECK: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[AND22]], [[C21]](s32) + ; CHECK: [[OR21:%[0-9]+]]:_(s32) = G_OR [[OR20]], [[SHL21]] + ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LSHR22]](s32) + ; CHECK: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C]] + ; CHECK: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[AND23]], [[C22]](s32) + ; CHECK: [[OR22:%[0-9]+]]:_(s32) = G_OR [[OR21]], [[SHL22]] + ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR23]](s32) + ; CHECK: [[AND24:%[0-9]+]]:_(s32) = G_AND [[COPY26]], [[C]] + ; CHECK: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[AND24]], [[C23]](s32) + ; CHECK: [[OR23:%[0-9]+]]:_(s32) = G_OR [[OR22]], [[SHL23]] + ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY [[LSHR24]](s32) + ; CHECK: [[AND25:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C]] + ; CHECK: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[AND25]], [[C24]](s32) + ; CHECK: [[OR24:%[0-9]+]]:_(s32) = G_OR [[OR23]], [[SHL24]] + ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY [[LSHR25]](s32) + ; CHECK: [[AND26:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C]] + ; CHECK: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[AND26]], [[C25]](s32) + ; CHECK: [[OR25:%[0-9]+]]:_(s32) = G_OR [[OR24]], [[SHL25]] + ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LSHR26]](s32) + ; CHECK: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C]] + ; CHECK: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[AND27]], [[C26]](s32) + ; CHECK: [[OR26:%[0-9]+]]:_(s32) = G_OR [[OR25]], [[SHL26]] + ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY [[LSHR27]](s32) + ; CHECK: [[AND28:%[0-9]+]]:_(s32) = G_AND [[COPY30]], [[C]] + ; CHECK: [[SHL27:%[0-9]+]]:_(s32) = G_SHL [[AND28]], [[C27]](s32) + ; CHECK: [[OR27:%[0-9]+]]:_(s32) = G_OR [[OR26]], [[SHL27]] + ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LSHR28]](s32) + ; CHECK: [[AND29:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C]] + ; CHECK: [[SHL28:%[0-9]+]]:_(s32) = G_SHL [[AND29]], [[C28]](s32) + ; CHECK: [[OR28:%[0-9]+]]:_(s32) = G_OR [[OR27]], [[SHL28]] + ; CHECK: [[COPY32:%[0-9]+]]:_(s32) = COPY [[LSHR29]](s32) + ; CHECK: [[AND30:%[0-9]+]]:_(s32) = G_AND [[COPY32]], [[C]] + ; CHECK: [[SHL29:%[0-9]+]]:_(s32) = G_SHL [[AND30]], [[C29]](s32) + ; CHECK: [[OR29:%[0-9]+]]:_(s32) = G_OR [[OR28]], [[SHL29]] + ; CHECK: [[COPY33:%[0-9]+]]:_(s32) = COPY [[LSHR30]](s32) + ; CHECK: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY33]], [[C]] + ; CHECK: [[SHL30:%[0-9]+]]:_(s32) = G_SHL [[AND31]], [[C30]](s32) + ; CHECK: [[OR30:%[0-9]+]]:_(s32) = G_OR [[OR29]], [[SHL30]] + ; CHECK: [[AND32:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C30]] + ; CHECK: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[SHL31:%[0-9]+]]:_(s32) = G_SHL [[AND32]], [[C31]](s32) + ; CHECK: [[LSHR31:%[0-9]+]]:_(s32) = G_LSHR [[OR30]], [[SHL31]](s32) + ; CHECK: [[COPY34:%[0-9]+]]:_(s32) = COPY [[LSHR31]](s32) + ; CHECK: $vgpr0 = COPY [[COPY34]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(<32 x s1>) = G_BITCAST %0 + %3:_(s1) = G_EXTRACT_VECTOR_ELT %2, %1 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v12s8_varidx_s32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3 + ; CHECK-LABEL: name: extract_vector_elt_v12s8_varidx_s32 + ; CHECK: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C1]](s32) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C2]](s32) + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C1]](s32) + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C1]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[UV2]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR7]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C1]](s32) + ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32) + ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C4]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[LSHR9]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C5]] + ; CHECK: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C5]](s32) + ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[EVEC]], [[SHL9]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; CHECK: $vgpr0 = COPY [[COPY14]](s32) + %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + %1:_(<12 x s8>) = G_BITCAST %0 + %2:_(s32) = COPY $vgpr3 + %3:_(s8) = G_EXTRACT_VECTOR_ELT %1, %2 + %4:_(s32) = G_ANYEXT %3 + $vgpr0 = COPY %4 +... + +--- +name: extract_vector_elt_v3s8_varidx_s32 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; CHECK-LABEL: name: extract_vector_elt_v3s8_varidx_s32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY5]], 8 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY6]], 8 + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY7]], 8 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<3 x s32>), [[COPY1]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32) + ; CHECK: $vgpr0 = COPY [[COPY8]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s24) = G_TRUNC %0 + %3:_(<3 x s8>) = G_BITCAST %2 + %4:_(s8) = G_EXTRACT_VECTOR_ELT %3, %1 + %5:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll index 86d7a2f4e4dbd..e566572763e57 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.dim.a16.ll @@ -16,8 +16,10 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -38,8 +40,10 @@ define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -67,12 +71,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -94,12 +100,14 @@ define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -130,15 +138,18 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -163,15 +174,18 @@ define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_l ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -205,15 +219,18 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -238,15 +255,18 @@ define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -279,12 +299,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -306,12 +328,14 @@ define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1darray), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -342,15 +366,18 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -375,15 +402,18 @@ define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coo ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -417,15 +447,18 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -450,15 +483,18 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -492,18 +528,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -527,18 +566,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -571,12 +613,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -598,12 +642,14 @@ define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.1d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) @@ -634,15 +680,18 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -667,15 +716,18 @@ define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -709,18 +761,21 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -744,18 +799,21 @@ define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.3d), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -789,18 +847,21 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -824,18 +885,21 @@ define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.cube), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -869,15 +933,18 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -902,15 +969,18 @@ define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -944,18 +1014,21 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -979,18 +1052,21 @@ define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.mip.2darray), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -1028,8 +1104,10 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d @@ -1050,8 +1128,10 @@ define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -1079,12 +1159,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 @@ -1106,12 +1188,14 @@ define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 @@ -1142,15 +1226,18 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1175,15 +1262,18 @@ define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1217,15 +1307,18 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1250,15 +1343,18 @@ define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1291,12 +1387,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 @@ -1318,12 +1416,14 @@ define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 @@ -1354,15 +1454,18 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1387,15 +1490,18 @@ define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1429,15 +1535,18 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1462,15 +1571,18 @@ define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1504,18 +1616,21 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1539,18 +1654,21 @@ define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2darraymsaa), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1583,12 +1701,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 @@ -1610,12 +1730,14 @@ define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 @@ -1646,15 +1768,18 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1679,15 +1804,18 @@ define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1721,18 +1849,21 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1756,18 +1887,21 @@ define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.3d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1801,18 +1935,21 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1836,18 +1973,21 @@ define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.cube), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1881,15 +2021,18 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1914,15 +2057,18 @@ define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[DEF]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -1956,18 +2102,21 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX9: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -1991,18 +2140,21 @@ define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vda ; GFX10NSA: [[COPY13:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY13]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[COPY15]](s32) - ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY16]](s32), [[COPY17]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.mip.2darray), [[BUILD_VECTOR1]](<4 x s32>), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 16 into custom "TargetCustom8") @@ -2030,8 +2182,10 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2052,8 +2206,10 @@ define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2081,8 +2237,10 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2103,8 +2261,10 @@ define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2132,8 +2292,10 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2154,8 +2316,10 @@ define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %co ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.3d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2183,8 +2347,10 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2205,8 +2371,10 @@ define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.cube), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2234,8 +2402,10 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2256,8 +2426,10 @@ define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.1darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2285,8 +2457,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2307,8 +2481,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16 ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darray), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2336,8 +2512,10 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2358,8 +2536,10 @@ define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2dmsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2387,8 +2567,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2409,8 +2591,10 @@ define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.getresinfo.2darraymsaa), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2438,8 +2622,10 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX9: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX9: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -2456,8 +2642,10 @@ define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) { ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 8, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 4 from custom "TargetCustom8") ; GFX10NSA: $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32) ; GFX10NSA: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -2481,8 +2669,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2501,8 +2691,10 @@ define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coord ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<2 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 9, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable load 8 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<2 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2529,8 +2721,10 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16 ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 4 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_V1 @@ -2547,8 +2741,10 @@ define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[COPY8]](s32), 2, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 4 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2574,8 +2770,10 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 8 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_V2 @@ -2594,8 +2792,10 @@ define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 ; GFX10NSA: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY10]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<2 x s32>), 12, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 0, 3 :: (dereferenceable store 8 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2618,8 +2818,10 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2640,8 +2842,10 @@ define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2669,8 +2873,10 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2691,8 +2897,10 @@ define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2720,8 +2928,10 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX9: $vgpr0 = COPY [[UV]](s32) @@ -2742,8 +2952,10 @@ define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> % ; GFX10NSA: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>) ; GFX10NSA: $vgpr0 = COPY [[UV]](s32) @@ -2776,8 +2988,10 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_glc @@ -2798,8 +3012,10 @@ define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 1, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2827,8 +3043,10 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_slc @@ -2849,8 +3067,10 @@ define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, < ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 2, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2878,8 +3098,10 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; GFX9: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX9: S_ENDPGM 0 ; GFX10NSA-LABEL: name: store_1d_glc_slc @@ -2900,8 +3122,10 @@ define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdat ; GFX10NSA: [[COPY12:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY12]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.1d), [[BUILD_VECTOR1]](<4 x s32>), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 0, 3, 3 :: (dereferenceable store 16 into custom "TargetCustom8") ; GFX10NSA: S_ENDPGM 0 main_body: @@ -2970,8 +3194,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX9: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -2994,8 +3220,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10NSA: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.1d), 15, [[TRUNC]](s16), [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) ; GFX10NSA: G_STORE [[UV4]](s32), [[DEF]](p1) :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) @@ -3028,12 +3256,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) @@ -3057,12 +3287,14 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY8:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10NSA: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2d), 15, [[BUILD_VECTOR_TRUNC]](<2 x s16>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") ; GFX10NSA: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AMDGPU_INTRIN_IMAGE_LOAD]](<5 x s32>) @@ -3098,15 +3330,18 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -3133,15 +3368,18 @@ define amdgpu_ps <4 x float> @load_3d_tfe(<8 x i32> inreg %rsrc, <2 x i16> %coor ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) ; GFX10NSA: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) @@ -3180,18 +3418,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX9: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX9: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX9: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") @@ -3217,18 +3458,21 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, <2 x i ; GFX10NSA: [[COPY9:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX10NSA: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) ; GFX10NSA: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX10NSA: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) + ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX10NSA: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY8]](<2 x s16>) - ; GFX10NSA: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10NSA: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10NSA: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) ; GFX10NSA: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) + ; GFX10NSA: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) ; GFX10NSA: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY9]](<2 x s16>) - ; GFX10NSA: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C1]](s32) + ; GFX10NSA: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10NSA: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) - ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) - ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX10NSA: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX10NSA: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) ; GFX10NSA: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) ; GFX10NSA: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>) ; GFX10NSA: [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(<5 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.2darraymsaa), 15, [[CONCAT_VECTORS]](<4 x s16>), $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 1, 0, 3 :: (dereferenceable load 16 from custom "TargetCustom8") diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir index 9f614bef378ed..692078edbe65f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir @@ -217,16 +217,18 @@ body: | ; CHECK: liveins: $vgpr0, $vgpr1 ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -250,15 +252,17 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; CHECK: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; CHECK: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir index d4de328b679a7..d4b80ce4a5721 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir @@ -45,13 +45,15 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) @@ -80,14 +82,16 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]] ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_0 @@ -115,16 +119,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -154,15 +160,17 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -191,15 +199,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -267,15 +278,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -304,13 +318,15 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C2]](s32) + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) @@ -339,14 +355,16 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32) ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32) - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]] + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]] ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST1]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_2 @@ -374,16 +392,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -413,15 +433,17 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -561,15 +583,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -577,10 +602,13 @@ body: | ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -602,15 +630,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]] - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -640,15 +671,17 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -657,11 +690,13 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -683,16 +718,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -700,12 +737,14 @@ body: | ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -728,15 +767,17 @@ body: | ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -745,11 +786,13 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -771,16 +814,18 @@ body: | ; GFX8: liveins: $vgpr0, $vgpr1 ; GFX8: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX8: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX8: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX8: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] - ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) - ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) @@ -788,12 +833,14 @@ body: | ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) - ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 @@ -933,24 +980,38 @@ body: | ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX8: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16 - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT1]](s16) - ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] - ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0 + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C1]](s32) + ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 0 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32) + ; GFX8: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32) + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v4s16_1_0 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 0 - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C1]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 0 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 @@ -971,24 +1032,36 @@ body: | ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX8: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16 - ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 48 - ; GFX8: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT]](s16) - ; GFX8: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[EXTRACT1]](s16) + ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX8: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0 ; GFX8: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) - ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] - ; GFX8: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) - ; GFX8: $vgpr0 = COPY [[BITCAST]](<2 x s16>) + ; GFX8: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C]](s32) + ; GFX8: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX8: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 32 + ; GFX8: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32) + ; GFX8: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX8: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; GFX8: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX8: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; GFX8: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; GFX8: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; GFX8: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) + ; GFX8: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) ; GFX9-LABEL: name: shufflevector_v2s16_v4s16_1_3 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9: [[EXTRACT:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 16 - ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s16) = G_EXTRACT [[COPY]](<4 x s16>), 48 - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT]](s16) - ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EXTRACT1]](s16) - ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[ANYEXT]](s32), [[ANYEXT1]](s32) + ; GFX9: [[BITCAST:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX9: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST]](<2 x s32>), 0 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(<2 x s32>) = G_BITCAST [[COPY]](<4 x s16>) + ; GFX9: [[EXTRACT1:%[0-9]+]]:_(s32) = G_EXTRACT [[BITCAST1]](<2 x s32>), 32 + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[EXTRACT1]], [[C]](s32) + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY2]](s32), [[COPY3]](s32) ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3