diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index df8c35ffd4575..b27edb1e9e14b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -719,6 +719,18 @@ def FeatureFlatAtomicFaddF32Inst "Has flat_atomic_add_f32 instruction" >; +def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero", + "HasDefaultComponentZero", + "true", + "BUFFER/IMAGE store instructions set unspecified components to zero" +>; + +def FeatureDefaultComponentBroadcast : SubtargetFeature<"default-component-broadcast", + "HasDefaultComponentBroadcast", + "true", + "BUFFER/IMAGE store instructions set unspecified components to x component" +>; + def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support", "SupportsSRAMECC", "true", @@ -1003,7 +1015,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts, - FeatureGDS, FeatureGWS + FeatureGDS, FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1014,7 +1026,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess, - FeatureImageInsts, FeatureGDS, FeatureGWS + FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1029,7 +1041,8 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, - FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS + FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero ] >; @@ -1047,7 +1060,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug, FeatureGWS + FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1067,7 +1080,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, - FeatureGDS, FeatureGWS + FeatureGDS, FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1087,7 +1100,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, - FeatureGWS + FeatureGWS, FeatureDefaultComponentZero ] >; @@ -1107,7 +1120,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureTrue16BitInsts + FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast ] >; @@ -2013,6 +2026,13 @@ def HasFlatAtomicFaddF32Inst : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">, AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>; +def HasDefaultComponentZero + : Predicate<"Subtarget->hasDefaultComponentZero()">, + AssemblerPredicate<(all_of FeatureDefaultComponentZero)>; +def HasDefaultComponentBroadcast + : Predicate<"Subtarget->hasDefaultComponentBroadcast()">, + AssemblerPredicate<(all_of FeatureDefaultComponentBroadcast)>; + def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 2bb7b6bd0674a..898289019c718 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, return DemandedElts; } +// Trim elements of the end of the vector \p V, if they are +// equal to the first element of the vector. +static APInt defaultComponentBroadcast(Value *V) { + auto *VTy = cast(V->getType()); + unsigned VWidth = VTy->getNumElements(); + APInt DemandedElts = APInt::getAllOnes(VWidth); + Value *FirstComponent = findScalarElement(V, 0); + + SmallVector ShuffleMask; + if (auto *SVI = dyn_cast(V)) + SVI->getShuffleMask(ShuffleMask); + + for (int I = VWidth - 1; I > 0; --I) { + if (ShuffleMask.empty()) { + auto *Elt = findScalarElement(V, I); + if (!Elt || (Elt != FirstComponent && !isa(Elt))) + break; + } else { + // Detect identical elements in the shufflevector result, even though + // findScalarElement cannot tell us what that element is. + if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem) + break; + } + DemandedElts.clearBit(I); + } + + return DemandedElts; +} + static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, @@ -1140,8 +1169,13 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (!isa(II.getArgOperand(0)->getType())) break; - APInt DemandedElts = - trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); + APInt DemandedElts; + if (ST->hasDefaultComponentBroadcast()) + DemandedElts = defaultComponentBroadcast(II.getArgOperand(0)); + else if (ST->hasDefaultComponentZero()) + DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); + else + break; int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 85d062a9a6f5e..070d165cdaadb 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -165,6 +165,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasAtomicCSubNoRtnInsts = false; bool HasAtomicGlobalPkAddBF16Inst = false; bool HasFlatAtomicFaddF32Inst = false; + bool HasDefaultComponentZero = false; + bool HasDefaultComponentBroadcast = false; bool SupportsSRAMECC = false; // This should not be used directly. 'TargetID' tracks the dynamic settings @@ -802,6 +804,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } + bool hasDefaultComponentZero() const { return HasDefaultComponentZero; } + + bool hasDefaultComponentBroadcast() const { + return HasDefaultComponentBroadcast; + } + bool hasNoSdstCMPX() const { return HasNoSdstCMPX; } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll index f2d904cce7f00..9cef4a3c7cc0f 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -3,6 +3,7 @@ ; RUN: opt -mcpu=gfx1010 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s ; RUN: opt -mcpu=gfx1100 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s ; RUN: opt -mcpu=gfx1200 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GFX12 %s +; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GFXUNKNOWN %s define amdgpu_ps void @image_store_1d_store_all_zeros(<8 x i32> inreg %rsrc, i32 %s) #0 { ; GCN-LABEL: @image_store_1d_store_all_zeros( @@ -12,6 +13,10 @@ define amdgpu_ps void @image_store_1d_store_all_zeros(<8 x i32> inreg %rsrc, i32 ; GFX12-LABEL: @image_store_1d_store_all_zeros( ; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @image_store_1d_store_all_zeros( +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFXUNKNOWN-NEXT: ret void ; call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret void @@ -23,8 +28,14 @@ define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg ; GCN-NEXT: ret void ; ; GFX12-LABEL: @image_store_1d_store_insert_zeros_at_end( -; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @image_store_1d_store_insert_zeros_at_end( +; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFXUNKNOWN-NEXT: ret void ; %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 @@ -46,6 +57,12 @@ define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> in ; GFX12-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2 ; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @image_store_mip_1d_store_insert_zeros_at_end( +; GFXUNKNOWN-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 1 +; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[NEWVDATA4]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFXUNKNOWN-NEXT: ret void ; %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 @@ -63,10 +80,16 @@ define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg % ; GCN-NEXT: ret void ; ; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end( -; GFX12-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[VDATA1:%.*]], i64 0 -; GFX12-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer -; GFX12-NEXT: call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) +; GFX12-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1 +; GFX12-NEXT: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @buffer_store_format_insert_zeros_at_end( +; GFXUNKNOWN-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) +; GFXUNKNOWN-NEXT: ret void ; %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 @@ -84,10 +107,16 @@ define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg % ; GCN-NEXT: ret void ; ; GFX12-LABEL: @struct_buffer_store_format_insert_zeros( -; GFX12-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 0 -; GFX12-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA1]], i64 2 -; GFX12-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) +; GFX12-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 2 +; GFX12-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @struct_buffer_store_format_insert_zeros( +; GFXUNKNOWN-NEXT: [[TMP1:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 2 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) +; GFXUNKNOWN-NEXT: ret void ; %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 @@ -107,6 +136,11 @@ define amdgpu_ps void @struct_tbuffer_store_insert_zeros_at_beginning(<4 x i32> ; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 3 ; GFX12-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @struct_tbuffer_store_insert_zeros_at_beginning( +; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 3 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GFXUNKNOWN-NEXT: ret void ; %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 @@ -126,6 +160,11 @@ define amdgpu_ps void @struct_tbuffer_store_insert_undefs(<4 x i32> inreg %a, fl ; GFX12-NEXT: [[TMP1:%.*]] = insertelement <2 x float> , float [[VDATA1:%.*]], i64 0 ; GFX12-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @struct_tbuffer_store_insert_undefs( +; GFXUNKNOWN-NEXT: [[NEWVDATA2:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GFXUNKNOWN-NEXT: ret void ; %newvdata1 = insertelement <4 x float> poison, float %vdata1, i32 0 %newvdata2 = insertelement <4 x float> %newvdata1, float 1.0, i32 1 @@ -140,9 +179,14 @@ define amdgpu_ps void @image_store_1d_store_shufflevector_same(<8 x i32> inreg % ; GCN-NEXT: ret void ; ; GFX12-LABEL: @image_store_1d_store_shufflevector_same( -; GFX12-NEXT: [[DATA:%.*]] = shufflevector <4 x float> [[VDATA1:%.*]], <4 x float> poison, <4 x i32> zeroinitializer -; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFX12-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VDATA1:%.*]], i64 0 +; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float [[TMP1]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @image_store_1d_store_shufflevector_same( +; GFXUNKNOWN-NEXT: [[DATA:%.*]] = shufflevector <4 x float> [[VDATA1:%.*]], <4 x float> poison, <4 x i32> zeroinitializer +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[DATA]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFXUNKNOWN-NEXT: ret void ; %data = shufflevector <4 x float> %vdata1, <4 x float> poison, <4 x i32> call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -155,8 +199,12 @@ define amdgpu_ps void @image_store_1d_store_shufflevector(<8 x i32> inreg %rsrc, ; GCN-NEXT: ret void ; ; GFX12-LABEL: @image_store_1d_store_shufflevector( -; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> , i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFX12-NEXT: call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> , i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @image_store_1d_store_shufflevector( +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> , i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFXUNKNOWN-NEXT: ret void ; %data = shufflevector <4 x float> , <4 x float> poison, <4 x i32> call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -172,11 +220,16 @@ define amdgpu_ps void @struct_buffer_store_format_insert_first_at_end(<4 x i32> ; GCN-NEXT: ret void ; ; GFX12-LABEL: @struct_buffer_store_format_insert_first_at_end( -; GFX12-NEXT: [[NEWVDATA2:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 -; GFX12-NEXT: [[NEWVDATA3:%.*]] = insertelement <4 x float> [[NEWVDATA2]], float [[VDATA1]], i64 2 -; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3 -; GFX12-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) +; GFX12-NEXT: [[TMP1:%.*]] = insertelement <2 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @struct_buffer_store_format_insert_first_at_end( +; GFXUNKNOWN-NEXT: [[NEWVDATA2:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFXUNKNOWN-NEXT: [[NEWVDATA3:%.*]] = insertelement <4 x float> [[NEWVDATA2]], float [[VDATA1]], i64 2 +; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) +; GFXUNKNOWN-NEXT: ret void ; %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 @@ -194,10 +247,15 @@ define amdgpu_ps void @struct_tbuffer_store_insert(<4 x i32> inreg %a, float %vd ; GCN-NEXT: ret void ; ; GFX12-LABEL: @struct_tbuffer_store_insert( -; GFX12-NEXT: [[NEWVDATA3:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 -; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3 -; GFX12-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GFX12-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 0 +; GFX12-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v3f32(<3 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @struct_tbuffer_store_insert( +; GFXUNKNOWN-NEXT: [[NEWVDATA3:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 0 +; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[NEWVDATA3]], float [[VDATA1]], i64 3 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GFXUNKNOWN-NEXT: ret void ; %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 %newvdata2 = insertelement <4 x float> %newvdata1, float 1.0, i32 1 @@ -215,6 +273,10 @@ define amdgpu_ps void @struct_tbuffer_store_argument(<4 x i32> inreg %a, <4 x fl ; GFX12-LABEL: @struct_tbuffer_store_argument( ; GFX12-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[VDATA4:%.*]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @struct_tbuffer_store_argument( +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[VDATA4:%.*]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GFXUNKNOWN-NEXT: ret void ; call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15) ret void @@ -230,6 +292,11 @@ define amdgpu_ps void @struct_tbuffer_store_argument_insert_first(<4 x i32> inre ; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[VDATA4:%.*]], float [[VDATA1:%.*]], i64 0 ; GFX12-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) ; GFX12-NEXT: ret void +; +; GFXUNKNOWN-LABEL: @struct_tbuffer_store_argument_insert_first( +; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[VDATA4:%.*]], float [[VDATA1:%.*]], i64 0 +; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GFXUNKNOWN-NEXT: ret void ; %newvdata4 = insertelement <4 x float> %vdata4, float %vdata1, i32 0 call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15)