diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 47926734d64d4..d6ca517944600 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1737,6 +1737,27 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { NewII->takeName(&II); return IC.replaceInstUsesWith(II, NewII); } + case Intrinsic::amdgcn_tensor_load_to_lds: + case Intrinsic::amdgcn_tensor_store_from_lds: { + Value *D2 = II.getArgOperand(2); + Value *D3 = II.getArgOperand(3); + // We know that not passing the second and third tensor DMA groups is + // equivalent to passing zeroes for those registers, so we rewrite to the + // shorter form here. Undef or poison are replaced by 0. + auto Pred = m_CombineOr(m_Zero(), m_Undef()); + if (!match(D2, Pred) || !match(D3, Pred)) + return std::nullopt; + + auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds + ? Intrinsic::amdgcn_tensor_load_to_lds_d2 + : Intrinsic::amdgcn_tensor_store_from_lds_d2; + CallInst *NewII = IC.Builder.CreateIntrinsic( + ShortIntrinsic, + {II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)}); + NewII->takeName(&II); + NewII->copyMetadata(II); + return IC.eraseInstFromFunction(II); + } } if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/tensor-load-store-lds.ll b/llvm/test/Transforms/InstCombine/AMDGPU/tensor-load-store-lds.ll new file mode 100644 index 0000000000000..bd19e7d47b320 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/tensor-load-store-lds.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s + +; -------------------------------------------------------------------- +; tensor_load_to_lds: D2 and D3 are zero/poison -> convert to _d2 variant +; -------------------------------------------------------------------- + +define void @test_tensor_load_to_lds_d2_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_d3_zero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 0) + ret void +} + +define void @test_tensor_load_to_lds_d2_d3_poison(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_d3_poison( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> poison, <4 x i32> poison, i32 0) + ret void +} + +define void @test_tensor_load_to_lds_d2_zero_d3_poison(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_zero_d3_poison( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> poison, i32 0) + ret void +} + +define void @test_tensor_load_to_lds_d2_poison_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_poison_d3_zero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> poison, <4 x i32> zeroinitializer, i32 0) + ret void +} + +; -------------------------------------------------------------------- +; non-matching patterns for tensor_load_to_lds simplification +; -------------------------------------------------------------------- + +define void @test_tensor_load_to_lds_d2_zero_d3_nonzero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d3) { +; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_zero_d3_nonzero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D3:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> zeroinitializer, <4 x i32> [[D3]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> %d3, i32 0) + ret void +} + +define void @test_tensor_load_to_lds_d2_nonzero_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d2) { +; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_nonzero_d3_zero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D2:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> [[D2]], <4 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> %d2, <4 x i32> zeroinitializer, i32 0) + ret void +} + +define void @test_tensor_load_to_lds_d2_d3_nonzero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d2, <4 x i32> inreg %d3) { +; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_d3_nonzero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D2:%.*]], <4 x i32> inreg [[D3:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> [[D2]], <4 x i32> [[D3]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> %d2, <4 x i32> %d3, i32 0) + ret void +} + +; -------------------------------------------------------------------- +; tensor_store_from_lds: D2 and D3 are zero/poison -> convert to _d2 variant +; -------------------------------------------------------------------- + +define void @test_tensor_store_from_lds_d2_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_d3_zero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 0) + ret void +} + +define void @test_tensor_store_from_lds_d2_d3_poison(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_d3_poison( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> poison, <4 x i32> poison, i32 0) + ret void +} + +define void @test_tensor_store_from_lds_d2_zero_d3_poison(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_zero_d3_poison( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> poison, i32 0) + ret void +} + +define void @test_tensor_store_from_lds_d2_poison_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_poison_d3_zero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> poison, <4 x i32> zeroinitializer, i32 0) + ret void +} + +; -------------------------------------------------------------------- +; non-matching patterns for tensor_store_from_lds simplification +; -------------------------------------------------------------------- + +define void @test_tensor_store_from_lds_d2_zero_d3_nonzero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d3) { +; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_zero_d3_nonzero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D3:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> zeroinitializer, <4 x i32> [[D3]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> %d3, i32 0) + ret void +} + +define void @test_tensor_store_from_lds_d2_nonzero_d3_zero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d2) { +; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_nonzero_d3_zero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D2:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> [[D2]], <4 x i32> zeroinitializer, i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> %d2, <4 x i32> zeroinitializer, i32 0) + ret void +} + +define void @test_tensor_store_from_lds_d2_d3_nonzero(<4 x i32> inreg %d0, <8 x i32> inreg %d1, <4 x i32> inreg %d2, <4 x i32> inreg %d3) { +; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_d3_nonzero( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]], <4 x i32> inreg [[D2:%.*]], <4 x i32> inreg [[D3:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[D0]], <8 x i32> [[D1]], <4 x i32> [[D2]], <4 x i32> [[D3]], i32 0) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> %d2, <4 x i32> %d3, i32 0) + ret void +} + +; -------------------------------------------------------------------- +; ensure cachepolicy is preserved +; -------------------------------------------------------------------- + +define void @test_tensor_load_to_lds_d2_d3_zero_cachepolicy(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_load_to_lds_d2_d3_zero_cachepolicy( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1) + ret void +} + +define void @test_tensor_store_from_lds_d2_d3_zero_cachepolicy(<4 x i32> inreg %d0, <8 x i32> inreg %d1) { +; CHECK-LABEL: define void @test_tensor_store_from_lds_d2_d3_zero_cachepolicy( +; CHECK-SAME: <4 x i32> inreg [[D0:%.*]], <8 x i32> inreg [[D1:%.*]]) { +; CHECK-NEXT: call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[D0]], <8 x i32> [[D1]], i32 1) +; CHECK-NEXT: ret void +; + call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %d0, <8 x i32> %d1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, i32 1) + ret void +} + +declare void @llvm.amdgcn.tensor.load.to.lds(<4 x i32>, <8 x i32>, <4 x i32>, <4 x i32>, i32 immarg) +declare void @llvm.amdgcn.tensor.store.from.lds(<4 x i32>, <8 x i32>, <4 x i32>, <4 x i32>, i32 immarg)