diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index ebe740f884ea6..02d40d38d307a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1118,9 +1118,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: { - // If the first argument is uniform these intrinsics return it unchanged. - const Use &Src = II.getArgOperandUse(0); + case Intrinsic::amdgcn_readlane: + case Intrinsic::amdgcn_ds_bpermute: { + // If the data argument is uniform these intrinsics return it unchanged. + unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0; + const Use &Src = II.getArgOperandUse(SrcIdx); if (isTriviallyUniform(Src)) return IC.replaceInstUsesWith(II, Src.get()); @@ -1129,7 +1131,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return ⅈ // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1) - if (auto *BC = dyn_cast(Src); BC && BC->hasOneUse()) { + if (auto *BC = dyn_cast(Src); + BC && BC->hasOneUse() && IID != Intrinsic::amdgcn_ds_bpermute) { Value *BCSrc = BC->getOperand(0); // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants. @@ -1152,6 +1155,22 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } } + // If the lane argument of bpermute is uniform, change it to readlane. This + // generates better code and can enable further optimizations because + // readlane is AlwaysUniform. + if (IID == Intrinsic::amdgcn_ds_bpermute) { + const Use &Lane = II.getArgOperandUse(0); + if (isTriviallyUniform(Lane)) { + Value *NewLane = IC.Builder.CreateLShr(Lane, 2); + Function *NewDecl = Intrinsic::getOrInsertDeclaration( + II.getModule(), Intrinsic::amdgcn_readlane, II.getType()); + II.setCalledFunction(NewDecl); + II.setOperand(0, Src); + II.setOperand(1, NewLane); + return ⅈ + } + } + return std::nullopt; } case Intrinsic::amdgcn_writelane: { diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 3c190efca7acf..843b436aa1b0f 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -6583,3 +6583,42 @@ define i32 @prng_poison_i32() { %prng = call i32 @llvm.amdgcn.prng.b32(i32 poison) ret i32 %prng } + +; -------------------------------------------------------------------- +; llvm.amdgcn.ds.bpermute +; -------------------------------------------------------------------- + +define amdgpu_kernel void @ds_bpermute_uniform_src(ptr addrspace(1) %out, i32 %lane) { +; CHECK-LABEL: @ds_bpermute_uniform_src( +; CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.ds.bpermute(i32 %lane, i32 7) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ds_bpermute_constant_lane(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: @ds_bpermute_constant_lane( +; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC:%.*]], i32 7) +; CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %v = call i32 @llvm.amdgcn.ds.bpermute(i32 28, i32 %src) + store i32 %v, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @ds_bpermute_uniform_lane(ptr addrspace(1) %out, i32 %lanearg, i32 %src) { +; CHECK-LABEL: @ds_bpermute_uniform_lane( +; CHECK-NEXT: [[LANE:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[LANEARG:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[LANE]], 2 +; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[SRC:%.*]], i32 [[TMP1]]) +; CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT:%.*]], align 4 +; CHECK-NEXT: ret void +; + %lane = call i32 @llvm.amdgcn.readfirstlane(i32 %lanearg) + %v = call i32 @llvm.amdgcn.ds.bpermute(i32 %lane, i32 %src) + store i32 %v, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll index e458fbd712370..02f50228339b1 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/bitcast-fold-lane-ops.ll @@ -311,3 +311,15 @@ define i32 @test_bitcast_f32_to_i32_readlane_convergencetoken(float %val, i32 in %result = call i32 @llvm.amdgcn.readlane.i32(i32 %bitcast, i32 %lane.index) [ "convergencectrl"(token %t) ] ret i32 %result } + +define i32 @test_bitcast_f32_to_i32_ds_bpermute(float %val, i32 %addr) { +; CHECK-LABEL: define i32 @test_bitcast_f32_to_i32_ds_bpermute( +; CHECK-SAME: float [[VAL:%.*]], i32 [[ADDR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float [[VAL]] to i32 +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[ADDR]], i32 [[BITCAST]]) +; CHECK-NEXT: ret i32 [[RESULT]] +; + %bitcast = bitcast float %val to i32 + %result = call i32 @llvm.amdgcn.ds.bpermute(i32 %addr, i32 %bitcast) + ret i32 %result +}