From 0fcff69bcb3d589e1feef4cd4902a2b48b6c7435 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 23 Mar 2022 09:06:47 -0400 Subject: [PATCH] [InstCombine] try to narrow shifted bswap-of-zext (2nd try) The first attempt at this missed a validity check. This version includes a test of the narrow source type for modulo-16-bits. Original commit message: This is the IR counterpart to 370ebc9d9a573d6 which provided a bswap narrowing fix for issue #53867. Here we can be more general (although I'm not sure yet what would happen for illegal types in codegen - too rare to worry about?): https://alive2.llvm.org/ce/z/3-CPfo This will be more effective if we have moved the shift after the bswap as proposed in D122010, but it is independent of that patch. Differential Revision: https://reviews.llvm.org/D122166 --- .../InstCombine/InstCombineShifts.cpp | 16 +++++++++ llvm/test/Transforms/InstCombine/lshr.ll | 33 ++++++++++--------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 03214118a2cf7..b38ee71e7bf67 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -1173,6 +1173,22 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) { MulC->logBase2() == ShAmtC) return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *MulC - 2)); + // Try to narrow a bswap: + // (bswap (zext X)) >> C --> zext (bswap X >> C') + // In the case where the shift amount equals the bitwidth difference, the + // shift is eliminated. + if (match(Op0, m_OneUse(m_Intrinsic( + m_OneUse(m_ZExt(m_Value(X))))))) { + // TODO: If the shift amount is less than the zext, we could shift left. + unsigned SrcWidth = X->getType()->getScalarSizeInBits(); + unsigned WidthDiff = BitWidth - SrcWidth; + if (SrcWidth % 16 == 0 && ShAmtC >= WidthDiff) { + Value *NarrowSwap = Builder.CreateUnaryIntrinsic(Intrinsic::bswap, X); + Value *NewShift = Builder.CreateLShr(NarrowSwap, ShAmtC - WidthDiff); + return new ZExtInst(NewShift, Ty); + } + } + // If the shifted-out value is known-zero, then this is an exact shift. if (!I.isExact() && MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmtC), 0, &I)) { diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll index 34a5facceafd5..2db2c09911769 100644 --- a/llvm/test/Transforms/InstCombine/lshr.ll +++ b/llvm/test/Transforms/InstCombine/lshr.ll @@ -831,9 +831,8 @@ define i1 @icmp_sge(i32 %x, i32 %y) { define i32 @narrow_bswap(i16 %x) { ; CHECK-LABEL: @narrow_bswap( -; CHECK-NEXT: [[Z:%.*]] = zext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.bswap.i32(i32 [[Z]]) -; CHECK-NEXT: [[S:%.*]] = lshr exact i32 [[B]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[X:%.*]]) +; CHECK-NEXT: [[S:%.*]] = zext i16 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[S]] ; %z = zext i16 %x to i32 @@ -844,9 +843,8 @@ define i32 @narrow_bswap(i16 %x) { define i128 @narrow_bswap_extra_wide(i16 %x) { ; CHECK-LABEL: @narrow_bswap_extra_wide( -; CHECK-NEXT: [[Z:%.*]] = zext i16 [[X:%.*]] to i128 -; CHECK-NEXT: [[B:%.*]] = call i128 @llvm.bswap.i128(i128 [[Z]]) -; CHECK-NEXT: [[S:%.*]] = lshr exact i128 [[B]], 112 +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.bswap.i16(i16 [[X:%.*]]) +; CHECK-NEXT: [[S:%.*]] = zext i16 [[TMP1]] to i128 ; CHECK-NEXT: ret i128 [[S]] ; %z = zext i16 %x to i128 @@ -855,6 +853,8 @@ define i128 @narrow_bswap_extra_wide(i16 %x) { ret i128 %s } +; TODO: The bswap can be narrowed followed by shl. + define i32 @narrow_bswap_undershift(i16 %x) { ; CHECK-LABEL: @narrow_bswap_undershift( ; CHECK-NEXT: [[Z:%.*]] = zext i16 [[X:%.*]] to i32 @@ -870,9 +870,8 @@ define i32 @narrow_bswap_undershift(i16 %x) { define <2 x i64> @narrow_bswap_splat(<2 x i16> %x) { ; CHECK-LABEL: @narrow_bswap_splat( -; CHECK-NEXT: [[Z:%.*]] = zext <2 x i16> [[X:%.*]] to <2 x i64> -; CHECK-NEXT: [[B:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[Z]]) -; CHECK-NEXT: [[S:%.*]] = lshr exact <2 x i64> [[B]], +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> [[X:%.*]]) +; CHECK-NEXT: [[S:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[S]] ; %z = zext <2 x i16> %x to <2 x i64> @@ -881,6 +880,8 @@ define <2 x i64> @narrow_bswap_splat(<2 x i16> %x) { ret <2 x i64> %s } +; TODO: poison/undef in the shift amount is ok to propagate. + define <2 x i64> @narrow_bswap_splat_poison_elt(<2 x i16> %x) { ; CHECK-LABEL: @narrow_bswap_splat_poison_elt( ; CHECK-NEXT: [[Z:%.*]] = zext <2 x i16> [[X:%.*]] to <2 x i64> @@ -896,9 +897,9 @@ define <2 x i64> @narrow_bswap_splat_poison_elt(<2 x i16> %x) { define <2 x i64> @narrow_bswap_overshift(<2 x i32> %x) { ; CHECK-LABEL: @narrow_bswap_overshift( -; CHECK-NEXT: [[Z:%.*]] = zext <2 x i32> [[X:%.*]] to <2 x i64> -; CHECK-NEXT: [[B:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[Z]]) -; CHECK-NEXT: [[S:%.*]] = lshr <2 x i64> [[B]], +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> [[X:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i32> [[TMP1]], +; CHECK-NEXT: [[S:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[S]] ; %z = zext <2 x i32> %x to <2 x i64> @@ -909,9 +910,9 @@ define <2 x i64> @narrow_bswap_overshift(<2 x i32> %x) { define i128 @narrow_bswap_overshift2(i96 %x) { ; CHECK-LABEL: @narrow_bswap_overshift2( -; CHECK-NEXT: [[Z:%.*]] = zext i96 [[X:%.*]] to i128 -; CHECK-NEXT: [[B:%.*]] = call i128 @llvm.bswap.i128(i128 [[Z]]) -; CHECK-NEXT: [[S:%.*]] = lshr i128 [[B]], 61 +; CHECK-NEXT: [[TMP1:%.*]] = call i96 @llvm.bswap.i96(i96 [[X:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = lshr i96 [[TMP1]], 29 +; CHECK-NEXT: [[S:%.*]] = zext i96 [[TMP2]] to i128 ; CHECK-NEXT: ret i128 [[S]] ; %z = zext i96 %x to i128 @@ -920,6 +921,8 @@ define i128 @narrow_bswap_overshift2(i96 %x) { ret i128 %s } +; negative test - can't make a bswap with an odd number of bytes + define i32 @not_narrow_bswap(i24 %x) { ; CHECK-LABEL: @not_narrow_bswap( ; CHECK-NEXT: [[Z:%.*]] = zext i24 [[X:%.*]] to i32