diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 44059fcba73a4c..4937b48acdec6a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2763,6 +2763,30 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { if (auto *InsElt = dyn_cast(Src)) return new BitCastInst(InsElt->getOperand(1), DestTy); } + + // Convert an artificial vector insert into more analyzable bitwise logic. + unsigned BitWidth = DestTy->getScalarSizeInBits(); + Value *X, *Y; + uint64_t IndexC; + if (match(Src, m_OneUse(m_InsertElt(m_OneUse(m_BitCast(m_Value(X))), + m_Value(Y), m_ConstantInt(IndexC)))) && + DestTy->isIntegerTy() && X->getType() == DestTy && + isDesirableIntType(BitWidth)) { + // Adjust for big endian - the LSBs are at the high index. + if (DL.isBigEndian()) + IndexC = SrcVTy->getNumElements() - 1 - IndexC; + + // We only handle (endian-normalized) insert to index 0. Any other insert + // would require a left-shift, so that is an extra instruction. + if (IndexC == 0) { + // bitcast (inselt (bitcast X), Y, 0) --> or (and X, MaskC), (zext Y) + unsigned EltWidth = Y->getType()->getScalarSizeInBits(); + APInt MaskC = APInt::getHighBitsSet(BitWidth, BitWidth - EltWidth); + Value *AndX = Builder.CreateAnd(X, MaskC); + Value *ZextY = Builder.CreateZExt(Y, DestTy); + return BinaryOperator::CreateOr(AndX, ZextY); + } + } } if (auto *Shuf = dyn_cast(Src)) { diff --git a/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll index 2c3c59fdc68b6b..464a438f286f7c 100644 --- a/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll +++ b/llvm/test/Transforms/InstCombine/bitcast-inselt-bitcast.ll @@ -1,15 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S -data-layout="E-n64" | FileCheck %s --check-prefixes=ALL -; RUN: opt < %s -instcombine -S -data-layout="e-n64" | FileCheck %s --check-prefixes=ALL +; RUN: opt < %s -instcombine -S -data-layout="E-n64" | FileCheck %s --check-prefixes=ALL,BE +; RUN: opt < %s -instcombine -S -data-layout="e-n64" | FileCheck %s --check-prefixes=ALL,LE declare void @use(<2 x i8>) +; i16 is a common type, so we can convert independently of the data layout. +; Endian determines if a shift is needed (and so the transform is avoided). + define i16 @insert0_v2i8(i16 %x, i8 %y) { -; ALL-LABEL: @insert0_v2i8( -; ALL-NEXT: [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8> -; ALL-NEXT: [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 0 -; ALL-NEXT: [[R:%.*]] = bitcast <2 x i8> [[I]] to i16 -; ALL-NEXT: ret i16 [[R]] +; BE-LABEL: @insert0_v2i8( +; BE-NEXT: [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8> +; BE-NEXT: [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 0 +; BE-NEXT: [[R:%.*]] = bitcast <2 x i8> [[I]] to i16 +; BE-NEXT: ret i16 [[R]] +; +; LE-LABEL: @insert0_v2i8( +; LE-NEXT: [[TMP1:%.*]] = and i16 [[X:%.*]], -256 +; LE-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i16 +; LE-NEXT: [[R:%.*]] = or i16 [[TMP1]], [[TMP2]] +; LE-NEXT: ret i16 [[R]] ; %v = bitcast i16 %x to <2 x i8> %i = insertelement <2 x i8> %v, i8 %y, i8 0 @@ -17,12 +26,21 @@ define i16 @insert0_v2i8(i16 %x, i8 %y) { ret i16 %r } +; i16 is a common type, so we can convert independently of the data layout. +; Endian determines if a shift is needed (and so the transform is avoided). + define i16 @insert1_v2i8(i16 %x, i8 %y) { -; ALL-LABEL: @insert1_v2i8( -; ALL-NEXT: [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8> -; ALL-NEXT: [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 1 -; ALL-NEXT: [[R:%.*]] = bitcast <2 x i8> [[I]] to i16 -; ALL-NEXT: ret i16 [[R]] +; BE-LABEL: @insert1_v2i8( +; BE-NEXT: [[TMP1:%.*]] = and i16 [[X:%.*]], -256 +; BE-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i16 +; BE-NEXT: [[R:%.*]] = or i16 [[TMP1]], [[TMP2]] +; BE-NEXT: ret i16 [[R]] +; +; LE-LABEL: @insert1_v2i8( +; LE-NEXT: [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8> +; LE-NEXT: [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[Y:%.*]], i8 1 +; LE-NEXT: [[R:%.*]] = bitcast <2 x i8> [[I]] to i16 +; LE-NEXT: ret i16 [[R]] ; %v = bitcast i16 %x to <2 x i8> %i = insertelement <2 x i8> %v, i8 %y, i8 1 @@ -30,12 +48,21 @@ define i16 @insert1_v2i8(i16 %x, i8 %y) { ret i16 %r } +; i32 is a common type, so we can convert independently of the data layout. +; Endian determines if a shift is needed (and so the transform is avoided). + define i32 @insert0_v4i8(i32 %x, i8 %y) { -; ALL-LABEL: @insert0_v4i8( -; ALL-NEXT: [[V:%.*]] = bitcast i32 [[X:%.*]] to <4 x i8> -; ALL-NEXT: [[I:%.*]] = insertelement <4 x i8> [[V]], i8 [[Y:%.*]], i8 0 -; ALL-NEXT: [[R:%.*]] = bitcast <4 x i8> [[I]] to i32 -; ALL-NEXT: ret i32 [[R]] +; BE-LABEL: @insert0_v4i8( +; BE-NEXT: [[V:%.*]] = bitcast i32 [[X:%.*]] to <4 x i8> +; BE-NEXT: [[I:%.*]] = insertelement <4 x i8> [[V]], i8 [[Y:%.*]], i8 0 +; BE-NEXT: [[R:%.*]] = bitcast <4 x i8> [[I]] to i32 +; BE-NEXT: ret i32 [[R]] +; +; LE-LABEL: @insert0_v4i8( +; LE-NEXT: [[TMP1:%.*]] = and i32 [[X:%.*]], -256 +; LE-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 +; LE-NEXT: [[R:%.*]] = or i32 [[TMP1]], [[TMP2]] +; LE-NEXT: ret i32 [[R]] ; %v = bitcast i32 %x to <4 x i8> %i = insertelement <4 x i8> %v, i8 %y, i8 0 @@ -43,12 +70,21 @@ define i32 @insert0_v4i8(i32 %x, i8 %y) { ret i32 %r } +; i64 is a legal type, so we can convert based on the data layout. +; Endian determines if a shift is needed (and so the transform is avoided). + define i64 @insert0_v4i16(i64 %x, i16 %y) { -; ALL-LABEL: @insert0_v4i16( -; ALL-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16> -; ALL-NEXT: [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 0 -; ALL-NEXT: [[R:%.*]] = bitcast <4 x i16> [[I]] to i64 -; ALL-NEXT: ret i64 [[R]] +; BE-LABEL: @insert0_v4i16( +; BE-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16> +; BE-NEXT: [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 0 +; BE-NEXT: [[R:%.*]] = bitcast <4 x i16> [[I]] to i64 +; BE-NEXT: ret i64 [[R]] +; +; LE-LABEL: @insert0_v4i16( +; LE-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], -65536 +; LE-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i64 +; LE-NEXT: [[R:%.*]] = or i64 [[TMP1]], [[TMP2]] +; LE-NEXT: ret i64 [[R]] ; %v = bitcast i64 %x to <4 x i16> %i = insertelement <4 x i16> %v, i16 %y, i8 0 @@ -56,6 +92,8 @@ define i64 @insert0_v4i16(i64 %x, i16 %y) { ret i64 %r } +; Negative test - shifts needed for both endians. + define i64 @insert1_v4i16(i64 %x, i16 %y) { ; ALL-LABEL: @insert1_v4i16( ; ALL-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16> @@ -69,12 +107,21 @@ define i64 @insert1_v4i16(i64 %x, i16 %y) { ret i64 %r } +; i64 is a legal type, so we can convert based on the data layout. +; Endian determines if a shift is needed (and so the transform is avoided). + define i64 @insert3_v4i16(i64 %x, i16 %y) { -; ALL-LABEL: @insert3_v4i16( -; ALL-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16> -; ALL-NEXT: [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 3 -; ALL-NEXT: [[R:%.*]] = bitcast <4 x i16> [[I]] to i64 -; ALL-NEXT: ret i64 [[R]] +; BE-LABEL: @insert3_v4i16( +; BE-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], -65536 +; BE-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i64 +; BE-NEXT: [[R:%.*]] = or i64 [[TMP1]], [[TMP2]] +; BE-NEXT: ret i64 [[R]] +; +; LE-LABEL: @insert3_v4i16( +; LE-NEXT: [[V:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16> +; LE-NEXT: [[I:%.*]] = insertelement <4 x i16> [[V]], i16 [[Y:%.*]], i8 3 +; LE-NEXT: [[R:%.*]] = bitcast <4 x i16> [[I]] to i64 +; LE-NEXT: ret i64 [[R]] ; %v = bitcast i64 %x to <4 x i16> %i = insertelement <4 x i16> %v, i16 %y, i8 3 @@ -82,6 +129,8 @@ define i64 @insert3_v4i16(i64 %x, i16 %y) { ret i64 %r } +; Negative test - i128 is not a legal type, so we do not convert based on the data layout. + define i128 @insert0_v4i32(i128 %x, i32 %y) { ; ALL-LABEL: @insert0_v4i32( ; ALL-NEXT: [[V:%.*]] = bitcast i128 [[X:%.*]] to <4 x i32> @@ -95,6 +144,8 @@ define i128 @insert0_v4i32(i128 %x, i32 %y) { ret i128 %r } +; Negative test - extra use requires more instructions. + define i16 @insert0_v2i8_use1(i16 %x, i8 %y) { ; ALL-LABEL: @insert0_v2i8_use1( ; ALL-NEXT: [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8> @@ -110,6 +161,8 @@ define i16 @insert0_v2i8_use1(i16 %x, i8 %y) { ret i16 %r } +; Negative test - extra use requires more instructions. + define i16 @insert0_v2i8_use2(i16 %x, i8 %y) { ; ALL-LABEL: @insert0_v2i8_use2( ; ALL-NEXT: [[V:%.*]] = bitcast i16 [[X:%.*]] to <2 x i8>