diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 46af9bf5eed003..1910ebca4004cc 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2699,7 +2699,8 @@ Instruction *InstCombinerImpl::matchBSwapOrBitReverse(Instruction &I, } /// Match UB-safe variants of the funnel shift intrinsic. -static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) { +static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC, + const DominatorTree &DT) { // TODO: Can we reduce the code duplication between this and the related // rotate matching code under visitSelect and visitTrunc? unsigned Width = Or.getType()->getScalarSizeInBits(); @@ -2804,6 +2805,64 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) { return nullptr; FShiftArgs = {ShVal0, ShVal1, ShAmt}; + } else if (isa(Or0) || isa(Or1)) { + // If there are two 'or' instructions concat variables in opposite order: + // + // Slot1 and Slot2 are all zero bits. + // | Slot1 | Low | Slot2 | High | + // LowHigh = or (shl (zext Low), ZextLowShlAmt), (zext High) + // | Slot2 | High | Slot1 | Low | + // HighLow = or (shl (zext High), ZextHighShlAmt), (zext Low) + // + // the latter 'or' can be safely convert to + // -> HighLow = fshl LowHigh, LowHigh, ZextHighShlAmt + // if ZextLowShlAmt + ZextHighShlAmt == Width. + if (!isa(Or1)) + std::swap(Or0, Or1); + + Value *High, *ZextHigh, *Low; + const APInt *ZextHighShlAmt; + if (!match(Or0, + m_OneUse(m_Shl(m_Value(ZextHigh), m_APInt(ZextHighShlAmt))))) + return nullptr; + + if (!match(Or1, m_ZExt(m_Value(Low))) || + !match(ZextHigh, m_ZExt(m_Value(High)))) + return nullptr; + + unsigned HighSize = High->getType()->getScalarSizeInBits(); + unsigned LowSize = Low->getType()->getScalarSizeInBits(); + // Make sure High does not overlap with Low and most significant bits of + // High aren't shifted out. + if (ZextHighShlAmt->ult(LowSize) || ZextHighShlAmt->ugt(Width - HighSize)) + return nullptr; + + for (User *U : ZextHigh->users()) { + Value *X, *Y; + if (!match(U, m_Or(m_Value(X), m_Value(Y)))) + continue; + + if (!isa(Y)) + std::swap(X, Y); + + const APInt *ZextLowShlAmt; + if (!match(X, m_Shl(m_Specific(Or1), m_APInt(ZextLowShlAmt))) || + !match(Y, m_Specific(ZextHigh)) || !DT.dominates(U, &Or)) + continue; + + // HighLow is good concat. If sum of two shifts amount equals to Width, + // LowHigh must also be a good concat. + if (*ZextLowShlAmt + *ZextHighShlAmt != Width) + continue; + + // Low must not overlap with High and most significant bits of Low must + // not be shifted out. + assert(ZextLowShlAmt->uge(HighSize) && + ZextLowShlAmt->ule(Width - LowSize) && "Invalid concat"); + + FShiftArgs = {U, U, ConstantInt::get(Or0->getType(), *ZextHighShlAmt)}; + break; + } } if (FShiftArgs.empty()) @@ -3305,7 +3364,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { /*MatchBitReversals*/ true)) return BitOp; - if (Instruction *Funnel = matchFunnelShift(I, *this)) + if (Instruction *Funnel = matchFunnelShift(I, *this, DT)) return Funnel; if (Instruction *Concat = matchOrConcat(I, Builder)) diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll index 60ce49a1635623..dd8cb2d153fdac 100644 --- a/llvm/test/Transforms/InstCombine/funnel.ll +++ b/llvm/test/Transforms/InstCombine/funnel.ll @@ -354,6 +354,150 @@ define <2 x i64> @fshl_select_vector(<2 x i64> %x, <2 x i64> %y, <2 x i64> %sham ret <2 x i64> %r } +; Convert 'or concat' to fshl if opposite 'or concat' exists. + +define i32 @fshl_concat_i8_i24(i8 %x, i24 %y, ptr %addr) { +; CHECK-LABEL: @fshl_concat_i8_i24( +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 24 +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i24 [[Y:%.*]] to i32 +; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]] +; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[YX:%.*]] = call i32 @llvm.fshl.i32(i32 [[XY]], i32 [[XY]], i32 8) +; CHECK-NEXT: ret i32 [[YX]] +; + %zext.x = zext i8 %x to i32 + %slx = shl i32 %zext.x, 24 + %zext.y = zext i24 %y to i32 + %xy = or i32 %zext.y, %slx + store i32 %xy, ptr %addr, align 4 + %sly = shl i32 %zext.y, 8 + %yx = or i32 %zext.x, %sly + ret i32 %yx +} + +define i32 @fshl_concat_i8_i8(i8 %x, i8 %y, ptr %addr) { +; CHECK-LABEL: @fshl_concat_i8_i8( +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 13 +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32 +; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]] +; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[YX:%.*]] = call i32 @llvm.fshl.i32(i32 [[XY]], i32 [[XY]], i32 19) +; CHECK-NEXT: ret i32 [[YX]] +; + %zext.x = zext i8 %x to i32 + %slx = shl i32 %zext.x, 13 + %zext.y = zext i8 %y to i32 + %xy = or i32 %zext.y, %slx + store i32 %xy, ptr %addr, align 4 + %sly = shl i32 %zext.y, 19 + %yx = or i32 %zext.x, %sly + ret i32 %yx +} + +define i32 @fshl_concat_i8_i8_overlap(i8 %x, i8 %y, ptr %addr) { +; CHECK-LABEL: @fshl_concat_i8_i8_overlap( +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SLX:%.*]] = shl i32 [[ZEXT_X]], 25 +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32 +; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]] +; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 7 +; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]] +; CHECK-NEXT: ret i32 [[YX]] +; + ; Test sly overlap. + %zext.x = zext i8 %x to i32 + %slx = shl i32 %zext.x, 25 + %zext.y = zext i8 %y to i32 + %xy = or i32 %zext.y, %slx + store i32 %xy, ptr %addr, align 4 + %sly = shl i32 %zext.y, 7 + %yx = or i32 %zext.x, %sly + ret i32 %yx +} + +define i32 @fshl_concat_i8_i8_drop(i8 %x, i8 %y, ptr %addr) { +; CHECK-LABEL: @fshl_concat_i8_i8_drop( +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 7 +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32 +; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]] +; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 25 +; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]] +; CHECK-NEXT: ret i32 [[YX]] +; + ; Test sly drop. + %zext.x = zext i8 %x to i32 + %slx = shl i32 %zext.x, 7 + %zext.y = zext i8 %y to i32 + %xy = or i32 %zext.y, %slx + store i32 %xy, ptr %addr, align 4 + %sly = shl i32 %zext.y, 25 + %yx = or i32 %zext.x, %sly + ret i32 %yx +} + +define i32 @fshl_concat_i8_i8_different_slot(i8 %x, i8 %y, ptr %addr) { +; CHECK-LABEL: @fshl_concat_i8_i8_different_slot( +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 9 +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32 +; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]] +; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 22 +; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]] +; CHECK-NEXT: ret i32 [[YX]] +; + %zext.x = zext i8 %x to i32 + %slx = shl i32 %zext.x, 9 + %zext.y = zext i8 %y to i32 + %xy = or i32 %zext.y, %slx + store i32 %xy, ptr %addr, align 4 + %sly = shl i32 %zext.y, 22 + %yx = or i32 %zext.x, %sly + ret i32 %yx +} + +define i32 @fshl_concat_unknown_source(i32 %zext.x, i32 %zext.y, ptr %addr) { +; CHECK-LABEL: @fshl_concat_unknown_source( +; CHECK-NEXT: [[SLX:%.*]] = shl i32 [[ZEXT_X:%.*]], 16 +; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y:%.*]] +; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 16 +; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]] +; CHECK-NEXT: ret i32 [[YX]] +; + %slx = shl i32 %zext.x, 16 + %xy = or i32 %zext.y, %slx + store i32 %xy, ptr %addr, align 4 + %sly = shl i32 %zext.y, 16 + %yx = or i32 %zext.x, %sly + ret i32 %yx +} + +define <2 x i32> @fshl_concat_vector(<2 x i8> %x, <2 x i24> %y, ptr %addr) { +; CHECK-LABEL: @fshl_concat_vector( +; CHECK-NEXT: [[ZEXT_X:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[SLX:%.*]] = shl nuw <2 x i32> [[ZEXT_X]], +; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext <2 x i24> [[Y:%.*]] to <2 x i32> +; CHECK-NEXT: [[XY:%.*]] = or <2 x i32> [[SLX]], [[ZEXT_Y]] +; CHECK-NEXT: store <2 x i32> [[XY]], ptr [[ADDR:%.*]], align 4 +; CHECK-NEXT: [[YX:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[XY]], <2 x i32> [[XY]], <2 x i32> ) +; CHECK-NEXT: ret <2 x i32> [[YX]] +; + %zext.x = zext <2 x i8> %x to <2 x i32> + %slx = shl <2 x i32> %zext.x, + %zext.y = zext <2 x i24> %y to <2 x i32> + %xy = or <2 x i32> %slx, %zext.y + store <2 x i32> %xy, ptr %addr, align 4 + %sly = shl <2 x i32> %zext.y, + %yx = or <2 x i32> %sly, %zext.x + ret <2 x i32> %yx +} + ; Negative test - an oversized shift in the narrow type would produce the wrong value. define i8 @unmasked_shlop_unmasked_shift_amount(i32 %x, i32 %y, i32 %shamt) {