Skip to content

Commit

Permalink
[InstCombine] Convert or concat to fshl if opposite or concat exists (#…
Browse files Browse the repository at this point in the history
…68502)

If there are two 'or' instructions concat variables in opposite order
and the first 'or' dominates the second one, the second 'or' can be
optimized to fshl to rotate shift first 'or'. This can eliminate an shl
and expose more optimization opportunity for bswap/bitreverse.
  • Loading branch information
HaohaiWen committed Nov 20, 2023
1 parent 3494c55 commit 95d584c
Show file tree
Hide file tree
Showing 2 changed files with 205 additions and 2 deletions.
63 changes: 61 additions & 2 deletions llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2699,7 +2699,8 @@ Instruction *InstCombinerImpl::matchBSwapOrBitReverse(Instruction &I,
}

/// Match UB-safe variants of the funnel shift intrinsic.
static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC,
const DominatorTree &DT) {
// TODO: Can we reduce the code duplication between this and the related
// rotate matching code under visitSelect and visitTrunc?
unsigned Width = Or.getType()->getScalarSizeInBits();
Expand Down Expand Up @@ -2804,6 +2805,64 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
return nullptr;

FShiftArgs = {ShVal0, ShVal1, ShAmt};
} else if (isa<ZExtInst>(Or0) || isa<ZExtInst>(Or1)) {
// If there are two 'or' instructions concat variables in opposite order:
//
// Slot1 and Slot2 are all zero bits.
// | Slot1 | Low | Slot2 | High |
// LowHigh = or (shl (zext Low), ZextLowShlAmt), (zext High)
// | Slot2 | High | Slot1 | Low |
// HighLow = or (shl (zext High), ZextHighShlAmt), (zext Low)
//
// the latter 'or' can be safely convert to
// -> HighLow = fshl LowHigh, LowHigh, ZextHighShlAmt
// if ZextLowShlAmt + ZextHighShlAmt == Width.
if (!isa<ZExtInst>(Or1))
std::swap(Or0, Or1);

Value *High, *ZextHigh, *Low;
const APInt *ZextHighShlAmt;
if (!match(Or0,
m_OneUse(m_Shl(m_Value(ZextHigh), m_APInt(ZextHighShlAmt)))))
return nullptr;

if (!match(Or1, m_ZExt(m_Value(Low))) ||
!match(ZextHigh, m_ZExt(m_Value(High))))
return nullptr;

unsigned HighSize = High->getType()->getScalarSizeInBits();
unsigned LowSize = Low->getType()->getScalarSizeInBits();
// Make sure High does not overlap with Low and most significant bits of
// High aren't shifted out.
if (ZextHighShlAmt->ult(LowSize) || ZextHighShlAmt->ugt(Width - HighSize))
return nullptr;

for (User *U : ZextHigh->users()) {
Value *X, *Y;
if (!match(U, m_Or(m_Value(X), m_Value(Y))))
continue;

if (!isa<ZExtInst>(Y))
std::swap(X, Y);

const APInt *ZextLowShlAmt;
if (!match(X, m_Shl(m_Specific(Or1), m_APInt(ZextLowShlAmt))) ||
!match(Y, m_Specific(ZextHigh)) || !DT.dominates(U, &Or))
continue;

// HighLow is good concat. If sum of two shifts amount equals to Width,
// LowHigh must also be a good concat.
if (*ZextLowShlAmt + *ZextHighShlAmt != Width)
continue;

// Low must not overlap with High and most significant bits of Low must
// not be shifted out.
assert(ZextLowShlAmt->uge(HighSize) &&
ZextLowShlAmt->ule(Width - LowSize) && "Invalid concat");

FShiftArgs = {U, U, ConstantInt::get(Or0->getType(), *ZextHighShlAmt)};
break;
}
}

if (FShiftArgs.empty())
Expand Down Expand Up @@ -3305,7 +3364,7 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
/*MatchBitReversals*/ true))
return BitOp;

if (Instruction *Funnel = matchFunnelShift(I, *this))
if (Instruction *Funnel = matchFunnelShift(I, *this, DT))
return Funnel;

if (Instruction *Concat = matchOrConcat(I, Builder))
Expand Down
144 changes: 144 additions & 0 deletions llvm/test/Transforms/InstCombine/funnel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,150 @@ define <2 x i64> @fshl_select_vector(<2 x i64> %x, <2 x i64> %y, <2 x i64> %sham
ret <2 x i64> %r
}

; Convert 'or concat' to fshl if opposite 'or concat' exists.

define i32 @fshl_concat_i8_i24(i8 %x, i24 %y, ptr %addr) {
; CHECK-LABEL: @fshl_concat_i8_i24(
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
; CHECK-NEXT: [[SLX:%.*]] = shl nuw i32 [[ZEXT_X]], 24
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i24 [[Y:%.*]] to i32
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
; CHECK-NEXT: [[YX:%.*]] = call i32 @llvm.fshl.i32(i32 [[XY]], i32 [[XY]], i32 8)
; CHECK-NEXT: ret i32 [[YX]]
;
%zext.x = zext i8 %x to i32
%slx = shl i32 %zext.x, 24
%zext.y = zext i24 %y to i32
%xy = or i32 %zext.y, %slx
store i32 %xy, ptr %addr, align 4
%sly = shl i32 %zext.y, 8
%yx = or i32 %zext.x, %sly
ret i32 %yx
}

define i32 @fshl_concat_i8_i8(i8 %x, i8 %y, ptr %addr) {
; CHECK-LABEL: @fshl_concat_i8_i8(
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 13
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
; CHECK-NEXT: [[YX:%.*]] = call i32 @llvm.fshl.i32(i32 [[XY]], i32 [[XY]], i32 19)
; CHECK-NEXT: ret i32 [[YX]]
;
%zext.x = zext i8 %x to i32
%slx = shl i32 %zext.x, 13
%zext.y = zext i8 %y to i32
%xy = or i32 %zext.y, %slx
store i32 %xy, ptr %addr, align 4
%sly = shl i32 %zext.y, 19
%yx = or i32 %zext.x, %sly
ret i32 %yx
}

define i32 @fshl_concat_i8_i8_overlap(i8 %x, i8 %y, ptr %addr) {
; CHECK-LABEL: @fshl_concat_i8_i8_overlap(
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
; CHECK-NEXT: [[SLX:%.*]] = shl i32 [[ZEXT_X]], 25
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
; CHECK-NEXT: [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 7
; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
; CHECK-NEXT: ret i32 [[YX]]
;
; Test sly overlap.
%zext.x = zext i8 %x to i32
%slx = shl i32 %zext.x, 25
%zext.y = zext i8 %y to i32
%xy = or i32 %zext.y, %slx
store i32 %xy, ptr %addr, align 4
%sly = shl i32 %zext.y, 7
%yx = or i32 %zext.x, %sly
ret i32 %yx
}

define i32 @fshl_concat_i8_i8_drop(i8 %x, i8 %y, ptr %addr) {
; CHECK-LABEL: @fshl_concat_i8_i8_drop(
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 7
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
; CHECK-NEXT: [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 25
; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
; CHECK-NEXT: ret i32 [[YX]]
;
; Test sly drop.
%zext.x = zext i8 %x to i32
%slx = shl i32 %zext.x, 7
%zext.y = zext i8 %y to i32
%xy = or i32 %zext.y, %slx
store i32 %xy, ptr %addr, align 4
%sly = shl i32 %zext.y, 25
%yx = or i32 %zext.x, %sly
ret i32 %yx
}

define i32 @fshl_concat_i8_i8_different_slot(i8 %x, i8 %y, ptr %addr) {
; CHECK-LABEL: @fshl_concat_i8_i8_different_slot(
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
; CHECK-NEXT: [[SLX:%.*]] = shl nuw nsw i32 [[ZEXT_X]], 9
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y]]
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
; CHECK-NEXT: [[SLY:%.*]] = shl nuw nsw i32 [[ZEXT_Y]], 22
; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
; CHECK-NEXT: ret i32 [[YX]]
;
%zext.x = zext i8 %x to i32
%slx = shl i32 %zext.x, 9
%zext.y = zext i8 %y to i32
%xy = or i32 %zext.y, %slx
store i32 %xy, ptr %addr, align 4
%sly = shl i32 %zext.y, 22
%yx = or i32 %zext.x, %sly
ret i32 %yx
}

define i32 @fshl_concat_unknown_source(i32 %zext.x, i32 %zext.y, ptr %addr) {
; CHECK-LABEL: @fshl_concat_unknown_source(
; CHECK-NEXT: [[SLX:%.*]] = shl i32 [[ZEXT_X:%.*]], 16
; CHECK-NEXT: [[XY:%.*]] = or i32 [[SLX]], [[ZEXT_Y:%.*]]
; CHECK-NEXT: store i32 [[XY]], ptr [[ADDR:%.*]], align 4
; CHECK-NEXT: [[SLY:%.*]] = shl i32 [[ZEXT_Y]], 16
; CHECK-NEXT: [[YX:%.*]] = or i32 [[SLY]], [[ZEXT_X]]
; CHECK-NEXT: ret i32 [[YX]]
;
%slx = shl i32 %zext.x, 16
%xy = or i32 %zext.y, %slx
store i32 %xy, ptr %addr, align 4
%sly = shl i32 %zext.y, 16
%yx = or i32 %zext.x, %sly
ret i32 %yx
}

define <2 x i32> @fshl_concat_vector(<2 x i8> %x, <2 x i24> %y, ptr %addr) {
; CHECK-LABEL: @fshl_concat_vector(
; CHECK-NEXT: [[ZEXT_X:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
; CHECK-NEXT: [[SLX:%.*]] = shl nuw <2 x i32> [[ZEXT_X]], <i32 24, i32 24>
; CHECK-NEXT: [[ZEXT_Y:%.*]] = zext <2 x i24> [[Y:%.*]] to <2 x i32>
; CHECK-NEXT: [[XY:%.*]] = or <2 x i32> [[SLX]], [[ZEXT_Y]]
; CHECK-NEXT: store <2 x i32> [[XY]], ptr [[ADDR:%.*]], align 4
; CHECK-NEXT: [[YX:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[XY]], <2 x i32> [[XY]], <2 x i32> <i32 8, i32 8>)
; CHECK-NEXT: ret <2 x i32> [[YX]]
;
%zext.x = zext <2 x i8> %x to <2 x i32>
%slx = shl <2 x i32> %zext.x, <i32 24, i32 24>
%zext.y = zext <2 x i24> %y to <2 x i32>
%xy = or <2 x i32> %slx, %zext.y
store <2 x i32> %xy, ptr %addr, align 4
%sly = shl <2 x i32> %zext.y, <i32 8, i32 8>
%yx = or <2 x i32> %sly, %zext.x
ret <2 x i32> %yx
}

; Negative test - an oversized shift in the narrow type would produce the wrong value.

define i8 @unmasked_shlop_unmasked_shift_amount(i32 %x, i32 %y, i32 %shamt) {
Expand Down

0 comments on commit 95d584c

Please sign in to comment.