-
Notifications
You must be signed in to change notification settings - Fork 11.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[InstCombine] Do not simplify lshr/shl arg if it is part of fshl rotate pattern. #66115
Conversation
…te pattern. The fshl/fshr having first two arguments as same gets lowered to targets specific rotate. But based on the uses, one of the arguments can get simplified resulting in different arguments performing equivalent operation. This patch prevents the simplification of the arguments of lshr/shl if they are part of fshl pattern.
More context for the patch in issue #66185. |
@llvm/pr-subscribers-llvm-transforms ChangesThe fshl/fshr having first two arguments as same gets lowered to targets specific rotate. But based on the uses, one of the arguments can get simplified resulting in different arguments performing equivalent operation.This patch prevents the simplification of the arguments of lshr/shl if they are part of fshl pattern.Full diff: https://github.com/llvm/llvm-project/pull/66115.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index be005e61a8d2d89..c3adbaa44448fe4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -611,6 +611,23 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, DemandedMask, Known)) return R; + // Do not simplify if lshr is part of fshl rotate pattern + if (I->hasOneUser()) { + auto *Op = I->user_back(); + if (Op->getOpcode() == BinaryOperator::Or) { + const APInt *LShrAmt; + Value *LShrVal; + auto *Operand = + Op->getOperand(0) == I ? Op->getOperand(1) : Op->getOperand(0); + if (match(Operand, + m_OneUse(m_LShr(m_Value(LShrVal), m_APInt(LShrAmt))))) + if (I->getOperand(0) == LShrVal) + if (SA->ult(BitWidth) && LShrAmt->ult(BitWidth) && + (*SA + *LShrAmt) == BitWidth) + return nullptr; + } + } + // TODO: If we only want bits that already match the signbit then we don't // need to shift. @@ -669,6 +686,22 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (match(I->getOperand(1), m_APInt(SA))) { uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); + // Do not simplify if shl is part of fshl rotate pattern + if (I->hasOneUser()) { + auto *Op = I->user_back(); + if (Op->getOpcode() == BinaryOperator::Or) { + const APInt *ShlAmt; + Value *ShlVal; + auto *Operand = + Op->getOperand(0) == I ? Op->getOperand(1) : Op->getOperand(0); + if (match(Operand, m_OneUse(m_Shl(m_Value(ShlVal), m_APInt(ShlAmt))))) + if (I->getOperand(0) == ShlVal) + if (SA->ult(BitWidth) && ShlAmt->ult(BitWidth) && + (*SA + *ShlAmt) == BitWidth) + return nullptr; + } + } + // If we are just demanding the shifted sign bit and below, then this can // be treated as an ASHR in disguise. if (DemandedMask.countl_zero() >= ShiftAmt) { diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll index 48bf296993f6ac2..34761623028b942 100644 --- a/llvm/test/Transforms/InstCombine/fsh.ll +++ b/llvm/test/Transforms/InstCombine/fsh.ll @@ -722,6 +722,134 @@ define i32 @fsh_orconst_rotate(i32 %a) { ret i32 %t2 } +define i32 @fsh_rotate_25(i8 %x, i32 %y) { +; CHECK-LABEL: @fsh_rotate_25( +; CHECK-NEXT: [[T1:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[T1]], [[Y:%.*]] +; CHECK-NEXT: [[OR2:%.*]] = call i32 @llvm.fshl.i32(i32 [[OR1]], i32 [[OR1]], i32 25) +; CHECK-NEXT: ret i32 [[OR2]] +; + + %t1 = zext i8 %x to i32 + %or1 = or i32 %t1, %y + %shr = lshr i32 %or1, 7 + %shl = shl i32 %or1, 25 + %or2 = or i32 %shr, %shl + ret i32 %or2 +} + +define i32 @fsh_rotate_18(i8 %x, i32 %y) { +; CHECK-LABEL: @fsh_rotate_18( +; CHECK-NEXT: [[T1:%.*]] = zext i8 [[X:%.*]] to i32 +; CHECK-NEXT: [[OR1:%.*]] = or i32 [[T1]], [[Y:%.*]] +; CHECK-NEXT: [[OR2:%.*]] = call i32 @llvm.fshl.i32(i32 [[OR1]], i32 [[OR1]], i32 18) +; CHECK-NEXT: ret i32 [[OR2]] +; + + %t1 = zext i8 %x to i32 + %or1 = or i32 %t1, %y + %shr = lshr i32 %or1, 14 + %shl = shl i32 %or1, 18 + %or2 = or i32 %shr, %shl + ret i32 %or2 +} + +define i32 @fsh_load_rotate_12(ptr %data) { +; CHECK-LABEL: @fsh_load_rotate_12( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[DATA:%.*]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 [[CONV]], 24 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 16 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL3]], [[SHL]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 +; CHECK-NEXT: [[CONV5:%.*]] = zext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 8 +; CHECK-NEXT: [[OR7:%.*]] = or i32 [[OR]], [[SHL6]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[OR10:%.*]] = or i32 [[OR7]], [[CONV9]] +; CHECK-NEXT: [[OR15:%.*]] = call i32 @llvm.fshl.i32(i32 [[OR10]], i32 [[OR10]], i32 12) +; CHECK-NEXT: ret i32 [[OR15]] +; + +entry: + %0 = load i8, ptr %data + %conv = zext i8 %0 to i32 + %shl = shl nuw i32 %conv, 24 + %arrayidx1 = getelementptr inbounds i8, ptr %data, i64 1 + %1 = load i8, ptr %arrayidx1 + %conv2 = zext i8 %1 to i32 + %shl3 = shl nuw nsw i32 %conv2, 16 + %or = or i32 %shl3, %shl + %arrayidx4 = getelementptr inbounds i8, ptr %data, i64 2 + %2 = load i8, ptr %arrayidx4 + %conv5 = zext i8 %2 to i32 + %shl6 = shl nuw nsw i32 %conv5, 8 + %or7 = or i32 %or, %shl6 + %arrayidx8 = getelementptr inbounds i8, ptr %data, i64 3 + %3 = load i8, ptr %arrayidx8 + %conv9 = zext i8 %3 to i32 + %or10 = or i32 %or7, %conv9 + %shr = lshr i32 %or10, 20 + %shl7 = shl i32 %or10, 12 + %or15 = or i32 %shr, %shl7 + ret i32 %or15 +} + +define i32 @fsh_load_rotate_5(ptr %data) { +; CHECK-LABEL: @fsh_load_rotate_5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[DATA:%.*]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 [[CONV]], 24 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 16 +; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHL3]], [[SHL]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 +; CHECK-NEXT: [[CONV5:%.*]] = zext i8 [[TMP2]] to i32 +; CHECK-NEXT: [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 8 +; CHECK-NEXT: [[OR7:%.*]] = or i32 [[OR]], [[SHL6]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[DATA]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[CONV9:%.*]] = zext i8 [[TMP3]] to i32 +; CHECK-NEXT: [[OR10:%.*]] = or i32 [[OR7]], [[CONV9]] +; CHECK-NEXT: [[OR15:%.*]] = call i32 @llvm.fshl.i32(i32 [[OR10]], i32 [[OR10]], i32 5) +; CHECK-NEXT: ret i32 [[OR15]] +; + +entry: + %0 = load i8, ptr %data + %conv = zext i8 %0 to i32 + %shl = shl nuw i32 %conv, 24 + %arrayidx1 = getelementptr inbounds i8, ptr %data, i64 1 + %1 = load i8, ptr %arrayidx1 + %conv2 = zext i8 %1 to i32 + %shl3 = shl nuw nsw i32 %conv2, 16 + %or = or i32 %shl3, %shl + %arrayidx4 = getelementptr inbounds i8, ptr %data, i64 2 + %2 = load i8, ptr %arrayidx4 + %conv5 = zext i8 %2 to i32 + %shl6 = shl nuw nsw i32 %conv5, 8 + %or7 = or i32 %or, %shl6 + %arrayidx8 = getelementptr inbounds i8, ptr %data, i64 3 + %3 = load i8, ptr %arrayidx8 + %conv9 = zext i8 %3 to i32 + %or10 = or i32 %or7, %conv9 + %shr = lshr i32 %or10, 27 + %shl7 = shl i32 %or10, 5 + %or15 = or i32 %shr, %shl7 + ret i32 %or15 +} + define <2 x i31> @fshr_mask_args_same_vector(<2 x i31> %a) { ; CHECK-LABEL: @fshr_mask_args_same_vector( ; CHECK-NEXT: [[T3:%.*]] = shl <2 x i31> [[A:%.*]], <i31 10, i31 10> |
ping. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks.
@@ -722,6 +722,134 @@ define i32 @fsh_orconst_rotate(i32 %a) { | |||
ret i32 %t2 | |||
} | |||
|
|||
define i32 @fsh_rotate_25(i8 %x, i32 %y) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be nice to explicitly mark with a comment the tests where the simplification didn't not trigger anyway.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 to indicate negative tests
Als, you can add the tests as a stacked first commit so the second commit in the patch can show the test changes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A couple of minors - try to use m_Specific instead of matching the operand values afterward
auto *Operand = | ||
Op->getOperand(0) == I ? Op->getOperand(1) : Op->getOperand(0); | ||
if (match(Operand, | ||
m_OneUse(m_LShr(m_Value(LShrVal), m_APInt(LShrAmt))))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use m_Specific(I->getOperand(0))?
Value *ShlVal; | ||
auto *Operand = | ||
Op->getOperand(0) == I ? Op->getOperand(1) : Op->getOperand(0); | ||
if (match(Operand, m_OneUse(m_Shl(m_Value(ShlVal), m_APInt(ShlAmt))))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use m_Specific(I->getOperand(0))?
(*SA + *ShlAmt) == BitWidth) | ||
return nullptr; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Somewhat opposed to this, because its easy for logic to detect "should we be able to do X" to de-sync from the logic that does X.
I would prefer adding a help to InstructionCombiner that can be used for both the rotate creation folds and here to keep the two in sync.
Something along the following lines:
std::optional<std::tuple<Intrinsic::ID, Value *, Value *>>
ConvertShlOrLShrToFShlOrFShr(BinaryOperator *I) {
Value *ShAmt1, *ShAmt2;
unsigned BitWidth = I->getType()->getBitWidth();
if (!match(I, m_c_Or(m_OneUse(m_Shl(m_Value(X), m_Value(ShAmt1))),
m_OneUse(m_LShr(m_Deferred(X), m_Value(ShAmt2))))))
return std::nullopt;
const APint *C1;
const APint *C2;
if (match(ShAmt1, m_APint(C1)) && match(ShAmt2, m_APInt(C2))) {
if (C1->uge(BitWidth) || C2->uge(BitWidth) || (*C1 + *C2) != BitWidth)
return std::nullopt;
return {Intrinsic::FShl, X, ShAmt1};
}
auto ShAmtsMatchForRotate = [](Value *Amt1, Value *Amt2) {
return match(Amt1,
m_And(m_Neg(m_Specific(Amt2)), m_SpecificInt(BitWidth - 1)));
};
if (ShAMtMatchForRotate(ShAmt1, ShAmt2))
return {Intrinsic::FShl, X, ShAmt1};
if (ShAMtMatchForRotate(ShAmt2, ShAmt1))
return {Intrinsic::FShr, X, ShAmt2};
return std::nullopt;
}
If here you do:
if(I->hasOneUse() && ConvertShlOrLShrToFShlOrFShr(I->user_back().has_value())
return nullptr;
And at rotate fold you switch to using ConvertShlOrLShrToFShlOrFShr
to actually create rotates.
Then as pattern detection improves both will stay aligned.
@quic-eikansh reverse-ping? |
This has landed in #73441. |
The fshl/fshr having first two arguments as same gets lowered to targets specific rotate. But based on the uses, one of the arguments can get simplified resulting in different arguments performing equivalent operation.
This patch prevents the simplification of the arguments of lshr/shl if they are part of fshl pattern.