Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SimplifyCFG fails to merge all == -1 into a single block #62311

Closed
RKSimon opened this issue Apr 23, 2023 · 5 comments
Closed

SimplifyCFG fails to merge all == -1 into a single block #62311

RKSimon opened this issue Apr 23, 2023 · 5 comments

Comments

@RKSimon
Copy link
Collaborator

RKSimon commented Apr 23, 2023

https://gcc.godbolt.org/z/vn1v37Ger

#include <x86intrin.h>

bool allones(__m512i x) {
  return
    x[0] == -1 && x[1] == -1 &&
    x[2] == -1 && x[3] == -1 &&
    x[4] == -1 && x[5] == -1 &&
    x[6] == -1 && x[7] == -1;
}

Branched off from #59998 - all but the last comparisons are merged into the same basic block, preventing vectorization.

define dso_local noundef zeroext i1 @allones(long long __vector(8))(<8 x i64> noundef %x) {
entry:
  %x.addr = alloca <8 x i64>, align 64
  store <8 x i64> %x, ptr %x.addr, align 64
  %0 = load <8 x i64>, ptr %x.addr, align 64
  %vecext = extractelement <8 x i64> %0, i32 0
  %cmp = icmp eq i64 %vecext, -1
  br i1 %cmp, label %land.lhs.true, label %land.end

land.lhs.true:                                    ; preds = %entry
  %1 = load <8 x i64>, ptr %x.addr, align 64
  %vecext1 = extractelement <8 x i64> %1, i32 1
  %cmp2 = icmp eq i64 %vecext1, -1
  br i1 %cmp2, label %land.lhs.true3, label %land.end

land.lhs.true3:                                   ; preds = %land.lhs.true
  %2 = load <8 x i64>, ptr %x.addr, align 64
  %vecext4 = extractelement <8 x i64> %2, i32 2
  %cmp5 = icmp eq i64 %vecext4, -1
  br i1 %cmp5, label %land.lhs.true6, label %land.end

land.lhs.true6:                                   ; preds = %land.lhs.true3
  %3 = load <8 x i64>, ptr %x.addr, align 64
  %vecext7 = extractelement <8 x i64> %3, i32 3
  %cmp8 = icmp eq i64 %vecext7, -1
  br i1 %cmp8, label %land.lhs.true9, label %land.end

land.lhs.true9:                                   ; preds = %land.lhs.true6
  %4 = load <8 x i64>, ptr %x.addr, align 64
  %vecext10 = extractelement <8 x i64> %4, i32 4
  %cmp11 = icmp eq i64 %vecext10, -1
  br i1 %cmp11, label %land.lhs.true12, label %land.end

land.lhs.true12:                                  ; preds = %land.lhs.true9
  %5 = load <8 x i64>, ptr %x.addr, align 64
  %vecext13 = extractelement <8 x i64> %5, i32 5
  %cmp14 = icmp eq i64 %vecext13, -1
  br i1 %cmp14, label %land.lhs.true15, label %land.end

land.lhs.true15:                                  ; preds = %land.lhs.true12
  %6 = load <8 x i64>, ptr %x.addr, align 64
  %vecext16 = extractelement <8 x i64> %6, i32 6
  %cmp17 = icmp eq i64 %vecext16, -1
  br i1 %cmp17, label %land.rhs, label %land.end

land.rhs:                                         ; preds = %land.lhs.true15
  %7 = load <8 x i64>, ptr %x.addr, align 64
  %vecext18 = extractelement <8 x i64> %7, i32 7
  %cmp19 = icmp eq i64 %vecext18, -1
  br label %land.end

land.end:                                         ; preds = %land.rhs, %land.lhs.true15, %land.lhs.true12, %land.lhs.true9, %land.lhs.true6, %land.lhs.true3, %land.lhs.true, %entry
  %8 = phi i1 [ false, %land.lhs.true15 ], [ false, %land.lhs.true12 ], [ false, %land.lhs.true9 ], [ false, %land.lhs.true6 ], [ false, %land.lhs.true3 ], [ false, %land.lhs.true ], [ false, %entry ], [ %cmp19, %land.rhs ]
  ret i1 %8
}

to

define dso_local noundef zeroext i1 @allones(<8 x i64> noundef %x) {
entry:
  %x.addr = alloca <8 x i64>, align 64
  store <8 x i64> %x, ptr %x.addr, align 64
  %0 = load <8 x i64>, ptr %x.addr, align 64
  %vecext = extractelement <8 x i64> %0, i32 0
  %cmp = icmp eq i64 %vecext, -1
  %1 = load <8 x i64>, ptr %x.addr, align 64
  %vecext1 = extractelement <8 x i64> %1, i32 1
  %cmp2 = icmp eq i64 %vecext1, -1
  %or.cond = select i1 %cmp, i1 %cmp2, i1 false
  %2 = load <8 x i64>, ptr %x.addr, align 64
  %vecext4 = extractelement <8 x i64> %2, i32 2
  %cmp5 = icmp eq i64 %vecext4, -1
  %or.cond20 = select i1 %or.cond, i1 %cmp5, i1 false
  %3 = load <8 x i64>, ptr %x.addr, align 64
  %vecext7 = extractelement <8 x i64> %3, i32 3
  %cmp8 = icmp eq i64 %vecext7, -1
  %or.cond21 = select i1 %or.cond20, i1 %cmp8, i1 false
  %4 = load <8 x i64>, ptr %x.addr, align 64
  %vecext10 = extractelement <8 x i64> %4, i32 4
  %cmp11 = icmp eq i64 %vecext10, -1
  %or.cond22 = select i1 %or.cond21, i1 %cmp11, i1 false
  %5 = load <8 x i64>, ptr %x.addr, align 64
  %vecext13 = extractelement <8 x i64> %5, i32 5
  %cmp14 = icmp eq i64 %vecext13, -1
  %or.cond23 = select i1 %or.cond22, i1 %cmp14, i1 false
  %6 = load <8 x i64>, ptr %x.addr, align 64
  %vecext16 = extractelement <8 x i64> %6, i32 6
  %cmp17 = icmp eq i64 %vecext16, -1
  %or.cond24 = select i1 %or.cond23, i1 %cmp17, i1 false
  br i1 %or.cond24, label %land.rhs, label %land.end

land.rhs:                                         ; preds = %entry
  %7 = load <8 x i64>, ptr %x.addr, align 64
  %vecext18 = extractelement <8 x i64> %7, i32 7
  %cmp19 = icmp eq i64 %vecext18, -1
  br label %land.end

land.end:                                         ; preds = %land.rhs, %entry
  %8 = phi i1 [ false, %entry ], [ %cmp19, %land.rhs ]
  ret i1 %8
}

Interestingly, the equivalent 'allzeros' has the same problem on the first SimplifyCFG pass, but a later pass merges the last comparison as well: https://gcc.godbolt.org/z/oe8cjY137

@khei4
Copy link
Contributor

khei4 commented May 6, 2023

Let me know if I'm off the mark

  1. convert -1 comparison to consecutive integral AND ops on InstCombine
    https://alive2.llvm.org/ce/z/RvmDGE

  2. merge last comparisons on SimplifyCFG
    https://alive2.llvm.org/ce/z/G2bdfn

These changes would fix this 🙂

ref:
(current consecutive AND ops https://llvm.godbolt.org/z/1srzcndGW
(current consecutive OR ops https://llvm.godbolt.org/z/4qT86rf9c

@xgupta
Copy link
Contributor

xgupta commented May 19, 2023

Hi @khei4,
Are you working on this issue?
If not I would like to work on it.
Thanks

@khei4
Copy link
Contributor

khei4 commented May 19, 2023

Are you working on this issue?
If not I would like to work on it.

@xgupta Not yet. You can do it, thanks!

@xgupta
Copy link
Contributor

xgupta commented May 27, 2023

I did some investigation. It is like this working for allzeros case (combining two cmp instruction to single one) -

 InstCombinerImpl::visitSelectInst -> InstCombinerImpl::foldSelectOfBools -> foldAndOrOfSelectUsingImpliedCond -> foldAndOrOfICmps -> foldLogOpOfMaskedICmps
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
/// into a single (icmp(A & X) ==/!= Y).
static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                     bool IsLogical,
                                     InstCombiner::BuilderTy &Builder) {
  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;

  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
  std::optional<std::pair<unsigned, unsigned>> MaskPair =
      getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
  unsigned LHSMask = MaskPair->first;
  unsigned RHSMask = MaskPair->second;

  unsigned Mask = LHSMask & RHSMask;
  // In most cases we're going to produce an EQ for the "&&" case.
  ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
 
  if (Mask & Mask_AllZeros) {
    // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
    -> (icmp eq (A & (B|D)), 0)

    Value *NewOr = Builder.CreateOr(B, D);
    Value *NewAnd = Builder.CreateAnd(A, NewOr);
    // We can't use C as zero because we might actually handle
    //   (icmp ne (A & B), B) & (icmp ne (A & D), D)
    // with B and D, having a single bit set.
    Value *Zero = Constant::getNullValue(A->getType());
    return Builder.CreateICmp(NewCC, NewAnd, Zero);
  }
  
  return nullptr;
}

I am next working on making instcombine to work with 1 case.

@RKSimon
Copy link
Collaborator Author

RKSimon commented Jun 10, 2023

Thanks @xgupta !

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants