Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrong code at -O1 on x86_64-linux_gnu(recent regression) #72855

Closed
shao-hua-li opened this issue Nov 20, 2023 · 3 comments · Fixed by #73007
Closed

Wrong code at -O1 on x86_64-linux_gnu(recent regression) #72855

shao-hua-li opened this issue Nov 20, 2023 · 3 comments · Fixed by #73007

Comments

@shao-hua-li
Copy link

Clang at -O1 produces the wrong code.

Bisected to 63917e1, which was committed by @igogo-x86

Compiler explorer: https://godbolt.org/z/jdPPzP3P6

% cat a.c
int printf(const char *, ...);
int a;
int *g = &a, *m = &a, *n = &a;
short o;
int p(char *b, long c, long *d) {
  char *e = b;
  long f = *d;
  for (; c && f; --c, ++f, ++e)
    ;
  return e - b;
}
int t(long h, long i) {
  char j;
  long k[] = {i};
  int l = p(&j, h, k);
  return l;
}
long q(int *r) {
  *m = 4;
  for (;;)
    for (; *r >= 0;) {
      *g = *n % 5 == 1;
      for (;;)
        for (; t(*n + 5, 3124342948) <= 5;)
          return 0;
    }
}
int main() {
  int s = a;
  q(&s);
  printf("%d\n", o);
}
%
% clang -O0 -fsanitize=address,undefined a.c && ./a.out
0
% clang -O1 a.c && ./a.out
Timeout
%
@dtcxzyw
Copy link
Member

dtcxzyw commented Nov 20, 2023

Before MachineLICM:

# Machine code for function main: IsSSA, TracksLiveness
Frame Objects:
  fi#0: size=1, align=1, at location [SP+8]

bb.0.entry:
  successors: %bb.3(0x50000000), %bb.1(0x30000000); %bb.3(62.50%), %bb.1(37.50%)

  %4:gr64 = MOV64rm $rip, 1, $noreg, @m, $noreg :: (dereferenceable load (s64) from @m, !tbaa !9)
  CMP32mi $rip, 1, $noreg, @a, $noreg, 0, implicit-def $eflags :: (dereferenceable load (s32) from @a, !tbaa !11)
  MOV32mi killed %4:gr64, 1, $noreg, 0, $noreg, 4 :: (store (s32) into %ir.1, !tbaa !11)
  JCC_1 %bb.3, 9, implicit $eflags
  JMP_1 %bb.1

bb.1.for.cond.i.preheader:
; predecessors: %bb.0
  successors: %bb.2(0x80000000); %bb.2(100.00%)


bb.2.for.cond.i:
; predecessors: %bb.1, %bb.2
  successors: %bb.2(0x80000000); %bb.2(100.00%)

  JMP_1 %bb.2

bb.3.for.body.i:
; predecessors: %bb.0
  successors: %bb.4(0x80000000); %bb.4(100.00%)

  %0:gr64 = MOV64rm $rip, 1, $noreg, @n, $noreg :: (dereferenceable load (s64) from @n, !tbaa !9)
  %5:gr64 = MOVSX64rm32 %0:gr64, 1, $noreg, 0, $noreg :: (load (s32) from %ir.2, !tbaa !11)
  %6:gr32 = COPY %5.sub_32bit:gr64
  %7:gr64 = IMUL64rri32 %5:gr64, 1717986919, implicit-def dead $eflags
  %8:gr64 = SHR64ri %7:gr64(tied-def 0), 63, implicit-def dead $eflags
  %9:gr32 = COPY %8.sub_32bit:gr64
  %10:gr64 = SAR64ri %7:gr64(tied-def 0), 33, implicit-def dead $eflags
  %11:gr32 = COPY %10.sub_32bit:gr64
  %12:gr32 = ADD32rr %11:gr32(tied-def 0), killed %9:gr32, implicit-def dead $eflags
  %14:gr64 = IMPLICIT_DEF
  %13:gr64_nosp = INSERT_SUBREG %14:gr64(tied-def 0), killed %12:gr32, %subreg.sub_32bit
  %15:gr32 = LEA64_32r %13:gr64_nosp, 4, %13:gr64_nosp, 0, $noreg
  %16:gr32 = SUB32rr %6:gr32(tied-def 0), killed %15:gr32, implicit-def dead $eflags
  %17:gr32 = SUB32ri %16:gr32(tied-def 0), 1, implicit-def $eflags
  %18:gr8 = SETCCr 4, implicit $eflags
  %19:gr32 = MOVZX32rr8 killed %18:gr8
  %20:gr64 = MOV64rm $rip, 1, $noreg, @g, $noreg :: (dereferenceable load (s64) from @g, !tbaa !9)
  MOV32mr killed %20:gr64, 1, $noreg, 0, $noreg, killed %19:gr32 :: (store (s32) into %ir.4, !tbaa !11)

bb.4.for.cond3.i:
; predecessors: %bb.3, %bb.6
  successors: %bb.6(0x40000000), %bb.5(0x40000000); %bb.6(50.00%), %bb.5(50.00%)

  %1:gr64 = MOVSX64rm32 %0:gr64, 1, $noreg, 0, $noreg :: (load (s32) from %ir.2, !tbaa !11)
  %22:gr64 = SUB64ri32 %1:gr64(tied-def 0), -5, implicit-def $eflags
  %21:gr64 = LEA64r %stack.0.j.i.i, 1, $noreg, 0, $noreg
  JCC_1 %bb.6, 4, implicit $eflags
  JMP_1 %bb.5

bb.5.for.end.loopexit.i.i.i:
; predecessors: %bb.4
  successors: %bb.6(0x80000000); %bb.6(100.00%)

  %23:gr64 = MOV64ri32 -5
  %24:gr64 = nsw SUB64rr %23:gr64(tied-def 0), %1:gr64, implicit-def dead $eflags
  %25:gr64 = MOV32ri64 3124342948
  %26:gr64 = SUB64rr %24:gr64(tied-def 0), %25:gr64, implicit-def $eflags
  %27:gr64 = CMOV64rr %25:gr64(tied-def 0), %24:gr64, 7, implicit $eflags
  %28:gr64_nosp = NEG64r %27:gr64(tied-def 0), implicit-def dead $eflags
  %2:gr64 = LEA64r %stack.0.j.i.i, 1, killed %28:gr64_nosp, 0, $noreg

bb.6.t.exit.i:
; predecessors: %bb.4, %bb.5
  successors: %bb.7(0x04000000), %bb.4(0x7c000000); %bb.7(3.12%), %bb.4(96.88%)

  %3:gr64 = PHI %21:gr64, %bb.4, %2:gr64, %bb.5
  %29:gr64 = LEA64r %stack.0.j.i.i, 1, $noreg, 0, $noreg
  %30:gr32 = COPY %29.sub_32bit:gr64
  %31:gr32 = COPY %3.sub_32bit:gr64
  %32:gr32 = SUB32rr %31:gr32(tied-def 0), killed %30:gr32, implicit-def dead $eflags
  %33:gr32 = SUB32ri %32:gr32(tied-def 0), 5, implicit-def $eflags
  JCC_1 %bb.4, 15, implicit $eflags
  JMP_1 %bb.7

bb.7.q.exit:
; predecessors: %bb.6

  %34:gr32 = MOVSX32rm16 $rip, 1, $noreg, @o, $noreg :: (dereferenceable load (s16) from @o, !tbaa !16)
  ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
  %35:gr64 = LEA64r $rip, 1, $noreg, @.str, $noreg
  %36:gr32 = MOV32r0 implicit-def dead $eflags
  %37:gr8 = COPY %36.sub_8bit:gr32
  $rdi = COPY %35:gr64
  $esi = COPY %34:gr32
  $al = COPY %37:gr8
  CALL64pcrel32 target-flags(x86-plt) @printf, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit $rdi, implicit $esi, implicit $al, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
  ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
  $eax = COPY %36:gr32
  RET 0, $eax

# End machine code for function main.

After:

*** IR Dump After Early Machine Loop Invariant Code Motion (early-machinelicm) on main ***
# Machine code for function main: IsSSA, TracksLiveness
Frame Objects:
  fi#0: size=1, align=1, at location [SP+8]

bb.0.entry:
  successors: %bb.3(0x50000000), %bb.1(0x30000000); %bb.3(62.50%), %bb.1(37.50%)

  %4:gr64 = MOV64rm $rip, 1, $noreg, @m, $noreg :: (dereferenceable load (s64) from @m, !tbaa !9)
  CMP32mi $rip, 1, $noreg, @a, $noreg, 0, implicit-def $eflags :: (dereferenceable load (s32) from @a, !tbaa !11)
  MOV32mi killed %4:gr64, 1, $noreg, 0, $noreg, 4 :: (store (s32) into %ir.1, !tbaa !11)
  JCC_1 %bb.3, 9, implicit $eflags
  JMP_1 %bb.1

bb.1.for.cond.i.preheader:
; predecessors: %bb.0
  successors: %bb.2(0x80000000); %bb.2(100.00%)


bb.2.for.cond.i:
; predecessors: %bb.1, %bb.2
  successors: %bb.2(0x80000000); %bb.2(100.00%)

  JMP_1 %bb.2

bb.3.for.body.i:
; predecessors: %bb.0
  successors: %bb.4(0x80000000); %bb.4(100.00%)

  %0:gr64 = MOV64rm $rip, 1, $noreg, @n, $noreg :: (dereferenceable load (s64) from @n, !tbaa !9)
  %5:gr64 = MOVSX64rm32 %0:gr64, 1, $noreg, 0, $noreg :: (load (s32) from %ir.2, !tbaa !11)
  %6:gr32 = COPY %5.sub_32bit:gr64
  %7:gr64 = IMUL64rri32 %5:gr64, 1717986919, implicit-def dead $eflags
  %8:gr64 = SHR64ri %7:gr64(tied-def 0), 63, implicit-def dead $eflags
  %9:gr32 = COPY %8.sub_32bit:gr64
  %10:gr64 = SAR64ri %7:gr64(tied-def 0), 33, implicit-def dead $eflags
  %11:gr32 = COPY %10.sub_32bit:gr64
  %12:gr32 = ADD32rr %11:gr32(tied-def 0), killed %9:gr32, implicit-def dead $eflags
  %14:gr64 = IMPLICIT_DEF
  %13:gr64_nosp = INSERT_SUBREG %14:gr64(tied-def 0), killed %12:gr32, %subreg.sub_32bit
  %15:gr32 = LEA64_32r %13:gr64_nosp, 4, %13:gr64_nosp, 0, $noreg
  %16:gr32 = SUB32rr %6:gr32(tied-def 0), killed %15:gr32, implicit-def dead $eflags
  %17:gr32 = SUB32ri %16:gr32(tied-def 0), 1, implicit-def $eflags
  %18:gr8 = SETCCr 4, implicit $eflags
  %19:gr32 = MOVZX32rr8 killed %18:gr8
  %20:gr64 = MOV64rm $rip, 1, $noreg, @g, $noreg :: (dereferenceable load (s64) from @g, !tbaa !9)
  MOV32mr killed %20:gr64, 1, $noreg, 0, $noreg, killed %19:gr32 :: (store (s32) into %ir.4, !tbaa !11)
  %21:gr64 = LEA64r %stack.0.j.i.i, 1, $noreg, 0, $noreg
  %23:gr64 = MOV64ri32 -5
  %24:gr64 = nsw SUB64rr %23:gr64(tied-def 0), %5:gr64, implicit-def dead $eflags
  %25:gr64 = MOV32ri64 3124342948

bb.4.for.cond3.i:
; predecessors: %bb.3, %bb.6
  successors: %bb.6(0x40000000), %bb.5(0x40000000); %bb.6(50.00%), %bb.5(50.00%)

  %22:gr64 = SUB64ri32 %5:gr64(tied-def 0), -5, implicit-def $eflags
  JCC_1 %bb.6, 4, implicit $eflags
  JMP_1 %bb.5

bb.5.for.end.loopexit.i.i.i:
; predecessors: %bb.4
  successors: %bb.6(0x80000000); %bb.6(100.00%)

  %26:gr64 = SUB64rr %24:gr64(tied-def 0), %25:gr64, implicit-def $eflags
  %27:gr64 = CMOV64rr %25:gr64(tied-def 0), %24:gr64, 7, implicit $eflags
  %28:gr64_nosp = NEG64r %27:gr64(tied-def 0), implicit-def dead $eflags
  %2:gr64 = LEA64r %stack.0.j.i.i, 1, killed %28:gr64_nosp, 0, $noreg

bb.6.t.exit.i:
; predecessors: %bb.4, %bb.5
  successors: %bb.7(0x04000000), %bb.4(0x7c000000); %bb.7(3.12%), %bb.4(96.88%)

  %3:gr64 = PHI %21:gr64, %bb.4, %2:gr64, %bb.5
  %30:gr32 = COPY %21.sub_32bit:gr64
  %31:gr32 = COPY %3.sub_32bit:gr64
  %32:gr32 = SUB32rr %31:gr32(tied-def 0), killed %30:gr32, implicit-def dead $eflags
  %33:gr32 = SUB32ri %32:gr32(tied-def 0), 5, implicit-def $eflags
  JCC_1 %bb.4, 15, implicit $eflags
  JMP_1 %bb.7

bb.7.q.exit:
; predecessors: %bb.6

  %34:gr32 = MOVSX32rm16 $rip, 1, $noreg, @o, $noreg :: (dereferenceable load (s16) from @o, !tbaa !16)
  ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
  %35:gr64 = LEA64r $rip, 1, $noreg, @.str, $noreg
  %36:gr32 = MOV32r0 implicit-def dead $eflags
  %37:gr8 = COPY %36.sub_8bit:gr32
  $rdi = COPY %35:gr64
  $esi = COPY %34:gr32
  $al = COPY %37:gr8
  CALL64pcrel32 target-flags(x86-plt) @printf, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit $rdi, implicit $esi, implicit $al, implicit-def $rsp, implicit-def $ssp, implicit-def $eax
  ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
  $eax = COPY %36:gr32
  RET 0, $eax

# End machine code for function main.

@igogo-x86
Copy link
Contributor

Thanks for reporting, I'll have a look

igogo-x86 added a commit to igogo-x86/llvm-project that referenced this issue Nov 27, 2023
When hoisting an invariant load, we should not combine it with an existing
load through common subexpression elimination (CSE). This is because there
might be memory-changing instructions between the existing load and the end
of the block entering the loop.

Fixes llvm#72855
igogo-x86 added a commit that referenced this issue Nov 27, 2023
When hoisting an invariant load, we should not combine it with an
existing load through common subexpression elimination (CSE). This is
because there might be memory-changing instructions between the existing
load and the end of the block entering the loop.

Fixes #72855
@zmodem
Copy link
Collaborator

zmodem commented Dec 1, 2023

We just tracked down a test failure in Chromium to this bug.

Given that the bug was found pretty shortly after the commit causing the breakage, I wish it had been reverted quickly per https://llvm.org/docs/DeveloperPolicy.html#patch-reversion-policy but I'm also very happy it got fixed :-)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

5 participants