Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLVM 14 regression: __muloti4 is lowered with a recursive call despite nobuiltin attribute #56403

Closed
andrewrk opened this issue Jul 6, 2022 · 8 comments · Fixed by llvm/llvm-project-release-prs#114
Labels
bug Indicates an unexpected problem or unintended behavior llvm:codegen miscompilation regression release:merged

Comments

@andrewrk
Copy link
Member

andrewrk commented Jul 6, 2022

This is a regression from 13.0.1 to 14.0.6. The reproduction I have here is for aarch64-linux, however, I can also reproduce it for wasm32-wasi.

To help provide an overview of the IR, here is the high level source code:

const std = @import("std");
const builtin = @import("builtin");

export fn __muloti4(a: i128, b: i128, overflow: *c_int) callconv(.C) i128 {
    return muloXi4_genericSmall(i128, a, b, overflow);
}

inline fn muloXi4_genericSmall(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
    overflow.* = 0;
    const min = std.math.minInt(ST);
    var res: ST = a *% b;
    if ((a < 0 and b == min) or (a != 0 and @divTrunc(res, a) != b))
        overflow.* = 1;
    return res;
}

Input LLVM IR:

; ModuleID = 'compiler_rt'
source_filename = "compiler_rt"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-unknown"

@builtin.zig_backend = internal unnamed_addr constant i64 2, align 8
@builtin.output_mode = internal unnamed_addr constant i2 -2, align 1

; Function Attrs: nobuiltin nounwind
define dso_local i128 @__muloti4(i128 %0, i128 %1, i32* nonnull align 4 %2) #0 {
Entry:
  %3 = alloca i128, align 16
  %4 = alloca i128, align 16
  store i32 0, i32* %2, align 4
  %5 = mul i128 %0, %1
  store i128 %5, i128* %3, align 16
  %6 = icmp slt i128 %0, 0
  br i1 %6, label %Then, label %Else

Then:                                             ; preds = %Entry
  %7 = icmp eq i128 %1, -170141183460469231731687303715884105728
  br label %Block

Else:                                             ; preds = %Entry
  br label %Block

Block:                                            ; preds = %Else, %Then
  %8 = phi i1 [ %7, %Then ], [ false, %Else ]
  br i1 %8, label %Then1, label %Else2

Then1:                                            ; preds = %Block
  br label %Block6

Else2:                                            ; preds = %Block
  %9 = icmp ne i128 %0, 0
  br i1 %9, label %Then3, label %Else4

Then3:                                            ; preds = %Else2
  %10 = load i128, i128* %3, align 16
  %11 = sdiv i128 %10, %0
  %12 = icmp ne i128 %11, %1
  br label %Block5

Else4:                                            ; preds = %Else2
  br label %Block5

Block5:                                           ; preds = %Else4, %Then3
  %13 = phi i1 [ %12, %Then3 ], [ false, %Else4 ]
  br label %Block6

Block6:                                           ; preds = %Block5, %Then1
  %14 = phi i1 [ true, %Then1 ], [ %13, %Block5 ]
  br i1 %14, label %Then7, label %Else8

Then7:                                            ; preds = %Block6
  store i32 1, i32* %2, align 4
  br label %Block9

Else8:                                            ; preds = %Block6
  br label %Block9

Block9:                                           ; preds = %Else8, %Then7
  %15 = load i128, i128* %3, align 16
  store i128 %15, i128* %4, align 16
  %16 = load i128, i128* %4, align 16
  ret i128 %16
}

attributes #0 = { nobuiltin nounwind "frame-pointer"="none" "target-cpu"="generic" "target-features"="-a510,-a65,-a710,-a76,-a78,-a78c,-aes,-aggressive-fma,-alternate-sextload-cvt-f32-pattern,-altnzcv,-am,-amvs,-arith-bcc-fusion,-arith-cbz-fusion,-balance-fp-ops,-bf16,-brbe,-bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,-ccdp,-ccidx,-ccpp,-cmp-bcc-fusion,-complxnum,-CONTEXTIDREL2,-cortex-r82,-crc,-crypto,-custom-cheap-as-move,-disable-latency-sched-heuristic,-dit,-dotprod,-ecv,-el2vmsa,-el3,+ete,-exynos-cheap-as-move,-f32mm,-f64mm,-fgt,-fix-cortex-a53-835769,-flagm,-force-32bit-jump-tables,-fp16fml,+fp-armv8,-fptoint,-fullfp16,-fuse-address,+fuse-aes,-fuse-arith-logic,-fuse-crypto-eor,-fuse-csel,-fuse-literals,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,-hcx,-i8mm,-jsconv,-lor,-ls64,-lse,-lse2,-lsl-fast,-mops,-mpam,-mte,+neon,-no-bti-at-return-twice,-no-neg-immediates,-no-zcz-fp,-nv,-outline-atomics,-pan,-pan-rwv,-pauth,+perfmon,-predictable-select-expensive,-predres,-rand,-ras,-rcpc,-rcpc-immo,-rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,-sb,-sel2,-sha2,-sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme-f64,-sme-i64,-spe,-spe-eef,-specrestrict,-ssbs,-streaming-sve,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-tagged-globals,-tlb-rmi,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tracev8.4,+trbe,-uaops,-use-experimental-zeroing-pseudos,+use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,-v8.1a,-v8.2a,-v8.3a,-v8.4a,-v8.5a,-v8.6a,-v8.7a,-v8.8a,-v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9a,-vh,-wfxt,-xs,-zcm,-zcz,-zcz-fp-workaround,-zcz-gp" }

Note that the __muloti4 function has the nobuiltin attribute. After optimizations are applied, the IR transforms to:

; ModuleID = 'compiler_rt'
source_filename = "compiler_rt"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-unknown"

; Function Attrs: mustprogress nobuiltin nofree nosync nounwind willreturn writeonly
define dso_local i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align 4 %2) local_unnamed_addr #0 {
Entry:
  store i32 0, i32* %2, align 4
  %.fr = freeze i128 %1
  %mul = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %0, i128 %.fr)
  %3 = icmp slt i128 %0, 0
  %4 = icmp eq i128 %.fr, -170141183460469231731687303715884105728
  %5 = and i1 %3, %4
  br i1 %5, label %Then7, label %Else2

Else2:                                            ; preds = %Entry
  %mul.ov = extractvalue { i128, i1 } %mul, 1
  br i1 %mul.ov, label %Then7, label %Block9

Then7:                                            ; preds = %Else2, %Entry
  store i32 1, i32* %2, align 4
  br label %Block9

Block9:                                           ; preds = %Else2, %Then7
  %mul.val = extractvalue { i128, i1 } %mul, 0
  ret i128 %mul.val
}

; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128) #1

attributes #0 = { mustprogress nobuiltin nofree nosync nounwind willreturn writeonly "frame-pointer"="none" "target-cpu"="generic" "target-features"="-a510,-a65,-a710,-a76,-a78,-a78c,-aes,-aggressive-fma,-alternate-sextload-cvt-f32-pattern,-altnzcv,-am,-amvs,-arith-bcc-fusion,-arith-cbz-fusion,-balance-fp-ops,-bf16,-brbe,-bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,-ccdp,-ccidx,-ccpp,-cmp-bcc-fusion,-complxnum,-CONTEXTIDREL2,-cortex-r82,-crc,-crypto,-custom-cheap-as-move,-disable-latency-sched-heuristic,-dit,-dotprod,-ecv,-el2vmsa,-el3,+ete,-exynos-cheap-as-move,-f32mm,-f64mm,-fgt,-fix-cortex-a53-835769,-flagm,-force-32bit-jump-tables,-fp16fml,+fp-armv8,-fptoint,-fullfp16,-fuse-address,+fuse-aes,-fuse-arith-logic,-fuse-crypto-eor,-fuse-csel,-fuse-literals,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,-hcx,-i8mm,-jsconv,-lor,-ls64,-lse,-lse2,-lsl-fast,-mops,-mpam,-mte,+neon,-no-bti-at-return-twice,-no-neg-immediates,-no-zcz-fp,-nv,-outline-atomics,-pan,-pan-rwv,-pauth,+perfmon,-predictable-select-expensive,-predres,-rand,-ras,-rcpc,-rcpc-immo,-rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,-sb,-sel2,-sha2,-sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme-f64,-sme-i64,-spe,-spe-eef,-specrestrict,-ssbs,-streaming-sve,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-tagged-globals,-tlb-rmi,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tracev8.4,+trbe,-uaops,-use-experimental-zeroing-pseudos,+use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,-v8.1a,-v8.2a,-v8.3a,-v8.4a,-v8.5a,-v8.6a,-v8.7a,-v8.8a,-v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9a,-vh,-wfxt,-xs,-zcm,-zcz,-zcz-fp-workaround,-zcz-gp" }
attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }

Note in particular the call to @llvm.smul.with.overflow.i128. This will lower to a call to __muloti4 itself, causing an infinite loop. In LLVM 13.0.1, this instead optimized to a mul instruction, avoiding the problem.

If this is all working as designed, then can I please be advised - how is this use case intended to be solved? As far as I can tell, LLVM's own compiler-rt is just getting "lucky" that this is not happening to its __muloti4 symbol, since its implementation is slightly more convoluted.

@andrewrk andrewrk added bug Indicates an unexpected problem or unintended behavior regression miscompilation labels Jul 6, 2022
@andrewrk andrewrk added this to the LLVM 15.0.0 Release milestone Jul 6, 2022
@llvmbot
Copy link
Collaborator

llvmbot commented Jul 6, 2022

@llvm/issue-subscribers-bug

andrewrk added a commit to ziglang/zig that referenced this issue Jul 6, 2022
@mstorsjo
Copy link
Member

mstorsjo commented Jul 9, 2022

I bisected this issue down to 35fa7b8 - CC @LebedevRI.

andrewrk added a commit to ziglang/zig that referenced this issue Jul 19, 2022
wooster0 pushed a commit to wooster0/zig that referenced this issue Jul 24, 2022
@RKSimon
Copy link
Collaborator

RKSimon commented Aug 8, 2022

CC @rotateright who suggested this might be better in Aggressive InstCombine where we can put a better legality check on it

@rotateright
Copy link
Contributor

I spent all day mucking around with AggressiveInstCombine, but then realized we can probably get away with a tiny patch to the existing code to avoid the bug:
https://reviews.llvm.org/D131452

rotateright added a commit that referenced this issue Aug 9, 2022
@rotateright
Copy link
Contributor

We're leaning towards a backend quick-fix instead of or possibly alongside the IR change:
https://reviews.llvm.org/D131521

rotateright added a commit that referenced this issue Aug 17, 2022
This is a potentially better alternative to D131452 that also
should avoid the infinite loop bug from:
issue #56403

This is again a minimal fix to reduce merging pain for the
release. But if this makes sense, then we might want to guard
all of the RTLIB generation (and other libcalls?) with a
similar name check.

Differential Revision: https://reviews.llvm.org/D131521
@rotateright
Copy link
Contributor

/cherry-pick 8eddd1e 7f72a0f

@llvmbot
Copy link
Collaborator

llvmbot commented Aug 18, 2022

/branch llvm/llvm-project-release-prs/issue56403

@llvmbot
Copy link
Collaborator

llvmbot commented Aug 18, 2022

/pull-request llvm/llvm-project-release-prs#114

tru pushed a commit that referenced this issue Aug 22, 2022
tru pushed a commit that referenced this issue Aug 22, 2022
This is a potentially better alternative to D131452 that also
should avoid the infinite loop bug from:
issue #56403

This is again a minimal fix to reduce merging pain for the
release. But if this makes sense, then we might want to guard
all of the RTLIB generation (and other libcalls?) with a
similar name check.

Differential Revision: https://reviews.llvm.org/D131521

(cherry picked from commit 7f72a0f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Indicates an unexpected problem or unintended behavior llvm:codegen miscompilation regression release:merged
Projects
Archived in project
Development

Successfully merging a pull request may close this issue.

6 participants