LLVM 14 regression: __muloti4 is lowered with a recursive call despite nobuiltin attribute #56403

andrewrk · 2022-07-06T04:03:06Z

This is a regression from 13.0.1 to 14.0.6. The reproduction I have here is for aarch64-linux, however, I can also reproduce it for wasm32-wasi.

To help provide an overview of the IR, here is the high level source code:

const std = @import("std");
const builtin = @import("builtin");

export fn __muloti4(a: i128, b: i128, overflow: *c_int) callconv(.C) i128 {
    return muloXi4_genericSmall(i128, a, b, overflow);
}

inline fn muloXi4_genericSmall(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST {
    overflow.* = 0;
    const min = std.math.minInt(ST);
    var res: ST = a *% b;
    if ((a < 0 and b == min) or (a != 0 and @divTrunc(res, a) != b))
        overflow.* = 1;
    return res;
}

Input LLVM IR:

; ModuleID = 'compiler_rt'
source_filename = "compiler_rt"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-unknown"

@builtin.zig_backend = internal unnamed_addr constant i64 2, align 8
@builtin.output_mode = internal unnamed_addr constant i2 -2, align 1

; Function Attrs: nobuiltin nounwind
define dso_local i128 @__muloti4(i128 %0, i128 %1, i32* nonnull align 4 %2) #0 {
Entry:
  %3 = alloca i128, align 16
  %4 = alloca i128, align 16
  store i32 0, i32* %2, align 4
  %5 = mul i128 %0, %1
  store i128 %5, i128* %3, align 16
  %6 = icmp slt i128 %0, 0
  br i1 %6, label %Then, label %Else

Then:                                             ; preds = %Entry
  %7 = icmp eq i128 %1, -170141183460469231731687303715884105728
  br label %Block

Else:                                             ; preds = %Entry
  br label %Block

Block:                                            ; preds = %Else, %Then
  %8 = phi i1 [ %7, %Then ], [ false, %Else ]
  br i1 %8, label %Then1, label %Else2

Then1:                                            ; preds = %Block
  br label %Block6

Else2:                                            ; preds = %Block
  %9 = icmp ne i128 %0, 0
  br i1 %9, label %Then3, label %Else4

Then3:                                            ; preds = %Else2
  %10 = load i128, i128* %3, align 16
  %11 = sdiv i128 %10, %0
  %12 = icmp ne i128 %11, %1
  br label %Block5

Else4:                                            ; preds = %Else2
  br label %Block5

Block5:                                           ; preds = %Else4, %Then3
  %13 = phi i1 [ %12, %Then3 ], [ false, %Else4 ]
  br label %Block6

Block6:                                           ; preds = %Block5, %Then1
  %14 = phi i1 [ true, %Then1 ], [ %13, %Block5 ]
  br i1 %14, label %Then7, label %Else8

Then7:                                            ; preds = %Block6
  store i32 1, i32* %2, align 4
  br label %Block9

Else8:                                            ; preds = %Block6
  br label %Block9

Block9:                                           ; preds = %Else8, %Then7
  %15 = load i128, i128* %3, align 16
  store i128 %15, i128* %4, align 16
  %16 = load i128, i128* %4, align 16
  ret i128 %16
}

attributes #0 = { nobuiltin nounwind "frame-pointer"="none" "target-cpu"="generic" "target-features"="-a510,-a65,-a710,-a76,-a78,-a78c,-aes,-aggressive-fma,-alternate-sextload-cvt-f32-pattern,-altnzcv,-am,-amvs,-arith-bcc-fusion,-arith-cbz-fusion,-balance-fp-ops,-bf16,-brbe,-bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,-ccdp,-ccidx,-ccpp,-cmp-bcc-fusion,-complxnum,-CONTEXTIDREL2,-cortex-r82,-crc,-crypto,-custom-cheap-as-move,-disable-latency-sched-heuristic,-dit,-dotprod,-ecv,-el2vmsa,-el3,+ete,-exynos-cheap-as-move,-f32mm,-f64mm,-fgt,-fix-cortex-a53-835769,-flagm,-force-32bit-jump-tables,-fp16fml,+fp-armv8,-fptoint,-fullfp16,-fuse-address,+fuse-aes,-fuse-arith-logic,-fuse-crypto-eor,-fuse-csel,-fuse-literals,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,-hcx,-i8mm,-jsconv,-lor,-ls64,-lse,-lse2,-lsl-fast,-mops,-mpam,-mte,+neon,-no-bti-at-return-twice,-no-neg-immediates,-no-zcz-fp,-nv,-outline-atomics,-pan,-pan-rwv,-pauth,+perfmon,-predictable-select-expensive,-predres,-rand,-ras,-rcpc,-rcpc-immo,-rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,-sb,-sel2,-sha2,-sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme-f64,-sme-i64,-spe,-spe-eef,-specrestrict,-ssbs,-streaming-sve,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-tagged-globals,-tlb-rmi,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tracev8.4,+trbe,-uaops,-use-experimental-zeroing-pseudos,+use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,-v8.1a,-v8.2a,-v8.3a,-v8.4a,-v8.5a,-v8.6a,-v8.7a,-v8.8a,-v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9a,-vh,-wfxt,-xs,-zcm,-zcz,-zcz-fp-workaround,-zcz-gp" }

Note that the __muloti4 function has the nobuiltin attribute. After optimizations are applied, the IR transforms to:

; ModuleID = 'compiler_rt'
source_filename = "compiler_rt"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-unknown"

; Function Attrs: mustprogress nobuiltin nofree nosync nounwind willreturn writeonly
define dso_local i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align 4 %2) local_unnamed_addr #0 {
Entry:
  store i32 0, i32* %2, align 4
  %.fr = freeze i128 %1
  %mul = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %0, i128 %.fr)
  %3 = icmp slt i128 %0, 0
  %4 = icmp eq i128 %.fr, -170141183460469231731687303715884105728
  %5 = and i1 %3, %4
  br i1 %5, label %Then7, label %Else2

Else2:                                            ; preds = %Entry
  %mul.ov = extractvalue { i128, i1 } %mul, 1
  br i1 %mul.ov, label %Then7, label %Block9

Then7:                                            ; preds = %Else2, %Entry
  store i32 1, i32* %2, align 4
  br label %Block9

Block9:                                           ; preds = %Else2, %Then7
  %mul.val = extractvalue { i128, i1 } %mul, 0
  ret i128 %mul.val
}

; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128) #1

attributes #0 = { mustprogress nobuiltin nofree nosync nounwind willreturn writeonly "frame-pointer"="none" "target-cpu"="generic" "target-features"="-a510,-a65,-a710,-a76,-a78,-a78c,-aes,-aggressive-fma,-alternate-sextload-cvt-f32-pattern,-altnzcv,-am,-amvs,-arith-bcc-fusion,-arith-cbz-fusion,-balance-fp-ops,-bf16,-brbe,-bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,-ccdp,-ccidx,-ccpp,-cmp-bcc-fusion,-complxnum,-CONTEXTIDREL2,-cortex-r82,-crc,-crypto,-custom-cheap-as-move,-disable-latency-sched-heuristic,-dit,-dotprod,-ecv,-el2vmsa,-el3,+ete,-exynos-cheap-as-move,-f32mm,-f64mm,-fgt,-fix-cortex-a53-835769,-flagm,-force-32bit-jump-tables,-fp16fml,+fp-armv8,-fptoint,-fullfp16,-fuse-address,+fuse-aes,-fuse-arith-logic,-fuse-crypto-eor,-fuse-csel,-fuse-literals,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,-hcx,-i8mm,-jsconv,-lor,-ls64,-lse,-lse2,-lsl-fast,-mops,-mpam,-mte,+neon,-no-bti-at-return-twice,-no-neg-immediates,-no-zcz-fp,-nv,-outline-atomics,-pan,-pan-rwv,-pauth,+perfmon,-predictable-select-expensive,-predres,-rand,-ras,-rcpc,-rcpc-immo,-rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,-sb,-sel2,-sha2,-sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme-f64,-sme-i64,-spe,-spe-eef,-specrestrict,-ssbs,-streaming-sve,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-tagged-globals,-tlb-rmi,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tracev8.4,+trbe,-uaops,-use-experimental-zeroing-pseudos,+use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,-v8.1a,-v8.2a,-v8.3a,-v8.4a,-v8.5a,-v8.6a,-v8.7a,-v8.8a,-v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9a,-vh,-wfxt,-xs,-zcm,-zcz,-zcz-fp-workaround,-zcz-gp" }
attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }

Note in particular the call to @llvm.smul.with.overflow.i128. This will lower to a call to __muloti4 itself, causing an infinite loop. In LLVM 13.0.1, this instead optimized to a mul instruction, avoiding the problem.

If this is all working as designed, then can I please be advised - how is this use case intended to be solved? As far as I can tell, LLVM's own compiler-rt is just getting "lucky" that this is not happening to its __muloti4 symbol, since its implementation is slightly more convoluted.

The text was updated successfully, but these errors were encountered:

llvmbot · 2022-07-06T04:03:23Z

@llvm/issue-subscribers-bug

This is a workaround for llvm/llvm-project#56403

mstorsjo · 2022-07-09T19:13:55Z

I bisected this issue down to 35fa7b8 - CC @LebedevRI.

This is a workaround for llvm/llvm-project#56403

RKSimon · 2022-08-08T13:08:59Z

CC @rotateright who suggested this might be better in Aggressive InstCombine where we can put a better legality check on it

rotateright · 2022-08-08T23:51:13Z

I spent all day mucking around with AggressiveInstCombine, but then realized we can probably get away with a tiny patch to the existing code to avoid the bug:
https://reviews.llvm.org/D131452

Reduced from issue #56403

rotateright · 2022-08-17T17:54:14Z

We're leaning towards a backend quick-fix instead of or possibly alongside the IR change:
https://reviews.llvm.org/D131521

Issue #56403

This is a potentially better alternative to D131452 that also should avoid the infinite loop bug from: issue #56403 This is again a minimal fix to reduce merging pain for the release. But if this makes sense, then we might want to guard all of the RTLIB generation (and other libcalls?) with a similar name check. Differential Revision: https://reviews.llvm.org/D131521

rotateright · 2022-08-18T11:44:43Z

/cherry-pick 8eddd1e 7f72a0f

llvmbot · 2022-08-18T11:50:14Z

/branch llvm/llvm-project-release-prs/issue56403

llvmbot · 2022-08-18T11:56:45Z

/pull-request llvm/llvm-project-release-prs#114

Issue #56403 (cherry picked from commit 8eddd1e)

This is a potentially better alternative to D131452 that also should avoid the infinite loop bug from: issue #56403 This is again a minimal fix to reduce merging pain for the release. But if this makes sense, then we might want to guard all of the RTLIB generation (and other libcalls?) with a similar name check. Differential Revision: https://reviews.llvm.org/D131521 (cherry picked from commit 7f72a0f)

andrewrk added bug Indicates an unexpected problem or unintended behavior regression miscompilation labels Jul 6, 2022

andrewrk added this to the LLVM 15.0.0 Release milestone Jul 6, 2022

andrewrk added a commit to ziglang/zig that referenced this issue Jul 6, 2022

compiler_rt: work around LLVM optimizing __muloti4 to call itself

d8fc8d0

This is a workaround for llvm/llvm-project#56403

andrewrk added a commit to ziglang/zig that referenced this issue Jul 19, 2022

compiler_rt: work around LLVM optimizing __muloti4 to call itself

ee81619

This is a workaround for llvm/llvm-project#56403

wooster0 pushed a commit to wooster0/zig that referenced this issue Jul 24, 2022

compiler_rt: work around LLVM optimizing __muloti4 to call itself

9aa4813

This is a workaround for llvm/llvm-project#56403

rotateright added a commit that referenced this issue Aug 9, 2022

[PhaseOrdering][AArch64] add test for mul-with-overflow; NFC

56a1c61

Reduced from issue #56403

rotateright added a commit that referenced this issue Aug 17, 2022

[AArch64] add test for recursive libcall lowering; NFC

8eddd1e

Issue #56403

llvmbot mentioned this issue Aug 18, 2022

PR for llvm/llvm-project#56403 llvm/llvm-project-release-prs#114

Merged

EugeneZelenko added the llvm:codegen label Aug 18, 2022

tru closed this as completed in llvm/llvm-project-release-prs#114 Aug 22, 2022

tru pushed a commit that referenced this issue Aug 22, 2022

[AArch64] add test for recursive libcall lowering; NFC

e2613c5

Issue #56403 (cherry picked from commit 8eddd1e)

EugeneZelenko added the release:merged label Aug 22, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

LLVM 14 regression: __muloti4 is lowered with a recursive call despite nobuiltin attribute #56403

LLVM 14 regression: __muloti4 is lowered with a recursive call despite nobuiltin attribute #56403

andrewrk commented Jul 6, 2022

llvmbot commented Jul 6, 2022

mstorsjo commented Jul 9, 2022

RKSimon commented Aug 8, 2022

rotateright commented Aug 8, 2022

rotateright commented Aug 17, 2022

rotateright commented Aug 18, 2022

llvmbot commented Aug 18, 2022

llvmbot commented Aug 18, 2022

LLVM 14 regression: __muloti4 is lowered with a recursive call despite nobuiltin attribute #56403

LLVM 14 regression: __muloti4 is lowered with a recursive call despite nobuiltin attribute #56403

Comments

andrewrk commented Jul 6, 2022

llvmbot commented Jul 6, 2022

mstorsjo commented Jul 9, 2022

RKSimon commented Aug 8, 2022

rotateright commented Aug 8, 2022

rotateright commented Aug 17, 2022

rotateright commented Aug 18, 2022

llvmbot commented Aug 18, 2022

llvmbot commented Aug 18, 2022