Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missed optimization for combination of conditionals #52622

Open
JakobDegen opened this issue Dec 11, 2021 · 2 comments
Open

Missed optimization for combination of conditionals #52622

JakobDegen opened this issue Dec 11, 2021 · 2 comments

Comments

@JakobDegen
Copy link

JakobDegen commented Dec 11, 2021

rustc emits the following LLVM-IR:

Original IR
; ModuleID = 'test.6a688027-cgu.0'
source_filename = "test.6a688027-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; core::hint::unreachable_unchecked
; Function Attrs: inlinehint noreturn nonlazybind uwtable
define internal void @_ZN4core4hint21unreachable_unchecked17h79872ecff0660357E() unnamed_addr #0 {
start:
  unreachable
}

; <core::num::nonzero::NonZeroU32 as core::cmp::PartialEq>::eq
; Function Attrs: inlinehint nonlazybind uwtable
define internal zeroext i1 @"_ZN71_$LT$core..num..nonzero..NonZeroU32$u20$as$u20$core..cmp..PartialEq$GT$2eq17hcc6145d807d01b45E"(i32* noalias readonly align 4 dereferenceable(4) %self, i32* noalias readonly align 4 dereferenceable(4) %other) unnamed_addr #1 {
start:
  %_5 = load i32, i32* %self, align 4
  %_6 = load i32, i32* %other, align 4
  %0 = icmp eq i32 %_5, %_6
  ret i1 %0
}

; test::disc
; Function Attrs: nonlazybind uwtable
define internal i8 @_ZN4test4disc17hd5bc60725a718a4dE(i32* noalias readonly align 4 dereferenceable(4) %a) unnamed_addr #2 {
start:
  %0 = alloca i8, align 1
  %1 = load i32, i32* %a, align 4
  %2 = icmp eq i32 %1, 0
  %_2 = select i1 %2, i64 0, i64 1
  switch i64 %_2, label %bb2 [
    i64 0, label %bb1
    i64 1, label %bb3
  ]

bb2:                                              ; preds = %start
  unreachable

bb1:                                              ; preds = %start
  store i8 0, i8* %0, align 1
  br label %bb4

bb3:                                              ; preds = %start
  store i8 1, i8* %0, align 1
  br label %bb4

bb4:                                              ; preds = %bb1, %bb3
  %3 = load i8, i8* %0, align 1
  ret i8 %3
}

; test::cmp3
; Function Attrs: nonlazybind uwtable
define zeroext i1 @_ZN4test4cmp317h6f7d157bae19d84dE(i32 %0, i32 %1) unnamed_addr #2 {
start:
  %y = alloca i32, align 4
  %x = alloca i32, align 4
  %_10 = alloca { i32, i32 }, align 4
  %2 = alloca i8, align 1
  %b = alloca i32, align 4
  %a = alloca i32, align 4
  store i32 %0, i32* %a, align 4
  store i32 %1, i32* %b, align 4
; call test::disc
  %_4 = call i8 @_ZN4test4disc17hd5bc60725a718a4dE(i32* noalias readonly align 4 dereferenceable(4) %a)
  br label %bb1

bb1:                                              ; preds = %start
; call test::disc
  %_7 = call i8 @_ZN4test4disc17hd5bc60725a718a4dE(i32* noalias readonly align 4 dereferenceable(4) %b)
  br label %bb2

bb2:                                              ; preds = %bb1
  %_3 = icmp eq i8 %_4, %_7
  br i1 %_3, label %bb3, label %bb11

bb11:                                             ; preds = %bb2
  store i8 0, i8* %2, align 1
  br label %bb12

bb3:                                              ; preds = %bb2
  %3 = bitcast { i32, i32 }* %_10 to i8*
  call void @llvm.lifetime.start.p0i8(i64 8, i8* %3)
  %_11 = load i32, i32* %a, align 4
  %_12 = load i32, i32* %b, align 4
  %4 = bitcast { i32, i32 }* %_10 to i32*
  store i32 %_11, i32* %4, align 4
  %5 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %_10, i32 0, i32 1
  store i32 %_12, i32* %5, align 4
  %6 = bitcast { i32, i32 }* %_10 to i32*
  %7 = load i32, i32* %6, align 4
  %8 = icmp eq i32 %7, 0
  %_15 = select i1 %8, i64 0, i64 1
  switch i64 %_15, label %bb5 [
    i64 0, label %bb4
    i64 1, label %bb6
  ]

bb5:                                              ; preds = %bb4, %bb6, %bb3
; call core::hint::unreachable_unchecked
  call void @_ZN4core4hint21unreachable_unchecked17h79872ecff0660357E()
  unreachable

bb4:                                              ; preds = %bb3
  %9 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %_10, i32 0, i32 1
  %10 = load i32, i32* %9, align 4
  %11 = icmp eq i32 %10, 0
  %_13 = select i1 %11, i64 0, i64 1
  %12 = icmp eq i64 %_13, 0
  br i1 %12, label %bb9, label %bb5

bb6:                                              ; preds = %bb3
  %13 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %_10, i32 0, i32 1
  %14 = load i32, i32* %13, align 4
  %15 = icmp eq i32 %14, 0
  %_14 = select i1 %15, i64 0, i64 1
  %16 = icmp eq i64 %_14, 1
  br i1 %16, label %bb7, label %bb5

bb7:                                              ; preds = %bb6
  %17 = bitcast i32* %x to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %17)
  %18 = bitcast { i32, i32 }* %_10 to i32*
  %19 = load i32, i32* %18, align 4, !range !2
  store i32 %19, i32* %x, align 4
  %20 = bitcast i32* %y to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* %20)
  %21 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %_10, i32 0, i32 1
  %22 = load i32, i32* %21, align 4, !range !2
  store i32 %22, i32* %y, align 4
; call <core::num::nonzero::NonZeroU32 as core::cmp::PartialEq>::eq
  %23 = call zeroext i1 @"_ZN71_$LT$core..num..nonzero..NonZeroU32$u20$as$u20$core..cmp..PartialEq$GT$2eq17hcc6145d807d01b45E"(i32* noalias readonly align 4 dereferenceable(4) %x, i32* noalias readonly align 4 dereferenceable(4) %y)
  %24 = zext i1 %23 to i8
  store i8 %24, i8* %2, align 1
  br label %bb8

bb8:                                              ; preds = %bb7
  %25 = bitcast i32* %y to i8*
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %25)
  %26 = bitcast i32* %x to i8*
  call void @llvm.lifetime.end.p0i8(i64 4, i8* %26)
  br label %bb10

bb10:                                             ; preds = %bb9, %bb8
  %27 = bitcast { i32, i32 }* %_10 to i8*
  call void @llvm.lifetime.end.p0i8(i64 8, i8* %27)
  br label %bb12

bb9:                                              ; preds = %bb4
  store i8 1, i8* %2, align 1
  br label %bb10

bb12:                                             ; preds = %bb11, %bb10
  %28 = load i8, i8* %2, align 1, !range !3
  %29 = trunc i8 %28 to i1
  ret i1 %29
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #3

attributes #0 = { inlinehint noreturn nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #1 = { inlinehint nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #2 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #3 = { argmemonly nofree nosync nounwind willreturn }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 7, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}
!2 = !{i32 1, i32 0}
!3 = !{i8 0, i8 2}

which after optimization becomes

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define zeroext i1 @_ZN4test4cmp317h6f7d157bae19d84dE(i32 %0, i32 %1) unnamed_addr #0 {
start:
  %2 = icmp eq i32 %0, 0
  %3 = icmp ne i32 %1, 0
  %_3 = xor i1 %2, %3
  br i1 %_3, label %bb3, label %bb12

bb3:                                              ; preds = %start
  br i1 %2, label %bb4, label %bb6

bb4:                                              ; preds = %bb3
  %4 = icmp eq i32 %1, 0
  tail call void @llvm.assume(i1 %4)
  br label %bb12

bb6:                                              ; preds = %bb3
  tail call void @llvm.assume(i1 %3)
  %5 = icmp eq i32 %0, %1
  br label %bb12

bb12:                                             ; preds = %bb4, %bb6, %start
  %.1 = phi i1 [ false, %start ], [ %5, %bb6 ], [ true, %bb4 ]
  ret i1 %.1
}

declare void @llvm.assume(i1 noundef) #1

attributes #0 = { mustprogress nofree nosync nounwind nonlazybind readnone uwtable willreturn "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #1 = { inaccessiblememonly nofree nosync nounwind willreturn }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 7, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}

I believe this is a missed optimization, and it would be correct to compile the function into a simple equality check on the inputs. This is however my first time reading llvm-ir, so I could also be wrong; please let me know if so.

I have tested this on both the rustc LLVM artifacts and on godbolt using trunk. If I should provide anymore metadata, please let me know. This example is the result of compiling fairly idiomatic Rust, and I can provide other similar (but probably ultimately equivalent) examples if needed.

@asl asl added help wanted Indicates that a maintainer wants help. Not [good first issue]. new issue and removed help wanted Indicates that a maintainer wants help. Not [good first issue]. labels Dec 12, 2021
@duk-37 duk-37 self-assigned this Jan 12, 2022
@duk-37
Copy link
Contributor

duk-37 commented Jan 12, 2022

Here's a godbolt link to some code you might find interesting. The topmost source editor is the LLVM IR you pasted here with one interesting change: the llvm.assume calls are removed. Unfortunately, this seems to produce marginally better codegen since some of the basic blocks are then removed. You can see this with the llc view in the middle.

Removing the problematic assume instructions seems to be a relatively easy case; walk through assume operands (with some bounds to prevent time blowup), then kill the assume if none of them are used after the assume (i.e. there are no code paths from the assume that influence its operands after it). This could be done with a simple dominance check, so nothing too bad here.

The rest of the code is similar to the C++ sample at the bottom.

This seems like it could be a missing case in InstCombine or SimplifyCFG with the general case being (a op1 n) op2 (b op1 n) implying a != b on the other branch, where op1 is any integer comparison operation and op2 is a boolean comparison. For example, (a < 2) == (b < 2) implying a != b in the false branch. Note that we can't infer that a == b in the true branch. For example, with a = 3 and b = 4, we'd hit the true branch without the two values being equivalent.

@scottmcm
Copy link

scottmcm commented Jan 28, 2023

Another repro, which doesn't get the assumes, based on rust-lang/rust#107022, https://rust.godbolt.org/z/nMrfjY7a3

After optimizations it ends up as

define noundef zeroext i1 @_ZN7example4demo17h8aa2bedc8aa1e47fE(i8 noundef %x, i8 noundef %y) unnamed_addr #1 {
  %0 = icmp eq i8 %x, 3
  br i1 %0, label %bb1, label %bb3

bb1:                                              ; preds = %start
  %1 = icmp eq i8 %y, 3
  br label %bb7

bb3:                                              ; preds = %start
  %.not = icmp ne i8 %y, 3
  %2 = icmp eq i8 %x, %y
  %spec.select = and i1 %.not, %2
  br label %bb7

bb7:                                              ; preds = %bb3, %bb1
  %.0 = phi i1 [ %1, %bb1 ], [ %spec.select, %bb3 ]
  ret i1 %.0
}

Which https://alive2.llvm.org/ce/z/RmgVaQ confirms could just be

  %0 = icmp eq i8 %x, %y
  ret i1 %0

Direct repro for whether opt can improve it yet: https://llvm.godbolt.org/z/9qGcxdfd8

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

6 participants