Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization: combine unlikely branches #53015

Open
tczajka opened this issue Jan 5, 2022 · 2 comments
Open

Optimization: combine unlikely branches #53015

tczajka opened this issue Jan 5, 2022 · 2 comments

Comments

@tczajka
Copy link

tczajka commented Jan 5, 2022

This Rust code:

fn sum(a: &[i32]) -> i32 {
    a[0] + a[1] + a[2]
}

performs bounds checks on each array access in order. Effectively it translates to something like this:

fn sum(a: &[i32]) -> i32 {
    if a.len() == 0 {
        panic!("0 out of range");
    } else if a.len() == 1 {
        panic!("1 out of range");
    } else if a.len() <= 2 {
        panic!("2 out of range");
    } else {
        unsafe { a.get_unchecked(0) + a.get_unchecked(1) + a.get_unchecked(2) }
    }
}

Generated assembly:

        push    rax
        test    rsi, rsi
        je      .LBB0_4
        cmp     rsi, 1
        je      .LBB0_5
        cmp     rsi, 2
        jbe     .LBB0_3
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        add     eax, dword ptr [rdi + 8]
        pop     rcx
        ret

All three "panic" cases unconditionally call an error handling function marked cold.

This could be optimized by combining the tests into one test covering all three unlikely cases, so that there is only one check on the hot path. If we have to go into the cold code, only then differentiate between the different cold cases. Like this:

fn sum(a: &[i32]) -> i32 {
    if a.len() <= 2 {
        if a.len() == 0 {
            panic!("0 out of range");
        } else if a.len() == 1 {
            panic!("1 out of range");
        } else {
            panic!("2 out of range");
        }
    } else {
        unsafe { a.get_unchecked(0) + a.get_unchecked(1) + a.get_unchecked(2) }
    }
}
@LebedevRI
Copy link
Member

I don't know how the IR looks like in those rust examples,
but i essentially tried doing that in https://reviews.llvm.org/D104870
(there were prior attempts at this)

@CAD97
Copy link

CAD97 commented Jan 5, 2022

(I'll provide a compiler explorer link later, on mobile currently...)

Rust:

fn sum(a: &[i32]) -> i32 {
    a[0] + a[1] + a[2]
}
Unoptimized LLVM IR:
; ModuleID = 'playground.7b426eab-cgu.0'
source_filename = "playground.7b426eab-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"core::panic::location::Location" = type { { [0 x i8]*, i64 }, i32, i32 }

@alloc2 = private unnamed_addr constant <{ [10 x i8] }> <{ [10 x i8] c"src/lib.rs" }>, align 1
@alloc3 = private unnamed_addr constant <{ i8*, [16 x i8] }> <{ i8* getelementptr inbounds (<{ [10 x i8] }>, <{ [10 x i8] }>* @alloc2, i32 0, i32 0, i32 0), [16 x i8] c"\0A\00\00\00\00\00\00\00\02\00\00\00\05\00\00\00" }>, align 8
@alloc5 = private unnamed_addr constant <{ i8*, [16 x i8] }> <{ i8* getelementptr inbounds (<{ [10 x i8] }>, <{ [10 x i8] }>* @alloc2, i32 0, i32 0, i32 0), [16 x i8] c"\0A\00\00\00\00\00\00\00\02\00\00\00\0C\00\00\00" }>, align 8
@str.0 = internal constant [28 x i8] c"attempt to add with overflow"
@alloc7 = private unnamed_addr constant <{ i8*, [16 x i8] }> <{ i8* getelementptr inbounds (<{ [10 x i8] }>, <{ [10 x i8] }>* @alloc2, i32 0, i32 0, i32 0), [16 x i8] c"\0A\00\00\00\00\00\00\00\02\00\00\00\13\00\00\00" }>, align 8
@__rustc_debug_gdb_scripts_section__ = linkonce_odr unnamed_addr constant [34 x i8] c"\01gdb_load_rust_pretty_printers.py\00", section ".debug_gdb_scripts", align 1

; playground::sum
; Function Attrs: nonlazybind uwtable
define i32 @_ZN10playground3sum17he5ba01730143811fE([0 x i32]* nonnull align 4 %a.0, i64 %a.1) unnamed_addr #0 !dbg !6 {
start:
  %a.dbg.spill = alloca { [0 x i32]*, i64 }, align 8
  %0 = getelementptr inbounds { [0 x i32]*, i64 }, { [0 x i32]*, i64 }* %a.dbg.spill, i32 0, i32 0
  store [0 x i32]* %a.0, [0 x i32]** %0, align 8
  %1 = getelementptr inbounds { [0 x i32]*, i64 }, { [0 x i32]*, i64 }* %a.dbg.spill, i32 0, i32 1
  store i64 %a.1, i64* %1, align 8
  call void @llvm.dbg.declare(metadata { [0 x i32]*, i64 }* %a.dbg.spill, metadata !20, metadata !DIExpression()), !dbg !21
  %_6 = icmp ult i64 0, %a.1, !dbg !22
  %2 = call i1 @llvm.expect.i1(i1 %_6, i1 true), !dbg !22
  br i1 %2, label %bb1, label %panic, !dbg !22

bb1:                                              ; preds = %start
  %3 = getelementptr inbounds [0 x i32], [0 x i32]* %a.0, i64 0, i64 0, !dbg !22
  %_3 = load i32, i32* %3, align 4, !dbg !22
  %_10 = icmp ult i64 1, %a.1, !dbg !23
  %4 = call i1 @llvm.expect.i1(i1 %_10, i1 true), !dbg !23
  br i1 %4, label %bb2, label %panic1, !dbg !23

panic:                                            ; preds = %start
; call core::panicking::panic_bounds_check
  call void @_ZN4core9panicking18panic_bounds_check17h0ffbb3014c878e8aE(i64 0, i64 %a.1, %"core::panic::location::Location"* align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc3 to %"core::panic::location::Location"*)) #4, !dbg !22
  unreachable, !dbg !22

bb2:                                              ; preds = %bb1
  %5 = getelementptr inbounds [0 x i32], [0 x i32]* %a.0, i64 0, i64 1, !dbg !23
  %_7 = load i32, i32* %5, align 4, !dbg !23
  %6 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %_3, i32 %_7), !dbg !22
  %_11.0 = extractvalue { i32, i1 } %6, 0, !dbg !22
  %_11.1 = extractvalue { i32, i1 } %6, 1, !dbg !22
  %7 = call i1 @llvm.expect.i1(i1 %_11.1, i1 false), !dbg !22
  br i1 %7, label %panic2, label %bb3, !dbg !22

panic1:                                           ; preds = %bb1
; call core::panicking::panic_bounds_check
  call void @_ZN4core9panicking18panic_bounds_check17h0ffbb3014c878e8aE(i64 1, i64 %a.1, %"core::panic::location::Location"* align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc5 to %"core::panic::location::Location"*)) #4, !dbg !23
  unreachable, !dbg !23

bb3:                                              ; preds = %bb2
  %_15 = icmp ult i64 2, %a.1, !dbg !24
  %8 = call i1 @llvm.expect.i1(i1 %_15, i1 true), !dbg !24
  br i1 %8, label %bb4, label %panic3, !dbg !24

panic2:                                           ; preds = %bb2
; call core::panicking::panic
  call void @_ZN4core9panicking5panic17h0ba7146865b2f9d6E([0 x i8]* nonnull align 1 bitcast ([28 x i8]* @str.0 to [0 x i8]*), i64 28, %"core::panic::location::Location"* align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc3 to %"core::panic::location::Location"*)) #4, !dbg !22
  unreachable, !dbg !22

bb4:                                              ; preds = %bb3
  %9 = getelementptr inbounds [0 x i32], [0 x i32]* %a.0, i64 0, i64 2, !dbg !24
  %_12 = load i32, i32* %9, align 4, !dbg !24
  %10 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %_11.0, i32 %_12), !dbg !22
  %_16.0 = extractvalue { i32, i1 } %10, 0, !dbg !22
  %_16.1 = extractvalue { i32, i1 } %10, 1, !dbg !22
  %11 = call i1 @llvm.expect.i1(i1 %_16.1, i1 false), !dbg !22
  br i1 %11, label %panic4, label %bb5, !dbg !22

panic3:                                           ; preds = %bb3
; call core::panicking::panic_bounds_check
  call void @_ZN4core9panicking18panic_bounds_check17h0ffbb3014c878e8aE(i64 2, i64 %a.1, %"core::panic::location::Location"* align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc7 to %"core::panic::location::Location"*)) #4, !dbg !24
  unreachable, !dbg !24

bb5:                                              ; preds = %bb4
  ret i32 %_16.0, !dbg !25

panic4:                                           ; preds = %bb4
; call core::panicking::panic
  call void @_ZN4core9panicking5panic17h0ba7146865b2f9d6E([0 x i8]* nonnull align 1 bitcast ([28 x i8]* @str.0 to [0 x i8]*), i64 28, %"core::panic::location::Location"* align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc3 to %"core::panic::location::Location"*)) #4, !dbg !22
  unreachable, !dbg !22
}

; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
declare void @llvm.dbg.declare(metadata, metadata, metadata) #1

; Function Attrs: nofree nosync nounwind readnone willreturn
declare i1 @llvm.expect.i1(i1, i1) #2

; core::panicking::panic_bounds_check
; Function Attrs: cold noinline noreturn nonlazybind uwtable
declare void @_ZN4core9panicking18panic_bounds_check17h0ffbb3014c878e8aE(i64, i64, %"core::panic::location::Location"* align 8 dereferenceable(24)) unnamed_addr #3

; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #1

; core::panicking::panic
; Function Attrs: cold noinline noreturn nonlazybind uwtable
declare void @_ZN4core9panicking5panic17h0ba7146865b2f9d6E([0 x i8]* nonnull align 1, i64, %"core::panic::location::Location"* align 8 dereferenceable(24)) unnamed_addr #3

attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
attributes #2 = { nofree nosync nounwind readnone willreturn }
attributes #3 = { cold noinline noreturn nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #4 = { noreturn }

!llvm.module.flags = !{!0, !1, !2}
!llvm.dbg.cu = !{!3}

!0 = !{i32 7, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}
!2 = !{i32 2, !"Debug Info Version", i32 3}
!3 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !4, producer: "clang LLVM (rustc version 1.57.0 (f1edd0429 2021-11-29))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !5)
!4 = !DIFile(filename: "src/lib.rs", directory: "/playground")
!5 = !{}
!6 = distinct !DISubprogram(name: "sum", linkageName: "_ZN10playground3sum17he5ba01730143811fE", scope: !8, file: !7, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !3, templateParams: !5, retainedNodes: !19)
!7 = !DIFile(filename: "src/lib.rs", directory: "/playground", checksumkind: CSK_MD5, checksum: "d4437818f085c36dbbb8cdce8d42bc47")
!8 = !DINamespace(name: "playground", scope: null)
!9 = !DISubroutineType(types: !10)
!10 = !{!11, !12}
!11 = !DIBasicType(name: "i32", size: 32, encoding: DW_ATE_signed)
!12 = !DICompositeType(tag: DW_TAG_structure_type, name: "&[i32]", file: !13, size: 128, align: 64, elements: !14, templateParams: !5, identifier: "eb06f4cf6efdc2bd8df73753b82fb221")
!13 = !DIFile(filename: "<unknown>", directory: "")
!14 = !{!15, !17}
!15 = !DIDerivedType(tag: DW_TAG_member, name: "data_ptr", scope: !12, file: !13, baseType: !16, size: 64, align: 64)
!16 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const i32", baseType: !11, size: 64, align: 64, dwarfAddressSpace: 0)
!17 = !DIDerivedType(tag: DW_TAG_member, name: "length", scope: !12, file: !13, baseType: !18, size: 64, align: 64, offset: 64)
!18 = !DIBasicType(name: "usize", size: 64, encoding: DW_ATE_unsigned)
!19 = !{!20}
!20 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !7, line: 1, type: !12)
!21 = !DILocation(line: 1, column: 12, scope: !6)
!22 = !DILocation(line: 2, column: 5, scope: !6)
!23 = !DILocation(line: 2, column: 12, scope: !6)
!24 = !DILocation(line: 2, column: 19, scope: !6)
!25 = !DILocation(line: 3, column: 2, scope: !6)
Optimized IR (rustc 1.57.0)
; ModuleID = 'playground.7284a82e-cgu.0'
source_filename = "playground.7284a82e-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"core::panic::location::Location" = type { { [0 x i8]*, i64 }, i32, i32 }

@alloc6 = private unnamed_addr constant <{ [10 x i8] }> <{ [10 x i8] c"src/lib.rs" }>, align 1
@alloc3 = private unnamed_addr constant <{ i8*, [16 x i8] }> <{ i8* getelementptr inbounds (<{ [10 x i8] }>, <{ [10 x i8] }>* @alloc6, i32 0, i32 0, i32 0), [16 x i8] c"\0A\00\00\00\00\00\00\00\02\00\00\00\05\00\00\00" }>, align 8
@alloc5 = private unnamed_addr constant <{ i8*, [16 x i8] }> <{ i8* getelementptr inbounds (<{ [10 x i8] }>, <{ [10 x i8] }>* @alloc6, i32 0, i32 0, i32 0), [16 x i8] c"\0A\00\00\00\00\00\00\00\02\00\00\00\0C\00\00\00" }>, align 8
@alloc7 = private unnamed_addr constant <{ i8*, [16 x i8] }> <{ i8* getelementptr inbounds (<{ [10 x i8] }>, <{ [10 x i8] }>* @alloc6, i32 0, i32 0, i32 0), [16 x i8] c"\0A\00\00\00\00\00\00\00\02\00\00\00\13\00\00\00" }>, align 8

; playground::sum
; Function Attrs: nonlazybind uwtable
define i32 @_ZN10playground3sum17h097620a467899888E([0 x i32]* noalias nocapture nonnull readonly align 4 %a.0, i64 %a.1) unnamed_addr #0 {
start:
  switch i64 %a.1, label %bb2 [
    i64 0, label %panic
    i64 1, label %panic1
  ], !prof !2

panic:                                            ; preds = %start
; call core::panicking::panic_bounds_check
  tail call void @_ZN4core9panicking18panic_bounds_check17h0ffbb3014c878e8aE(i64 0, i64 0, %"core::panic::location::Location"* noalias readonly align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc3 to %"core::panic::location::Location"*)) #2
  unreachable

bb2:                                              ; preds = %start
  %_14 = icmp ugt i64 %a.1, 2
  br i1 %_14, label %bb3, label %panic2, !prof !3

panic1:                                           ; preds = %start
; call core::panicking::panic_bounds_check
  tail call void @_ZN4core9panicking18panic_bounds_check17h0ffbb3014c878e8aE(i64 1, i64 1, %"core::panic::location::Location"* noalias readonly align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc5 to %"core::panic::location::Location"*)) #2
  unreachable

bb3:                                              ; preds = %bb2
  %0 = getelementptr inbounds [0 x i32], [0 x i32]* %a.0, i64 0, i64 0
  %_3 = load i32, i32* %0, align 4
  %1 = getelementptr inbounds [0 x i32], [0 x i32]* %a.0, i64 0, i64 1
  %_7 = load i32, i32* %1, align 4
  %_2 = add i32 %_7, %_3
  %2 = getelementptr inbounds [0 x i32], [0 x i32]* %a.0, i64 0, i64 2
  %_11 = load i32, i32* %2, align 4
  %3 = add i32 %_2, %_11
  ret i32 %3

panic2:                                           ; preds = %bb2
; call core::panicking::panic_bounds_check
  tail call void @_ZN4core9panicking18panic_bounds_check17h0ffbb3014c878e8aE(i64 2, i64 2, %"core::panic::location::Location"* noalias readonly align 8 dereferenceable(24) bitcast (<{ i8*, [16 x i8] }>* @alloc7 to %"core::panic::location::Location"*)) #2
  unreachable
}

; core::panicking::panic_bounds_check
; Function Attrs: cold noinline noreturn nonlazybind uwtable
declare void @_ZN4core9panicking18panic_bounds_check17h0ffbb3014c878e8aE(i64, i64, %"core::panic::location::Location"* noalias readonly align 8 dereferenceable(24)) unnamed_addr #1

attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #1 = { cold noinline noreturn nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #2 = { noreturn }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 7, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}
!2 = !{!"branch_weights", i32 4000000, i32 2001, i32 2000}
!3 = !{!"branch_weights", i32 2000, i32 1}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants