Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unnecessary stack usage #53348

Open
newpavlov opened this issue Jan 22, 2022 · 4 comments
Open

Unnecessary stack usage #53348

newpavlov opened this issue Jan 22, 2022 · 4 comments

Comments

@newpavlov
Copy link

Initially reported in rust-lang/rust#88930

I have the following simple SIMD-powered function which tests whether points are inside one of bounding boxes written in Rust (should be trivial to translate it to a different language):

pub unsafe fn foo(
    x: &[__m256i; N],
    y: &[__m256i; N],
    z: &[__m256i; N],
    bboxes: &[[__m256i; 6]],
) -> [__m256i; N] {
    let mut res = [_mm256_setzero_si256(); N];
    for bbox in bboxes {
        for i in 0..N {
            let tx = _mm256_and_si256(
                _mm256_cmpgt_epi32(x[i], bbox[0]),
                _mm256_cmpgt_epi32(bbox[1], x[i]),
            );
            let ty = _mm256_and_si256(
                _mm256_cmpgt_epi32(y[i], bbox[2]),
                _mm256_cmpgt_epi32(bbox[3], y[i]),
            );
            let t = _mm256_and_si256(tx, ty);
            let tz = _mm256_and_si256(
                _mm256_cmpgt_epi32(z[i], bbox[4]),
                _mm256_cmpgt_epi32(bbox[5], z[i]),
            );
            let t = _mm256_and_si256(t, tz);
            res[i] = _mm256_or_si256(res[i], t);
        }
    }
    res
}

By inspecting the generated assembly we can see that for some reason it caches coordinates to stack and reads them from it each iteration instead of using the input pointers. The same behavior can be observed for a function which processes coordinate slices. This caching looks quite redundant to me, especially considering that noalias is enabled (i.e. compiler should know that memory at which coordinates are stored can not change during function execution).

It looks like LLVM correctly moves coordinate loads from the inner loop using its infinite virtual registers. And it's exactly the behavior we want when there is enough physical registers. But when it's not true, it spills virtual register values to stack instead of relying on the original locations.

Looks like this issue also affects SHA-2 implementation on RISC-V targets.

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 22, 2022

@llvm/issue-subscribers-backend-risc-v

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 22, 2022

@llvm/issue-subscribers-backend-x86

@asl
Copy link
Collaborator

asl commented Jan 22, 2022

Will you please attach LLVM IR reproducer?

@newpavlov
Copy link
Author

newpavlov commented Jan 22, 2022

Will this do?

Click to expand
; ModuleID = 'foo.b13c2645-cgu.0'
source_filename = "foo.b13c2645-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"unwind::libunwind::_Unwind_Exception" = type { i64, void (i32, %"unwind::libunwind::_Unwind_Exception"*)*, [6 x i64] }
%"unwind::libunwind::_Unwind_Context" = type { [0 x i8] }

; Function Attrs: nonlazybind uwtable
define void @foo([4 x <4 x i64>]* noalias nocapture sret([4 x <4 x i64>]) dereferenceable(128) %res, [4 x <4 x i64>]* noalias nocapture readonly align 32 dereferenceable(128) %x, [4 x <4 x i64>]* noalias nocapture readonly align 32 dereferenceable(128) %y, [4 x <4 x i64>]* noalias nocapture readonly align 32 dereferenceable(128) %z, [0 x [6 x <4 x i64>]]* noalias nonnull readonly align 32 %bboxes.0, i64 %bboxes.1) unnamed_addr #0 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %res142 = bitcast [4 x <4 x i64>]* %res to i8*
  call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 32 dereferenceable(128) %res142, i8 0, i64 128, i1 false)
  %0 = getelementptr inbounds [0 x [6 x <4 x i64>]], [0 x [6 x <4 x i64>]]* %bboxes.0, i64 0, i64 %bboxes.1, i64 0, i64 0
  %_12.i140 = icmp eq i64 %bboxes.1, 0
  br i1 %_12.i140, label %bb7, label %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader"

"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader": ; preds = %start
  %1 = getelementptr [0 x [6 x <4 x i64>]], [0 x [6 x <4 x i64>]]* %bboxes.0, i64 0, i64 0, i64 0, i64 0
  %2 = bitcast [4 x <4 x i64>]* %x to <8 x i32>*
  %.phi.trans.insert143 = bitcast [4 x <4 x i64>]* %z to <8 x i32>*
  %_57135.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert143, align 32
  %.phi.trans.insert145 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %x, i64 0, i64 1
  %.phi.trans.insert146 = bitcast <4 x i64>* %.phi.trans.insert145 to <8 x i32>*
  %_24129.1.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert146, align 32
  %.phi.trans.insert147 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %y, i64 0, i64 1
  %.phi.trans.insert148 = bitcast <4 x i64>* %.phi.trans.insert147 to <8 x i32>*
  %_39132.1.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert148, align 32
  %.phi.trans.insert149 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %z, i64 0, i64 1
  %.phi.trans.insert150 = bitcast <4 x i64>* %.phi.trans.insert149 to <8 x i32>*
  %_57135.1.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert150, align 32
  %.phi.trans.insert152 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %x, i64 0, i64 2
  %.phi.trans.insert153 = bitcast <4 x i64>* %.phi.trans.insert152 to <8 x i32>*
  %_24129.2.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert153, align 32
  %.phi.trans.insert154 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %y, i64 0, i64 2
  %.phi.trans.insert155 = bitcast <4 x i64>* %.phi.trans.insert154 to <8 x i32>*
  %_39132.2.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert155, align 32
  %.phi.trans.insert156 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %z, i64 0, i64 2
  %.phi.trans.insert157 = bitcast <4 x i64>* %.phi.trans.insert156 to <8 x i32>*
  %_57135.2.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert157, align 32
  %.phi.trans.insert159 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %x, i64 0, i64 3
  %.phi.trans.insert160 = bitcast <4 x i64>* %.phi.trans.insert159 to <8 x i32>*
  %_24129.3.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert160, align 32
  %.phi.trans.insert161 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %y, i64 0, i64 3
  %.phi.trans.insert162 = bitcast <4 x i64>* %.phi.trans.insert161 to <8 x i32>*
  %_39132.3.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert162, align 32
  %.phi.trans.insert163 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %z, i64 0, i64 3
  %.phi.trans.insert164 = bitcast <4 x i64>* %.phi.trans.insert163 to <8 x i32>*
  %_57135.3.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert164, align 32
  %_24129 = load <8 x i32>, <8 x i32>* %2, align 32
  %3 = bitcast [4 x <4 x i64>]* %y to <8 x i32>*
  %_39132 = load <8 x i32>, <8 x i32>* %3, align 32
  %4 = bitcast [4 x <4 x i64>]* %res to <8 x i32>*
  %5 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %res, i64 0, i64 1
  %6 = bitcast <4 x i64>* %5 to <8 x i32>*
  %7 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %res, i64 0, i64 2
  %8 = bitcast <4 x i64>* %7 to <8 x i32>*
  %9 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %res, i64 0, i64 3
  %10 = bitcast <4 x i64>* %9 to <8 x i32>*
  %.promoted = load <8 x i32>, <8 x i32>* %10, align 32
  br label %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit"

"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit": ; preds = %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader", %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit"
  %11 = phi <8 x i32> [ %70, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ %.promoted, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %_74138.2 = phi <8 x i32> [ %58, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ zeroinitializer, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %_74138.1 = phi <8 x i32> [ %46, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ zeroinitializer, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %_74138 = phi <8 x i32> [ %34, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ zeroinitializer, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %iter.sroa.0.0141 = phi i64* [ %71, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ %1, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %12 = bitcast i64* %iter.sroa.0.0141 to <8 x i32>*
  %_28130 = load <8 x i32>, <8 x i32>* %12, align 32
  %13 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 4
  %14 = bitcast i64* %13 to <8 x i32>*
  %_31131 = load <8 x i32>, <8 x i32>* %14, align 32
  %15 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 8
  %16 = bitcast i64* %15 to <8 x i32>*
  %_43133 = load <8 x i32>, <8 x i32>* %16, align 32
  %17 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 12
  %18 = bitcast i64* %17 to <8 x i32>*
  %_46134 = load <8 x i32>, <8 x i32>* %18, align 32
  %19 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 16
  %20 = bitcast i64* %19 to <8 x i32>*
  %_61136 = load <8 x i32>, <8 x i32>* %20, align 32
  %21 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 20
  %22 = bitcast i64* %21 to <8 x i32>*
  %_64137 = load <8 x i32>, <8 x i32>* %22, align 32
  %23 = icmp sgt <8 x i32> %_24129, %_28130
  %24 = icmp sgt <8 x i32> %_31131, %_24129
  %25 = icmp sgt <8 x i32> %_39132, %_43133
  %26 = icmp sgt <8 x i32> %_46134, %_39132
  %27 = icmp sgt <8 x i32> %_57135.pre, %_61136
  %28 = icmp sgt <8 x i32> %_64137, %_57135.pre
  %29 = and <8 x i1> %24, %23
  %30 = and <8 x i1> %29, %25
  %31 = and <8 x i1> %30, %26
  %32 = and <8 x i1> %31, %27
  %33 = and <8 x i1> %32, %28
  %34 = select <8 x i1> %33, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %_74138
  %35 = icmp sgt <8 x i32> %_24129.1.pre, %_28130
  %36 = icmp sgt <8 x i32> %_31131, %_24129.1.pre
  %37 = icmp sgt <8 x i32> %_39132.1.pre, %_43133
  %38 = icmp sgt <8 x i32> %_46134, %_39132.1.pre
  %39 = icmp sgt <8 x i32> %_57135.1.pre, %_61136
  %40 = icmp sgt <8 x i32> %_64137, %_57135.1.pre
  %41 = and <8 x i1> %36, %35
  %42 = and <8 x i1> %41, %37
  %43 = and <8 x i1> %42, %38
  %44 = and <8 x i1> %43, %39
  %45 = and <8 x i1> %44, %40
  %46 = select <8 x i1> %45, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %_74138.1
  %47 = icmp sgt <8 x i32> %_24129.2.pre, %_28130
  %48 = icmp sgt <8 x i32> %_31131, %_24129.2.pre
  %49 = icmp sgt <8 x i32> %_39132.2.pre, %_43133
  %50 = icmp sgt <8 x i32> %_46134, %_39132.2.pre
  %51 = icmp sgt <8 x i32> %_57135.2.pre, %_61136
  %52 = icmp sgt <8 x i32> %_64137, %_57135.2.pre
  %53 = and <8 x i1> %48, %47
  %54 = and <8 x i1> %53, %49
  %55 = and <8 x i1> %54, %50
  %56 = and <8 x i1> %55, %51
  %57 = and <8 x i1> %56, %52
  %58 = select <8 x i1> %57, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %_74138.2
  %59 = icmp sgt <8 x i32> %_24129.3.pre, %_28130
  %60 = icmp sgt <8 x i32> %_31131, %_24129.3.pre
  %61 = icmp sgt <8 x i32> %_39132.3.pre, %_43133
  %62 = icmp sgt <8 x i32> %_46134, %_39132.3.pre
  %63 = icmp sgt <8 x i32> %_57135.3.pre, %_61136
  %64 = icmp sgt <8 x i32> %_64137, %_57135.3.pre
  %65 = and <8 x i1> %60, %59
  %66 = and <8 x i1> %65, %61
  %67 = and <8 x i1> %66, %62
  %68 = and <8 x i1> %67, %63
  %69 = and <8 x i1> %68, %64
  %70 = select <8 x i1> %69, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %11
  %71 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 24
  %_12.i = icmp eq i64* %71, %0
  br i1 %_12.i, label %bb7.loopexit, label %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit"

bb7.loopexit:                                     ; preds = %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit"
  store <8 x i32> %34, <8 x i32>* %4, align 32
  store <8 x i32> %46, <8 x i32>* %6, align 32
  store <8 x i32> %58, <8 x i32>* %8, align 32
  store <8 x i32> %70, <8 x i32>* %10, align 32
  br label %bb7

bb7:                                              ; preds = %bb7.loopexit, %start
  ret void
}

; Function Attrs: nonlazybind uwtable
declare i32 @rust_eh_personality(i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*) unnamed_addr #0

; Function Attrs: argmemonly nofree nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1

attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #1 = { argmemonly nofree nounwind willreturn writeonly }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 7, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}

Godbolt

UPD: Fixed the LLVM IR, the previous version was compiled without enabling AVX2.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants