Unnecessary stack usage #53348

newpavlov · 2022-01-22T01:20:26Z

Initially reported in rust-lang/rust#88930

I have the following simple SIMD-powered function which tests whether points are inside one of bounding boxes written in Rust (should be trivial to translate it to a different language):

pub unsafe fn foo(
    x: &[__m256i; N],
    y: &[__m256i; N],
    z: &[__m256i; N],
    bboxes: &[[__m256i; 6]],
) -> [__m256i; N] {
    let mut res = [_mm256_setzero_si256(); N];
    for bbox in bboxes {
        for i in 0..N {
            let tx = _mm256_and_si256(
                _mm256_cmpgt_epi32(x[i], bbox[0]),
                _mm256_cmpgt_epi32(bbox[1], x[i]),
            );
            let ty = _mm256_and_si256(
                _mm256_cmpgt_epi32(y[i], bbox[2]),
                _mm256_cmpgt_epi32(bbox[3], y[i]),
            );
            let t = _mm256_and_si256(tx, ty);
            let tz = _mm256_and_si256(
                _mm256_cmpgt_epi32(z[i], bbox[4]),
                _mm256_cmpgt_epi32(bbox[5], z[i]),
            );
            let t = _mm256_and_si256(t, tz);
            res[i] = _mm256_or_si256(res[i], t);
        }
    }
    res
}

By inspecting the generated assembly we can see that for some reason it caches coordinates to stack and reads them from it each iteration instead of using the input pointers. The same behavior can be observed for a function which processes coordinate slices. This caching looks quite redundant to me, especially considering that noalias is enabled (i.e. compiler should know that memory at which coordinates are stored can not change during function execution).

It looks like LLVM correctly moves coordinate loads from the inner loop using its infinite virtual registers. And it's exactly the behavior we want when there is enough physical registers. But when it's not true, it spills virtual register values to stack instead of relying on the original locations.

Looks like this issue also affects SHA-2 implementation on RISC-V targets.

The text was updated successfully, but these errors were encountered:

llvmbot · 2022-01-22T01:23:25Z

@llvm/issue-subscribers-backend-risc-v

llvmbot · 2022-01-22T01:23:28Z

@llvm/issue-subscribers-backend-x86

asl · 2022-01-22T08:13:50Z

Will you please attach LLVM IR reproducer?

newpavlov · 2022-01-22T08:23:59Z

Will this do?

Click to expand

; ModuleID = 'foo.b13c2645-cgu.0'
source_filename = "foo.b13c2645-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

%"unwind::libunwind::_Unwind_Exception" = type { i64, void (i32, %"unwind::libunwind::_Unwind_Exception"*)*, [6 x i64] }
%"unwind::libunwind::_Unwind_Context" = type { [0 x i8] }

; Function Attrs: nonlazybind uwtable
define void @foo([4 x <4 x i64>]* noalias nocapture sret([4 x <4 x i64>]) dereferenceable(128) %res, [4 x <4 x i64>]* noalias nocapture readonly align 32 dereferenceable(128) %x, [4 x <4 x i64>]* noalias nocapture readonly align 32 dereferenceable(128) %y, [4 x <4 x i64>]* noalias nocapture readonly align 32 dereferenceable(128) %z, [0 x [6 x <4 x i64>]]* noalias nonnull readonly align 32 %bboxes.0, i64 %bboxes.1) unnamed_addr #0 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
  %res142 = bitcast [4 x <4 x i64>]* %res to i8*
  call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 32 dereferenceable(128) %res142, i8 0, i64 128, i1 false)
  %0 = getelementptr inbounds [0 x [6 x <4 x i64>]], [0 x [6 x <4 x i64>]]* %bboxes.0, i64 0, i64 %bboxes.1, i64 0, i64 0
  %_12.i140 = icmp eq i64 %bboxes.1, 0
  br i1 %_12.i140, label %bb7, label %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader"

"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader": ; preds = %start
  %1 = getelementptr [0 x [6 x <4 x i64>]], [0 x [6 x <4 x i64>]]* %bboxes.0, i64 0, i64 0, i64 0, i64 0
  %2 = bitcast [4 x <4 x i64>]* %x to <8 x i32>*
  %.phi.trans.insert143 = bitcast [4 x <4 x i64>]* %z to <8 x i32>*
  %_57135.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert143, align 32
  %.phi.trans.insert145 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %x, i64 0, i64 1
  %.phi.trans.insert146 = bitcast <4 x i64>* %.phi.trans.insert145 to <8 x i32>*
  %_24129.1.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert146, align 32
  %.phi.trans.insert147 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %y, i64 0, i64 1
  %.phi.trans.insert148 = bitcast <4 x i64>* %.phi.trans.insert147 to <8 x i32>*
  %_39132.1.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert148, align 32
  %.phi.trans.insert149 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %z, i64 0, i64 1
  %.phi.trans.insert150 = bitcast <4 x i64>* %.phi.trans.insert149 to <8 x i32>*
  %_57135.1.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert150, align 32
  %.phi.trans.insert152 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %x, i64 0, i64 2
  %.phi.trans.insert153 = bitcast <4 x i64>* %.phi.trans.insert152 to <8 x i32>*
  %_24129.2.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert153, align 32
  %.phi.trans.insert154 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %y, i64 0, i64 2
  %.phi.trans.insert155 = bitcast <4 x i64>* %.phi.trans.insert154 to <8 x i32>*
  %_39132.2.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert155, align 32
  %.phi.trans.insert156 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %z, i64 0, i64 2
  %.phi.trans.insert157 = bitcast <4 x i64>* %.phi.trans.insert156 to <8 x i32>*
  %_57135.2.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert157, align 32
  %.phi.trans.insert159 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %x, i64 0, i64 3
  %.phi.trans.insert160 = bitcast <4 x i64>* %.phi.trans.insert159 to <8 x i32>*
  %_24129.3.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert160, align 32
  %.phi.trans.insert161 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %y, i64 0, i64 3
  %.phi.trans.insert162 = bitcast <4 x i64>* %.phi.trans.insert161 to <8 x i32>*
  %_39132.3.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert162, align 32
  %.phi.trans.insert163 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %z, i64 0, i64 3
  %.phi.trans.insert164 = bitcast <4 x i64>* %.phi.trans.insert163 to <8 x i32>*
  %_57135.3.pre = load <8 x i32>, <8 x i32>* %.phi.trans.insert164, align 32
  %_24129 = load <8 x i32>, <8 x i32>* %2, align 32
  %3 = bitcast [4 x <4 x i64>]* %y to <8 x i32>*
  %_39132 = load <8 x i32>, <8 x i32>* %3, align 32
  %4 = bitcast [4 x <4 x i64>]* %res to <8 x i32>*
  %5 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %res, i64 0, i64 1
  %6 = bitcast <4 x i64>* %5 to <8 x i32>*
  %7 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %res, i64 0, i64 2
  %8 = bitcast <4 x i64>* %7 to <8 x i32>*
  %9 = getelementptr inbounds [4 x <4 x i64>], [4 x <4 x i64>]* %res, i64 0, i64 3
  %10 = bitcast <4 x i64>* %9 to <8 x i32>*
  %.promoted = load <8 x i32>, <8 x i32>* %10, align 32
  br label %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit"

"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit": ; preds = %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader", %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit"
  %11 = phi <8 x i32> [ %70, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ %.promoted, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %_74138.2 = phi <8 x i32> [ %58, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ zeroinitializer, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %_74138.1 = phi <8 x i32> [ %46, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ zeroinitializer, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %_74138 = phi <8 x i32> [ %34, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ zeroinitializer, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %iter.sroa.0.0141 = phi i64* [ %71, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit" ], [ %1, %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit.preheader" ]
  %12 = bitcast i64* %iter.sroa.0.0141 to <8 x i32>*
  %_28130 = load <8 x i32>, <8 x i32>* %12, align 32
  %13 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 4
  %14 = bitcast i64* %13 to <8 x i32>*
  %_31131 = load <8 x i32>, <8 x i32>* %14, align 32
  %15 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 8
  %16 = bitcast i64* %15 to <8 x i32>*
  %_43133 = load <8 x i32>, <8 x i32>* %16, align 32
  %17 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 12
  %18 = bitcast i64* %17 to <8 x i32>*
  %_46134 = load <8 x i32>, <8 x i32>* %18, align 32
  %19 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 16
  %20 = bitcast i64* %19 to <8 x i32>*
  %_61136 = load <8 x i32>, <8 x i32>* %20, align 32
  %21 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 20
  %22 = bitcast i64* %21 to <8 x i32>*
  %_64137 = load <8 x i32>, <8 x i32>* %22, align 32
  %23 = icmp sgt <8 x i32> %_24129, %_28130
  %24 = icmp sgt <8 x i32> %_31131, %_24129
  %25 = icmp sgt <8 x i32> %_39132, %_43133
  %26 = icmp sgt <8 x i32> %_46134, %_39132
  %27 = icmp sgt <8 x i32> %_57135.pre, %_61136
  %28 = icmp sgt <8 x i32> %_64137, %_57135.pre
  %29 = and <8 x i1> %24, %23
  %30 = and <8 x i1> %29, %25
  %31 = and <8 x i1> %30, %26
  %32 = and <8 x i1> %31, %27
  %33 = and <8 x i1> %32, %28
  %34 = select <8 x i1> %33, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %_74138
  %35 = icmp sgt <8 x i32> %_24129.1.pre, %_28130
  %36 = icmp sgt <8 x i32> %_31131, %_24129.1.pre
  %37 = icmp sgt <8 x i32> %_39132.1.pre, %_43133
  %38 = icmp sgt <8 x i32> %_46134, %_39132.1.pre
  %39 = icmp sgt <8 x i32> %_57135.1.pre, %_61136
  %40 = icmp sgt <8 x i32> %_64137, %_57135.1.pre
  %41 = and <8 x i1> %36, %35
  %42 = and <8 x i1> %41, %37
  %43 = and <8 x i1> %42, %38
  %44 = and <8 x i1> %43, %39
  %45 = and <8 x i1> %44, %40
  %46 = select <8 x i1> %45, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %_74138.1
  %47 = icmp sgt <8 x i32> %_24129.2.pre, %_28130
  %48 = icmp sgt <8 x i32> %_31131, %_24129.2.pre
  %49 = icmp sgt <8 x i32> %_39132.2.pre, %_43133
  %50 = icmp sgt <8 x i32> %_46134, %_39132.2.pre
  %51 = icmp sgt <8 x i32> %_57135.2.pre, %_61136
  %52 = icmp sgt <8 x i32> %_64137, %_57135.2.pre
  %53 = and <8 x i1> %48, %47
  %54 = and <8 x i1> %53, %49
  %55 = and <8 x i1> %54, %50
  %56 = and <8 x i1> %55, %51
  %57 = and <8 x i1> %56, %52
  %58 = select <8 x i1> %57, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %_74138.2
  %59 = icmp sgt <8 x i32> %_24129.3.pre, %_28130
  %60 = icmp sgt <8 x i32> %_31131, %_24129.3.pre
  %61 = icmp sgt <8 x i32> %_39132.3.pre, %_43133
  %62 = icmp sgt <8 x i32> %_46134, %_39132.3.pre
  %63 = icmp sgt <8 x i32> %_57135.3.pre, %_61136
  %64 = icmp sgt <8 x i32> %_64137, %_57135.3.pre
  %65 = and <8 x i1> %60, %59
  %66 = and <8 x i1> %65, %61
  %67 = and <8 x i1> %66, %62
  %68 = and <8 x i1> %67, %63
  %69 = and <8 x i1> %68, %64
  %70 = select <8 x i1> %69, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> %11
  %71 = getelementptr inbounds i64, i64* %iter.sroa.0.0141, i64 24
  %_12.i = icmp eq i64* %71, %0
  br i1 %_12.i, label %bb7.loopexit, label %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit"

bb7.loopexit:                                     ; preds = %"_ZN91_$LT$core..slice..iter..Iter$LT$T$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$4next17h843b76fd6aec3a2fE.exit"
  store <8 x i32> %34, <8 x i32>* %4, align 32
  store <8 x i32> %46, <8 x i32>* %6, align 32
  store <8 x i32> %58, <8 x i32>* %8, align 32
  store <8 x i32> %70, <8 x i32>* %10, align 32
  br label %bb7

bb7:                                              ; preds = %bb7.loopexit, %start
  ret void
}

; Function Attrs: nonlazybind uwtable
declare i32 @rust_eh_personality(i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*) unnamed_addr #0

; Function Attrs: argmemonly nofree nounwind willreturn writeonly
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1

attributes #0 = { nonlazybind uwtable "probe-stack"="__rust_probestack" "target-cpu"="x86-64" }
attributes #1 = { argmemonly nofree nounwind willreturn writeonly }

!llvm.module.flags = !{!0, !1}

!0 = !{i32 7, !"PIC Level", i32 2}
!1 = !{i32 2, !"RtLibUseGOT", i32 1}

Godbolt

UPD: Fixed the LLVM IR, the previous version was compiled without enabling AVX2.

github-actions bot added the new issue label Jan 22, 2022

EugeneZelenko added backend:RISC-V backend:X86 and removed new issue labels Jan 22, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Unnecessary stack usage #53348

Unnecessary stack usage #53348

newpavlov commented Jan 22, 2022

llvmbot commented Jan 22, 2022

llvmbot commented Jan 22, 2022

asl commented Jan 22, 2022

newpavlov commented Jan 22, 2022 •

edited

Loading

Unnecessary stack usage #53348

Unnecessary stack usage #53348

Comments

newpavlov commented Jan 22, 2022

llvmbot commented Jan 22, 2022

llvmbot commented Jan 22, 2022

asl commented Jan 22, 2022

newpavlov commented Jan 22, 2022 • edited Loading

newpavlov commented Jan 22, 2022 •

edited

Loading