Inefficient code for Nxi1 masked operations on non-avx512 target #53760

jhorstmann · 2022-02-11T13:57:31Z

I noticed the following code generation issue when trying Rust's new portable_simd library, using rust nightly build. The full example can be seen in the godbolt compiler explorer.

The original rust code is does an addition of two i32 vectors, the second operand should be masked using an u8 bitmask.

#![feature(platform_intrinsics)]
#![feature(portable_simd)]
use std::simd::*;

extern "platform-intrinsic" {
    pub(crate) fn simd_select<M, T>(m: M, a: T, b: T) -> T;
    pub(crate) fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
}

pub fn from_bitmask_i32x8(bitmask: u8) -> i32x8 {
    unsafe {
        simd_select_bitmask(bitmask, i32x8::splat(-1), i32x8::splat(0))
    }
}

pub fn add_masked_i32x8(a: i32x8, b: i32x8, bitmask: u8) -> i32x8 {
    unsafe {
        a + (b & from_bitmask_i32x8(bitmask))
    }
}

This compiles to the following llvm-ir (using -C opt-level=3 --emit=llvm-ir):

define void @_ZN7example18from_bitmask_i32x817h2e814f0b6cf811aeE(<8 x i32>* noalias nocapture sret(<8 x i32>) dereferenceable(32) %0, i8 %bitmask) unnamed_addr #0 !dbg !6 {
  %1 = bitcast i8 %bitmask to <8 x i1>, !dbg !10
  %2 = sext <8 x i1> %1 to <8 x i32>, !dbg !10
  store <8 x i32> %2, <8 x i32>* %0, align 32, !dbg !10
  ret void, !dbg !11
}

define void @_ZN7example16add_masked_i32x817h5456e14a2952ad51E(<8 x i32>* noalias nocapture sret(<8 x i32>) dereferenceable(32) %0, <8 x i32>* noalias nocapture readonly dereferenceable(32) %a, <8 x i32>* noalias nocapture readonly dereferenceable(32) %b, i8 %bitmask) unnamed_addr #0 !dbg !12 {
  %_4 = load <8 x i32>, <8 x i32>* %a, align 32, !dbg !13
  %_6 = load <8 x i32>, <8 x i32>* %b, align 32, !dbg !14
  %1 = bitcast i8 %bitmask to <8 x i1>, !dbg !15
  %2 = select <8 x i1> %1, <8 x i32> %_6, <8 x i32> zeroinitializer, !dbg !17
  %3 = add <8 x i32> %2, %_4, !dbg !25
  store <8 x i32> %3, <8 x i32>* %0, align 32, !dbg !25, !alias.scope !29
  ret void, !dbg !32
}

With an avx512 capable target, the generated code looks good, the generated vector mask gets optimized away and replaced by a masked load using k registers.

example::add_masked_i32x8:
        mov     rax, rdi
        kmovd   k1, ecx
        vmovdqa32       ymm0 {k1} {z}, ymmword ptr [rdx]
        vpaddd  ymm0, ymm0, ymmword ptr [rsi]
        vmovdqa ymmword ptr [rdi], ymm0
        vzeroupper
        ret

With a non-avx512 target, generating a vector masked gets optimized nicely by broadcasting the bitmask and comparing it against a constant containing the lane indices.

.LCPI0_0:
        .long   1
        .long   2
        .long   4
        .long   8
        .long   16
        .long   32
        .long   64
        .long   128
example::from_bitmask_i32x8:
        mov     rax, rdi
        vmovd   xmm0, esi
        vpbroadcastb    ymm0, xmm0
        vmovdqa ymm1, ymmword ptr [rip + .LCPI0_0]
        vpand   ymm0, ymm0, ymm1
        vpcmpeqd        ymm0, ymm0, ymm1
        vmovdqa ymmword ptr [rdi], ymm0
        vzeroupper
        ret

The masked addition should then be able to just use the same code and blend using the generated vector mask. Instead it generates quite inefficient code that tests each bit in the bitmask individually and inserts values into the a vector register:

example::add_masked_i32x8:
        push    rax
        mov     rax, rdi
        mov     edi, ecx
        shr     dil, 5
        movzx   r9d, dil
        and     r9d, 1
        neg     r9d
        mov     r8d, ecx
        shr     r8b, 4
        movzx   edi, r8b
        and     edi, 1
        neg     edi
        vmovd   xmm0, edi
        vpinsrd xmm0, xmm0, r9d, 1
        mov     edi, ecx
        shr     dil, 6
        movzx   edi, dil
        and     edi, 1
        neg     edi
        vpinsrd xmm0, xmm0, edi, 2
        mov     edi, ecx
        shr     dil, 7
        movzx   edi, dil
        neg     edi
        vpinsrd xmm0, xmm0, edi, 3
        mov     edi, ecx
        and     edi, 1
        neg     edi
        vmovd   xmm1, edi
        mov     edi, ecx
        shr     dil
        movzx   edi, dil
        and     edi, 1
        neg     edi
        vpinsrd xmm1, xmm1, edi, 1
        mov     edi, ecx
        shr     dil, 2
        movzx   edi, dil
        and     edi, 1
        neg     edi
        vpinsrd xmm1, xmm1, edi, 2
        shr     cl, 3
        movzx   ecx, cl
        and     ecx, 1
        neg     ecx
        vpinsrd xmm1, xmm1, ecx, 3
        vinserti128     ymm0, ymm1, xmm0, 1
        vpand   ymm0, ymm0, ymmword ptr [rdx]
        vpaddd  ymm0, ymm0, ymmword ptr [rsi]
        vmovdqa ymmword ptr [rax], ymm0
        pop     rcx
        vzeroupper
        ret

The text was updated successfully, but these errors were encountered:

jhorstmann · 2022-02-11T14:03:02Z

This seems to be happening in an optimization pass that transforms the bitcast + sext pattern into bitcast + select, as can be seen in standalone example in compiler explorer.

This llvm-ir generates optimized assembly using llc --mcpu=skylake -O3:

define void @add_masked_i32x8_bitcast_sext(<8 x i32>* noalias nocapture sret(<8 x i32>) dereferenceable(32) %0, <8 x i32>* noalias nocapture readonly dereferenceable(32) %a, <8 x i32>* noalias nocapture readonly dereferenceable(32) %b, i8 %bitmask) unnamed_addr #0 {
  %_6 = load <8 x i32>, <8 x i32>* %a, align 32
  %_7 = load <8 x i32>, <8 x i32>* %b, align 32
  %2 = bitcast i8 %bitmask to <8 x i1>
  %3 = sext <8 x i1> %2 to <8 x i32>
  %4 = and <8 x i32> %3, %_7
  %5 = add <8 x i32> %4, %_6

  store <8 x i32> %5, <8 x i32>* %0, align 32
  ret void
}

Optimizing using opt --mcpu=skylake -O3 turns it into the same bitcast + select as above, which again leads to suboptimal code using vector inserts.

llvmbot · 2022-02-11T14:27:23Z

@llvm/issue-subscribers-backend-x86

RKSimon · 2022-02-11T14:48:48Z

The optimized code passed to the backend is where it all fall apart: https://godbolt.org/z/zGaEK77b8

define void @add_masked_i32x8_bitcast_sext_opt(<8 x i32>* noalias nocapture writeonly sret(<8 x i32>) dereferenceable(32) %0, <8 x i32>* noalias nocapture readonly dereferenceable(32) %a, <8 x i32>* noalias nocapture readonly dereferenceable(32) %b, i8 %bitmask) unnamed_addr #1 {
  %_6 = load <8 x i32>, <8 x i32>* %a, align 32
  %_7 = load <8 x i32>, <8 x i32>* %b, align 32
  %2 = bitcast i8 %bitmask to <8 x i1>
  %3 = select <8 x i1> %2, <8 x i32> %_7, <8 x i32> zeroinitializer
  %4 = add <8 x i32> %3, %_6
  store <8 x i32> %4, <8 x i32>* %0, align 32
  ret void
}

Replace the *_EXTEND node with the raw operands, this will make it easier to use combineToExtendBoolVectorInReg for any boolvec extension combine. Cleanup prep for Issue #53760

… NFC. Avoid the need for a forward declaration. Cleanup prep for Issue #53760

…t(vXi1),A,B) For pre-AVX512 targets, attempt to sign-extend a vXi1 condition mask to pass to a X86ISD::BLENDV node Fixes Issue #53760

RKSimon · 2022-02-17T13:39:21Z

@jhorstmann rust.godbolt.org seems to be taking forever to update the nightly build with my fix - can you confirm if it worked at all?

jhorstmann · 2022-02-17T14:44:14Z

@RKSimon Thanks for the fast solution! I think in the last godbolt link you posted (https://godbolt.org/z/zGaEK77b8) I already see the difference if I compare 13.0 and trunk.

I'm not familiar yet with the llvm dev process, should I just close the ticket now? Will the fix become part of release 14 or the next release afterwards?

RKSimon · 2022-02-17T16:55:49Z

@jhorstmann Thanks for the confirmation! If you have a local trunk build of rust that you can see the fix then please close this. Otherwise, lets leave it open for now to see if the nightly godbolt build catches up, just in case I've missed something.

jhorstmann · 2022-02-18T17:32:59Z

@RKSimon I managed to build a rustc with your 3 commits included and can confirm this fixes the issue. My benchmark, that is summing 16k f64 numbers if their corresponding bit in a vector of u64 is set, got a speedup of more than 2x on an i7-10510U. I think another small improvement might come from #53791.

AFAIK rust is currently tracking the release/14.x branch, so this might take a while to show up in godbolt.

lukaslihotzki · 2022-03-13T23:09:24Z

When replacing all i32 with i8 in the latest godbolt example (https://godbolt.org/z/zesxz6qh4), each bit is extracted individually again, although the same optimization could be applied. The same problem occurs when replacing with i16.

RKSimon · 2022-03-14T12:40:14Z

OK, I'll try to find a more general solution to this

RKSimon · 2022-03-15T12:18:05Z

@lukaslihotzki please can you test against trunk latest?

lukaslihotzki · 2022-03-15T15:08:38Z

@RKSimon I don't have set up LLVM compilation locally. Can I use llc from https://alive2.llvm.org/ce/ for testing? How recent is llc there?

This issue affects all non-x86 platforms I have tested (https://godbolt.org/z/n6d1fhE7d):

--march=arm --mattr=neon: 7 times and and vmov.8
--march=aarch64 --mattr=sve2: 8 times sbfx bit field extract a single bit
--march=ppc32 --mattr=altivec: 7 times rlwinm rotate left and AND
--march=wasm32 --mattr=simd128: 7 times i32.shr_u, i32.and, and i16x8.replace_lane (inserts 16-bit values, then i8x16.shuffle to narrow to i8)
--march=riscv32 --mattr=v: 8 branches, no vectorization at all

RKSimon · 2022-03-15T15:49:43Z

If you wait 24 hours then compiler explorer usually catches up to trunk. The fix is only for x86 - its unlikely that we can generalize it to work on other targets.

lukaslihotzki · 2022-03-15T16:22:01Z

The faster-than-scalar solution that can be generalized is: broadcast, bitwise AND then compare with [1,2,4,8,16,32,64,128]. Currently, LLVM trunk uses this approach for --mcpu=haswell, but it could use the same approach for every architecture (at least for neon, sve, and wasm32+simd128).

Can this general approach be implemented for LLVM in a target-independent way, still allowing targets to use better solutions like AVX-512 mask registers or SVE predicate registers? If not, would it be helpful to create an issue for every target? Alternatively, should Rust portable_simd use this approach directly instead of sext <8 x i1>… for non-x86 platforms to fix rust-lang/portable-simd#264?

Replace the *_EXTEND node with the raw operands, this will make it easier to use combineToExtendBoolVectorInReg for any boolvec extension combine. Cleanup prep for Issue llvm#53760

… NFC. Avoid the need for a forward declaration. Cleanup prep for Issue llvm#53760

…t(vXi1),A,B) For pre-AVX512 targets, attempt to sign-extend a vXi1 condition mask to pass to a X86ISD::BLENDV node Fixes Issue llvm#53760

…neToExtendBoolVectorInReg before legalization This replaces the attempt in 20af71f to use combineToExtendBoolVectorInReg to create X86ISD::BLENDV masks directly, instead we use it to canonicalize the iX bitcast to a sign-extended mask and then truncate it back to vXi1 prior to legalization breaking it apart. Fixes llvm#53760

jhorstmann · 2022-03-15T20:43:35Z

Can confirm that mask generation now uses the optimized path for mask8x8, mask8x16, mask16x8 and mask16x16 using a local build of rust with llvm commit f591231 cherry-picked (because rust is currently on llvm 14). Thanks @RKSimon !

My benchmark using portable_simd for reference, checked the generated code with perf record / perf report.

RKSimon · 2022-03-16T17:22:32Z

Can this general approach be implemented for LLVM in a target-independent way, still allowing targets to use better solutions like AVX-512 mask registers or SVE predicate registers? If not, would it be helpful to create an issue for every target? Alternatively, should Rust portable_simd use this approach directly instead of sext <8 x i1>… for non-x86 platforms to fix rust-lang/portable-simd#264?

I'll see if I can move any of the X86 implementation into TargetLowering so other targets can reuse it.

github-actions bot added the new issue label Feb 11, 2022

jhorstmann mentioned this issue Feb 11, 2022

Migrated to portable simd jorgecarleitao/arrow2#747

Merged

EugeneZelenko added backend:X86 and removed new issue labels Feb 11, 2022

RKSimon self-assigned this Feb 11, 2022

RKSimon added a commit that referenced this issue Feb 11, 2022

[X86] Move combineToExtendBoolVectorInReg before the select combines.…

48e1434

… NFC. Avoid the need for a forward declaration. Cleanup prep for Issue #53760

RKSimon added a commit that referenced this issue Feb 11, 2022

[X86] combineVSelectToBLENDV - handle vselect(vXi1,A,B) -> blendv(sex…

20af71f

…t(vXi1),A,B) For pre-AVX512 targets, attempt to sign-extend a vXi1 condition mask to pass to a X86ISD::BLENDV node Fixes Issue #53760

RKSimon added the confirmed Verified by a second party label Feb 11, 2022

jhorstmann closed this as completed Feb 18, 2022

programmerjake mentioned this issue Mar 13, 2022

mask8x8::from_bitmask falls back to scalar code rust-lang/portable-simd#264

Open

RKSimon reopened this Mar 14, 2022

RKSimon closed this as completed in f591231 Mar 15, 2022

jhorstmann pushed a commit to jhorstmann/llvm-project that referenced this issue Mar 15, 2022

[X86] Move combineToExtendBoolVectorInReg before the select combines.…

4119dcc

… NFC. Avoid the need for a forward declaration. Cleanup prep for Issue llvm#53760

jhorstmann mentioned this issue Mar 27, 2022

packed_simd v.s. portable_simd, which should be used? apache/arrow-rs#1492

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Inefficient code for Nxi1 masked operations on non-avx512 target #53760

Inefficient code for Nxi1 masked operations on non-avx512 target #53760

jhorstmann commented Feb 11, 2022

jhorstmann commented Feb 11, 2022

llvmbot commented Feb 11, 2022

RKSimon commented Feb 11, 2022

RKSimon commented Feb 17, 2022

jhorstmann commented Feb 17, 2022

RKSimon commented Feb 17, 2022

jhorstmann commented Feb 18, 2022

lukaslihotzki commented Mar 13, 2022

RKSimon commented Mar 14, 2022

RKSimon commented Mar 15, 2022

lukaslihotzki commented Mar 15, 2022

RKSimon commented Mar 15, 2022

lukaslihotzki commented Mar 15, 2022

jhorstmann commented Mar 15, 2022

RKSimon commented Mar 16, 2022

Inefficient code for Nxi1 masked operations on non-avx512 target #53760

Inefficient code for Nxi1 masked operations on non-avx512 target #53760

Comments

jhorstmann commented Feb 11, 2022

jhorstmann commented Feb 11, 2022

llvmbot commented Feb 11, 2022

RKSimon commented Feb 11, 2022

RKSimon commented Feb 17, 2022

jhorstmann commented Feb 17, 2022

RKSimon commented Feb 17, 2022

jhorstmann commented Feb 18, 2022

lukaslihotzki commented Mar 13, 2022

RKSimon commented Mar 14, 2022

RKSimon commented Mar 15, 2022

lukaslihotzki commented Mar 15, 2022

RKSimon commented Mar 15, 2022

lukaslihotzki commented Mar 15, 2022

jhorstmann commented Mar 15, 2022

RKSimon commented Mar 16, 2022