x86 SIMD intrinsics are sometimes compiled with a weird choice of width

When compiling the following code, which uses SIMD intrinsics:
```
#include <immintrin.h>

void fn(float *f, int *d, bool *b) {
    auto v_zero = _mm256_set1_epi32(0);
    auto v_zero_f = _mm256_set1_ps(0.0f);

    auto v = _mm256_loadu_ps(f);
    auto mask = _mm256_cmpeq_epi32(_mm256_castps_si256(v), v_zero);

    auto v_s = _mm256_loadu_si256((const __m256i*)(d));
    mask = _mm256_or_si256(mask, _mm256_cmpgt_epi32(v_s, v_zero));

    auto v_b = _mm256_cvtepu8_epi32(_mm_loadu_si64(b));
    mask = _mm256_or_si256(mask, _mm256_cmpgt_epi32(v_b, v_zero));

    v = _mm256_blendv_ps(v_zero_f, v, _mm256_castsi256_ps(mask));
    _mm256_storeu_ps(f, v);
}
```
Even though `mask` is nominally 8 packed 32-bit values and the only other width is 8 bit, Clang chooses to perform some operations on it as packed 16-bit values:
```
        vmovdqu ymm0, ymmword ptr [rdi]
        vpxor   xmm1, xmm1, xmm1
        vpcmpeqd        ymm2, ymm0, ymm1
        vmovdqu ymm3, ymmword ptr [rsi]
        vpcmpgtd        ymm1, ymm3, ymm1
        vpor    ymm1, ymm2, ymm1
        vextracti128    xmm2, ymm1, 1
        vpackssdw       xmm1, xmm1, xmm2
        vmovq   xmm2, qword ptr [rdx]
        vpxor   xmm3, xmm3, xmm3
        vpcmpeqb        xmm2, xmm2, xmm3
        vpcmpeqd        xmm3, xmm3, xmm3
        vpxor   xmm2, xmm2, xmm3
        vpmovsxbw       xmm2, xmm2
        vpor    xmm1, xmm1, xmm2
        vpmovsxwd       ymm1, xmm1
        vpand   ymm0, ymm1, ymm0
        vmovdqu ymmword ptr [rdi], ymm0
        vzeroupper
        ret
```
Compare to GCC, which just uses the expected width:
```
        vmovups ymm2, YMMWORD PTR [rdi]
        vmovdqu ymm0, YMMWORD PTR [rsi]
        vpxor   xmm1, xmm1, xmm1
        vpmovzxbd       ymm3, QWORD PTR [rdx]
        vpcmpeqd        ymm4, ymm2, ymm1
        vpcmpgtd        ymm0, ymm0, ymm1
        vpcmpgtd        ymm3, ymm3, ymm1
        vpor    ymm0, ymm0, ymm4
        vpor    ymm0, ymm0, ymm3
        vpcmpgtd        ymm1, ymm1, ymm0
        vandps  ymm2, ymm2, ymm1
        vmovups YMMWORD PTR [rdi], ymm2
        vzeroupper
        ret
```
If any component of `mask` is removed, the issue disappears.

Example: https://godbolt.org/z/6zEYresqM

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

x86 SIMD intrinsics are sometimes compiled with a weird choice of width #157382

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

x86 SIMD intrinsics are sometimes compiled with a weird choice of width #157382

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions