[AArch64] neon big endian miscompiled #65884

hstk30 · 2023-09-10T10:02:14Z

#include <arm_neon.h>

extern void abort (void);

__attribute__((noinline)) uint8x16_t
wrap_vld1q_lane_u8 (const uint8_t *load, uint8x16_t vec) {
  return vld1q_lane_u8 (load, vec, 12);
}

int test_vld1q_lane_u8(const uint8_t *data) {
  uint8_t out[16];
  uint8_t overwrite = 7;
  int j;
  uint8x16_t in = vld1q_u8 (data);
  in = wrap_vld1q_lane_u8 (&overwrite, in);
  vst1q_u8 (out, in);
  for (j = 0; j < 13; j++)
    if (out[j] != (j == 12 ? overwrite : data[j])) {
      abort();
    }
  return 0;
}

int main (int argc, char **argv)
{
  uint64_t orig_data[2] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL};
  test_vld1q_lane_u8((const uint8_t *)orig_data);
  return 0;
}

this code fail, when -O3 -fno-inline.

I see the https://llvm.org/docs/BigEndianNEON.html , but I still confuse about the asm:

        rev64   v0.16b, v0.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        rev64   v0.16b, v0.16b
        ext     v0.16b, v0.16b, v0.16b, #8

this code seem do nothing, but appear many times.

Can anyone give me some clue about this fail? Or just narrow this problem?

I opt bisect it, it's fail in SLPVectorizerPass, but I guess it just introduced this problem, but not the main point.

The text was updated successfully, but these errors were encountered:

hstk30 · 2023-09-11T02:29:11Z

A weird observation is when set lane in range 0-11， the code is work. But when the lane great than 12， the code is fail.

llvmbot · 2023-09-11T07:24:56Z

@llvm/issue-subscribers-backend-aarch64

hstk30 · 2023-09-11T09:10:17Z

I gdb it, and below is some debug info

test_vld1q_lane_u8:                     // @test_vld1q_lane_u8
// %bb.0:                               // %entry
        sub     sp, sp, #48
        ld1     { v0.16b }, [x0]
  //  v0 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce}
        stp     x29, x30, [sp, #32]             // 16-byte Folded Spill
        add     x29, sp, #32
        mov     w8, #7
        sub     x0, x29, #4
        sturb   w8, [x29, #-4]
        str     q0, [sp]                        // 16-byte Folded Spill
        rev64   v0.16b, v0.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        rev64   v0.16b, v0.16b
        ext     v0.16b, v0.16b, v0.16b, #8
        bl      wrap_vld1q_lane_u8
        rev64   v0.16b, v0.16b
  // v0 = {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13}
        adrp    x8, .LCPI1_0
        add     x8, x8, :lo12:.LCPI1_0
        ldr     q5, [sp]                        // 16-byte Folded Reload
  // v5 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce}
        ext     v0.16b, v0.16b, v0.16b, #8
  // v0 = {0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13, 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12}
        ld1     { v2.8b }, [x8]         // v2: [6, 7, 5, 4, 3, 2, 1, 0, ...]
        adrp    x8, .LCPI1_1
        add     x8, x8, :lo12:.LCPI1_1
        mov     v1.16b, v5.16b          // v1 = v5
        mov     v1.d[1], v5.d[0]
  // v1 = {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef, 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef}
        ext     v3.16b, v0.16b, v0.16b, #8  // v3 <- v0
  // v3 = {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xce, 0x8a, 0x46, 0x7, 0xdf, 0x9b, 0x57, 0x13}
        ld1     { v4.8b }, [x8]         // v4: [1, 0, 2, 3, 4, 5, 6, 7, ...]
        umov    w8, v0.b[7]
        umov    w9, v0.b[6]
        ext     v5.16b, v5.16b, v5.16b, #8
  // v5 = {0x13, 0x57, 0x9b, 0xdf, 0x2, 0x46, 0x8a, 0xce, 0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef}
        mov     v3.d[1], v3.d[0]
  // v3 =  {0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12, 0xef, 0xcd, 0xab, 0x90, 0x78, 0x56, 0x34, 0x12}
        fmov    s6, w8                  // v6 handle [8-11]
        tbl     v1.8b, { v1.16b }, v4.8b  // v1 <- v1[v4]
        umov    w8, v0.b[5]
        tbl     v2.8b, { v3.16b }, v2.8b  // v2 <- v3[v2]
        zip1 v3.8b, v0.8b, v5.8b
  // v3 =  {0x13, 0xce, 0x57, 0x8a, 0x9b, 0x46, 0xdf, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
        mov     v6.h[1], w9
        umov    w9, v0.b[4]
        cmeq    v1.8b, v2.8b, v1.8b
  // {0x12, 0x34, 0x56, 0x78, 0x90, 0xab, 0xcd, 0xef} 0-7 done
        rev16   v2.8b, v3.8b            // v2 <- v3
  // v2 = {0xce13, 0x8a57, 0x469b, 0x7df, 0x0, 0x0, 0x0, 0x0}
        mov     v6.h[2], w8
        umov    w8, v1.b[0]
        umov    w10, v1.b[1]
        umov    w11, v1.b[2]
        umov    w12, v1.b[3]
        umov    w13, v1.b[4]
        mov     v6.h[3], w9
        bic     v2.4h, #255, lsl #8
  // v2 = {0xce00, 0x8a00, 0x4600, 0x700, 0x0, 0x0, 0x0, 0x0}
        and     w8, w8, #0x1
        and     w9, w10, #0x1
        umov    w10, v1.b[5]
        lsl     w9, w9, #6
        and     w11, w11, #0x1
        bfi     w9, w8, #7, #1
        umov    w8, v1.b[6]
        and     w12, w12, #0x1
        bic     v6.4h, #255, lsl #8
        and     w13, w13, #0x1
  // v6 = {0x1300, 0x5700, 0x9b00, 0xdf00, 0x0, 0x0, 0x0, 0x0}
        cmeq    v2.4h, v6.4h, v2.4h

Focus on v2 cmp v6 which is compare index [8-11], the v2 is

{0xce00, 0x8a00, 0x4600, 0x700, 0x0, 0x0, 0x0, 0x0}

and the v6 is

{0x1300, 0x5700, 0x9b00, 0xdf00, 0x0, 0x0, 0x0, 0x0}

It caused by zip1 v3.8b, v0.8b, v5.8b or rev16 v2.8b, v3.8b.

If I change zip1 v3.8b, v0.8b, v5.8b to zip1 v3.8b, v5.8b, v0.8b, it work.
Or change rev16 v2.8b, v3.8b to mov v2.8b, v3.8b, it work.

hstk30 · 2023-09-12T13:59:29Z

Hi bro，do you have any idea about this problem? @davemgreen

hstk30 · 2023-09-22T15:26:19Z

I compare the opt pipeline, https://godbolt.org/z/TGToW3jKM , guess the error caused by transform shufflevector <4 x i32> . The logic of SLP + InstCombine is OK.

hstk30-hw · 2023-09-23T09:40:59Z

I found a Tiny difference :

I let the compare range from 8 to 13( 4+1) ， https://godbolt.org/z/sxcj49Goq ， it work fine. And the shuffer vector IR is

  %1 = shufflevector <16 x i8> %call, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 5, i32 4>
  %2 = shufflevector <16 x i8> %0, <16 x i8> undef, <4 x i32> <i32 9, i32 8, i32 10, i32 11>

The compare range from 0 to 13(8 + 4 +1), the <4 x i32> shuffle vector IR is

  %4 = shufflevector <16 x i8> %call, <16 x i8> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
  %5 = shufflevector <16 x i8> %0, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>

The shuffle mask is diff. And lead to error.

hstk30-hw · 2023-09-23T10:17:47Z

define <4 x i1> @work(<16 x i8>* %A, <16 x i8>* %B) nounwind {
   %tmp1 = load <16 x i8>, <16 x i8>* %A
   %tmp2 = load <16 x i8>, <16 x i8>* %B
   %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 5, i32 4>
   %tmp4 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <4 x i32> <i32 9, i32 8, i32 10, i32 11>
   %tmp5 = icmp eq <4 x i8> %tmp3, %tmp4
   %tmp6 = freeze <4 x i1> %tmp5
   ret <4 x i1> %tmp6
 }

 define <4 x i1> @fail(<16 x i8>* %A, <16 x i8>* %B) nounwind {
   %tmp1 = load <16 x i8>, <16 x i8>* %A
   %tmp2 = load <16 x i8>, <16 x i8>* %B
   %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
   %tmp4 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
   %tmp5 = icmp eq <4 x i8> %tmp3, %tmp4
   %tmp6 = freeze <4 x i1> %tmp5
   ret <4 x i1> %tmp6
 }

hstk30-hw · 2023-09-25T03:02:04Z

https://godbolt.org/z/97qKq7rb6

 define <4 x i1> @fail(<16 x i8>* %A, <16 x i8>* %B) nounwind {
   %tmp1 = load <16 x i8>, <16 x i8>* %A
   %tmp2 = load <16 x i8>, <16 x i8>* %B
   %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %tmp4 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   %tmp5 = icmp eq <4 x i8> %tmp3, %tmp4
   %tmp6 = freeze <4 x i1> %tmp5
   ret <4 x i1> %tmp6
 }

let

v0 = {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}
v1 = {0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0}

fail:                                   // @fail
        ld1     { v0.16b }, [x0]
        ld1     { v1.16b }, [x1]
        umov    w8, v0.b[3]
        umov    w9, v0.b[2]
        ext     v1.16b, v1.16b, v1.16b, #8
// v1 = {0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0, ...}
        fmov    s2, w8
        umov    w8, v0.b[1]
        mov     v2.h[1], w9
        mov     v2.h[2], w8
        umov    w8, v0.b[0]
        zip2    v0.8b, v1.8b, v0.8b
// v0 = {0x3, 0x4, 0x2, 0x5, 0x1, 0x6, 0x0, 0x7, ...}
        mov     v2.h[3], w8
        rev16   v0.8b, v0.8b
// v0 = {0x4, 0x3, 0x5, 0x2, 0x6, 0x1, 0x7, 0x0, ...}
        bic     v2.4h, #255, lsl #8
        bic     v0.4h, #255, lsl #8
// v0 = {0x4, 0x0, 0x5, 0x0, 0x6, 0x0, 0x7, 0x0, ...}
// v2 = {0x3, 0x0, 0x2, 0x0, 0x1, 0x0, 0x0, 0x0, ...}
        cmeq    v0.4h, v2.4h, v0.4h
        rev64   v0.4h, v0.4h
        ret

I don't know the correct asm should be, otherwise I can fix it.

hstk30-hw · 2023-09-27T03:59:35Z

llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Line 10578 in 134ae13

SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,

In the function ReconstructShuffle , I guess the construction for shuffle vector not consider the big endian?

I fix it by insert one more rev when bitcast. It work, but I'm not really sure.

Same issue #65058

ostannard · 2023-09-28T10:48:31Z

I don't think this code is valid, because it mixes ACLE intrinsics with the GCC vector extension. These have different semantics for lane ordering on big endian systems. The ACLE spec has a section on this, which recommends using the ACLE intrinsics consistently: https://github.com/ARM-software/acle/blob/main/main/acle.md#compatibility-with-other-vector-programming-models

ostannard · 2023-09-28T10:54:25Z

No, ignore me, I'd just mis-read the code, it isn't actually using the GCC vector extension.

hstk30 · 2023-09-28T15:27:37Z

Actually, the code is from gcc's test suit https://github.com/gcc-mirror/gcc/blob/master/gcc/testsuite/gcc.target/aarch64/vld1_lane.c.
In the gcc's -O3, it just unroll loops, not do vectoratize. So cunning :(

https://godbolt.org/z/6bhPGxWTd

davemgreen · 2023-10-10T09:01:38Z

Hi bro，do you have any idea about this problem? @davemgreen

Apologies for not responding, I received no notifications for this issue past the first two messages (since I added #backend-aarch64). The fix above sounds very sensible.

Fixes #65884

Fixes llvm#65884

github-actions bot added the new issue label Sep 10, 2023

davemgreen added the backend:AArch64 label Sep 11, 2023

davemgreen added miscompilation and removed new issue labels Sep 11, 2023

bzEq changed the title ~~【AArch64】neon big endian miscompiled~~ [AArch64] neon big endian miscompiled Sep 11, 2023

nikic mentioned this issue Sep 23, 2023

[clang][vectorize] -O2 vectorize will get wrong result #66163

Closed

hstk30-hw mentioned this issue Oct 10, 2023

feat: fix big endian shuffle vector miscompile #68673

Merged

davemgreen closed this as completed in #68673 Nov 20, 2023

davemgreen pushed a commit that referenced this issue Nov 20, 2023

[AArch64] Fix big endian shuffle vector miscompile (#68673)

abcbca2

Fixes #65884

sr-tream pushed a commit to sr-tream/llvm-project that referenced this issue Nov 20, 2023

[AArch64] Fix big endian shuffle vector miscompile (llvm#68673)

25fa5b3

Fixes llvm#65884

zahiraam pushed a commit to zahiraam/llvm-project that referenced this issue Nov 20, 2023

[AArch64] Fix big endian shuffle vector miscompile (llvm#68673)

67a9c37

Fixes llvm#65884

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] neon big endian miscompiled #65884

[AArch64] neon big endian miscompiled #65884

hstk30 commented Sep 10, 2023

hstk30 commented Sep 11, 2023

llvmbot commented Sep 11, 2023

hstk30 commented Sep 11, 2023

hstk30 commented Sep 12, 2023

hstk30 commented Sep 22, 2023

hstk30-hw commented Sep 23, 2023 •

edited

hstk30-hw commented Sep 23, 2023

hstk30-hw commented Sep 25, 2023

hstk30-hw commented Sep 27, 2023 •

edited

ostannard commented Sep 28, 2023

ostannard commented Sep 28, 2023

hstk30 commented Sep 28, 2023

davemgreen commented Oct 10, 2023

[AArch64] neon big endian miscompiled #65884

[AArch64] neon big endian miscompiled #65884

Comments

hstk30 commented Sep 10, 2023

hstk30 commented Sep 11, 2023

llvmbot commented Sep 11, 2023

hstk30 commented Sep 11, 2023

hstk30 commented Sep 12, 2023

hstk30 commented Sep 22, 2023

hstk30-hw commented Sep 23, 2023 • edited

hstk30-hw commented Sep 23, 2023

hstk30-hw commented Sep 25, 2023

hstk30-hw commented Sep 27, 2023 • edited

ostannard commented Sep 28, 2023

ostannard commented Sep 28, 2023

hstk30 commented Sep 28, 2023

davemgreen commented Oct 10, 2023

hstk30-hw commented Sep 23, 2023 •

edited

hstk30-hw commented Sep 27, 2023 •

edited