diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll new file mode 100644 index 0000000000000..710eb85eca20c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -0,0 +1,341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s + +define <8 x i8> @load4_v4i8_add(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { +; CHECK-LABEL: load4_v4i8_add: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s2, s3, [x1] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v2.8b +; CHECK-NEXT: uzp1 v1.8b, v1.8b, v3.8b +; CHECK-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %la = load <4 x i8>, <4 x i8> *%a + %lb = load <4 x i8>, <4 x i8> *%b + %c = getelementptr <4 x i8>, <4 x i8> *%a, i64 1 + %d = getelementptr <4 x i8>, <4 x i8> *%b, i64 1 + %lc = load <4 x i8>, <4 x i8> *%c + %ld = load <4 x i8>, <4 x i8> *%d + %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> + %s2 = shufflevector <4 x i8> %lc, <4 x i8> %ld, <8 x i32> + %add = add <8 x i8> %s1, %s2 + ret <8 x i8> %add +} + +define <8 x i16> @load4_v4i8_zext_add(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { +; CHECK-LABEL: load4_v4i8_zext_add: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s2, s3, [x1] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v2.8b +; CHECK-NEXT: uzp1 v1.8b, v1.8b, v3.8b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ret + %la = load <4 x i8>, <4 x i8> *%a + %lb = load <4 x i8>, <4 x i8> *%b + %c = getelementptr <4 x i8>, <4 x i8> *%a, i64 1 + %d = getelementptr <4 x i8>, <4 x i8> *%b, i64 1 + %lc = load <4 x i8>, <4 x i8> *%c + %ld = load <4 x i8>, <4 x i8> *%d + %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> + %s2 = shufflevector <4 x i8> %lc, <4 x i8> %ld, <8 x i32> + %z1 = zext <8 x i8> %s1 to <8 x i16> + %z2 = zext <8 x i8> %s2 to <8 x i16> + %add = add <8 x i16> %z1, %z2 + ret <8 x i16> %add +} + +define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* nocapture noundef readonly %p2, i32 noundef %st2) { +; CHECK-LABEL: large: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 +; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 +; CHECK-NEXT: sxtw x9, w3 +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x2, x9 +; CHECK-NEXT: ushll v4.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.8h, v3.8b, #0 +; CHECK-NEXT: ldp s5, s3, [x10] +; CHECK-NEXT: add x10, x10, x8 +; CHECK-NEXT: add x8, x10, x8 +; CHECK-NEXT: ldp s6, s7, [x11] +; CHECK-NEXT: ldp s16, s17, [x10] +; CHECK-NEXT: ldp s18, s21, [x8] +; CHECK-NEXT: add x11, x11, x9 +; CHECK-NEXT: add x9, x11, x9 +; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: ushll v16.8h, v16.8b, #0 +; CHECK-NEXT: ushll v18.8h, v18.8b, #0 +; CHECK-NEXT: ldp s19, s20, [x11] +; CHECK-NEXT: uzp1 v16.8b, v18.8b, v16.8b +; CHECK-NEXT: uzp1 v4.8b, v5.8b, v4.8b +; CHECK-NEXT: ldp s18, s5, [x9] +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll v6.8h, v6.8b, #0 +; CHECK-NEXT: ushll v19.8h, v19.8b, #0 +; CHECK-NEXT: ushll v18.8h, v18.8b, #0 +; CHECK-NEXT: uzp1 v2.8b, v6.8b, v2.8b +; CHECK-NEXT: uzp1 v18.8b, v18.8b, v19.8b +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll v17.8h, v17.8b, #0 +; CHECK-NEXT: ushll v20.8h, v20.8b, #0 +; CHECK-NEXT: ushll v6.8h, v16.8b, #0 +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ushll v16.8h, v18.8b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll v19.8h, v21.8b, #0 +; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: ushll v7.8h, v7.8b, #0 +; CHECK-NEXT: usubl v18.4s, v6.4h, v16.4h +; CHECK-NEXT: usubl2 v6.4s, v6.8h, v16.8h +; CHECK-NEXT: usubl v16.4s, v4.4h, v2.4h +; CHECK-NEXT: usubl2 v2.4s, v4.8h, v2.8h +; CHECK-NEXT: uzp1 v4.8b, v19.8b, v17.8b +; CHECK-NEXT: uzp1 v1.8b, v3.8b, v1.8b +; CHECK-NEXT: uzp1 v3.8b, v5.8b, v20.8b +; CHECK-NEXT: uzp1 v0.8b, v7.8b, v0.8b +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: usubl2 v5.4s, v4.8h, v3.8h +; CHECK-NEXT: usubl v3.4s, v4.4h, v3.4h +; CHECK-NEXT: usubl2 v4.4s, v1.8h, v0.8h +; CHECK-NEXT: usubl v0.4s, v1.4h, v0.4h +; CHECK-NEXT: shl v1.4s, v3.4s, #16 +; CHECK-NEXT: shl v3.4s, v5.4s, #16 +; CHECK-NEXT: shl v4.4s, v4.4s, #16 +; CHECK-NEXT: add v1.4s, v1.4s, v18.4s +; CHECK-NEXT: shl v0.4s, v0.4s, #16 +; CHECK-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-NEXT: add v2.4s, v4.4s, v2.4s +; CHECK-NEXT: rev64 v4.4s, v3.4s +; CHECK-NEXT: rev64 v5.4s, v1.4s +; CHECK-NEXT: add v0.4s, v0.4s, v16.4s +; CHECK-NEXT: rev64 v6.4s, v2.4s +; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: add v16.4s, v3.4s, v4.4s +; CHECK-NEXT: add v17.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: trn2 v5.4s, v16.4s, v17.4s +; CHECK-NEXT: add v18.4s, v2.4s, v6.4s +; CHECK-NEXT: add v19.4s, v0.4s, v7.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s +; CHECK-NEXT: trn2 v4.4s, v19.4s, v18.4s +; CHECK-NEXT: ext v6.16b, v5.16b, v16.16b, #8 +; CHECK-NEXT: zip1 v7.4s, v0.4s, v2.4s +; CHECK-NEXT: trn2 v16.4s, v17.4s, v16.4s +; CHECK-NEXT: ext v4.16b, v19.16b, v4.16b, #8 +; CHECK-NEXT: zip1 v20.4s, v3.4s, v1.4s +; CHECK-NEXT: ext v7.16b, v0.16b, v7.16b, #8 +; CHECK-NEXT: ext v17.16b, v16.16b, v17.16b, #8 +; CHECK-NEXT: zip2 v18.4s, v19.4s, v18.4s +; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mov v0.s[3], v2.s[2] +; CHECK-NEXT: mov v5.d[1], v4.d[1] +; CHECK-NEXT: mov v20.d[1], v7.d[1] +; CHECK-NEXT: mov v17.d[1], v18.d[1] +; CHECK-NEXT: mov v16.d[1], v4.d[1] +; CHECK-NEXT: mov v1.d[1], v0.d[1] +; CHECK-NEXT: mov v6.d[1], v18.d[1] +; CHECK-NEXT: add v0.4s, v17.4s, v16.4s +; CHECK-NEXT: add v2.4s, v1.4s, v20.4s +; CHECK-NEXT: sub v3.4s, v5.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v20.4s, v1.4s +; CHECK-NEXT: rev64 v4.4s, v0.4s +; CHECK-NEXT: rev64 v5.4s, v3.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: add v16.4s, v0.4s, v4.4s +; CHECK-NEXT: add v17.4s, v3.4s, v5.4s +; CHECK-NEXT: add v18.4s, v1.4s, v6.4s +; CHECK-NEXT: add v19.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v19.16b, #12 +; CHECK-NEXT: ext v5.16b, v1.16b, v18.16b, #12 +; CHECK-NEXT: ext v7.16b, v3.16b, v17.16b, #12 +; CHECK-NEXT: rev64 v16.4s, v16.4s +; CHECK-NEXT: ext v6.16b, v4.16b, v2.16b, #4 +; CHECK-NEXT: ext v17.16b, v4.16b, v4.16b, #8 +; CHECK-NEXT: ext v18.16b, v5.16b, v1.16b, #4 +; CHECK-NEXT: ext v19.16b, v5.16b, v5.16b, #8 +; CHECK-NEXT: ext v20.16b, v7.16b, v3.16b, #4 +; CHECK-NEXT: ext v21.16b, v7.16b, v7.16b, #8 +; CHECK-NEXT: rev64 v7.4s, v7.4s +; CHECK-NEXT: trn2 v0.4s, v16.4s, v0.4s +; CHECK-NEXT: rev64 v5.4s, v5.4s +; CHECK-NEXT: rev64 v4.4s, v4.4s +; CHECK-NEXT: ext v6.16b, v6.16b, v17.16b, #12 +; CHECK-NEXT: ext v17.16b, v18.16b, v19.16b, #12 +; CHECK-NEXT: ext v18.16b, v20.16b, v21.16b, #12 +; CHECK-NEXT: ext v3.16b, v7.16b, v3.16b, #4 +; CHECK-NEXT: ext v7.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v1.16b, v5.16b, v1.16b, #4 +; CHECK-NEXT: ext v2.16b, v4.16b, v2.16b, #4 +; CHECK-NEXT: add v4.4s, v18.4s, v3.4s +; CHECK-NEXT: add v5.4s, v0.4s, v7.4s +; CHECK-NEXT: add v16.4s, v17.4s, v1.4s +; CHECK-NEXT: add v19.4s, v6.4s, v2.4s +; CHECK-NEXT: sub v3.4s, v18.4s, v3.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s +; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v17.4s, v1.4s +; CHECK-NEXT: mov v19.d[1], v2.d[1] +; CHECK-NEXT: mov v16.d[1], v1.d[1] +; CHECK-NEXT: mov v4.d[1], v3.d[1] +; CHECK-NEXT: mov v5.d[1], v0.d[1] +; CHECK-NEXT: movi v0.8h, #1 +; CHECK-NEXT: movi v7.2d, #0x00ffff0000ffff +; CHECK-NEXT: ushr v1.4s, v4.4s, #15 +; CHECK-NEXT: ushr v2.4s, v19.4s, #15 +; CHECK-NEXT: ushr v3.4s, v5.4s, #15 +; CHECK-NEXT: ushr v6.4s, v16.4s, #15 +; CHECK-NEXT: and v2.16b, v2.16b, v0.16b +; CHECK-NEXT: and v6.16b, v6.16b, v0.16b +; CHECK-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-NEXT: and v0.16b, v1.16b, v0.16b +; CHECK-NEXT: mul v1.4s, v2.4s, v7.4s +; CHECK-NEXT: mul v2.4s, v6.4s, v7.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v7.4s +; CHECK-NEXT: mul v3.4s, v3.4s, v7.4s +; CHECK-NEXT: add v6.4s, v1.4s, v19.4s +; CHECK-NEXT: add v7.4s, v2.4s, v16.4s +; CHECK-NEXT: add v4.4s, v0.4s, v4.4s +; CHECK-NEXT: add v5.4s, v3.4s, v5.4s +; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b +; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b +; CHECK-NEXT: eor v2.16b, v7.16b, v2.16b +; CHECK-NEXT: eor v1.16b, v6.16b, v1.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: lsr w9, w8, #16 +; CHECK-NEXT: add w8, w9, w8, uxth +; CHECK-NEXT: lsr w0, w8, #1 +; CHECK-NEXT: ret +entry: + %idx.ext = sext i32 %st1 to i64 + %idx.ext63 = sext i32 %st2 to i64 + %arrayidx3 = getelementptr inbounds i8, i8* %p1, i64 4 + %arrayidx5 = getelementptr inbounds i8, i8* %p2, i64 4 + %0 = bitcast i8* %p1 to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %p2 to <4 x i8>* + %3 = load <4 x i8>, <4 x i8>* %2, align 1 + %4 = bitcast i8* %arrayidx3 to <4 x i8>* + %5 = load <4 x i8>, <4 x i8>* %4, align 1 + %6 = bitcast i8* %arrayidx5 to <4 x i8>* + %7 = load <4 x i8>, <4 x i8>* %6, align 1 + %add.ptr = getelementptr inbounds i8, i8* %p1, i64 %idx.ext + %add.ptr64 = getelementptr inbounds i8, i8* %p2, i64 %idx.ext63 + %arrayidx3.1 = getelementptr inbounds i8, i8* %add.ptr, i64 4 + %arrayidx5.1 = getelementptr inbounds i8, i8* %add.ptr64, i64 4 + %8 = bitcast i8* %add.ptr to <4 x i8>* + %9 = load <4 x i8>, <4 x i8>* %8, align 1 + %10 = bitcast i8* %add.ptr64 to <4 x i8>* + %11 = load <4 x i8>, <4 x i8>* %10, align 1 + %12 = bitcast i8* %arrayidx3.1 to <4 x i8>* + %13 = load <4 x i8>, <4 x i8>* %12, align 1 + %14 = bitcast i8* %arrayidx5.1 to <4 x i8>* + %15 = load <4 x i8>, <4 x i8>* %14, align 1 + %add.ptr.1 = getelementptr inbounds i8, i8* %add.ptr, i64 %idx.ext + %add.ptr64.1 = getelementptr inbounds i8, i8* %add.ptr64, i64 %idx.ext63 + %arrayidx3.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 4 + %arrayidx5.2 = getelementptr inbounds i8, i8* %add.ptr64.1, i64 4 + %16 = bitcast i8* %add.ptr.1 to <4 x i8>* + %17 = load <4 x i8>, <4 x i8>* %16, align 1 + %18 = bitcast i8* %add.ptr64.1 to <4 x i8>* + %19 = load <4 x i8>, <4 x i8>* %18, align 1 + %20 = bitcast i8* %arrayidx3.2 to <4 x i8>* + %21 = load <4 x i8>, <4 x i8>* %20, align 1 + %22 = bitcast i8* %arrayidx5.2 to <4 x i8>* + %23 = load <4 x i8>, <4 x i8>* %22, align 1 + %add.ptr.2 = getelementptr inbounds i8, i8* %add.ptr.1, i64 %idx.ext + %add.ptr64.2 = getelementptr inbounds i8, i8* %add.ptr64.1, i64 %idx.ext63 + %arrayidx3.3 = getelementptr inbounds i8, i8* %add.ptr.2, i64 4 + %arrayidx5.3 = getelementptr inbounds i8, i8* %add.ptr64.2, i64 4 + %24 = bitcast i8* %add.ptr.2 to <4 x i8>* + %25 = load <4 x i8>, <4 x i8>* %24, align 1 + %26 = shufflevector <4 x i8> %25, <4 x i8> %17, <16 x i32> + %27 = shufflevector <4 x i8> %9, <4 x i8> poison, <16 x i32> + %28 = shufflevector <16 x i8> %26, <16 x i8> %27, <16 x i32> + %29 = shufflevector <4 x i8> %1, <4 x i8> poison, <16 x i32> + %30 = shufflevector <16 x i8> %28, <16 x i8> %29, <16 x i32> + %31 = zext <16 x i8> %30 to <16 x i32> + %32 = bitcast i8* %add.ptr64.2 to <4 x i8>* + %33 = load <4 x i8>, <4 x i8>* %32, align 1 + %34 = shufflevector <4 x i8> %33, <4 x i8> %19, <16 x i32> + %35 = shufflevector <4 x i8> %11, <4 x i8> poison, <16 x i32> + %36 = shufflevector <16 x i8> %34, <16 x i8> %35, <16 x i32> + %37 = shufflevector <4 x i8> %3, <4 x i8> poison, <16 x i32> + %38 = shufflevector <16 x i8> %36, <16 x i8> %37, <16 x i32> + %39 = zext <16 x i8> %38 to <16 x i32> + %40 = sub nsw <16 x i32> %31, %39 + %41 = bitcast i8* %arrayidx3.3 to <4 x i8>* + %42 = load <4 x i8>, <4 x i8>* %41, align 1 + %43 = shufflevector <4 x i8> %42, <4 x i8> %21, <16 x i32> + %44 = shufflevector <4 x i8> %13, <4 x i8> poison, <16 x i32> + %45 = shufflevector <16 x i8> %43, <16 x i8> %44, <16 x i32> + %46 = shufflevector <4 x i8> %5, <4 x i8> poison, <16 x i32> + %47 = shufflevector <16 x i8> %45, <16 x i8> %46, <16 x i32> + %48 = zext <16 x i8> %47 to <16 x i32> + %49 = bitcast i8* %arrayidx5.3 to <4 x i8>* + %50 = load <4 x i8>, <4 x i8>* %49, align 1 + %51 = shufflevector <4 x i8> %50, <4 x i8> %23, <16 x i32> + %52 = shufflevector <4 x i8> %15, <4 x i8> poison, <16 x i32> + %53 = shufflevector <16 x i8> %51, <16 x i8> %52, <16 x i32> + %54 = shufflevector <4 x i8> %7, <4 x i8> poison, <16 x i32> + %55 = shufflevector <16 x i8> %53, <16 x i8> %54, <16 x i32> + %56 = zext <16 x i8> %55 to <16 x i32> + %57 = sub nsw <16 x i32> %48, %56 + %58 = shl nsw <16 x i32> %57, + %59 = add nsw <16 x i32> %58, %40 + %60 = shufflevector <16 x i32> %59, <16 x i32> undef, <16 x i32> + %61 = add nsw <16 x i32> %59, %60 + %62 = sub nsw <16 x i32> %59, %60 + %63 = shufflevector <16 x i32> %61, <16 x i32> %62, <16 x i32> + %64 = shufflevector <16 x i32> %61, <16 x i32> %62, <16 x i32> + %65 = add nsw <16 x i32> %63, %64 + %66 = sub nsw <16 x i32> %63, %64 + %67 = shufflevector <16 x i32> %65, <16 x i32> %66, <16 x i32> + %68 = shufflevector <16 x i32> %65, <16 x i32> %66, <16 x i32> + %69 = add nsw <16 x i32> %67, %68 + %70 = sub nsw <16 x i32> %67, %68 + %71 = shufflevector <16 x i32> %69, <16 x i32> %70, <16 x i32> + %72 = shufflevector <16 x i32> %69, <16 x i32> %70, <16 x i32> + %73 = add nsw <16 x i32> %71, %72 + %74 = sub nsw <16 x i32> %71, %72 + %75 = shufflevector <16 x i32> %73, <16 x i32> %74, <16 x i32> + %76 = lshr <16 x i32> %75, + %77 = and <16 x i32> %76, + %78 = mul nuw <16 x i32> %77, + %79 = add <16 x i32> %78, %75 + %80 = xor <16 x i32> %79, %78 + %81 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %80) + %conv118 = and i32 %81, 65535 + %shr = lshr i32 %81, 16 + %add119 = add nuw nsw i32 %conv118, %shr + %shr120 = lshr i32 %add119, 1 + ret i32 %shr120 +} + +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) diff --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll index e18999c892a9c..ae78ac5021da6 100644 --- a/llvm/test/CodeGen/AArch64/insert-subvector.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll @@ -558,3 +558,125 @@ define <4 x i32> @load_v4i32_2_2(float %tmp, <4 x i32> %b, <2 x i32> *%a) { %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> ret <4 x i32> %s2 } + +; More than a single vector + +define <8 x i8> @load2_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { +; CHECK-LABEL: load2_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x1] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %la = load <4 x i8>, <4 x i8> *%a + %lb = load <4 x i8>, <4 x i8> *%b + %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> + ret <8 x i8> %s1 +} + +define <16 x i8> @load3_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { +; CHECK-LABEL: load3_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldr s3, [x1] +; CHECK-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: umov w9, v2.h[1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: umov w8, v2.h[2] +; CHECK-NEXT: mov v0.b[1], w9 +; CHECK-NEXT: umov w9, v2.h[3] +; CHECK-NEXT: ushll v2.8h, v3.8b, #0 +; CHECK-NEXT: mov v0.b[2], w8 +; CHECK-NEXT: umov w8, v2.h[0] +; CHECK-NEXT: mov v0.b[3], w9 +; CHECK-NEXT: umov w9, v2.h[1] +; CHECK-NEXT: mov v0.b[4], w8 +; CHECK-NEXT: umov w8, v2.h[2] +; CHECK-NEXT: mov v0.b[5], w9 +; CHECK-NEXT: umov w9, v2.h[3] +; CHECK-NEXT: mov v0.b[6], w8 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: mov v0.b[7], w9 +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: mov v0.b[8], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[9], w9 +; CHECK-NEXT: umov w9, v1.h[3] +; CHECK-NEXT: mov v0.b[10], w8 +; CHECK-NEXT: mov v0.b[11], w9 +; CHECK-NEXT: ret + %la = load <4 x i8>, <4 x i8> *%a + %lb = load <4 x i8>, <4 x i8> *%b + %c = getelementptr <4 x i8>, <4 x i8> *%a, i64 1 + %d = getelementptr <4 x i8>, <4 x i8> *%b, i64 1 + %lc = load <4 x i8>, <4 x i8> *%c + %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> + %s2 = shufflevector <4 x i8> %lc, <4 x i8> undef, <8 x i32> + %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> + ret <16 x i8> %s3 +} + +define <16 x i8> @load4_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { +; CHECK-LABEL: load4_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s2, s3, [x1] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: mov v1.d[1], v3.d[0] +; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret + %la = load <4 x i8>, <4 x i8> *%a + %lb = load <4 x i8>, <4 x i8> *%b + %c = getelementptr <4 x i8>, <4 x i8> *%a, i64 1 + %d = getelementptr <4 x i8>, <4 x i8> *%b, i64 1 + %lc = load <4 x i8>, <4 x i8> *%c + %ld = load <4 x i8>, <4 x i8> *%d + %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> + %s2 = shufflevector <4 x i8> %lc, <4 x i8> %ld, <8 x i32> + %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> + ret <16 x i8> %s3 +} + +define <16 x i8> @load2multi1_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { +; CHECK-LABEL: load2multi1_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldr s1, [x1] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ret + %la = load <4 x i8>, <4 x i8> *%a + %lb = load <4 x i8>, <4 x i8> *%b + %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> + %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s1, <16 x i32> + ret <16 x i8> %s3 +} + +define <16 x i8> @load2multi2_v4i8(float %tmp, <4 x i8> *%a, <4 x i8> *%b) { +; CHECK-LABEL: load2multi2_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x1] +; CHECK-NEXT: ldr s1, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: mov v0.d[1], v0.d[0] +; CHECK-NEXT: mov v1.d[1], v1.d[0] +; CHECK-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-NEXT: ret + %la = load <4 x i8>, <4 x i8> *%a + %lb = load <4 x i8>, <4 x i8> *%b + %s1 = shufflevector <4 x i8> %la, <4 x i8> %la, <8 x i32> + %s2 = shufflevector <4 x i8> %lb, <4 x i8> %lb, <8 x i32> + %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> + ret <16 x i8> %s3 +}