124 changes: 42 additions & 82 deletions llvm/test/CodeGen/AArch64/zext-to-tbl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -993,61 +993,21 @@ exit:
ret void
}

; CHECK-LABEL: lCPI11_0:
; CHECK-NEXT: .byte 0 ; 0x0
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 1 ; 0x1
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 2 ; 0x2
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 3 ; 0x3
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff
; CHECK-NEXT: .byte 255 ; 0xff

; CHECK-BE-LABEL: .LCPI11_0:
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 0 // 0x0
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 1 // 0x1
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 2 // 0x2
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 255 // 0xff
; CHECK-BE-NEXT: .byte 3 // 0x3

define void @zext_v4i8_to_v4i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v4i8_to_v4i32_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh12:
; CHECK-NEXT: adrp x9, lCPI11_0@PAGE
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: Lloh13:
; CHECK-NEXT: ldr q0, [x9, lCPI11_0@PAGEOFF]
; CHECK-NEXT: LBB11_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr s1, [x0, x8]
; CHECK-NEXT: ldr s0, [x0, x8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: cmp x8, #128
; CHECK-NEXT: tbl.16b v1, { v1 }, v0
; CHECK-NEXT: str q1, [x1], #64
; CHECK-NEXT: ushll.8h v0, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: str q0, [x1], #64
; CHECK-NEXT: b.ne LBB11_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh13
;
; CHECK-BE-LABEL: zext_v4i8_to_v4i32_in_loop:
; CHECK-BE: // %bb.0: // %entry
Expand Down Expand Up @@ -1194,18 +1154,18 @@ exit:
define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v12i8_to_v12i32_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh14:
; CHECK-NEXT: Lloh12:
; CHECK-NEXT: adrp x9, lCPI12_0@PAGE
; CHECK-NEXT: Lloh15:
; CHECK-NEXT: Lloh13:
; CHECK-NEXT: adrp x10, lCPI12_1@PAGE
; CHECK-NEXT: Lloh16:
; CHECK-NEXT: Lloh14:
; CHECK-NEXT: adrp x11, lCPI12_2@PAGE
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: Lloh17:
; CHECK-NEXT: Lloh15:
; CHECK-NEXT: ldr q0, [x9, lCPI12_0@PAGEOFF]
; CHECK-NEXT: Lloh18:
; CHECK-NEXT: Lloh16:
; CHECK-NEXT: ldr q1, [x10, lCPI12_1@PAGEOFF]
; CHECK-NEXT: Lloh19:
; CHECK-NEXT: Lloh17:
; CHECK-NEXT: ldr q2, [x11, lCPI12_2@PAGEOFF]
; CHECK-NEXT: LBB12_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
Expand All @@ -1220,9 +1180,9 @@ define void @zext_v12i8_to_v12i32_in_loop(ptr %src, ptr %dst) {
; CHECK-NEXT: b.ne LBB12_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh19
; CHECK-NEXT: .loh AdrpLdr Lloh15, Lloh18
; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh17
; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh16
; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh15
;
; CHECK-BE-LABEL: zext_v12i8_to_v12i32_in_loop:
; CHECK-BE: // %bb.0: // %entry
Expand Down Expand Up @@ -2192,22 +2152,22 @@ exit:
define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v20i8_to_v20i24_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh20:
; CHECK-NEXT: Lloh18:
; CHECK-NEXT: adrp x9, lCPI20_0@PAGE
; CHECK-NEXT: Lloh21:
; CHECK-NEXT: Lloh19:
; CHECK-NEXT: adrp x10, lCPI20_1@PAGE
; CHECK-NEXT: Lloh22:
; CHECK-NEXT: Lloh20:
; CHECK-NEXT: adrp x11, lCPI20_2@PAGE
; CHECK-NEXT: Lloh23:
; CHECK-NEXT: Lloh21:
; CHECK-NEXT: adrp x12, lCPI20_3@PAGE
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: Lloh24:
; CHECK-NEXT: Lloh22:
; CHECK-NEXT: ldr q0, [x9, lCPI20_0@PAGEOFF]
; CHECK-NEXT: Lloh25:
; CHECK-NEXT: Lloh23:
; CHECK-NEXT: ldr q1, [x10, lCPI20_1@PAGEOFF]
; CHECK-NEXT: Lloh26:
; CHECK-NEXT: Lloh24:
; CHECK-NEXT: ldr q2, [x11, lCPI20_2@PAGEOFF]
; CHECK-NEXT: Lloh27:
; CHECK-NEXT: Lloh25:
; CHECK-NEXT: ldr q3, [x12, lCPI20_3@PAGEOFF]
; CHECK-NEXT: LBB20_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
Expand All @@ -2228,10 +2188,10 @@ define void @zext_v20i8_to_v20i24_in_loop(ptr %src, ptr %dst) {
; CHECK-NEXT: b.ne LBB20_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh23, Lloh27
; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh26
; CHECK-NEXT: .loh AdrpLdr Lloh21, Lloh25
; CHECK-NEXT: .loh AdrpLdr Lloh20, Lloh24
; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23
; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22
;
; CHECK-BE-LABEL: zext_v20i8_to_v20i24_in_loop:
; CHECK-BE: // %bb.0: // %entry
Expand Down Expand Up @@ -2519,30 +2479,30 @@ exit:
define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) {
; CHECK-LABEL: zext_v23i8_to_v23i48_in_loop:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh28:
; CHECK-NEXT: Lloh26:
; CHECK-NEXT: adrp x9, lCPI21_0@PAGE
; CHECK-NEXT: Lloh29:
; CHECK-NEXT: Lloh27:
; CHECK-NEXT: adrp x10, lCPI21_1@PAGE
; CHECK-NEXT: Lloh30:
; CHECK-NEXT: Lloh28:
; CHECK-NEXT: adrp x11, lCPI21_2@PAGE
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: Lloh31:
; CHECK-NEXT: Lloh29:
; CHECK-NEXT: ldr q0, [x9, lCPI21_0@PAGEOFF]
; CHECK-NEXT: Lloh32:
; CHECK-NEXT: Lloh30:
; CHECK-NEXT: adrp x9, lCPI21_3@PAGE
; CHECK-NEXT: Lloh33:
; CHECK-NEXT: Lloh31:
; CHECK-NEXT: ldr q1, [x10, lCPI21_1@PAGEOFF]
; CHECK-NEXT: Lloh34:
; CHECK-NEXT: Lloh32:
; CHECK-NEXT: adrp x10, lCPI21_4@PAGE
; CHECK-NEXT: Lloh35:
; CHECK-NEXT: Lloh33:
; CHECK-NEXT: ldr q2, [x11, lCPI21_2@PAGEOFF]
; CHECK-NEXT: Lloh36:
; CHECK-NEXT: Lloh34:
; CHECK-NEXT: adrp x11, lCPI21_5@PAGE
; CHECK-NEXT: Lloh37:
; CHECK-NEXT: Lloh35:
; CHECK-NEXT: ldr q3, [x9, lCPI21_3@PAGEOFF]
; CHECK-NEXT: Lloh38:
; CHECK-NEXT: Lloh36:
; CHECK-NEXT: ldr q4, [x10, lCPI21_4@PAGEOFF]
; CHECK-NEXT: Lloh39:
; CHECK-NEXT: Lloh37:
; CHECK-NEXT: ldr q5, [x11, lCPI21_5@PAGEOFF]
; CHECK-NEXT: LBB21_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
Expand Down Expand Up @@ -2570,15 +2530,15 @@ define void @zext_v23i8_to_v23i48_in_loop(ptr %src, ptr %dst) {
; CHECK-NEXT: b.ne LBB21_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
; CHECK-NEXT: .loh AdrpLdr Lloh36, Lloh39
; CHECK-NEXT: .loh AdrpLdr Lloh34, Lloh38
; CHECK-NEXT: .loh AdrpLdr Lloh32, Lloh37
; CHECK-NEXT: .loh AdrpAdrp Lloh30, Lloh36
; CHECK-NEXT: .loh AdrpLdr Lloh34, Lloh37
; CHECK-NEXT: .loh AdrpLdr Lloh32, Lloh36
; CHECK-NEXT: .loh AdrpLdr Lloh30, Lloh35
; CHECK-NEXT: .loh AdrpAdrp Lloh29, Lloh34
; CHECK-NEXT: .loh AdrpLdr Lloh29, Lloh33
; CHECK-NEXT: .loh AdrpAdrp Lloh28, Lloh32
; CHECK-NEXT: .loh AdrpLdr Lloh28, Lloh31
; CHECK-NEXT: .loh AdrpAdrp Lloh28, Lloh34
; CHECK-NEXT: .loh AdrpLdr Lloh28, Lloh33
; CHECK-NEXT: .loh AdrpAdrp Lloh27, Lloh32
; CHECK-NEXT: .loh AdrpLdr Lloh27, Lloh31
; CHECK-NEXT: .loh AdrpAdrp Lloh26, Lloh30
; CHECK-NEXT: .loh AdrpLdr Lloh26, Lloh29
;
; CHECK-BE-LABEL: zext_v23i8_to_v23i48_in_loop:
; CHECK-BE: // %bb.0: // %entry
Expand Down
207 changes: 184 additions & 23 deletions llvm/test/CodeGen/X86/vec_anyext.ll
Original file line number Diff line number Diff line change
@@ -1,74 +1,235 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64--
; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,X64

; PR 9267

define<4 x i16> @func_16_32() {
%F = load <4 x i32>, ptr undef
define <4 x i16> @func_16_32(ptr %a, ptr %b, ptr %c) nounwind {
; X86-LABEL: func_16_32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: vmovdqa (%edx), %xmm0
; X86-NEXT: vpaddw (%ecx), %xmm0, %xmm0
; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X86-NEXT: vmovq %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: func_16_32:
; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rsi), %xmm0
; X64-NEXT: vpaddw (%rdi), %xmm0, %xmm0
; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X64-NEXT: vmovq %xmm0, (%rdx)
; X64-NEXT: retq
%F = load <4 x i32>, ptr %a
%G = trunc <4 x i32> %F to <4 x i16>
%H = load <4 x i32>, ptr undef
%H = load <4 x i32>, ptr %b
%Y = trunc <4 x i32> %H to <4 x i16>
%T = add <4 x i16> %Y, %G
store <4 x i16>%T , ptr undef
store <4 x i16>%T , ptr %c
ret <4 x i16> %T
}

define<4 x i16> @func_16_64() {
%F = load <4 x i64>, ptr undef
define <4 x i16> @func_16_64(ptr %a, ptr %b, ptr %c) nounwind {
; X86-LABEL: func_16_64:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: vmovaps (%edx), %ymm0
; X86-NEXT: vxorps (%ecx), %ymm0, %ymm0
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
; X86-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; X86-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; X86-NEXT: vmovq %xmm0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: func_16_64:
; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rsi), %ymm0
; X64-NEXT: vpxor (%rdi), %ymm0, %ymm0
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; X64-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovq %xmm0, (%rdx)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%F = load <4 x i64>, ptr %a
%G = trunc <4 x i64> %F to <4 x i16>
%H = load <4 x i64>, ptr undef
%H = load <4 x i64>, ptr %b
%Y = trunc <4 x i64> %H to <4 x i16>
%T = xor <4 x i16> %Y, %G
store <4 x i16>%T , ptr undef
store <4 x i16>%T , ptr %c
ret <4 x i16> %T
}

define<4 x i32> @func_32_64() {
%F = load <4 x i64>, ptr undef
define <4 x i32> @func_32_64(ptr %a, ptr %b) nounwind {
; X86-LABEL: func_32_64:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovaps (%ecx), %ymm0
; X86-NEXT: vorps (%eax), %ymm0, %ymm0
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: func_32_64:
; X64: # %bb.0:
; X64-NEXT: vmovaps (%rsi), %ymm0
; X64-NEXT: vorps (%rdi), %ymm0, %ymm0
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%F = load <4 x i64>, ptr %a
%G = trunc <4 x i64> %F to <4 x i32>
%H = load <4 x i64>, ptr undef
%H = load <4 x i64>, ptr %b
%Y = trunc <4 x i64> %H to <4 x i32>
%T = or <4 x i32> %Y, %G
ret <4 x i32> %T
}

define<4 x i8> @func_8_16() {
%F = load <4 x i16>, ptr undef
define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind {
; X86-LABEL: func_8_16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: retl
;
; X64-LABEL: func_8_16:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $16, %ecx
; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: shrq $32, %rcx
; X64-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; X64-NEXT: shrq $48, %rax
; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; X64-NEXT: movq (%rsi), %rax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $16, %ecx
; X64-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: shrq $32, %rcx
; X64-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
; X64-NEXT: shrq $48, %rax
; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; X64-NEXT: retq
%F = load <4 x i16>, ptr %a
%G = trunc <4 x i16> %F to <4 x i8>
%H = load <4 x i16>, ptr undef
%H = load <4 x i16>, ptr %b
%Y = trunc <4 x i16> %H to <4 x i8>
%T = add <4 x i8> %Y, %G
ret <4 x i8> %T
}

define<4 x i8> @func_8_32() {
%F = load <4 x i32>, ptr undef
define <4 x i8> @func_8_32(ptr %a, ptr %b) nounwind {
; X86-LABEL: func_8_32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovdqa (%ecx), %xmm0
; X86-NEXT: vpsubb (%eax), %xmm0, %xmm0
; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-NEXT: retl
;
; X64-LABEL: func_8_32:
; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rsi), %xmm0
; X64-NEXT: vpsubb (%rdi), %xmm0, %xmm0
; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: retq
%F = load <4 x i32>, ptr %a
%G = trunc <4 x i32> %F to <4 x i8>
%H = load <4 x i32>, ptr undef
%H = load <4 x i32>, ptr %b
%Y = trunc <4 x i32> %H to <4 x i8>
%T = sub <4 x i8> %Y, %G
ret <4 x i8> %T
}

define<4 x i8> @func_8_64() {
%F = load <4 x i64>, ptr undef
define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
; X86-LABEL: func_8_64:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovdqa (%ecx), %xmm0
; X86-NEXT: vmovdqa 16(%ecx), %xmm1
; X86-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; X86-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X86-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: vmovdqa (%eax), %xmm1
; X86-NEXT: vmovdqa 16(%eax), %xmm3
; X86-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; X86-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: func_8_64:
; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rdi), %xmm0
; X64-NEXT: vmovdqa 16(%rdi), %xmm1
; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: vmovdqa (%rsi), %xmm1
; X64-NEXT: vmovdqa 16(%rsi), %xmm3
; X64-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; X64-NEXT: retq
%F = load <4 x i64>, ptr %a
%G = trunc <4 x i64> %F to <4 x i8>
%H = load <4 x i64>, ptr undef
%H = load <4 x i64>, ptr %b
%Y = trunc <4 x i64> %H to <4 x i8>
%T = add <4 x i8> %Y, %G
ret <4 x i8> %T
}

define<4 x i16> @const_16_32() {
define <4 x i16> @const_16_32() nounwind {
; CHECK-LABEL: const_16_32:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,3,8,7,u,u,u,u>
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i32> <i32 0, i32 3, i32 8, i32 7> to <4 x i16>
ret <4 x i16> %G
}

define<4 x i16> @const_16_64() {
define <4 x i16> @const_16_64() nounwind {
; CHECK-LABEL: const_16_64:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = <0,3,8,7,u,u,u,u>
; CHECK-NEXT: ret{{[l|q]}}
%G = trunc <4 x i64> <i64 0, i64 3, i64 8, i64 7> to <4 x i16>
ret <4 x i16> %G
}

define void @bugOnTruncBitwidthReduce() nounwind {
; CHECK-LABEL: bugOnTruncBitwidthReduce:
; CHECK: # %bb.0: # %meh
; CHECK-NEXT: ret{{[l|q]}}
meh:
%0 = xor <4 x i64> zeroinitializer, zeroinitializer
%1 = trunc <4 x i64> %0 to <4 x i32>
Expand Down