diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 595f8491b405c..26f076d450c15 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -353,6 +353,69 @@ define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp ret <4 x float> %res3 } +define <4 x float> @merge_v4f32_f32_3210(ptr %ptr) nounwind uwtable noinline ssp { +; SSE2-LABEL: merge_v4f32_f32_3210: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: merge_v4f32_f32_3210: +; SSE41: # %bb.0: +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: merge_v4f32_f32_3210: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX-NEXT: retq +; +; X86-SSE1-LABEL: merge_v4f32_f32_3210: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X86-SSE1-NEXT: retl +; +; X86-SSE41-LABEL: merge_v4f32_f32_3210: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X86-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3 + %ptr1 = getelementptr inbounds float, ptr %ptr, i64 2 + %ptr2 = getelementptr inbounds float, ptr %ptr, i64 1 + %ptr3 = getelementptr inbounds float, ptr %ptr, i64 0 + %val0 = load float, ptr %ptr0, align 4 + %val1 = load float, ptr %ptr1, align 4 + %val2 = load float, ptr %ptr2, align 4 + %val3 = load float, ptr %ptr3, align 4 + %res0 = insertelement <4 x float> poison, float %val0, i64 0 + %res1 = insertelement <4 x float> %res0, float %val1, i64 1 + %res2 = insertelement <4 x float> %res1, float %val2, i64 2 + %res3 = insertelement <4 x float> %res2, float %val3, i64 3 + ret <4 x float> %res3 +} + define <4 x i32> @merge_4i32_i32_23u5(ptr %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_4i32_i32_23u5: ; SSE: # %bb.0: @@ -724,6 +787,81 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline s ret <4 x i32> %res1 } +define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp { +; SSE2-LABEL: merge_v4i32_i32_3210: +; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: merge_v4i32_i32_3210: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pinsrd $1, 8(%rdi), %xmm0 +; SSE41-NEXT: pinsrd $2, 4(%rdi), %xmm0 +; SSE41-NEXT: pinsrd $3, (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: merge_v4i32_i32_3210: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-SSE1-LABEL: merge_v4i32_i32_3210: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %edi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: .cfi_offset %esi, -12 +; X86-SSE1-NEXT: .cfi_offset %edi, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movl 12(%ecx), %edx +; X86-SSE1-NEXT: movl 8(%ecx), %esi +; X86-SSE1-NEXT: movl (%ecx), %edi +; X86-SSE1-NEXT: movl 4(%ecx), %ecx +; X86-SSE1-NEXT: movl %edi, 12(%eax) +; X86-SSE1-NEXT: movl %ecx, 8(%eax) +; X86-SSE1-NEXT: movl %esi, 4(%eax) +; X86-SSE1-NEXT: movl %edx, (%eax) +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %edi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl $4 +; +; X86-SSE41-LABEL: merge_v4i32_i32_3210: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: pinsrd $1, 8(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrd $2, 4(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrd $3, (%eax), %xmm0 +; X86-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3 + %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2 + %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 1 + %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 0 + %val0 = load i32, ptr %ptr0, align 4 + %val1 = load i32, ptr %ptr1, align 4 + %val2 = load i32, ptr %ptr2, align 4 + %val3 = load i32, ptr %ptr3, align 4 + %res0 = insertelement <4 x i32> poison, i32 %val0, i64 0 + %res1 = insertelement <4 x i32> %res0, i32 %val1, i64 1 + %res2 = insertelement <4 x i32> %res1, i32 %val2, i64 2 + %res3 = insertelement <4 x i32> %res2, i32 %val3, i64 3 + ret <4 x i32> %res3 +} + define <8 x i16> @merge_8i16_i16_23u567u9(ptr %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_8i16_i16_23u567u9: ; SSE: # %bb.0: @@ -862,6 +1000,150 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ss ret <8 x i16> %res7 } +define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; SSE2-LABEL: merge_8i16_i16_76543210: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzwl 2(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movzwl 4(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzwl 6(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movzwl 8(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzwl 10(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movzwl 12(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movzwl 14(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: merge_8i16_i16_76543210: +; SSE41: # %bb.0: +; SSE41-NEXT: movzwl 14(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrw $1, 12(%rdi), %xmm0 +; SSE41-NEXT: pinsrw $2, 10(%rdi), %xmm0 +; SSE41-NEXT: pinsrw $3, 8(%rdi), %xmm0 +; SSE41-NEXT: pinsrw $4, 6(%rdi), %xmm0 +; SSE41-NEXT: pinsrw $5, 4(%rdi), %xmm0 +; SSE41-NEXT: pinsrw $6, 2(%rdi), %xmm0 +; SSE41-NEXT: pinsrw $7, (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: merge_8i16_i16_76543210: +; AVX: # %bb.0: +; AVX-NEXT: movzwl 14(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-SSE1-LABEL: merge_8i16_i16_76543210: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: pushl %edi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 20 +; X86-SSE1-NEXT: pushl %eax +; X86-SSE1-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE1-NEXT: .cfi_offset %esi, -20 +; X86-SSE1-NEXT: .cfi_offset %edi, -16 +; X86-SSE1-NEXT: .cfi_offset %ebx, -12 +; X86-SSE1-NEXT: .cfi_offset %ebp, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movzwl 14(%eax), %ecx +; X86-SSE1-NEXT: movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; X86-SSE1-NEXT: movzwl 12(%eax), %ecx +; X86-SSE1-NEXT: movw %cx, (%esp) # 2-byte Spill +; X86-SSE1-NEXT: movzwl 10(%eax), %esi +; X86-SSE1-NEXT: movzwl 8(%eax), %edi +; X86-SSE1-NEXT: movzwl 6(%eax), %ebx +; X86-SSE1-NEXT: movzwl 4(%eax), %ebp +; X86-SSE1-NEXT: movzwl (%eax), %ecx +; X86-SSE1-NEXT: movzwl 2(%eax), %edx +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movw %cx, 14(%eax) +; X86-SSE1-NEXT: movw %dx, 12(%eax) +; X86-SSE1-NEXT: movw %bp, 10(%eax) +; X86-SSE1-NEXT: movw %bx, 8(%eax) +; X86-SSE1-NEXT: movw %di, 6(%eax) +; X86-SSE1-NEXT: movw %si, 4(%eax) +; X86-SSE1-NEXT: movzwl (%esp), %ecx # 2-byte Folded Reload +; X86-SSE1-NEXT: movw %cx, 2(%eax) +; X86-SSE1-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 2-byte Folded Reload +; X86-SSE1-NEXT: movw %cx, (%eax) +; X86-SSE1-NEXT: addl $4, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 20 +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE1-NEXT: popl %edi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %ebp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl $4 +; +; X86-SSE41-LABEL: merge_8i16_i16_76543210: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movzwl 14(%eax), %ecx +; X86-SSE41-NEXT: movd %ecx, %xmm0 +; X86-SSE41-NEXT: pinsrw $1, 12(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrw $2, 10(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrw $3, 8(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrw $4, 6(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrw $5, 4(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrw $6, 2(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrw $7, (%eax), %xmm0 +; X86-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds i16, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds i16, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 0 + %val0 = load i16, ptr %ptr0 + %val1 = load i16, ptr %ptr1 + %val2 = load i16, ptr %ptr2 + %val3 = load i16, ptr %ptr3 + %val4 = load i16, ptr %ptr4 + %val5 = load i16, ptr %ptr5 + %val6 = load i16, ptr %ptr6 + %val7 = load i16, ptr %ptr7 + %res0 = insertelement <8 x i16> poison, i16 %val0, i64 0 + %res1 = insertelement <8 x i16> %res0, i16 %val1, i64 1 + %res2 = insertelement <8 x i16> %res1, i16 %val2, i64 2 + %res3 = insertelement <8 x i16> %res2, i16 %val3, i64 3 + %res4 = insertelement <8 x i16> %res3, i16 %val4, i64 4 + %res5 = insertelement <8 x i16> %res4, i16 %val5, i64 5 + %res6 = insertelement <8 x i16> %res5, i16 %val6, i64 6 + %res7 = insertelement <8 x i16> %res6, i16 %val7, i64 7 + ret <8 x i16> %res7 +} + define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF: ; SSE: # %bb.0: @@ -1056,6 +1338,244 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noin ret <16 x i8> %resF } +define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { +; SSE2-LABEL: merge_16i8_i8_FEDCBA9876543210: +; SSE2: # %bb.0: +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl 1(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movzbl 2(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl 3(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movzbl 4(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl 5(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: movzbl 6(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl 7(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movzbl 8(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl 9(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl 10(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl 11(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movzbl 12(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl 13(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movzbl 14(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl 15(%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210: +; SSE41: # %bb.0: +; SSE41-NEXT: movzbl 15(%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 +; SSE41-NEXT: pinsrb $1, 14(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $2, 13(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $3, 12(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $4, 11(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $5, 10(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $6, 9(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $7, 8(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $8, 7(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $9, 6(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $10, 5(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $11, 4(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $13, 2(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $14, 1(%rdi), %xmm0 +; SSE41-NEXT: pinsrb $15, (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: merge_16i8_i8_FEDCBA9876543210: +; AVX: # %bb.0: +; AVX-NEXT: movzbl 15(%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpinsrb $1, 14(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $2, 13(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $3, 12(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $4, 11(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $5, 10(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $6, 9(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, 8(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $8, 7(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, 6(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $10, 5(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $11, 4(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $13, 2(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $14, 1(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $15, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-SSE1-LABEL: merge_16i8_i8_FEDCBA9876543210: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: pushl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: pushl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: subl $12, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 24 +; X86-SSE1-NEXT: .cfi_offset %esi, -12 +; X86-SSE1-NEXT: .cfi_offset %ebx, -8 +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SSE1-NEXT: movzbl 15(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 14(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 13(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 12(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 11(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 10(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 9(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 8(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 7(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movzbl 6(%esi), %ecx +; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SSE1-NEXT: movb 5(%esi), %bh +; X86-SSE1-NEXT: movb 4(%esi), %bl +; X86-SSE1-NEXT: movb 3(%esi), %dh +; X86-SSE1-NEXT: movb 2(%esi), %ch +; X86-SSE1-NEXT: movb (%esi), %cl +; X86-SSE1-NEXT: movb 1(%esi), %dl +; X86-SSE1-NEXT: movb %cl, 15(%eax) +; X86-SSE1-NEXT: movb %dl, 14(%eax) +; X86-SSE1-NEXT: movb %ch, 13(%eax) +; X86-SSE1-NEXT: movb %dh, 12(%eax) +; X86-SSE1-NEXT: movb %bl, 11(%eax) +; X86-SSE1-NEXT: movb %bh, 10(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 9(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 8(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 7(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 6(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 5(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 4(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 3(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 2(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, 1(%eax) +; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SSE1-NEXT: movb %cl, (%eax) +; X86-SSE1-NEXT: addl $12, %esp +; X86-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X86-SSE1-NEXT: popl %esi +; X86-SSE1-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE1-NEXT: popl %ebx +; X86-SSE1-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE1-NEXT: retl $4 +; +; X86-SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210: +; X86-SSE41: # %bb.0: +; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE41-NEXT: movzbl 15(%eax), %ecx +; X86-SSE41-NEXT: movd %ecx, %xmm0 +; X86-SSE41-NEXT: pinsrb $1, 14(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $2, 13(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $3, 12(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $4, 11(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $5, 10(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $6, 9(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $7, 8(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $8, 7(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $9, 6(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $10, 5(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $11, 4(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $12, 3(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $13, 2(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $14, 1(%eax), %xmm0 +; X86-SSE41-NEXT: pinsrb $15, (%eax), %xmm0 +; X86-SSE41-NEXT: retl + %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 15 + %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 14 + %ptr2 = getelementptr inbounds i8, ptr %ptr, i64 13 + %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 12 + %ptr4 = getelementptr inbounds i8, ptr %ptr, i64 11 + %ptr5 = getelementptr inbounds i8, ptr %ptr, i64 10 + %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 9 + %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 8 + %ptr8 = getelementptr inbounds i8, ptr %ptr, i64 7 + %ptr9 = getelementptr inbounds i8, ptr %ptr, i64 6 + %ptrA = getelementptr inbounds i8, ptr %ptr, i64 5 + %ptrB = getelementptr inbounds i8, ptr %ptr, i64 4 + %ptrC = getelementptr inbounds i8, ptr %ptr, i64 3 + %ptrD = getelementptr inbounds i8, ptr %ptr, i64 2 + %ptrE = getelementptr inbounds i8, ptr %ptr, i64 1 + %ptrF = getelementptr inbounds i8, ptr %ptr, i64 0 + %val0 = load i8, ptr %ptr0 + %val1 = load i8, ptr %ptr1 + %val2 = load i8, ptr %ptr2 + %val3 = load i8, ptr %ptr3 + %val4 = load i8, ptr %ptr4 + %val5 = load i8, ptr %ptr5 + %val6 = load i8, ptr %ptr6 + %val7 = load i8, ptr %ptr7 + %val8 = load i8, ptr %ptr8 + %val9 = load i8, ptr %ptr9 + %valA = load i8, ptr %ptrA + %valB = load i8, ptr %ptrB + %valC = load i8, ptr %ptrC + %valD = load i8, ptr %ptrD + %valE = load i8, ptr %ptrE + %valF = load i8, ptr %ptrF + %res0 = insertelement <16 x i8> poison, i8 %val0, i8 0 + %res1 = insertelement <16 x i8> %res0, i8 %val1, i64 1 + %res2 = insertelement <16 x i8> %res1, i8 %val2, i64 2 + %res3 = insertelement <16 x i8> %res2, i8 %val3, i64 3 + %res4 = insertelement <16 x i8> %res3, i8 %val4, i64 4 + %res5 = insertelement <16 x i8> %res4, i8 %val5, i64 5 + %res6 = insertelement <16 x i8> %res5, i8 %val6, i64 6 + %res7 = insertelement <16 x i8> %res6, i8 %val7, i64 7 + %res8 = insertelement <16 x i8> %res7, i8 %val8, i64 8 + %res9 = insertelement <16 x i8> %res8, i8 %val9, i64 9 + %resA = insertelement <16 x i8> %res9, i8 %valA, i64 10 + %resB = insertelement <16 x i8> %resA, i8 %valB, i64 11 + %resC = insertelement <16 x i8> %resB, i8 %valC, i64 12 + %resD = insertelement <16 x i8> %resC, i8 %valD, i64 13 + %resE = insertelement <16 x i8> %resD, i8 %valE, i64 14 + %resF = insertelement <16 x i8> %resE, i8 %valF, i64 15 + ret <16 x i8> %resF +} + define void @merge_4i32_i32_combine(ptr %dst, ptr %src) { ; SSE-LABEL: merge_4i32_i32_combine: ; SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll index 33e8d62c00a4c..e5e99e17053a0 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -126,6 +126,40 @@ define <4 x double> @merge_4f64_f64_45zz(ptr %ptr) nounwind uwtable noinline ssp ret <4 x double> %res1 } +define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX-LABEL: merge_v4f64_f64_3210: +; AVX: # %bb.0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; X86-AVX-LABEL: merge_v4f64_f64_3210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X86-AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds double, ptr %ptr, i64 3 + %ptr1 = getelementptr inbounds double, ptr %ptr, i64 2 + %ptr2 = getelementptr inbounds double, ptr %ptr, i64 1 + %ptr3 = getelementptr inbounds double, ptr %ptr, i64 0 + %val0 = load double, ptr %ptr0, align 4 + %val1 = load double, ptr %ptr1, align 4 + %val2 = load double, ptr %ptr2, align 4 + %val3 = load double, ptr %ptr3, align 4 + %res0 = insertelement <4 x double> poison, double %val0, i64 0 + %res1 = insertelement <4 x double> %res0, double %val1, i64 1 + %res2 = insertelement <4 x double> %res1, double %val2, i64 2 + %res3 = insertelement <4 x double> %res2, double %val3, i64 3 + ret <4 x double> %res3 +} + define <4 x double> @merge_4f64_f64_34z6(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_4f64_f64_34z6: ; AVX: # %bb.0: @@ -234,6 +268,46 @@ define <4 x i64> @merge_4i64_i64_23zz(ptr %ptr) nounwind uwtable noinline ssp { ret <4 x i64> %res1 } +define <4 x i64> @merge_v4i64_i64_3210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX-LABEL: merge_v4i64_i64_3210: +; AVX: # %bb.0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; X86-AVX-LABEL: merge_v4i64_i64_3210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpinsrd $1, 12(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpinsrd $1, 28(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrd $2, 16(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrd $3, 20(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 3 + %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2 + %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 1 + %ptr3 = getelementptr inbounds i64, ptr %ptr, i64 0 + %val0 = load i64, ptr %ptr0, align 4 + %val1 = load i64, ptr %ptr1, align 4 + %val2 = load i64, ptr %ptr2, align 4 + %val3 = load i64, ptr %ptr3, align 4 + %res0 = insertelement <4 x i64> poison, i64 %val0, i64 0 + %res1 = insertelement <4 x i64> %res0, i64 %val1, i64 1 + %res2 = insertelement <4 x i64> %res1, i64 %val2, i64 2 + %res3 = insertelement <4 x i64> %res2, i64 %val3, i64 3 + ret <4 x i64> %res3 +} + define <8 x float> @merge_8f32_2f32_23z5(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_8f32_2f32_23z5: ; AVX: # %bb.0: @@ -335,6 +409,60 @@ define <8 x float> @merge_8f32_f32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ret <8 x float> %res7 } +define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX-LABEL: merge_8f32_f32_76543210: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; X86-AVX-LABEL: merge_8f32_f32_76543210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; X86-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds float, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds float, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds float, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds float, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds float, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds float, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds float, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds float, ptr %ptr, i64 0 + %val0 = load float, ptr %ptr0 + %val1 = load float, ptr %ptr1 + %val2 = load float, ptr %ptr2 + %val3 = load float, ptr %ptr3 + %val4 = load float, ptr %ptr4 + %val5 = load float, ptr %ptr5 + %val6 = load float, ptr %ptr6 + %val7 = load float, ptr %ptr7 + %res0 = insertelement <8 x float> poison, float %val0, i64 0 + %res1 = insertelement <8 x float> %res0, float %val1, i64 1 + %res2 = insertelement <8 x float> %res1, float %val2, i64 2 + %res3 = insertelement <8 x float> %res2, float %val3, i64 3 + %res4 = insertelement <8 x float> %res3, float %val4, i64 4 + %res5 = insertelement <8 x float> %res4, float %val5, i64 5 + %res6 = insertelement <8 x float> %res5, float %val6, i64 6 + %res7 = insertelement <8 x float> %res6, float %val7, i64 7 + ret <8 x float> %res7 +} + define <8 x i32> @merge_8i32_4i32_z3(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_8i32_4i32_z3: ; AVX: # %bb.0: @@ -414,6 +542,86 @@ define <8 x i32> @merge_8i32_i32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss ret <8 x i32> %res7 } +define <8 x i32> @merge_8i32_i32_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX1-LABEL: merge_8i32_i32_76543210: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-NEXT: vpinsrd $1, 24(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $2, 20(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $3, 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8i32_i32_76543210: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-NEXT: vpinsrd $1, 24(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrd $2, 20(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrd $3, 16(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8i32_i32_76543210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512F-NEXT: vpinsrd $1, 24(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrd $2, 20(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrd $3, 16(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; X86-AVX-LABEL: merge_8i32_i32_76543210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpinsrd $1, 8(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrd $2, 4(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrd $3, (%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpinsrd $1, 24(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrd $2, 20(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrd $3, 16(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds i32, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds i32, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds i32, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds i32, ptr %ptr, i64 0 + %val0 = load i32, ptr %ptr0 + %val1 = load i32, ptr %ptr1 + %val2 = load i32, ptr %ptr2 + %val3 = load i32, ptr %ptr3 + %val4 = load i32, ptr %ptr4 + %val5 = load i32, ptr %ptr5 + %val6 = load i32, ptr %ptr6 + %val7 = load i32, ptr %ptr7 + %res0 = insertelement <8 x i32> poison, i32 %val0, i64 0 + %res1 = insertelement <8 x i32> %res0, i32 %val1, i64 1 + %res2 = insertelement <8 x i32> %res1, i32 %val2, i64 2 + %res3 = insertelement <8 x i32> %res2, i32 %val3, i64 3 + %res4 = insertelement <8 x i32> %res3, i32 %val4, i64 4 + %res5 = insertelement <8 x i32> %res4, i32 %val5, i64 5 + %res6 = insertelement <8 x i32> %res5, i32 %val6, i64 6 + %res7 = insertelement <8 x i32> %res6, i32 %val7, i64 7 + ret <8 x i32> %res7 +} + define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: ; AVX: # %bb.0: @@ -522,6 +730,150 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n ret <16 x i16> %resF } +define <16 x i16> @merge_16i16_i16_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { +; AVX1-LABEL: merge_16i16_i16_FEDCBA9876543210: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl 14(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: movzwl 30(%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpinsrw $1, 28(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $2, 26(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, 24(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, 22(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $5, 20(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $6, 18(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $7, 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_16i16_i16_FEDCBA9876543210: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 14(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: movzwl 30(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpinsrw $1, 28(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $2, 26(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, 24(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, 22(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $5, 20(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $6, 18(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $7, 16(%rdi), %xmm1, %xmm1 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_16i16_i16_FEDCBA9876543210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: movzwl 14(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 +; AVX512F-NEXT: movzwl 30(%rdi), %eax +; AVX512F-NEXT: vmovd %eax, %xmm1 +; AVX512F-NEXT: vpinsrw $1, 28(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $2, 26(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $3, 24(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $4, 22(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $5, 20(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $6, 18(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $7, 16(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; X86-AVX-LABEL: merge_16i16_i16_FEDCBA9876543210: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movzwl 14(%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm0 +; X86-AVX-NEXT: vpinsrw $1, 12(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrw $2, 10(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrw $3, 8(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrw $4, 6(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrw $5, 4(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrw $6, 2(%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: vpinsrw $7, (%eax), %xmm0, %xmm0 +; X86-AVX-NEXT: movzwl 30(%eax), %ecx +; X86-AVX-NEXT: vmovd %ecx, %xmm1 +; X86-AVX-NEXT: vpinsrw $1, 28(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrw $2, 26(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrw $3, 24(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrw $4, 22(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrw $5, 20(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrw $6, 18(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vpinsrw $7, 16(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: retl + %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 15 + %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 14 + %ptr2 = getelementptr inbounds i16, ptr %ptr, i64 13 + %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 12 + %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 11 + %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 10 + %ptr6 = getelementptr inbounds i16, ptr %ptr, i64 9 + %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 8 + %ptr8 = getelementptr inbounds i16, ptr %ptr, i64 7 + %ptr9 = getelementptr inbounds i16, ptr %ptr, i64 6 + %ptrA = getelementptr inbounds i16, ptr %ptr, i64 5 + %ptrB = getelementptr inbounds i16, ptr %ptr, i64 4 + %ptrC = getelementptr inbounds i16, ptr %ptr, i64 3 + %ptrD = getelementptr inbounds i16, ptr %ptr, i64 2 + %ptrE = getelementptr inbounds i16, ptr %ptr, i64 1 + %ptrF = getelementptr inbounds i16, ptr %ptr, i64 0 + %val0 = load i16, ptr %ptr0 + %val1 = load i16, ptr %ptr1 + %val2 = load i16, ptr %ptr2 + %val3 = load i16, ptr %ptr3 + %val4 = load i16, ptr %ptr4 + %val5 = load i16, ptr %ptr5 + %val6 = load i16, ptr %ptr6 + %val7 = load i16, ptr %ptr7 + %val8 = load i16, ptr %ptr8 + %val9 = load i16, ptr %ptr9 + %valA = load i16, ptr %ptrA + %valB = load i16, ptr %ptrB + %valC = load i16, ptr %ptrC + %valD = load i16, ptr %ptrD + %valE = load i16, ptr %ptrE + %valF = load i16, ptr %ptrF + %res0 = insertelement <16 x i16> poison, i16 %val0, i64 0 + %res1 = insertelement <16 x i16> %res0, i16 %val1, i64 1 + %res2 = insertelement <16 x i16> %res1, i16 %val2, i64 2 + %res3 = insertelement <16 x i16> %res2, i16 %val3, i64 3 + %res4 = insertelement <16 x i16> %res3, i16 %val4, i64 4 + %res5 = insertelement <16 x i16> %res4, i16 %val5, i64 5 + %res6 = insertelement <16 x i16> %res5, i16 %val6, i64 6 + %res7 = insertelement <16 x i16> %res6, i16 %val7, i64 7 + %res8 = insertelement <16 x i16> %res7, i16 %val8, i64 8 + %res9 = insertelement <16 x i16> %res8, i16 %val9, i64 9 + %resA = insertelement <16 x i16> %res9, i16 %valA, i64 10 + %resB = insertelement <16 x i16> %resA, i16 %valB, i64 11 + %resC = insertelement <16 x i16> %resB, i16 %valC, i64 12 + %resD = insertelement <16 x i16> %resC, i16 %valD, i64 13 + %resE = insertelement <16 x i16> %resD, i16 %valE, i64 14 + %resF = insertelement <16 x i16> %resE, i16 %valF, i64 15 + ret <16 x i16> %resF +} + define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll index 790bed4188efe..fabca0ea5007e 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -148,6 +148,64 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ret <8 x double> %res7 } +define <8 x double> @merge_8f64_f64_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; ALL-LABEL: merge_8f64_f64_76543210: +; ALL: # %bb.0: +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1] +; ALL-NEXT: vmovhps {{.*#+}} xmm2 = xmm3[0,1],mem[0,1] +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq +; +; X86-AVX512F-LABEL: merge_8f64_f64_76543210: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; X86-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] +; X86-AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds double, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds double, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds double, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds double, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds double, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds double, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds double, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds double, ptr %ptr, i64 0 + %val0 = load double, ptr %ptr0 + %val1 = load double, ptr %ptr1 + %val2 = load double, ptr %ptr2 + %val3 = load double, ptr %ptr3 + %val4 = load double, ptr %ptr4 + %val5 = load double, ptr %ptr5 + %val6 = load double, ptr %ptr6 + %val7 = load double, ptr %ptr7 + %res0 = insertelement <8 x double> poison, double %val0, i64 0 + %res1 = insertelement <8 x double> %res0, double %val1, i64 1 + %res2 = insertelement <8 x double> %res1, double %val2, i64 2 + %res3 = insertelement <8 x double> %res2, double %val3, i64 3 + %res4 = insertelement <8 x double> %res3, double %val4, i64 4 + %res5 = insertelement <8 x double> %res4, double %val5, i64 5 + %res6 = insertelement <8 x double> %res5, double %val6, i64 6 + %res7 = insertelement <8 x double> %res6, double %val7, i64 7 + ret <8 x double> %res7 +} + define <8 x i64> @merge_8i64_4i64_z3(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_4i64_z3: ; ALL: # %bb.0: @@ -227,6 +285,76 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss ret <8 x i64> %res7 } +define <8 x i64> @merge_8i64_i64_76543210(ptr %ptr) nounwind uwtable noinline ssp { +; ALL-LABEL: merge_8i64_i64_76543210: +; ALL: # %bb.0: +; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; ALL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq +; +; X86-AVX512F-LABEL: merge_8i64_i64_76543210: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 12(%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 28(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $2, 16(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $3, 20(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 44(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $2, 32(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $3, 36(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 60(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vpinsrd $2, 48(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vpinsrd $3, 52(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 7 + %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 6 + %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 5 + %ptr3 = getelementptr inbounds i64, ptr %ptr, i64 4 + %ptr4 = getelementptr inbounds i64, ptr %ptr, i64 3 + %ptr5 = getelementptr inbounds i64, ptr %ptr, i64 2 + %ptr6 = getelementptr inbounds i64, ptr %ptr, i64 1 + %ptr7 = getelementptr inbounds i64, ptr %ptr, i64 0 + %val0 = load i64, ptr %ptr0 + %val1 = load i64, ptr %ptr1 + %val2 = load i64, ptr %ptr2 + %val3 = load i64, ptr %ptr3 + %val4 = load i64, ptr %ptr4 + %val5 = load i64, ptr %ptr5 + %val6 = load i64, ptr %ptr6 + %val7 = load i64, ptr %ptr7 + %res0 = insertelement <8 x i64> poison, i64 %val0, i64 0 + %res1 = insertelement <8 x i64> %res0, i64 %val1, i64 1 + %res2 = insertelement <8 x i64> %res1, i64 %val2, i64 2 + %res3 = insertelement <8 x i64> %res2, i64 %val3, i64 3 + %res4 = insertelement <8 x i64> %res3, i64 %val4, i64 4 + %res5 = insertelement <8 x i64> %res4, i64 %val5, i64 5 + %res6 = insertelement <8 x i64> %res5, i64 %val6, i64 6 + %res7 = insertelement <8 x i64> %res6, i64 %val7, i64 7 + ret <8 x i64> %res7 +} + define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: ; ALL: # %bb.0: @@ -335,6 +463,104 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable ret <16 x float> %resF } +define <16 x float> @merge_16f32_f32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { +; ALL-LABEL: merge_16f32_f32_FEDCBA9876543210: +; ALL: # %bb.0: +; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[2,3] +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq +; +; X86-AVX512F-LABEL: merge_16f32_f32_FEDCBA9876543210: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; X86-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] +; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] +; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] +; X86-AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds float, ptr %ptr, i64 15 + %ptr1 = getelementptr inbounds float, ptr %ptr, i64 14 + %ptr2 = getelementptr inbounds float, ptr %ptr, i64 13 + %ptr3 = getelementptr inbounds float, ptr %ptr, i64 12 + %ptr4 = getelementptr inbounds float, ptr %ptr, i64 11 + %ptr5 = getelementptr inbounds float, ptr %ptr, i64 10 + %ptr6 = getelementptr inbounds float, ptr %ptr, i64 9 + %ptr7 = getelementptr inbounds float, ptr %ptr, i64 8 + %ptr8 = getelementptr inbounds float, ptr %ptr, i64 7 + %ptr9 = getelementptr inbounds float, ptr %ptr, i64 6 + %ptrA = getelementptr inbounds float, ptr %ptr, i64 5 + %ptrB = getelementptr inbounds float, ptr %ptr, i64 4 + %ptrC = getelementptr inbounds float, ptr %ptr, i64 3 + %ptrD = getelementptr inbounds float, ptr %ptr, i64 2 + %ptrE = getelementptr inbounds float, ptr %ptr, i64 1 + %ptrF = getelementptr inbounds float, ptr %ptr, i64 0 + %val0 = load float, ptr %ptr0 + %val1 = load float, ptr %ptr1 + %val2 = load float, ptr %ptr2 + %val3 = load float, ptr %ptr3 + %val4 = load float, ptr %ptr4 + %val5 = load float, ptr %ptr5 + %val6 = load float, ptr %ptr6 + %val7 = load float, ptr %ptr7 + %val8 = load float, ptr %ptr8 + %val9 = load float, ptr %ptr9 + %valA = load float, ptr %ptrA + %valB = load float, ptr %ptrB + %valC = load float, ptr %ptrC + %valD = load float, ptr %ptrD + %valE = load float, ptr %ptrE + %valF = load float, ptr %ptrF + %res0 = insertelement <16 x float> poison, float %val0, i64 0 + %res1 = insertelement <16 x float> %res0, float %val1, i64 1 + %res2 = insertelement <16 x float> %res1, float %val2, i64 2 + %res3 = insertelement <16 x float> %res2, float %val3, i64 3 + %res4 = insertelement <16 x float> %res3, float %val4, i64 4 + %res5 = insertelement <16 x float> %res4, float %val5, i64 5 + %res6 = insertelement <16 x float> %res5, float %val6, i64 6 + %res7 = insertelement <16 x float> %res6, float %val7, i64 7 + %res8 = insertelement <16 x float> %res7, float %val8, i64 8 + %res9 = insertelement <16 x float> %res8, float %val9, i64 9 + %resA = insertelement <16 x float> %res9, float %valA, i64 10 + %resB = insertelement <16 x float> %resA, float %valB, i64 11 + %resC = insertelement <16 x float> %resB, float %valC, i64 12 + %resD = insertelement <16 x float> %resC, float %valD, i64 13 + %resE = insertelement <16 x float> %resD, float %valE, i64 14 + %resF = insertelement <16 x float> %resE, float %valF, i64 15 + ret <16 x float> %resF +} + define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: ; ALL: # %bb.0: @@ -443,6 +669,104 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n ret <16 x i32> %resF } +define <16 x i32> @merge_16i32_i32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { +; ALL-LABEL: merge_16i32_i32_FEDCBA9876543210: +; ALL: # %bb.0: +; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 +; ALL-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 +; ALL-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ALL-NEXT: vpinsrd $1, 24(%rdi), %xmm1, %xmm1 +; ALL-NEXT: vpinsrd $2, 20(%rdi), %xmm1, %xmm1 +; ALL-NEXT: vpinsrd $3, 16(%rdi), %xmm1, %xmm1 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ALL-NEXT: vpinsrd $1, 40(%rdi), %xmm1, %xmm1 +; ALL-NEXT: vpinsrd $2, 36(%rdi), %xmm1, %xmm1 +; ALL-NEXT: vpinsrd $3, 32(%rdi), %xmm1, %xmm1 +; ALL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; ALL-NEXT: vpinsrd $1, 56(%rdi), %xmm2, %xmm2 +; ALL-NEXT: vpinsrd $2, 52(%rdi), %xmm2, %xmm2 +; ALL-NEXT: vpinsrd $3, 48(%rdi), %xmm2, %xmm2 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq +; +; X86-AVX512F-LABEL: merge_16i32_i32_FEDCBA9876543210: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 8(%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vpinsrd $2, 4(%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vpinsrd $3, (%eax), %xmm0, %xmm0 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 24(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $2, 20(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $3, 16(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 40(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $2, 36(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vpinsrd $3, 32(%eax), %xmm1, %xmm1 +; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-AVX512F-NEXT: vpinsrd $1, 56(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vpinsrd $2, 52(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vpinsrd $3, 48(%eax), %xmm2, %xmm2 +; X86-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512F-NEXT: retl + %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 15 + %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 14 + %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 13 + %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 12 + %ptr4 = getelementptr inbounds i32, ptr %ptr, i64 11 + %ptr5 = getelementptr inbounds i32, ptr %ptr, i64 10 + %ptr6 = getelementptr inbounds i32, ptr %ptr, i64 9 + %ptr7 = getelementptr inbounds i32, ptr %ptr, i64 8 + %ptr8 = getelementptr inbounds i32, ptr %ptr, i64 7 + %ptr9 = getelementptr inbounds i32, ptr %ptr, i64 6 + %ptrA = getelementptr inbounds i32, ptr %ptr, i64 5 + %ptrB = getelementptr inbounds i32, ptr %ptr, i64 4 + %ptrC = getelementptr inbounds i32, ptr %ptr, i64 3 + %ptrD = getelementptr inbounds i32, ptr %ptr, i64 2 + %ptrE = getelementptr inbounds i32, ptr %ptr, i64 1 + %ptrF = getelementptr inbounds i32, ptr %ptr, i64 0 + %val0 = load i32, ptr %ptr0 + %val1 = load i32, ptr %ptr1 + %val2 = load i32, ptr %ptr2 + %val3 = load i32, ptr %ptr3 + %val4 = load i32, ptr %ptr4 + %val5 = load i32, ptr %ptr5 + %val6 = load i32, ptr %ptr6 + %val7 = load i32, ptr %ptr7 + %val8 = load i32, ptr %ptr8 + %val9 = load i32, ptr %ptr9 + %valA = load i32, ptr %ptrA + %valB = load i32, ptr %ptrB + %valC = load i32, ptr %ptrC + %valD = load i32, ptr %ptrD + %valE = load i32, ptr %ptrE + %valF = load i32, ptr %ptrF + %res0 = insertelement <16 x i32> poison, i32 %val0, i64 0 + %res1 = insertelement <16 x i32> %res0, i32 %val1, i64 1 + %res2 = insertelement <16 x i32> %res1, i32 %val2, i64 2 + %res3 = insertelement <16 x i32> %res2, i32 %val3, i64 3 + %res4 = insertelement <16 x i32> %res3, i32 %val4, i64 4 + %res5 = insertelement <16 x i32> %res4, i32 %val5, i64 5 + %res6 = insertelement <16 x i32> %res5, i32 %val6, i64 6 + %res7 = insertelement <16 x i32> %res6, i32 %val7, i64 7 + %res8 = insertelement <16 x i32> %res7, i32 %val8, i64 8 + %res9 = insertelement <16 x i32> %res8, i32 %val9, i64 9 + %resA = insertelement <16 x i32> %res9, i32 %valA, i64 10 + %resB = insertelement <16 x i32> %resA, i32 %valB, i64 11 + %resC = insertelement <16 x i32> %resB, i32 %valC, i64 12 + %resD = insertelement <16 x i32> %resC, i32 %valD, i64 13 + %resE = insertelement <16 x i32> %resD, i32 %valE, i64 14 + %resF = insertelement <16 x i32> %resE, i32 %valF, i64 15 + ret <16 x i32> %resF +} + define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: ; ALL: # %bb.0: