Expand Up
@@ -858,7 +858,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, % xmm1
; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & ( xmm1 ^ xmm0))
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
Expand All
@@ -879,7 +879,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, % xmm1
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & ( xmm1 ^ xmm0))
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi)
Expand All
@@ -900,7 +900,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, % xmm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & ( xmm1 ^ xmm0))
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
Expand All
@@ -921,7 +921,7 @@ define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, % xmm1
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & ( xmm1 ^ xmm0))
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi)
Expand Down
Expand Up
@@ -1412,7 +1412,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512-NEXT: vpternlogq $202, % ymm1, %ymm0, % ymm4
; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ ( ymm4 & (ymm0 ^ ymm1))
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
Expand Down
Expand Up
@@ -1440,7 +1440,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpternlogq $202, % ymm1, %ymm0, % ymm4
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ ( ymm4 & (ymm0 ^ ymm1))
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
Expand Down
Expand Up
@@ -1468,7 +1468,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpternlogq $202, % ymm1, %ymm0, % ymm4
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ ( ymm4 & (ymm0 ^ ymm1))
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
Expand Down
Expand Up
@@ -1496,7 +1496,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpternlogq $202, % ymm1, %ymm0, % ymm4
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ ( ymm4 & (ymm0 ^ ymm1))
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
Expand Down
Expand Up
@@ -2436,7 +2436,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpternlogq $202, % zmm2, %zmm8, % zmm5
; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ ( zmm5 & (zmm8 ^ zmm2))
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
Expand Down
Expand Up
@@ -2483,7 +2483,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vpternlogq $202, % zmm2, %zmm8, % zmm5
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ ( zmm5 & (zmm8 ^ zmm2))
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
Expand Down
Expand Up
@@ -2530,7 +2530,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-NEXT: vpternlogq $202, % zmm2, %zmm8, % zmm5
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ ( zmm5 & (zmm8 ^ zmm2))
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
Expand Down
Expand Up
@@ -2577,7 +2577,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vpternlogq $202, % zmm2, %zmm8, % zmm5
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ ( zmm5 & (zmm8 ^ zmm2))
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
Expand Down