Skip to content

Commit

Permalink
[X86] Add isel patterns for X86VBroadcast with i16 truncates from i16…
Browse files Browse the repository at this point in the history
…->i64 zextload/extload.

We can form vpbroadcastw with a folded load.

We had patterns for i16->i32 zextload/extload, but nothing prevents
i64 from occuring.

I'd like to move this all to DAG combine to fix more cases, but
this is trivial fix to minimize test diffs when moving to a combine.
  • Loading branch information
topperc committed Mar 13, 2020
1 parent 51a4c61 commit 09c8f38
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 115 deletions.
32 changes: 26 additions & 6 deletions llvm/lib/Target/X86/X86InstrAVX512.td
Expand Up @@ -1427,26 +1427,46 @@ let Predicates = [HasVLX, HasBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
// This means we'll encounter truncated i32 loads; match that here.
def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
(i16 (trunc (extloadi32i16 addr:$src))))),
(VPBROADCASTWZ128rm addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(i16 (trunc (zextloadi32i16 addr:$src))))),
(VPBROADCASTWZ128rm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
(i16 (trunc (extloadi32i16 addr:$src))))),
(VPBROADCASTWZ256rm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(i16 (trunc (zextloadi32i16 addr:$src))))),
(VPBROADCASTWZ256rm addr:$src)>;

def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (extloadi64i16 addr:$src))))),
(VPBROADCASTWZ128rm addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (zextloadi64i16 addr:$src))))),
(VPBROADCASTWZ128rm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (extloadi64i16 addr:$src))))),
(VPBROADCASTWZ256rm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (zextloadi64i16 addr:$src))))),
(VPBROADCASTWZ256rm addr:$src)>;
}
let Predicates = [HasBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
// This means we'll encounter truncated i32 loads; match that here.
def : Pat<(v32i16 (X86VBroadcast
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
(i16 (trunc (extloadi32i16 addr:$src))))),
(VPBROADCASTWZrm addr:$src)>;
def : Pat<(v32i16 (X86VBroadcast
(i16 (trunc (zextloadi32i16 addr:$src))))),
(VPBROADCASTWZrm addr:$src)>;

def : Pat<(v32i16 (X86VBroadcast
(i16 (trunc (extloadi64i16 addr:$src))))),
(VPBROADCASTWZrm addr:$src)>;
def : Pat<(v32i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(i16 (trunc (zextloadi64i16 addr:$src))))),
(VPBROADCASTWZrm addr:$src)>;
}

Expand Down
21 changes: 17 additions & 4 deletions llvm/lib/Target/X86/X86InstrSSE.td
Expand Up @@ -7518,16 +7518,29 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
// This means we'll encounter truncated i32 loads; match that here.
def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
(i16 (trunc (extloadi32i16 addr:$src))))),
(VPBROADCASTWrm addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(i16 (trunc (zextloadi32i16 addr:$src))))),
(VPBROADCASTWrm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (extloadi16 addr:$src)))))),
(i16 (trunc (extloadi32i16 addr:$src))))),
(VPBROADCASTWYrm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
(i16 (trunc (zextloadi32i16 addr:$src))))),
(VPBROADCASTWYrm addr:$src)>;

def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (extloadi64i16 addr:$src))))),
(VPBROADCASTWrm addr:$src)>;
def : Pat<(v8i16 (X86VBroadcast
(i16 (trunc (zextloadi64i16 addr:$src))))),
(VPBROADCASTWrm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (extloadi64i16 addr:$src))))),
(VPBROADCASTWYrm addr:$src)>;
def : Pat<(v16i16 (X86VBroadcast
(i16 (trunc (zextloadi64i16 addr:$src))))),
(VPBROADCASTWYrm addr:$src)>;
}

Expand Down
60 changes: 15 additions & 45 deletions llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
Expand Up @@ -3331,18 +3331,10 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(i64* %ptr) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
; AVX2: # %bb.0:
; AVX2-NEXT: movzwl 6(%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movzwl 6(%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
; AVX512VL-NEXT: retq
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i64:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i64:
; XOPAVX1: # %bb.0:
Expand All @@ -3353,9 +3345,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
; XOPAVX2-NEXT: vmovd %eax, %xmm0
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %xmm0
; XOPAVX2-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
Expand Down Expand Up @@ -3392,18 +3382,10 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(i64* %ptr) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
; AVX2: # %bb.0:
; AVX2-NEXT: movzwl 6(%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt7_mem_v8i16_i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movzwl 6(%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
; AVX512VL-NEXT: retq
; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v8i16_i64:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: insert_dup_elt7_mem_v8i16_i64:
; XOPAVX1: # %bb.0:
Expand All @@ -3414,9 +3396,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
; XOPAVX2-NEXT: vmovd %eax, %xmm0
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %xmm0
; XOPAVX2-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
Expand All @@ -3442,18 +3422,10 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; AVX2: # %bb.0:
; AVX2-NEXT: movzwl (%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movzwl (%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
; AVX512VL-NEXT: retq
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; XOPAVX1: # %bb.0:
Expand All @@ -3465,9 +3437,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: movzwl (%rdi), %eax
; XOPAVX2-NEXT: vmovd %eax, %xmm0
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; XOPAVX2-NEXT: vpbroadcastw (%rdi), %xmm0
; XOPAVX2-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i64
Expand Down
60 changes: 15 additions & 45 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
Expand Up @@ -7546,18 +7546,10 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i64:
; AVX2: # %bb.0:
; AVX2-NEXT: movzwl 6(%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movzwl 6(%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
; AVX512VL-NEXT: retq
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i64:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i64:
; XOPAVX1: # %bb.0:
Expand All @@ -7569,9 +7561,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i64:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
; XOPAVX2-NEXT: vmovd %eax, %xmm0
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %ymm0
; XOPAVX2-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
Expand All @@ -7588,18 +7578,10 @@ define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt7_mem_v16i16_i64:
; AVX2: # %bb.0:
; AVX2-NEXT: movzwl 6(%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt7_mem_v16i16_i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movzwl 6(%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
; AVX512VL-NEXT: retq
; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v16i16_i64:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 6(%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: insert_dup_elt7_mem_v16i16_i64:
; XOPAVX1: # %bb.0:
Expand All @@ -7610,9 +7592,7 @@ define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_elt7_mem_v16i16_i64:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
; XOPAVX2-NEXT: vmovd %eax, %xmm0
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; XOPAVX2-NEXT: vpbroadcastw 6(%rdi), %ymm0
; XOPAVX2-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
Expand All @@ -7631,18 +7611,10 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; AVX2: # %bb.0:
; AVX2-NEXT: movzwl (%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movzwl (%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
; AVX512VL-NEXT: retq
; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
;
; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; XOPAVX1: # %bb.0:
Expand All @@ -7655,9 +7627,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
;
; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: movzwl (%rdi), %eax
; XOPAVX2-NEXT: vmovd %eax, %xmm0
; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; XOPAVX2-NEXT: vpbroadcastw (%rdi), %ymm0
; XOPAVX2-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i64
Expand Down
21 changes: 6 additions & 15 deletions llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
Expand Up @@ -333,16 +333,13 @@ define <32 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
; KNL-LABEL: insert_dup_elt3_mem_v16i16_i64:
; KNL: ## %bb.0:
; KNL-NEXT: movzwl 6(%rdi), %eax
; KNL-NEXT: vmovd %eax, %xmm0
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
; KNL-NEXT: vpbroadcastw 6(%rdi), %ymm0
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_elt3_mem_v16i16_i64:
; SKX: ## %bb.0:
; SKX-NEXT: movzwl 6(%rdi), %eax
; SKX-NEXT: vpbroadcastw %eax, %zmm0
; SKX-NEXT: vpbroadcastw 6(%rdi), %zmm0
; SKX-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
Expand All @@ -354,16 +351,13 @@ define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
; KNL-LABEL: insert_dup_elt7_mem_v16i16_i64:
; KNL: ## %bb.0:
; KNL-NEXT: movzwl 6(%rdi), %eax
; KNL-NEXT: vmovd %eax, %xmm0
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
; KNL-NEXT: vpbroadcastw 6(%rdi), %ymm0
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_elt7_mem_v16i16_i64:
; SKX: ## %bb.0:
; SKX-NEXT: movzwl 6(%rdi), %eax
; SKX-NEXT: vpbroadcastw %eax, %zmm0
; SKX-NEXT: vpbroadcastw 6(%rdi), %zmm0
; SKX-NEXT: retq
%tmp = load i64, i64* %ptr, align 4
%tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
Expand All @@ -375,16 +369,13 @@ define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
define <32 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
; KNL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; KNL: ## %bb.0:
; KNL-NEXT: movzwl (%rdi), %eax
; KNL-NEXT: vmovd %eax, %xmm0
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
; SKX: ## %bb.0:
; SKX-NEXT: movzwl (%rdi), %eax
; SKX-NEXT: vpbroadcastw %eax, %zmm0
; SKX-NEXT: vpbroadcastw (%rdi), %zmm0
; SKX-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
%tmp1 = sext i16 %tmp to i64
Expand Down

0 comments on commit 09c8f38

Please sign in to comment.