Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LegalizeTypes] Teach DAGTypeLegalizer::GenWidenVectorLoads to pad wi…
…th undef if needed when concatenating small or loads to match a larger load In the included test case the align 16 allowed the v23f32 load to handled as load v16f32, load v4f32, and load v4f32(one element not used). These loads all need to be concatenated together into a final vector. In this case we tried to concatenate the two v4f32 loads to match the type of the v16f32 load so we could do a second concat_vectors, but those loads alone only add up to v8f32. So we need to two v4f32 undefs to pad it. It appears we've tried to hack around a similar issue in this code before by adding undef padding to loads in one of the earlier loops in this function. Originally in r147964 by padding all loads narrower than previous loads to the same size. Later modifed to only the last load in r293088. This patch removes that earlier code and just handles it on demand where we know we need it. Fixes PR46820 Differential Revision: https://reviews.llvm.org/D84463 (cherry picked from commit 8131e19)
- Loading branch information
Showing
2 changed files
with
59 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s | ||
|
||
; The alignment of 16 causes type legalization to split this as 3 loads, | ||
; v16f32, v4f32, and v4f32. This loads 24 elements, but the load is aligned | ||
; to 16 bytes so this i safe. There was an issue with type legalization building | ||
; the proper concat_vectors for this because the two v4f32s don't add up to | ||
; v16f32 and require padding. | ||
|
||
define <23 x float> @load23(<23 x float>* %p) { | ||
; CHECK-LABEL: load23: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: movq %rdi, %rax | ||
; CHECK-NEXT: vmovups 64(%rsi), %ymm0 | ||
; CHECK-NEXT: vmovups (%rsi), %zmm1 | ||
; CHECK-NEXT: vmovaps 64(%rsi), %xmm2 | ||
; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero | ||
; CHECK-NEXT: vmovss %xmm3, 88(%rdi) | ||
; CHECK-NEXT: vmovaps %xmm2, 64(%rdi) | ||
; CHECK-NEXT: vmovaps %zmm1, (%rdi) | ||
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 | ||
; CHECK-NEXT: vmovlps %xmm0, 80(%rdi) | ||
; CHECK-NEXT: vzeroupper | ||
; CHECK-NEXT: retq | ||
%t0 = load <23 x float>, <23 x float>* %p, align 16 | ||
ret <23 x float> %t0 | ||
} | ||
|
||
; Same test as above with minimal alignment just to demonstrate the different | ||
; codegen. | ||
define <23 x float> @load23_align_1(<23 x float>* %p) { | ||
; CHECK-LABEL: load23_align_1: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: movq %rdi, %rax | ||
; CHECK-NEXT: vmovups (%rsi), %zmm0 | ||
; CHECK-NEXT: vmovups 64(%rsi), %xmm1 | ||
; CHECK-NEXT: movq 80(%rsi), %rcx | ||
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero | ||
; CHECK-NEXT: vmovss %xmm2, 88(%rdi) | ||
; CHECK-NEXT: movq %rcx, 80(%rdi) | ||
; CHECK-NEXT: vmovaps %xmm1, 64(%rdi) | ||
; CHECK-NEXT: vmovaps %zmm0, (%rdi) | ||
; CHECK-NEXT: vzeroupper | ||
; CHECK-NEXT: retq | ||
%t0 = load <23 x float>, <23 x float>* %p, align 1 | ||
ret <23 x float> %t0 | ||
} |