-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LegalizeDAG][X86][AArch64][LoongArch] Freeze index when converting extract_elt/extract_subvector to load/store on stack. #88985
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: Craig Topper (topperc) ChangesWe try clamp the index to be within the bounds of the stack object We have other instances of this issue in type legalization and extract_elt/subvector, Fixes #88959 Patch is 42.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88985.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 24f69ea1b742a6..8ef9e857888ba0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1405,6 +1405,9 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, StoreMMO);
}
+ // Freeze the index so we don't poison the clamping code we're about to emit.
+ Idx = DAG.getFreeze(Idx);
+
SDValue NewLoad;
Align ElementAlignment =
std::min(cast<StoreSDNode>(Ch)->getAlign(),
diff --git a/llvm/test/CodeGen/AArch64/pr88959.ll b/llvm/test/CodeGen/AArch64/pr88959.ll
new file mode 100644
index 00000000000000..8fb80dc6670dd6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr88959.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+define i8 @f(<16 x i8> %0, i32 %1) {
+; CHECK-LABEL: f:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: sub w8, w8, w0
+; CHECK-NEXT: bfxil x9, x8, #0, #4
+; CHECK-NEXT: ldrb w0, [x9]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %3 = sub nuw i32 1, %1
+ %4 = extractelement <16 x i8> %0, i32 %3
+ ret i8 %4
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index fc2929d8e6db33..619439e9a06fa4 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -91,9 +91,10 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 0
-; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 0
+; CHECK-NEXT: ld.b $a0, $a2, 0
; CHECK-NEXT: st.b $a0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -116,9 +117,10 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 1
-; CHECK-NEXT: ld.h $a0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 1
+; CHECK-NEXT: ld.h $a0, $a2, 0
; CHECK-NEXT: st.h $a0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -141,9 +143,10 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT: ld.w $a0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 2
+; CHECK-NEXT: ld.w $a0, $a2, 0
; CHECK-NEXT: st.w $a0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -166,9 +169,10 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3
-; CHECK-NEXT: ld.d $a0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 3
+; CHECK-NEXT: ld.d $a0, $a2, 0
; CHECK-NEXT: st.d $a0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -191,9 +195,10 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT: fld.s $fa0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 2
+; CHECK-NEXT: fld.s $fa0, $a2, 0
; CHECK-NEXT: fst.s $fa0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -216,9 +221,10 @@ define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3
-; CHECK-NEXT: fld.d $fa0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 3
+; CHECK-NEXT: fld.d $fa0, $a2, 0
; CHECK-NEXT: fst.d $fa0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 0e0cfc64af9ee4..c33eb6bd433f61 100644
--- a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -10,26 +10,37 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
define void @test_extractelement_legalization_storereuse(<4 x i32> %a, ptr nocapture %x, ptr nocapture readonly %y, i32 %i) #0 {
; CHECK-LABEL: test_extractelement_legalization_storereuse:
; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: paddd (%edx), %xmm0
; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: movl (%edx), %esi
-; CHECK-NEXT: movl 4(%edx), %edi
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl 8(%edx), %ebx
-; CHECK-NEXT: movl 12(%edx), %edx
-; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT: movl %edi, (%eax,%ecx)
-; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT: movl %edx, 4(%eax,%ecx)
+; CHECK-NEXT: leal (,%eax,4), %ecx
+; CHECK-NEXT: movl %ecx, %esi
+; CHECK-NEXT: andl $3, %esi
+; CHECK-NEXT: leal 1(,%eax,4), %edi
+; CHECK-NEXT: andl $3, %edi
+; CHECK-NEXT: leal 2(,%eax,4), %ebx
+; CHECK-NEXT: andl $3, %ebx
+; CHECK-NEXT: leal 3(,%eax,4), %ebp
+; CHECK-NEXT: andl $3, %ebp
+; CHECK-NEXT: movl (%edx,%esi,4), %esi
+; CHECK-NEXT: movl (%edx,%edi,4), %edi
+; CHECK-NEXT: movl (%edx,%ebx,4), %ebx
+; CHECK-NEXT: movl (%edx,%ebp,4), %edx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT: movl %esi, 12(%ebp,%ecx,4)
+; CHECK-NEXT: shll $4, %eax
+; CHECK-NEXT: movl %edi, (%ebp,%eax)
+; CHECK-NEXT: movl %ebx, 8(%ebp,%ecx,4)
+; CHECK-NEXT: movl %edx, 4(%ebp,%ecx,4)
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
entry:
%0 = load <4 x i32>, ptr %y, align 16
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll
index 870912bb6bb1be..d8dfde8b5a76c0 100644
--- a/llvm/test/CodeGen/X86/sttni.ll
+++ b/llvm/test/CodeGen/X86/sttni.ll
@@ -315,11 +315,10 @@ define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs,
; X86-NEXT: jmp .LBB8_3
; X86-NEXT: .LBB8_2: # %compare
; X86-NEXT: movdqa %xmm0, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB8_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: movl %ebp, %esp
@@ -452,11 +451,10 @@ define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
; X86-NEXT: jmp .LBB11_3
; X86-NEXT: .LBB11_2: # %compare
; X86-NEXT: movdqa %xmm1, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB11_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: leal -4(%ebp), %esp
@@ -772,11 +770,10 @@ define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $48, %esp
; X86-NEXT: movdqa %xmm0, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: movzwl %ax, %eax
@@ -889,11 +886,10 @@ define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
; X86-NEXT: jmp .LBB23_3
; X86-NEXT: .LBB23_2: # %compare
; X86-NEXT: movdqa %xmm1, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB23_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index f2240a94684427..2a4d3053ce228f 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -226,69 +226,92 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSE3-LABEL: var_shuffle_v16i8:
; SSE3: # %bb.0:
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: pushq %r15
+; SSE3-NEXT: pushq %r14
+; SSE3-NEXT: pushq %r13
+; SSE3-NEXT: pushq %r12
+; SSE3-NEXT: pushq %rbx
; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl %al, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm1
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r13b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm2
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r12b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm4
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r15b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r14b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm6
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %bpl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm7
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %bl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm8
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r11b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm5
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r10b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm9
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r9b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm10
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r8b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm12
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %dil, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm11
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %sil, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm13
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %dl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm14
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %cl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm15
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
@@ -307,6 +330,12 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: popq %r12
+; SSE3-NEXT: popq %r13
+; SSE3-NEXT: popq %r14
+; SSE3-NEXT: popq %r15
+; SSE3-NEXT: popq %rbp
; SSE3-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v16i8:
@@ -490,69 +519,92 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
; SSE3: # %bb.0:
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: pushq %r15
+; SSE3-NEXT: pushq %r14
+; SSE3-NEXT: pushq %r13
+; SSE3-NEXT: pushq %r12
+; SSE3-NEXT: pushq %rbx
; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl %al, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm1
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r13b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm2
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r12b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm4
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r15b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r14b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm6
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %bpl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm7
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %bl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm8
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r11b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm5
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r10b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm9
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r9b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm10
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r8b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm12
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %dil, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm11
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %sil, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm13
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %dl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm14
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %cl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm15
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
@@ -571,6 +623,12 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: Craig Topper (topperc) ChangesWe try clamp the index to be within the bounds of the stack object We have other instances of this issue in type legalization and extract_elt/subvector, Fixes #88959 Patch is 42.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88985.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 24f69ea1b742a6..8ef9e857888ba0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1405,6 +1405,9 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, StoreMMO);
}
+ // Freeze the index so we don't poison the clamping code we're about to emit.
+ Idx = DAG.getFreeze(Idx);
+
SDValue NewLoad;
Align ElementAlignment =
std::min(cast<StoreSDNode>(Ch)->getAlign(),
diff --git a/llvm/test/CodeGen/AArch64/pr88959.ll b/llvm/test/CodeGen/AArch64/pr88959.ll
new file mode 100644
index 00000000000000..8fb80dc6670dd6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr88959.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+define i8 @f(<16 x i8> %0, i32 %1) {
+; CHECK-LABEL: f:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: sub w8, w8, w0
+; CHECK-NEXT: bfxil x9, x8, #0, #4
+; CHECK-NEXT: ldrb w0, [x9]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
+ %3 = sub nuw i32 1, %1
+ %4 = extractelement <16 x i8> %0, i32 %3
+ ret i8 %4
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index fc2929d8e6db33..619439e9a06fa4 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -91,9 +91,10 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 0
-; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 0
+; CHECK-NEXT: ld.b $a0, $a2, 0
; CHECK-NEXT: st.b $a0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -116,9 +117,10 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 1
-; CHECK-NEXT: ld.h $a0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 1
+; CHECK-NEXT: ld.h $a0, $a2, 0
; CHECK-NEXT: st.h $a0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -141,9 +143,10 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT: ld.w $a0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 2
+; CHECK-NEXT: ld.w $a0, $a2, 0
; CHECK-NEXT: st.w $a0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -166,9 +169,10 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3
-; CHECK-NEXT: ld.d $a0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 3
+; CHECK-NEXT: ld.d $a0, $a2, 0
; CHECK-NEXT: st.d $a0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -191,9 +195,10 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2
-; CHECK-NEXT: fld.s $fa0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 2
+; CHECK-NEXT: fld.s $fa0, $a2, 0
; CHECK-NEXT: fst.s $fa0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
@@ -216,9 +221,10 @@ define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0
; CHECK-NEXT: xvld $xr0, $a0, 0
; CHECK-NEXT: xvst $xr0, $sp, 0
-; CHECK-NEXT: addi.d $a0, $sp, 0
-; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3
-; CHECK-NEXT: fld.d $fa0, $a0, 0
+; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0
+; CHECK-NEXT: addi.d $a2, $sp, 0
+; CHECK-NEXT: bstrins.d $a2, $a0, 4, 3
+; CHECK-NEXT: fld.d $fa0, $a2, 0
; CHECK-NEXT: fst.d $fa0, $a1, 0
; CHECK-NEXT: addi.d $sp, $fp, -64
; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 0e0cfc64af9ee4..c33eb6bd433f61 100644
--- a/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/llvm/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -10,26 +10,37 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
define void @test_extractelement_legalization_storereuse(<4 x i32> %a, ptr nocapture %x, ptr nocapture readonly %y, i32 %i) #0 {
; CHECK-LABEL: test_extractelement_legalization_storereuse:
; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: paddd (%edx), %xmm0
; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: movl (%edx), %esi
-; CHECK-NEXT: movl 4(%edx), %edi
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl 8(%edx), %ebx
-; CHECK-NEXT: movl 12(%edx), %edx
-; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT: movl %edi, (%eax,%ecx)
-; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT: movl %edx, 4(%eax,%ecx)
+; CHECK-NEXT: leal (,%eax,4), %ecx
+; CHECK-NEXT: movl %ecx, %esi
+; CHECK-NEXT: andl $3, %esi
+; CHECK-NEXT: leal 1(,%eax,4), %edi
+; CHECK-NEXT: andl $3, %edi
+; CHECK-NEXT: leal 2(,%eax,4), %ebx
+; CHECK-NEXT: andl $3, %ebx
+; CHECK-NEXT: leal 3(,%eax,4), %ebp
+; CHECK-NEXT: andl $3, %ebp
+; CHECK-NEXT: movl (%edx,%esi,4), %esi
+; CHECK-NEXT: movl (%edx,%edi,4), %edi
+; CHECK-NEXT: movl (%edx,%ebx,4), %ebx
+; CHECK-NEXT: movl (%edx,%ebp,4), %edx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT: movl %esi, 12(%ebp,%ecx,4)
+; CHECK-NEXT: shll $4, %eax
+; CHECK-NEXT: movl %edi, (%ebp,%eax)
+; CHECK-NEXT: movl %ebx, 8(%ebp,%ecx,4)
+; CHECK-NEXT: movl %edx, 4(%ebp,%ecx,4)
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
entry:
%0 = load <4 x i32>, ptr %y, align 16
diff --git a/llvm/test/CodeGen/X86/sttni.ll b/llvm/test/CodeGen/X86/sttni.ll
index 870912bb6bb1be..d8dfde8b5a76c0 100644
--- a/llvm/test/CodeGen/X86/sttni.ll
+++ b/llvm/test/CodeGen/X86/sttni.ll
@@ -315,11 +315,10 @@ define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs,
; X86-NEXT: jmp .LBB8_3
; X86-NEXT: .LBB8_2: # %compare
; X86-NEXT: movdqa %xmm0, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB8_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: movl %ebp, %esp
@@ -452,11 +451,10 @@ define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32
; X86-NEXT: jmp .LBB11_3
; X86-NEXT: .LBB11_2: # %compare
; X86-NEXT: movdqa %xmm1, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB11_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: leal -4(%ebp), %esp
@@ -772,11 +770,10 @@ define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $48, %esp
; X86-NEXT: movdqa %xmm0, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
; X86-NEXT: movzwl %ax, %eax
@@ -889,11 +886,10 @@ define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
; X86-NEXT: jmp .LBB23_3
; X86-NEXT: .LBB23_2: # %compare
; X86-NEXT: movdqa %xmm1, (%esp)
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: andl $14, %ecx
-; X86-NEXT: movzwl (%esp,%ecx), %eax
+; X86-NEXT: andl $7, %ecx
+; X86-NEXT: movzwl (%esp,%ecx,2), %eax
; X86-NEXT: movdqa %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: subw 16(%esp,%ecx), %ax
+; X86-NEXT: subw 16(%esp,%ecx,2), %ax
; X86-NEXT: .LBB23_3: # %exit
; X86-NEXT: movzwl %ax, %eax
; X86-NEXT: movl %ebp, %esp
diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll
index f2240a94684427..2a4d3053ce228f 100644
--- a/llvm/test/CodeGen/X86/var-permute-128.ll
+++ b/llvm/test/CodeGen/X86/var-permute-128.ll
@@ -226,69 +226,92 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSE3-LABEL: var_shuffle_v16i8:
; SSE3: # %bb.0:
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: pushq %r15
+; SSE3-NEXT: pushq %r14
+; SSE3-NEXT: pushq %r13
+; SSE3-NEXT: pushq %r12
+; SSE3-NEXT: pushq %rbx
; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl %al, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm1
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r13b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm2
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r12b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm4
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r15b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r14b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm6
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %bpl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm7
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %bl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm8
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r11b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm5
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r10b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm9
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r9b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm10
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r8b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm12
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %dil, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm11
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %sil, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm13
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %dl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm14
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %cl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm15
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
@@ -307,6 +330,12 @@ define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: popq %r12
+; SSE3-NEXT: popq %r13
+; SSE3-NEXT: popq %r14
+; SSE3-NEXT: popq %r15
+; SSE3-NEXT: popq %rbp
; SSE3-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v16i8:
@@ -490,69 +519,92 @@ define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwi
define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
; SSE3: # %bb.0:
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: pushq %r15
+; SSE3-NEXT: pushq %r14
+; SSE3-NEXT: pushq %r13
+; SSE3-NEXT: pushq %r12
+; SSE3-NEXT: pushq %rbx
; SSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r8d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r9d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r10d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r11d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebx
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ebp
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r14d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r15d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r12d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %r13d
+; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE3-NEXT: movzbl %al, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm1
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r13b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm2
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r12b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm4
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r15b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r14b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm6
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %bpl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm7
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %bl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm8
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r11b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm5
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r10b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm9
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r9b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm10
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %r8b, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm12
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %dil, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm11
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %sil, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm13
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %dl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm14
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl %cl, %eax
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm15
-; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE3-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; SSE3-NEXT: andl $15, %eax
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax
; SSE3-NEXT: movd %eax, %xmm0
@@ -571,6 +623,12 @@ define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %in
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm...
[truncated]
|
; SSE3-NEXT: andl $15, %eax | ||
; SSE3-NEXT: movzbl -24(%rsp,%rax), %eax | ||
; SSE3-NEXT: movd %eax, %xmm1 | ||
; SSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax | ||
; SSE3-NEXT: movzbl %r13b, %eax |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is (any_extend (freeze (load))). Can we form an anyext load with a freeze in the middle?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the difference between that and freeze(any_extend(load))
? I guess if the freeze is in the middle, transforms can treat the high bits as UNDEF, as opposed to FREEZE(UNDEF)? But I doubt that has much practical effect.
(For zext and sext, it makes more of a difference.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The (ext (freeze (load))) regressions are starting to appear more and more now that DAG is embracing freeze, it'd be useful to get this sorted even if we have to always fold aext to zext
llvm/test/CodeGen/AArch64/pr88959.ll
Outdated
; CHECK-NEXT: ldrb w0, [x9] | ||
; CHECK-NEXT: add sp, sp, #16 | ||
; CHECK-NEXT: ret | ||
%3 = sub nuw i32 1, %1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use named values
@@ -1405,6 +1405,9 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) { | |||
Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, StoreMMO); | |||
} | |||
|
|||
// Freeze the index so we don't poison the clamping code we're about to emit. | |||
Idx = DAG.getFreeze(Idx); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe GlobalIsel has the same issue
…xtract_elt/extract_subvector to load/store on stack. We try clamp the index to be within the bounds of the stack object we create, but if we don't freeze it, poison can propagate into the clamp code. This can cause the access to leave the bounds of the stack object. We have other instances of this issue in type legalization and extract_elt/subvector, but posting this patch first for direction check. Fixes llvm#88959
6f450d9
to
e58194e
Compare
We try clamp the index to be within the bounds of the stack object
we create, but if we don't freeze it, poison can propagate into the
clamp code. This can cause the access to leave the bounds of the
stack object.
We have other instances of this issue in type legalization and extract_elt/subvector,
but posting this patch first for direction check.
Fixes #88959