[X86] Extend alignedstore PatFrag to cover atomic_store#197861
Conversation
|
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-x86 Author: jofrn ChangesThis commit casts floats to ints in an atomic store during AtomicExpand Smaller FP vectors (`<N x half>`, `<N x bfloat>`) are left to the DAG Store-side counterpart to #148899. Stacked on top of #197860. Full diff: https://github.com/llvm/llvm-project/pull/197861.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 35848f76897b3..60ed4ed1a410f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -2393,6 +2393,20 @@ def atomic_store_128 :
let MemoryVT = i128;
}
+def atomic_store_128_v2i64 :
+ PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store node:$val, node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v2i64;
+}
+
+def atomic_store_128_v4i32 :
+ PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store node:$val, node:$ptr)> {
+ let IsAtomic = true;
+ let MemoryVT = v4i32;
+}
+
//===----------------------------------------------------------------------===//
// Selection DAG Pattern Support.
//
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fea1caf0854f5..064a1e7c138fc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32967,6 +32967,19 @@ X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
return AtomicExpansionKind::None;
}
+TargetLowering::AtomicExpansionKind
+X86TargetLowering::shouldCastAtomicStoreInIR(StoreInst *SI) const {
+ Type *Ty = SI->getValueOperand()->getType();
+ if (!Ty->getScalarType()->isFloatingPointTy())
+ return AtomicExpansionKind::None;
+ // Sub-128-bit FP vectors codegen better when DAG widening folds the value
+ // into an extractelt-from-XMM pattern, instead of an IR-level bitcast to a
+ // scalar integer (which the type legalizer scalarizes).
+ if (Ty->isVectorTy() && Ty->getPrimitiveSizeInBits() < 128)
+ return AtomicExpansionKind::None;
+ return AtomicExpansionKind::CastToInteger;
+}
+
LoadInst *
X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 9a958525057b6..b26f95ddea388 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -894,6 +894,8 @@ namespace llvm {
shouldExpandLogicAtomicRMWInIR(const AtomicRMWInst *AI) const;
TargetLoweringBase::AtomicExpansionKind
shouldCastAtomicLoadInIR(LoadInst *LI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldCastAtomicStoreInIR(StoreInst *SI) const override;
void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index b2a7bce8d7571..f40edbf911e6f 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1242,6 +1242,21 @@ def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)),
(VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>;
+// store atomic <2 x i64>
+def : Pat<(atomic_store_128_v2i64 (v2i64 VR128:$src), addr:$dst),
+ (MOVAPDmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_128_v2i64 (v2i64 VR128:$src), addr:$dst),
+ (VMOVAPDmr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_128_v2i64 (v2i64 VR128X:$src), addr:$dst),
+ (VMOVAPDZ128mr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+// store atomic <4 x i32>
+def : Pat<(atomic_store_128_v4i32 (v4i32 VR128:$src), addr:$dst),
+ (MOVAPDmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_128_v4i32 (v4i32 VR128:$src), addr:$dst),
+ (VMOVAPDmr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_128_v4i32 (v4i32 VR128X:$src), addr:$dst),
+ (VMOVAPDZ128mr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+
// store atomic <2 x i8>
def : Pat<(atomic_store_16
(i16 (trunc (i32 (extractelt
@@ -1293,6 +1308,37 @@ def : Pat<(atomic_store_32
(v4i32 (bitconvert (v16i8 VR128X:$src))), (iPTR 0))),
addr:$dst),
(VMOVPDI2DIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+// store atomic <2 x half>, <2 x bfloat> (via widened v8f16, v8bf16)
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8f16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (MOVPDI2DImr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8bf16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (MOVPDI2DImr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8f16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPDI2DImr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8bf16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPDI2DImr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8f16 VR128X:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPDI2DIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+def : Pat<(atomic_store_32
+ (i32 (extractelt
+ (v4i32 (bitconvert (v8bf16 VR128X:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPDI2DIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
// store atomic <2 x i32,float>, <4 x i16>, <2 x ptr addrspace(270)>
def : Pat<(atomic_store_64
@@ -1340,6 +1386,37 @@ def : Pat<(atomic_store_64
(v2i64 (bitconvert (v8i16 VR128X:$src))), (iPTR 0))),
addr:$dst),
(VMOVPQI2QIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+// store atomic <4 x half>, <4 x bfloat> (via widened v8f16, v8bf16)
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8f16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (MOVPQI2QImr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8bf16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (MOVPQI2QImr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8f16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPQI2QImr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8bf16 VR128:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPQI2QImr addr:$dst, VR128:$src)>, Requires<[UseAVX]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8f16 VR128X:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
+def : Pat<(atomic_store_64
+ (i64 (extractelt
+ (v2i64 (bitconvert (v8bf16 VR128X:$src))), (iPTR 0))),
+ addr:$dst),
+ (VMOVPQI2QIZmr addr:$dst, VR128X:$src)>, Requires<[HasAVX512]>;
// Floating point loads/stores.
def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst),
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 066842739fb61..1d0d0d4dc5c6b 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -765,8 +765,7 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
;
; CHECK-AVX-O3-LABEL: store_atomic_vec2_half:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX-O3-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-SSE-O0-LABEL: store_atomic_vec2_half:
@@ -788,8 +787,7 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) {
;
; CHECK-AVX-O0-LABEL: store_atomic_vec2_half:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX-O0-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O0-NEXT: vmovd %xmm0, (%rdi)
; CHECK-AVX-O0-NEXT: retq
store atomic <2 x half> %v, ptr %x release, align 4
ret void
@@ -809,8 +807,7 @@ define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) {
;
; CHECK-AVX-O3-LABEL: store_atomic_vec2_bfloat:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX-O3-NEXT: movl %eax, (%rdi)
+; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat:
@@ -932,8 +929,7 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
;
; CHECK-AVX-O3-LABEL: store_atomic_vec4_half:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: vmovq %xmm0, %rax
-; CHECK-AVX-O3-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-SSE2-O0-LABEL: store_atomic_vec4_half:
@@ -1007,8 +1003,7 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind {
;
; CHECK-AVX-O0-LABEL: store_atomic_vec4_half:
; CHECK-AVX-O0: # %bb.0:
-; CHECK-AVX-O0-NEXT: vmovq %xmm0, %rax
-; CHECK-AVX-O0-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O0-NEXT: vmovq %xmm0, (%rdi)
; CHECK-AVX-O0-NEXT: retq
store atomic <4 x half> %v, ptr %x release, align 8
ret void
@@ -1060,8 +1055,7 @@ define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind {
;
; CHECK-AVX-O3-LABEL: store_atomic_vec4_bfloat:
; CHECK-AVX-O3: # %bb.0:
-; CHECK-AVX-O3-NEXT: vmovq %xmm0, %rax
-; CHECK-AVX-O3-NEXT: movq %rax, (%rdi)
+; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi)
; CHECK-AVX-O3-NEXT: retq
;
; CHECK-SSE-O0-LABEL: store_atomic_vec4_bfloat:
@@ -1201,6 +1195,87 @@ define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind {
ret void
}
+define void @store_atomic_vec4_float_align(ptr %x, <4 x float> %v) nounwind {
+; CHECK-SSE2-O3-LABEL: store_atomic_vec4_float_align:
+; CHECK-SSE2-O3: # %bb.0:
+; CHECK-SSE2-O3-NEXT: pushq %rax
+; CHECK-SSE2-O3-NEXT: movq %xmm0, %rsi
+; CHECK-SSE2-O3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-SSE2-O3-NEXT: movq %xmm0, %rdx
+; CHECK-SSE2-O3-NEXT: movl $3, %ecx
+; CHECK-SSE2-O3-NEXT: callq __atomic_store_16@PLT
+; CHECK-SSE2-O3-NEXT: popq %rax
+; CHECK-SSE2-O3-NEXT: retq
+;
+; CHECK-SSE4-O3-LABEL: store_atomic_vec4_float_align:
+; CHECK-SSE4-O3: # %bb.0:
+; CHECK-SSE4-O3-NEXT: pushq %rbx
+; CHECK-SSE4-O3-NEXT: pextrq $1, %xmm0, %rcx
+; CHECK-SSE4-O3-NEXT: movq %xmm0, %rbx
+; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax
+; CHECK-SSE4-O3-NEXT: movq 8(%rdi), %rdx
+; CHECK-SSE4-O3-NEXT: .p2align 4
+; CHECK-SSE4-O3-NEXT: .LBB39_1: # %atomicrmw.start
+; CHECK-SSE4-O3-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-SSE4-O3-NEXT: lock cmpxchg16b (%rdi)
+; CHECK-SSE4-O3-NEXT: jne .LBB39_1
+; CHECK-SSE4-O3-NEXT: # %bb.2: # %atomicrmw.end
+; CHECK-SSE4-O3-NEXT: popq %rbx
+; CHECK-SSE4-O3-NEXT: retq
+;
+; CHECK-AVX-O3-LABEL: store_atomic_vec4_float_align:
+; CHECK-AVX-O3: # %bb.0:
+; CHECK-AVX-O3-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-AVX-O3-NEXT: retq
+;
+; CHECK-SSE2-O0-LABEL: store_atomic_vec4_float_align:
+; CHECK-SSE2-O0: # %bb.0:
+; CHECK-SSE2-O0-NEXT: pushq %rax
+; CHECK-SSE2-O0-NEXT: movq %xmm0, %rsi
+; CHECK-SSE2-O0-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-SSE2-O0-NEXT: movq %xmm0, %rdx
+; CHECK-SSE2-O0-NEXT: movl $3, %ecx
+; CHECK-SSE2-O0-NEXT: callq __atomic_store_16@PLT
+; CHECK-SSE2-O0-NEXT: popq %rax
+; CHECK-SSE2-O0-NEXT: retq
+;
+; CHECK-SSE4-O0-LABEL: store_atomic_vec4_float_align:
+; CHECK-SSE4-O0: # %bb.0:
+; CHECK-SSE4-O0-NEXT: pushq %rbx
+; CHECK-SSE4-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: pextrq $1, %xmm0, %rax
+; CHECK-SSE4-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: movq %xmm0, %rax
+; CHECK-SSE4-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax
+; CHECK-SSE4-O0-NEXT: movq 8(%rdi), %rdx
+; CHECK-SSE4-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: jmp .LBB39_1
+; CHECK-SSE4-O0-NEXT: .LBB39_1: # %atomicrmw.start
+; CHECK-SSE4-O0-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; CHECK-SSE4-O0-NEXT: lock cmpxchg16b (%rsi)
+; CHECK-SSE4-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-SSE4-O0-NEXT: jne .LBB39_1
+; CHECK-SSE4-O0-NEXT: jmp .LBB39_2
+; CHECK-SSE4-O0-NEXT: .LBB39_2: # %atomicrmw.end
+; CHECK-SSE4-O0-NEXT: popq %rbx
+; CHECK-SSE4-O0-NEXT: retq
+;
+; CHECK-AVX-O0-LABEL: store_atomic_vec4_float_align:
+; CHECK-AVX-O0: # %bb.0:
+; CHECK-AVX-O0-NEXT: vmovaps %xmm0, (%rdi)
+; CHECK-AVX-O0-NEXT: retq
+ store atomic <4 x float> %v, ptr %x release, align 16
+ ret void
+}
+
define <2 x half> @atomic_vec2_half(ptr %x) {
; CHECK-SSE-O3-LABEL: atomic_vec2_half:
; CHECK-SSE-O3: # %bb.0:
|
|
|
| def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), | ||
| (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>; | ||
|
|
||
| // store atomic <2 x i64> |
There was a problem hiding this comment.
This shouldn't require duplicating all of these patterns. There surely must be existing store patterns that can be shared?
There was a problem hiding this comment.
Right, I was wondering the same. I'll take a closer look. Thanks.
There was a problem hiding this comment.
I expect you to need a PatFrag common to store + atomic_store, and update the patterns to use that
There was a problem hiding this comment.
Did this one. Thanks.
| // Sub-128-bit FP vectors codegen better when DAG widening folds the value | ||
| // into an extractelt-from-XMM pattern, instead of an IR-level bitcast to a | ||
| // scalar integer (which the type legalizer scalarizes). |
There was a problem hiding this comment.
I think this is thinking too much about this. The goal is to fully remove shouldCastAtomicStoreInIR
There was a problem hiding this comment.
Mm hm, I can't help yet utilize it. Let me look at this one more closely alsos. thank ya.
01c3c98 to
e0dca2b
Compare
6b8f49c to
48ba9b2
Compare
e0dca2b to
e4c9611
Compare
48ba9b2 to
b110a11
Compare
RKSimon
left a comment
There was a problem hiding this comment.
Do you have ALIGNED xmm/ymm/zmm atomic stores? I can't see any in this version of atomic-load-store.ll
e.g. all I can find is store atomic <8 x double> %v, ptr %x release, align 4
|
|
||
| multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName, | ||
| X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, | ||
| X86VectorVTInfo _, PatFrags st_frag, PatFrag mstore, |
There was a problem hiding this comment.
Usually this would be SDPatternOperator
b110a11 to
e65b07f
Compare
e4c9611 to
a839f91
Compare
e65b07f to
4314cab
Compare
4314cab to
07d1319
Compare
9a15cc0 to
9f6ca39
Compare
07d1319 to
1d7267a
Compare
Extend the X86 \`alignedstore\` PatFrag to also match \`atomic_store\` with vector-size alignment, so existing MOVAPS/MOVAPD/MOVDQA-family aligned-store patterns cover 128-bit aligned vector atomic stores on SSE/AVX/AVX-512 without per-type duplicates. \`<4 x float>\`, \`<2 x double>\`, \`<2 x i64>\`, \`<4 x i32>\`, \`<8 x half>\`, \`<8 x bfloat>\` all codegen to a single \`movaps\`/\`movapd\` on AVX+ via this. Adds v8f16/v8bf16 bitconvert variants to the widen-path \`atomic_store_32\` / \`atomic_store_64\` patterns so \`<2 x half>\`, \`<2 x bfloat>\`, \`<4 x half>\`, \`<4 x bfloat>\` atomic stores reaching the PR4 widen path also collapse to a single instruction on AVX+ targets. Vectors whose \`getTypeAction\` is split rather than widen still rely on PR6's \`SplitVecOp_ATOMIC_STORE\` — that path bitcasts the vector to a scalar integer and issues an integer \`atomic_store_N\`, picked up by the pre-existing scalar atomic-store patterns. The two legalization paths together cover the full vector-atomic-store matrix. Store-side counterpart to #148899.
9f6ca39 to
98275c5
Compare
1d7267a to
0d958f9
Compare
Smaller FP vectors (
<N x half>,<N x bfloat>) are left to the DAGwiden path on subtargets without native FP16/BF16 support; the
v8f16/v8bf16 bitconvert variants added to the Atomic Store Split commit's patterns let the
widened path collapse to a single instruction on AVX+ targets.
Store-side counterpart to #148899. Stacked on top of #197860; and below of #197862.