-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[msan][NFCI] Generalize handlePairwiseShadowOrIntrinsic to have shards #167954
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This will allow fixing up the handling of AVX2 phadd/phsub instructions in a future patch, by setting Shards = 2. Currently, the extra functionality is not used.
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-compiler-rt-sanitizer Author: Thurston Dang (thurstond) ChangesThis will allow fixing up the handling of AVX2 phadd/phsub instructions in a future patch, by setting Shards = 2. Currently, the extra functionality is not used. Full diff: https://github.com/llvm/llvm-project/pull/167954.diff 1 Files Affected:
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index ceeece41782f4..d04cae018a79d 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2720,34 +2720,55 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// of elements.
//
// For example, suppose we have:
- // VectorA: <a1, a2, a3, a4, a5, a6>
- // VectorB: <b1, b2, b3, b4, b5, b6>
- // ReductionFactor: 3.
+ // VectorA: <a0, a1, a2, a3, a4, a5>
+ // VectorB: <b0, b1, b2, b3, b4, b5>
+ // ReductionFactor: 3
+ // Shards: 1
// The output would be:
- // <a1|a2|a3, a4|a5|a6, b1|b2|b3, b4|b5|b6>
+ // <a0|a1|a2, a3|a4|a5, b0|b1|b2, b3|b4|b5>
+ //
+ // If we have:
+ // VectorA: <a0, a1, a2, a3, a4, a5, a6, a7>
+ // VectorB: <b0, b1, b2, b3, b4, b5, b6, b7>
+ // ReductionFactor: 2
+ // Shards: 2
+ // then a and be each have 2 "shards", resulting in the output being
+ // interleaved:
+ // <a0|a1, a2|a3, b0|b1, b2|b3, a4|a5, a6|a7, b4|b5, b6|b7>
//
// This is convenient for instrumenting horizontal add/sub.
// For bitwise OR on "vertical" pairs, see maybeHandleSimpleNomemIntrinsic().
Value *horizontalReduce(IntrinsicInst &I, unsigned ReductionFactor,
- Value *VectorA, Value *VectorB) {
+ unsigned Shards, Value *VectorA, Value *VectorB) {
assert(isa<FixedVectorType>(VectorA->getType()));
- unsigned TotalNumElems =
+ unsigned NumElems =
cast<FixedVectorType>(VectorA->getType())->getNumElements();
+ [[maybe_unused]] unsigned TotalNumElems = NumElems;
if (VectorB) {
assert(VectorA->getType() == VectorB->getType());
- TotalNumElems = TotalNumElems * 2;
+ TotalNumElems *= 2;
}
- assert(TotalNumElems % ReductionFactor == 0);
+ assert(NumElems % (ReductionFactor * Shards) == 0);
Value *Or = nullptr;
IRBuilder<> IRB(&I);
for (unsigned i = 0; i < ReductionFactor; i++) {
SmallVector<int, 16> Mask;
- for (unsigned X = 0; X < TotalNumElems; X += ReductionFactor)
- Mask.push_back(X + i);
+
+ for (unsigned j = 0; j < Shards; j++) {
+ unsigned Offset = NumElems / Shards * j;
+
+ for (unsigned X = 0; X < NumElems / Shards; X += ReductionFactor)
+ Mask.push_back(Offset + X + i);
+
+ if (VectorB) {
+ for (unsigned X = 0; X < NumElems / Shards; X += ReductionFactor)
+ Mask.push_back(NumElems + Offset + X + i);
+ }
+ }
Value *Masked;
if (VectorB)
@@ -2769,7 +2790,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
///
/// e.g., <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>)
/// <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
- void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I) {
+ void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I, unsigned Shards) {
assert(I.arg_size() == 1 || I.arg_size() == 2);
assert(I.getType()->isVectorTy());
@@ -2792,8 +2813,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
if (I.arg_size() == 2)
SecondArgShadow = getShadow(&I, 1);
- Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow,
- SecondArgShadow);
+ Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, Shards,
+ FirstArgShadow, SecondArgShadow);
OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
@@ -2808,7 +2829,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/// conceptually operates on
/// (<4 x i16> [[VAR1]], <4 x i16> [[VAR2]])
/// and can be handled with ReinterpretElemWidth == 16.
- void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I,
+ void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I, unsigned Shards,
int ReinterpretElemWidth) {
assert(I.arg_size() == 1 || I.arg_size() == 2);
@@ -2852,8 +2873,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
SecondArgShadow = IRB.CreateBitCast(SecondArgShadow, ReinterpretShadowTy);
}
- Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, FirstArgShadow,
- SecondArgShadow);
+ Value *OrShadow = horizontalReduce(I, /*ReductionFactor=*/2, Shards,
+ FirstArgShadow, SecondArgShadow);
OrShadow = CreateShadowCast(IRB, OrShadow, getShadowTy(&I));
@@ -6031,48 +6052,66 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Packed Horizontal Add/Subtract
case Intrinsic::x86_ssse3_phadd_w:
case Intrinsic::x86_ssse3_phadd_w_128:
- case Intrinsic::x86_avx2_phadd_w:
case Intrinsic::x86_ssse3_phsub_w:
case Intrinsic::x86_ssse3_phsub_w_128:
- case Intrinsic::x86_avx2_phsub_w: {
- handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/16);
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/16);
+ break;
+
+ case Intrinsic::x86_avx2_phadd_w:
+ case Intrinsic::x86_avx2_phsub_w:
+ // TODO: Shards = 2
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/16);
break;
- }
// Packed Horizontal Add/Subtract
case Intrinsic::x86_ssse3_phadd_d:
case Intrinsic::x86_ssse3_phadd_d_128:
- case Intrinsic::x86_avx2_phadd_d:
case Intrinsic::x86_ssse3_phsub_d:
case Intrinsic::x86_ssse3_phsub_d_128:
- case Intrinsic::x86_avx2_phsub_d: {
- handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/32);
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/32);
+ break;
+
+ case Intrinsic::x86_avx2_phadd_d:
+ case Intrinsic::x86_avx2_phsub_d:
+ // TODO: Shards = 2
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/32);
break;
- }
// Packed Horizontal Add/Subtract and Saturate
case Intrinsic::x86_ssse3_phadd_sw:
case Intrinsic::x86_ssse3_phadd_sw_128:
- case Intrinsic::x86_avx2_phadd_sw:
case Intrinsic::x86_ssse3_phsub_sw:
case Intrinsic::x86_ssse3_phsub_sw_128:
- case Intrinsic::x86_avx2_phsub_sw: {
- handlePairwiseShadowOrIntrinsic(I, /*ReinterpretElemWidth=*/16);
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/16);
+ break;
+
+ case Intrinsic::x86_avx2_phadd_sw:
+ case Intrinsic::x86_avx2_phsub_sw:
+ // TODO: Shards = 2
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1,
+ /*ReinterpretElemWidth=*/16);
break;
- }
// Packed Single/Double Precision Floating-Point Horizontal Add
case Intrinsic::x86_sse3_hadd_ps:
case Intrinsic::x86_sse3_hadd_pd:
- case Intrinsic::x86_avx_hadd_pd_256:
- case Intrinsic::x86_avx_hadd_ps_256:
case Intrinsic::x86_sse3_hsub_ps:
case Intrinsic::x86_sse3_hsub_pd:
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1);
+ break;
+
+ case Intrinsic::x86_avx_hadd_pd_256:
+ case Intrinsic::x86_avx_hadd_ps_256:
case Intrinsic::x86_avx_hsub_pd_256:
- case Intrinsic::x86_avx_hsub_ps_256: {
- handlePairwiseShadowOrIntrinsic(I);
+ case Intrinsic::x86_avx_hsub_ps_256:
+ // TODO: Shards = 2
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1);
break;
- }
case Intrinsic::x86_avx_maskstore_ps:
case Intrinsic::x86_avx_maskstore_pd:
@@ -6455,7 +6494,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Add Long Pairwise
case Intrinsic::aarch64_neon_saddlp:
case Intrinsic::aarch64_neon_uaddlp: {
- handlePairwiseShadowOrIntrinsic(I);
+ handlePairwiseShadowOrIntrinsic(I, /*Shards=*/1);
break;
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull Request Overview
This PR generalizes the handlePairwiseShadowOrIntrinsic function to support a Shards parameter, preparing for future improvements to AVX2 phadd/phsub instruction handling. The change is currently NFCI (No Functional Change Intended) as all call sites use Shards=1.
Key Changes
- Added
Shardsparameter tohorizontalReduceand both overloads ofhandlePairwiseShadowOrIntrinsic - Refactored the mask generation algorithm in
horizontalReduceto support shard-based processing - Separated AVX2 intrinsics from SSE/SSSE3 intrinsics in switch cases with TODO markers for future
Shards=2implementation
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
| [[maybe_unused]] unsigned TotalNumElems = NumElems; | ||
| if (VectorB) { | ||
| assert(VectorA->getType() == VectorB->getType()); | ||
| TotalNumElems = TotalNumElems * 2; | ||
| TotalNumElems *= 2; | ||
| } |
Copilot
AI
Nov 14, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The [[maybe_unused]] attribute on TotalNumElems is a code smell. This variable is computed but only used in an assertion. Consider either:
- Removing the variable entirely and computing the value directly in the assertion if needed
- Removing the
[[maybe_unused]]attribute if the variable serves a documentation purpose
Since TotalNumElems is only referenced in the assertion on line 2753 (which was changed to use NumElems instead), this variable appears to be genuinely unused and should be removed.
| // VectorB: <b0, b1, b2, b3, b4, b5, b6, b7> | ||
| // ReductionFactor: 2 | ||
| // Shards: 2 | ||
| // then a and be each have 2 "shards", resulting in the output being |
Copilot
AI
Nov 14, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's a typo in the comment: "be each have 2 'shards'" should be "b each have 2 'shards'" or more clearly "a and b each have 2 'shards'".
| // then a and be each have 2 "shards", resulting in the output being | |
| // then a and b each have 2 "shards", resulting in the output being |
These horizontal add/sub instructions are currently handled by adding/subtracting tuples of the first operand, followed by tuples of the second operand. This is not the correct semantics for the 256-bit insructions: they process the first half of the first operand, then the first half of the second operand, then the second half of the first operand, and finally the second half of the second operand (trust me bro [*]). This patch fixes the issue by applying the "shards" functionality that was added in llvm#167954, to handle the top and bottom 128-bit "shards" in turn. [*] clang/test/CodeGen/X86/avx2-builtins.c: ``` TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32( (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}, (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}), 30,70,20,60,110,150,100,140)); ```
These horizontal add/sub instructions are currently handled by adding/subtracting tuples of the first operand, followed by tuples of the second operand. This is not the correct semantics for the 256-bit insructions: they process the first half of the first operand, then the first half of the second operand, then the second half of the first operand, and finally the second half of the second operand (trust me bro [*]). This patch fixes the issue by applying the "shards" functionality that was added in #167954, to handle the top and bottom 128-bit "shards" in turn. [*] clang/test/CodeGen/X86/avx2-builtins.c: ``` TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32( (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}, (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}), 30,70,20,60,110,150,100,140)); ```
…8121) These horizontal add/sub instructions are currently handled by adding/subtracting tuples of the first operand, followed by tuples of the second operand. This is not the correct semantics for the 256-bit insructions: they process the first half of the first operand, then the first half of the second operand, then the second half of the first operand, and finally the second half of the second operand (trust me bro [*]). This patch fixes the issue by applying the "shards" functionality that was added in llvm/llvm-project#167954, to handle the top and bottom 128-bit "shards" in turn. [*] clang/test/CodeGen/X86/avx2-builtins.c: ``` TEST_CONSTEXPR(match_v8si(_mm256_hadd_epi32( (__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}, (__m256i)(__v8si){5, 15, 25, 35, 45, 55, 65, 75}), 30,70,20,60,110,150,100,140)); ```
This will allow fixing up the handling of AVX2 phadd/phsub instructions in a future patch, by setting Shards = 2.
Currently, the extra functionality is not used.