Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions compiler-rt/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,12 @@ if(NOT COMPILER_RT_HAS_FUNC_SYMBOL)
add_definitions(-D__func__=__FUNCTION__)
endif()

if (LLVM_MSAN_SHADOW_OFFSET_2MB)
add_definitions(-DLLVM_MSAN_SHADOW_OFFSET_2MB=1)
else()
add_definitions(-DLLVM_MSAN_SHADOW_OFFSET_2MB=0)
endif()

# Provide some common commandline flags for Sanitizer runtimes.
if("${ANDROID_API_LEVEL}" GREATER_EQUAL 29)
list(APPEND SANITIZER_COMMON_CFLAGS -fno-emulated-tls)
Expand Down
71 changes: 71 additions & 0 deletions compiler-rt/lib/msan/msan.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,77 @@ const MappingDesc kMemoryLayout[] = {
#define MEM_TO_SHADOW(mem) (LINEARIZE_MEM((mem)) + 0x100000000000ULL)
#define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x280000000000)

#elif LLVM_MSAN_SHADOW_OFFSET_2MB == 1

# if SANITIZER_NETBSD || (SANITIZER_LINUX && SANITIZER_WORDSIZE == 64)

// Offset applied to shadow addresses to avoid cache line conflicts on AMD Zen
// (on Intel it is not required, but does not harm).
//
// Problem: AMD Zen's 32 KiB L1D cache is 8-way associative with 64-byte lines
// (64 sets) [1]. Addresses are partitioned as:
//
// | tag | set_index (bits 6-11) | offset (bits 0-5) |
// ^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^
// 6 bits set index 6 bits index in set
//
// Without offset, app memory and its shadow share the same set_index (bits
// 6-11) but have different tags. Normally, 8-way associativity accommodates
// both lines in the same set without conflict.
//
// However, AMD Zen uses a Linear address μtag/way-predictor (derived from
// addr[12:27], see [2]) to save power by predicting which way to check instead
// of searching all 8 ways. Since XOR preserves addr[0:43], shadow and app share
// identical μtags, causing them to predict the same way, effectively degrading
// the 8-way cache to direct-mapped (20x slowdown).
//
// Solution: Adding 2MB offset (bit 21) changes the μtag while maintaining 2MB
// page alignment for mmap.
//
// [1]: https://lsferreira.net/public/knowledge-base/x86/upos/amd_zen+.PDF
// [2]: https://inria.hal.science/hal-02866777/document
const unsigned long kShadowOffset = 0x200000ULL;
// All of the following configurations are supported.
// ASLR disabled: main executable and DSOs at 0x555550000000
// PIE and ASLR: main executable and DSOs at 0x7f0000000000
// non-PIE: main executable below 0x100000000, DSOs at 0x7f0000000000
// Heap at 0x700000000000.
const MappingDesc kMemoryLayout[] = {
{0x000000000000ULL, 0x010000000000ULL - kShadowOffset, MappingDesc::APP,
"app-1"},
{0x010000000000ULL - kShadowOffset, 0x010000000000ULL + kShadowOffset,
MappingDesc::INVALID, "gap"},
{0x010000000000ULL + kShadowOffset, 0x100000000000ULL + kShadowOffset,
MappingDesc::SHADOW, "shadow-2"},
{0x100000000000ULL + kShadowOffset, 0x110000000000ULL + kShadowOffset,
MappingDesc::INVALID, "invalid"},
{0x110000000000ULL + kShadowOffset, 0x200000000000ULL + kShadowOffset,
MappingDesc::ORIGIN, "origin-2"},
{0x200000000000ULL + kShadowOffset, 0x300000000000ULL + kShadowOffset,
MappingDesc::SHADOW, "shadow-3"},
{0x300000000000ULL + kShadowOffset, 0x400000000000ULL + kShadowOffset,
MappingDesc::ORIGIN, "origin-3"},
{0x400000000000ULL + kShadowOffset, 0x500000000000ULL + kShadowOffset,
MappingDesc::INVALID, "invalid"},
{0x500000000000ULL + kShadowOffset, 0x510000000000ULL, MappingDesc::SHADOW,
"shadow-1"},
{0x510000000000ULL, 0x600000000000ULL - kShadowOffset, MappingDesc::APP,
"app-2"},
{0x600000000000ULL - kShadowOffset, 0x600000000000ULL + kShadowOffset,
MappingDesc::INVALID, "gap"},
{0x600000000000ULL + kShadowOffset, 0x610000000000ULL, MappingDesc::ORIGIN,
"origin-1"},
{0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"},
{0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
{0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
# define MEM_TO_SHADOW(mem) \
((((uptr)(mem)) ^ 0x500000000000ULL) + kShadowOffset)
# define SHADOW_TO_ORIGIN(mem) (((uptr)(mem)) + 0x100000000000ULL)

# else
# error LLVM_MSAN_SHADOW_OFFSET_2MB is applicable only for x86_64 NetBSD/Linux
# endif

#elif SANITIZER_NETBSD || (SANITIZER_LINUX && SANITIZER_WORDSIZE == 64)

// All of the following configurations are supported.
Expand Down
2 changes: 2 additions & 0 deletions llvm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ foreach(proj ${LLVM_ENABLE_PROJECTS})
endif()
endforeach()

option(LLVM_MSAN_SHADOW_OFFSET_2MB "Add 2MB offset to MSan shadow to avoid AMD Zen cache conflicts" OFF)

# Select the runtimes to build
#
# As we migrate runtimes to using the bootstrapping build, the set of default runtimes
Expand Down
1 change: 1 addition & 0 deletions llvm/cmake/modules/LLVMExternalProjectUtils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,7 @@ function(llvm_ExternalProject_Add name source_dir)
-DLLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO=${LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO}
-DLLVM_USE_RELATIVE_PATHS_IN_FILES=${LLVM_USE_RELATIVE_PATHS_IN_FILES}
-DLLVM_LIT_ARGS=${LLVM_LIT_ARGS}
-DLLVM_MSAN_SHADOW_OFFSET_2MB=${LLVM_MSAN_SHADOW_OFFSET_2MB}
-DLLVM_SOURCE_PREFIX=${LLVM_SOURCE_PREFIX}
-DPACKAGE_VERSION=${PACKAGE_VERSION}
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Instrumentation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,9 @@ add_llvm_component_library(LLVMInstrumentation
TransformUtils
ProfileData
)

if (LLVM_MSAN_SHADOW_OFFSET_2MB)
add_definitions(-DLLVM_MSAN_SHADOW_OFFSET_2MB=1)
else()
add_definitions(-DLLVM_MSAN_SHADOW_OFFSET_2MB=0)
endif()
10 changes: 10 additions & 0 deletions llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,8 +443,13 @@ static const MemoryMapParams Linux_I386_MemoryMapParams = {
static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
0, // AndMask (not used)
0x500000000000, // XorMask
#if LLVM_MSAN_SHADOW_OFFSET_2MB == 1
0x200000, // ShadowBase (== kShadowOffset)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to encode the change using only XorMask, without modifying ShadowBase?

When ShadowBase == 0, the compiler is able to optimize it out. A non-zero shadow base requires an extra instruction for every shadow calculation e.g, https://github.com/llvm/llvm-project/pull/171993/files#diff-91f63a475808bb3cc923763edeb8a60749a879ea775ce1ebcc31af6c867f91e7:

  ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080

- ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+ ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 2097152
+ ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP5]] to ptr

I suspect if the end addresses in kMemoryLayout were closed (e.g., 0x010000000000ULL - 1) instead of open, then it would be easy to do it entirely with XOR (though there might be other cleanup needed in MSan to handle closed end addresses).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don’t think this is something that can be resolved simply by using a closed end (i.e. mem2shadow(end - 1)).

If we directly embed kShadowOffset into the XOR mask, as follows:

#  define MEM_TO_SHADOW(mem)  (((uptr)(mem)) ^ (0x500000000000ULL + kShadowOffset))

Currently, kShadowOffset is 2 MiB, so this change would cause the lower 2 MiB and the upper 2 MiB within each 4 MiB shadow region to be swapped.

This may already be sufficient to mitigate the linear mapping issue (the mapping is linear within each 2 MiB region, but it is still NOT linear.

This kind of non-linearity would require special handling for every contiguous copy, which is quite tricky.
For example, with memcpy(dst, src, n), we would no longer be able to directly apply the same operation to the shadow as memcpy(mem2shadow(dst), mem2shadow(src), n), because the shadow corresponding to src may not be the start of the shadow range for the entire [src, src + n) region.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also AFAICS it is free (or almost free), since the real asm looks like (with -O3)

movq   $0x0, 0x200000(%rax)

Instead of

movq   $0x0, (%rax)

And I've also tested the performance on Intel, it looks the same - https://pastila.nl/?00b2e9d6/e11d914e0d9309f3444c4b244a9fa469#Ydv8nM20XBa2PJAPq+I6mQ==

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Camsyn Thanks, good point!

@azat The machine code is longer if a non-zero immediate is needed:

48 c7 80 00 00 20 00 00 00 00 00 mov    QWORD PTR [rax+0x200000],0x0
vs.
48 c7 00 00 00 00 00             mov    QWORD PTR [rax],0x0

c6 80 00 00 20 00 00             mov    BYTE PTR [rax+0x200000],0x0
vs.
19: c6 00 00                     mov    BYTE PTR [rax],0x0

Even if the CPU could execute both forms at the same speed, on the same execution ports, it is still increasing code size, icache pressure, etc.

Although it's probably not a huge impact, it would still be hard to justify making codegen worse for all x86 targets when the upside is only for a subset of Zen processors.

I do want MSan to work well for Zen as well, so how about a compile-time macro (when compiling MSan itself, not the target app) that enables the 2MB offset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was also worrying about the overhead for other CPUs (since I've tested only few), so making it a compile time switch make sense.

Though it would be nice to have a test for this, but since it is compile time switch it will require separate job (and separate tests, at least one test maybe), what do you think about this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean I can make tests depends on this compile time switch, but unlikely this is desired, and one simple test will be sufficient for now

0x100000200000, // OriginBase
#else
0, // ShadowBase (not used)
0x100000000000, // OriginBase
#endif
};

// mips32 Linux
Expand Down Expand Up @@ -531,8 +536,13 @@ static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
static const MemoryMapParams NetBSD_X86_64_MemoryMapParams = {
0, // AndMask
0x500000000000, // XorMask
#if LLVM_MSAN_SHADOW_OFFSET_2MB == 1
0x200000, // ShadowBase (== kShadowOffset)
0x100000200000, // OriginBase
#else
0, // ShadowBase
0x100000000000, // OriginBase
#endif
};

static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
Expand Down