-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Description
Given the following code:
constexpr auto size = 45;
struct array {
int m[size];
};
auto equal_impl(
array const lhs,
array const rhs
) -> bool {
for (int n = 0; n != size; ++n) {
if (lhs.m[n] != rhs.m[n]) {
return false;
}
}
return true;
}
auto equal(
array const lhs,
array const rhs
) -> bool {
return equal_impl(lhs, rhs);
}then everything seems fine when compiled with -O3:
equal_impl(array, array):
push rax
lea rdi, [rsp + 16]
lea rsi, [rsp + 200]
mov edx, 180
call memcmp@PLT
test eax, eax
sete al
pop rcx
ret
equal(array, array):
jmp equal_impl(array, array)But when 5 <= size <= 44, everything does not seem fine. For instance, size == 5 generates
equal_impl(array, array):
movdqu xmm0, xmmword ptr [rsp + 32]
movd xmm1, dword ptr [rsp + 24]
movd xmm2, dword ptr [rsp + 48]
pcmpeqb xmm2, xmm1
pcmpeqb xmm0, xmmword ptr [rsp + 8]
pand xmm0, xmm2
pmovmskb eax, xmm0
cmp eax, 65535
sete al
ret
equal(array, array):
movdqu xmm0, xmmword ptr [rsp + 32]
pcmpeqd xmm0, xmmword ptr [rsp + 8]
movmskps eax, xmm0
xor eax, 15
sete cl
mov eax, dword ptr [rsp + 48]
cmp dword ptr [rsp + 24], eax
sete al
and al, cl
retIt inlines equal_impl but then generates different code, which is weird.
size == 8:
equal_impl(array, array):
movdqa xmm0, xmmword ptr [rsp + 24]
pcmpeqb xmm0, xmmword ptr [rsp + 56]
movdqa xmm1, xmmword ptr [rsp + 8]
pcmpeqb xmm1, xmmword ptr [rsp + 40]
pand xmm1, xmm0
pmovmskb eax, xmm1
cmp eax, 65535
sete al
ret
equal(array, array):
movdqa xmm0, xmmword ptr [rsp + 40]
pcmpeqd xmm0, xmmword ptr [rsp + 8]
movmskps ecx, xmm0
xor eax, eax
xor ecx, 15
jne .LBB1_5
movq xmm0, qword ptr [rsp + 24]
movq xmm1, qword ptr [rsp + 56]
pcmpeqd xmm1, xmm0
pshufd xmm0, xmm1, 80
movmskpd ecx, xmm0
test cl, 1
je .LBB1_5
shr cl
je .LBB1_5
mov ecx, dword ptr [rsp + 64]
cmp dword ptr [rsp + 32], ecx
jne .LBB1_5
lea rax, [rsp + 40]
lea rcx, [rsp + 8]
mov ecx, dword ptr [rcx + 28]
cmp ecx, dword ptr [rax + 28]
sete al
.LBB1_5:
retsize == 44
equal_impl(array, array):
push rax
lea rdi, [rsp + 16]
lea rsi, [rsp + 192]
mov edx, 176
call memcmp@PLT
test eax, eax
sete al
pop rcx
ret
equal(array, array):
movdqa xmm2, xmmword ptr [rsp + 296]
movdqa xmm3, xmmword ptr [rsp + 280]
movdqa xmm4, xmmword ptr [rsp + 264]
movdqa xmm0, xmmword ptr [rsp + 248]
movdqa xmm1, xmmword ptr [rsp + 184]
movdqa xmm5, xmmword ptr [rsp + 200]
movdqa xmm6, xmmword ptr [rsp + 216]
movdqa xmm7, xmmword ptr [rsp + 232]
pcmpeqd xmm2, xmmword ptr [rsp + 120]
pcmpeqd xmm3, xmmword ptr [rsp + 104]
pcmpeqd xmm4, xmmword ptr [rsp + 88]
packssdw xmm3, xmm2
pcmpeqd xmm0, xmmword ptr [rsp + 72]
packssdw xmm0, xmm4
pcmpeqd xmm7, xmmword ptr [rsp + 56]
pcmpeqd xmm6, xmmword ptr [rsp + 40]
packsswb xmm0, xmm3
packssdw xmm6, xmm7
pcmpeqd xmm5, xmmword ptr [rsp + 24]
packsswb xmm2, xmm6
pcmpeqd xmm1, xmmword ptr [rsp + 8]
packssdw xmm1, xmm5
movdqa xmm3, xmmword ptr [rsp + 328]
movdqa xmm4, xmmword ptr [rsp + 312]
pcmpeqd xmm3, xmmword ptr [rsp + 152]
pcmpeqd xmm4, xmmword ptr [rsp + 136]
packssdw xmm4, xmm3
pand xmm4, xmm1
psrlw xmm4, 8
packuswb xmm1, xmm4
punpckhqdq xmm1, xmm2
psllw xmm1, 7
pand xmm1, xmm0
pmovmskb ecx, xmm1
xor eax, eax
cmp ecx, 65535
jne .LBB1_5
movq xmm0, qword ptr [rsp + 168]
movq xmm1, qword ptr [rsp + 344]
pcmpeqd xmm1, xmm0
pshufd xmm0, xmm1, 80
movmskpd ecx, xmm0
test cl, 1
je .LBB1_5
shr cl
je .LBB1_5
mov ecx, dword ptr [rsp + 352]
cmp dword ptr [rsp + 176], ecx
jne .LBB1_5
lea rax, [rsp + 184]
lea rcx, [rsp + 8]
mov ecx, dword ptr [rcx + 172]
cmp ecx, dword ptr [rax + 172]
sete al
.LBB1_5:
retIf I declare equal_impl as static, then we stop emitting code for equal_impl (as expected) and it does not change the generated code for equal until we get to a size >= 45 (the cut-off point where it would generate a jmp without static). Then, it continues its pattern of just adding more and more instructions up to size == 59 (related to #167389 for this exact code pattern in the implementation). However, at size == 60 both versions start generating the same code again (and it's really bad, it has calls to memcpy):
equal_impl(array, array):
lea rax, [rsp + 248]
lea rcx, [rsp + 8]
xor edx, edx
.LBB0_1:
mov esi, dword ptr [rcx + 4*rdx]
mov edi, dword ptr [rax + 4*rdx]
cmp esi, edi
jne .LBB0_3
cmp rdx, 59
lea rdx, [rdx + 1]
jne .LBB0_1
.LBB0_3:
cmp esi, edi
sete al
ret
equal(array, array):
sub rsp, 488
lea rdi, [rsp + 248]
lea rsi, [rsp + 496]
mov edx, 240
call memcpy@PLT
lea rdi, [rsp + 8]
lea rsi, [rsp + 736]
mov edx, 240
call memcpy@PLT
xor eax, eax
.LBB1_1:
mov ecx, dword ptr [rsp + 4*rax + 248]
mov edx, dword ptr [rsp + 4*rax + 8]
cmp ecx, edx
jne .LBB1_3
cmp rax, 59
lea rax, [rax + 1]
jne .LBB1_1
.LBB1_3:
cmp ecx, edx
sete al
add rsp, 488
retSee it live: https://godbolt.org/z/nPndKvdPn
Note that the examples used in here are similar to my other recent bug reports around memcmp, but other code patterns in the impl function cause the same behavior with varying complexity of the code and size of the data. The underlying issue here is that inlined code is not optimized properly.