Skip to content

[clang] Different code generated for function vs inlined version of same function, including memcpy into inlined parameter #167412

@davidstone

Description

@davidstone

Given the following code:

constexpr auto size = 45;

struct array {
	int m[size];
};

auto equal_impl(
	array const lhs,
	array const rhs
) -> bool {
	for (int n = 0; n != size; ++n) {
		if (lhs.m[n] != rhs.m[n]) {
			return false;
		}
	}
	return true;
}

auto equal(
	array const lhs,
	array const rhs
) -> bool {
	return equal_impl(lhs, rhs);
}

then everything seems fine when compiled with -O3:

equal_impl(array, array):
        push    rax
        lea     rdi, [rsp + 16]
        lea     rsi, [rsp + 200]
        mov     edx, 180
        call    memcmp@PLT
        test    eax, eax
        sete    al
        pop     rcx
        ret

equal(array, array):
        jmp     equal_impl(array, array)

But when 5 <= size <= 44, everything does not seem fine. For instance, size == 5 generates

equal_impl(array, array):
        movdqu  xmm0, xmmword ptr [rsp + 32]
        movd    xmm1, dword ptr [rsp + 24]
        movd    xmm2, dword ptr [rsp + 48]
        pcmpeqb xmm2, xmm1
        pcmpeqb xmm0, xmmword ptr [rsp + 8]
        pand    xmm0, xmm2
        pmovmskb        eax, xmm0
        cmp     eax, 65535
        sete    al
        ret

equal(array, array):
        movdqu  xmm0, xmmword ptr [rsp + 32]
        pcmpeqd xmm0, xmmword ptr [rsp + 8]
        movmskps        eax, xmm0
        xor     eax, 15
        sete    cl
        mov     eax, dword ptr [rsp + 48]
        cmp     dword ptr [rsp + 24], eax
        sete    al
        and     al, cl
        ret

It inlines equal_impl but then generates different code, which is weird.

size == 8:

equal_impl(array, array):
        movdqa  xmm0, xmmword ptr [rsp + 24]
        pcmpeqb xmm0, xmmword ptr [rsp + 56]
        movdqa  xmm1, xmmword ptr [rsp + 8]
        pcmpeqb xmm1, xmmword ptr [rsp + 40]
        pand    xmm1, xmm0
        pmovmskb        eax, xmm1
        cmp     eax, 65535
        sete    al
        ret

equal(array, array):
        movdqa  xmm0, xmmword ptr [rsp + 40]
        pcmpeqd xmm0, xmmword ptr [rsp + 8]
        movmskps        ecx, xmm0
        xor     eax, eax
        xor     ecx, 15
        jne     .LBB1_5
        movq    xmm0, qword ptr [rsp + 24]
        movq    xmm1, qword ptr [rsp + 56]
        pcmpeqd xmm1, xmm0
        pshufd  xmm0, xmm1, 80
        movmskpd        ecx, xmm0
        test    cl, 1
        je      .LBB1_5
        shr     cl
        je      .LBB1_5
        mov     ecx, dword ptr [rsp + 64]
        cmp     dword ptr [rsp + 32], ecx
        jne     .LBB1_5
        lea     rax, [rsp + 40]
        lea     rcx, [rsp + 8]
        mov     ecx, dword ptr [rcx + 28]
        cmp     ecx, dword ptr [rax + 28]
        sete    al
.LBB1_5:
        ret

size == 44

equal_impl(array, array):
        push    rax
        lea     rdi, [rsp + 16]
        lea     rsi, [rsp + 192]
        mov     edx, 176
        call    memcmp@PLT
        test    eax, eax
        sete    al
        pop     rcx
        ret

equal(array, array):
        movdqa  xmm2, xmmword ptr [rsp + 296]
        movdqa  xmm3, xmmword ptr [rsp + 280]
        movdqa  xmm4, xmmword ptr [rsp + 264]
        movdqa  xmm0, xmmword ptr [rsp + 248]
        movdqa  xmm1, xmmword ptr [rsp + 184]
        movdqa  xmm5, xmmword ptr [rsp + 200]
        movdqa  xmm6, xmmword ptr [rsp + 216]
        movdqa  xmm7, xmmword ptr [rsp + 232]
        pcmpeqd xmm2, xmmword ptr [rsp + 120]
        pcmpeqd xmm3, xmmword ptr [rsp + 104]
        pcmpeqd xmm4, xmmword ptr [rsp + 88]
        packssdw        xmm3, xmm2
        pcmpeqd xmm0, xmmword ptr [rsp + 72]
        packssdw        xmm0, xmm4
        pcmpeqd xmm7, xmmword ptr [rsp + 56]
        pcmpeqd xmm6, xmmword ptr [rsp + 40]
        packsswb        xmm0, xmm3
        packssdw        xmm6, xmm7
        pcmpeqd xmm5, xmmword ptr [rsp + 24]
        packsswb        xmm2, xmm6
        pcmpeqd xmm1, xmmword ptr [rsp + 8]
        packssdw        xmm1, xmm5
        movdqa  xmm3, xmmword ptr [rsp + 328]
        movdqa  xmm4, xmmword ptr [rsp + 312]
        pcmpeqd xmm3, xmmword ptr [rsp + 152]
        pcmpeqd xmm4, xmmword ptr [rsp + 136]
        packssdw        xmm4, xmm3
        pand    xmm4, xmm1
        psrlw   xmm4, 8
        packuswb        xmm1, xmm4
        punpckhqdq      xmm1, xmm2
        psllw   xmm1, 7
        pand    xmm1, xmm0
        pmovmskb        ecx, xmm1
        xor     eax, eax
        cmp     ecx, 65535
        jne     .LBB1_5
        movq    xmm0, qword ptr [rsp + 168]
        movq    xmm1, qword ptr [rsp + 344]
        pcmpeqd xmm1, xmm0
        pshufd  xmm0, xmm1, 80
        movmskpd        ecx, xmm0
        test    cl, 1
        je      .LBB1_5
        shr     cl
        je      .LBB1_5
        mov     ecx, dword ptr [rsp + 352]
        cmp     dword ptr [rsp + 176], ecx
        jne     .LBB1_5
        lea     rax, [rsp + 184]
        lea     rcx, [rsp + 8]
        mov     ecx, dword ptr [rcx + 172]
        cmp     ecx, dword ptr [rax + 172]
        sete    al
.LBB1_5:
        ret

If I declare equal_impl as static, then we stop emitting code for equal_impl (as expected) and it does not change the generated code for equal until we get to a size >= 45 (the cut-off point where it would generate a jmp without static). Then, it continues its pattern of just adding more and more instructions up to size == 59 (related to #167389 for this exact code pattern in the implementation). However, at size == 60 both versions start generating the same code again (and it's really bad, it has calls to memcpy):

equal_impl(array, array):
        lea     rax, [rsp + 248]
        lea     rcx, [rsp + 8]
        xor     edx, edx
.LBB0_1:
        mov     esi, dword ptr [rcx + 4*rdx]
        mov     edi, dword ptr [rax + 4*rdx]
        cmp     esi, edi
        jne     .LBB0_3
        cmp     rdx, 59
        lea     rdx, [rdx + 1]
        jne     .LBB0_1
.LBB0_3:
        cmp     esi, edi
        sete    al
        ret

equal(array, array):
        sub     rsp, 488
        lea     rdi, [rsp + 248]
        lea     rsi, [rsp + 496]
        mov     edx, 240
        call    memcpy@PLT
        lea     rdi, [rsp + 8]
        lea     rsi, [rsp + 736]
        mov     edx, 240
        call    memcpy@PLT
        xor     eax, eax
.LBB1_1:
        mov     ecx, dword ptr [rsp + 4*rax + 248]
        mov     edx, dword ptr [rsp + 4*rax + 8]
        cmp     ecx, edx
        jne     .LBB1_3
        cmp     rax, 59
        lea     rax, [rax + 1]
        jne     .LBB1_1
.LBB1_3:
        cmp     ecx, edx
        sete    al
        add     rsp, 488
        ret

See it live: https://godbolt.org/z/nPndKvdPn

Note that the examples used in here are similar to my other recent bug reports around memcmp, but other code patterns in the impl function cause the same behavior with varying complexity of the code and size of the data. The underlying issue here is that inlined code is not optimized properly.

Metadata

Metadata

Assignees

No one assigned

    Labels

    clangClang issues not falling into any other categoryllvm:optimizations

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions