Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aarch64 unconditionally spills when x86-64 does not #82252

Open
jrmuizel opened this issue Feb 19, 2024 · 2 comments
Open

aarch64 unconditionally spills when x86-64 does not #82252

jrmuizel opened this issue Feb 19, 2024 · 2 comments

Comments

@jrmuizel
Copy link

Give the following code:

extern void foo();
struct Vertex {
    float x;
    float y;
    float c;
};

class Foo {
    void write_vertex(float x1, float y1, float c1, float x2, float y2, float c2, float x3, float y3, float c3);
    Vertex *ptr;
};

 int blue;

 void Foo::write_vertex(float x1, float y1, float c1, float x2, float y2, float c2, float x3, float y3, float c3)    {
    Vertex v1 = {x1, y1, c1};
    Vertex v2 = {x2, y2, c2};
    Vertex v3 = {x3, y3, c3};
    if (blue) {
        ptr[0] = v1;
        ptr[1] = v2;
        ptr[3] = v3;
    } else{
        foo();
        ptr[0] = v1;
        ptr[1] = v2;
        ptr[3] = v3;
    }
}

The generated x86-64 code only stores to the stack when calling foo() however the aarch64 code always spills:

Foo::write_vertex(float, float, float, float, float, float, float, float, float):        // @Foo::write_vertex(float, float, float, float, float, float, float, float, float)
        stp     d15, d14, [sp, #-96]!           // 16-byte Folded Spill
        stp     d13, d12, [sp, #16]             // 16-byte Folded Spill
        stp     d11, d10, [sp, #32]             // 16-byte Folded Spill
        stp     d9, d8, [sp, #48]               // 16-byte Folded Spill
        stp     x29, x30, [sp, #64]             // 16-byte Folded Spill
        str     x19, [sp, #80]                  // 8-byte Folded Spill
        add     x29, sp, #64
        fmov    s9, s6
        fmov    s10, s5
        adrp    x8, blue
        fmov    s11, s4
        fmov    s12, s3
        ldr     w8, [x8, :lo12:blue]
        fmov    s13, s2
        fmov    s14, s1
        ldr     s8, [x29, #32]
        fmov    s15, s0
        mov     x19, x0
        cbnz    w8, .LBB0_2
        str     s7, [x29, #28]                  // 4-byte Folded Spill
        bl      foo()
        ldr     s7, [x29, #28]                  // 4-byte Folded Reload
.LBB0_2:
        ldr     x8, [x19]
        stp     s15, s14, [x8]
        stp     s13, s12, [x8, #8]
        stp     s11, s10, [x8, #16]
        stp     s9, s7, [x8, #36]
        str     s8, [x8, #44]
        ldp     x29, x30, [sp, #64]             // 16-byte Folded Reload
        ldr     x19, [sp, #80]                  // 8-byte Folded Reload
        ldp     d9, d8, [sp, #48]               // 16-byte Folded Reload
        ldp     d11, d10, [sp, #32]             // 16-byte Folded Reload
        ldp     d13, d12, [sp, #16]             // 16-byte Folded Reload
        ldp     d15, d14, [sp], #96             // 16-byte Folded Reload
        ret
blue:
        .word   0                
@llvmbot
Copy link
Collaborator

llvmbot commented Feb 19, 2024

@llvm/issue-subscribers-backend-aarch64

Author: Jeff Muizelaar (jrmuizel)

Give the following code: ```c++ extern void foo(); struct Vertex { float x; float y; float c; };

class Foo {
void write_vertex(float x1, float y1, float c1, float x2, float y2, float c2, float x3, float y3, float c3);
Vertex *ptr;
};

int blue;

void Foo::write_vertex(float x1, float y1, float c1, float x2, float y2, float c2, float x3, float y3, float c3) {
Vertex v1 = {x1, y1, c1};
Vertex v2 = {x2, y2, c2};
Vertex v3 = {x3, y3, c3};
if (blue) {
ptr[0] = v1;
ptr[1] = v2;
ptr[3] = v3;
} else{
foo();
ptr[0] = v1;
ptr[1] = v2;
ptr[3] = v3;
}
}


The generated x86-64 code only stores to the stack when calling `foo()` however the aarch64 code always spills:

Foo::write_vertex(float, float, float, float, float, float, float, float, float): // @Foo::write_vertex(float, float, float, float, float, float, float, float, float)
stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
stp d13, d12, [sp, #16] // 16-byte Folded Spill
stp d11, d10, [sp, #32] // 16-byte Folded Spill
stp d9, d8, [sp, #48] // 16-byte Folded Spill
stp x29, x30, [sp, #64] // 16-byte Folded Spill
str x19, [sp, #80] // 8-byte Folded Spill
add x29, sp, #64
fmov s9, s6
fmov s10, s5
adrp x8, blue
fmov s11, s4
fmov s12, s3
ldr w8, [x8, :lo12:blue]
fmov s13, s2
fmov s14, s1
ldr s8, [x29, #32]
fmov s15, s0
mov x19, x0
cbnz w8, .LBB0_2
str s7, [x29, #28] // 4-byte Folded Spill
bl foo()
ldr s7, [x29, #28] // 4-byte Folded Reload
.LBB0_2:
ldr x8, [x19]
stp s15, s14, [x8]
stp s13, s12, [x8, #8]
stp s11, s10, [x8, #16]
stp s9, s7, [x8, #36]
str s8, [x8, #44]
ldp x29, x30, [sp, #64] // 16-byte Folded Reload
ldr x19, [sp, #80] // 8-byte Folded Reload
ldp d9, d8, [sp, #48] // 16-byte Folded Reload
ldp d11, d10, [sp, #32] // 16-byte Folded Reload
ldp d13, d12, [sp, #16] // 16-byte Folded Reload
ldp d15, d14, [sp], #96 // 16-byte Folded Reload
ret
blue:
.word 0

</details>

@efriedma-quic
Copy link
Collaborator

efriedma-quic commented Feb 20, 2024

In general, the register allocator doesn't model the cost of the prologue/epilogue, or the cost of shrink-wrapping. It just minimizes the number of stores it generates... and the way to do that is to use registers that aren't clobbered by the call, i.e. callee-save registers. (Most targets tailor the register allocation order to prefer caller-save registers, which covers the simplest cases.)

The only reason your testcase gets optimized on x86-64 is that the x86-64 calling convention doesn't have any floating-point callee-save registers, so the register allocator is forced to generate spills. If you replace "float" with "int", it does the same thing as AArch64.

It's probably not worth modifying the register allocator for this; modifications are hard (both in terms of the actual code, and the difficulty of review/testing/benchmarking/etc.). The only other solution I can see is some form of tail-dup; if you duplicate the stores, the register allocator should do the right thing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants