aarch64 unconditionally spills when x86-64 does not #82252

jrmuizel · 2024-02-19T14:19:33Z

Give the following code:

extern void foo();
struct Vertex {
    float x;
    float y;
    float c;
};

class Foo {
    void write_vertex(float x1, float y1, float c1, float x2, float y2, float c2, float x3, float y3, float c3);
    Vertex *ptr;
};

 int blue;

 void Foo::write_vertex(float x1, float y1, float c1, float x2, float y2, float c2, float x3, float y3, float c3)    {
    Vertex v1 = {x1, y1, c1};
    Vertex v2 = {x2, y2, c2};
    Vertex v3 = {x3, y3, c3};
    if (blue) {
        ptr[0] = v1;
        ptr[1] = v2;
        ptr[3] = v3;
    } else{
        foo();
        ptr[0] = v1;
        ptr[1] = v2;
        ptr[3] = v3;
    }
}

The generated x86-64 code only stores to the stack when calling foo() however the aarch64 code always spills:

Foo::write_vertex(float, float, float, float, float, float, float, float, float):        // @Foo::write_vertex(float, float, float, float, float, float, float, float, float)
        stp     d15, d14, [sp, #-96]!           // 16-byte Folded Spill
        stp     d13, d12, [sp, #16]             // 16-byte Folded Spill
        stp     d11, d10, [sp, #32]             // 16-byte Folded Spill
        stp     d9, d8, [sp, #48]               // 16-byte Folded Spill
        stp     x29, x30, [sp, #64]             // 16-byte Folded Spill
        str     x19, [sp, #80]                  // 8-byte Folded Spill
        add     x29, sp, #64
        fmov    s9, s6
        fmov    s10, s5
        adrp    x8, blue
        fmov    s11, s4
        fmov    s12, s3
        ldr     w8, [x8, :lo12:blue]
        fmov    s13, s2
        fmov    s14, s1
        ldr     s8, [x29, #32]
        fmov    s15, s0
        mov     x19, x0
        cbnz    w8, .LBB0_2
        str     s7, [x29, #28]                  // 4-byte Folded Spill
        bl      foo()
        ldr     s7, [x29, #28]                  // 4-byte Folded Reload
.LBB0_2:
        ldr     x8, [x19]
        stp     s15, s14, [x8]
        stp     s13, s12, [x8, #8]
        stp     s11, s10, [x8, #16]
        stp     s9, s7, [x8, #36]
        str     s8, [x8, #44]
        ldp     x29, x30, [sp, #64]             // 16-byte Folded Reload
        ldr     x19, [sp, #80]                  // 8-byte Folded Reload
        ldp     d9, d8, [sp, #48]               // 16-byte Folded Reload
        ldp     d11, d10, [sp, #32]             // 16-byte Folded Reload
        ldp     d13, d12, [sp, #16]             // 16-byte Folded Reload
        ldp     d15, d14, [sp], #96             // 16-byte Folded Reload
        ret
blue:
        .word   0

The text was updated successfully, but these errors were encountered:

llvmbot · 2024-02-19T15:56:13Z

@llvm/issue-subscribers-backend-aarch64

Author: Jeff Muizelaar (jrmuizel)

Give the following code: ```c++ extern void foo(); struct Vertex { float x; float y; float c; };

class Foo {
void write_vertex(float x1, float y1, float c1, float x2, float y2, float c2, float x3, float y3, float c3);
Vertex *ptr;
};

int blue;

void Foo::write_vertex(float x1, float y1, float c1, float x2, float y2, float c2, float x3, float y3, float c3) {
Vertex v1 = {x1, y1, c1};
Vertex v2 = {x2, y2, c2};
Vertex v3 = {x3, y3, c3};
if (blue) {
ptr[0] = v1;
ptr[1] = v2;
ptr[3] = v3;
} else{
foo();
ptr[0] = v1;
ptr[1] = v2;
ptr[3] = v3;
}
}


The generated x86-64 code only stores to the stack when calling `foo()` however the aarch64 code always spills:

Foo::write_vertex(float, float, float, float, float, float, float, float, float): // @Foo::write_vertex(float, float, float, float, float, float, float, float, float)
stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
stp d13, d12, [sp, #16] // 16-byte Folded Spill
stp d11, d10, [sp, #32] // 16-byte Folded Spill
stp d9, d8, [sp, #48] // 16-byte Folded Spill
stp x29, x30, [sp, #64] // 16-byte Folded Spill
str x19, [sp, #80] // 8-byte Folded Spill
add x29, sp, #64
fmov s9, s6
fmov s10, s5
adrp x8, blue
fmov s11, s4
fmov s12, s3
ldr w8, [x8, :lo12:blue]
fmov s13, s2
fmov s14, s1
ldr s8, [x29, #32]
fmov s15, s0
mov x19, x0
cbnz w8, .LBB0_2
str s7, [x29, #28] // 4-byte Folded Spill
bl foo()
ldr s7, [x29, #28] // 4-byte Folded Reload
.LBB0_2:
ldr x8, [x19]
stp s15, s14, [x8]
stp s13, s12, [x8, #8]
stp s11, s10, [x8, #16]
stp s9, s7, [x8, #36]
str s8, [x8, #44]
ldp x29, x30, [sp, #64] // 16-byte Folded Reload
ldr x19, [sp, #80] // 8-byte Folded Reload
ldp d9, d8, [sp, #48] // 16-byte Folded Reload
ldp d11, d10, [sp, #32] // 16-byte Folded Reload
ldp d13, d12, [sp, #16] // 16-byte Folded Reload
ldp d15, d14, [sp], #96 // 16-byte Folded Reload
ret
blue:
.word 0

</details>

efriedma-quic · 2024-02-20T01:01:18Z

In general, the register allocator doesn't model the cost of the prologue/epilogue, or the cost of shrink-wrapping. It just minimizes the number of stores it generates... and the way to do that is to use registers that aren't clobbered by the call, i.e. callee-save registers. (Most targets tailor the register allocation order to prefer caller-save registers, which covers the simplest cases.)

The only reason your testcase gets optimized on x86-64 is that the x86-64 calling convention doesn't have any floating-point callee-save registers, so the register allocator is forced to generate spills. If you replace "float" with "int", it does the same thing as AArch64.

It's probably not worth modifying the register allocator for this; modifications are hard (both in terms of the actual code, and the difficulty of review/testing/benchmarking/etc.). The only other solution I can see is some form of tail-dup; if you duplicate the stores, the register allocator should do the right thing.

github-actions bot added the new issue label Feb 19, 2024

EugeneZelenko added backend:AArch64 and removed new issue labels Feb 19, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

aarch64 unconditionally spills when x86-64 does not #82252

aarch64 unconditionally spills when x86-64 does not #82252

jrmuizel commented Feb 19, 2024

llvmbot commented Feb 19, 2024

efriedma-quic commented Feb 20, 2024 •

edited

Loading

aarch64 unconditionally spills when x86-64 does not #82252

aarch64 unconditionally spills when x86-64 does not #82252

Comments

jrmuizel commented Feb 19, 2024

llvmbot commented Feb 19, 2024

efriedma-quic commented Feb 20, 2024 • edited Loading

efriedma-quic commented Feb 20, 2024 •

edited

Loading