Skip to content

Commit

Permalink
CodeGen: Improve lowering of NUM_TO_VEC on A64 for constants (#1194)
Browse files Browse the repository at this point in the history
When the input is a constant, we use a fairly inefficient sequence of
fmov+fcvt+dup or, when the double isn't encodable in fmov,
adr+ldr+fcvt+dup.

Instead, we can use the same lowering as X64 when the input is a
constant, and load the vector from memory. However, if the constant is
encodable via fmov, we can use a vector fmov instead (which is just one
instruction and doesn't need constant space).

Fortunately the bit encoding of fmov for 32-bit floating point numbers
matches that of 64-bit: the decoding algorithm is a little different
because it expands into a larger exponent, but the values are
compatible, so if a double can be encoded into a scalar fmov with a
given abcdefgh pattern, the same pattern should encode the same float;
due to the very limited number of mantissa and exponent bits, all values
that are encodable are also exact in both 32-bit and 64-bit floats.

This strategy is ~same as what gcc uses. For complex vectors, we
previously used 4 instructions and 8 bytes of constant storage, and now
we use 2 instructions and 16 bytes of constant storage, so the memory
footprint is the same; for simple vectors we just need 1 instruction (4
bytes).

clang lowers vector constants a little differently, opting to synthesize
a 64-bit integer using 4 instructions (mov/movk) and then move it to the
vector register - this requires 5 instructions and 20 bytes, vs ours/gcc
2 instructions and 8+16=24 bytes. I tried a simpler version of this that
would be more compact - synthesize a 32-bit integer constant with
mov+movk, and move it to vector register via dup.4s - but this was a
little slower on M2, so for now we prefer the slightly larger version as
it's not a regression vs current implementation.

On the vector approximation benchmark we get:

- Before this PR (flag=false): ~7.85 ns/op
- After this PR (flag=true): ~7.74 ns/op
- After this PR, with 0.125 instead of 0.123 in the benchmark code (to
use fmov): ~7.52 ns/op
- Not part of this PR, but the mov/dup strategy described above: ~8.00
ns/op
  • Loading branch information
zeux authored Mar 13, 2024
1 parent 209fd50 commit 9aa82c6
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 15 deletions.
5 changes: 3 additions & 2 deletions CodeGen/include/Luau/AssemblyBuilderA64.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,12 @@ class AssemblyBuilderA64
// Address of code (label)
void adr(RegisterA64 dst, Label& label);

// Floating-point scalar moves
// Floating-point scalar/vector moves
// Note: constant must be compatible with immediate floating point moves (see isFmovSupported)
void fmov(RegisterA64 dst, RegisterA64 src);
void fmov(RegisterA64 dst, double src);

// Floating-point scalar math
// Floating-point scalar/vector math
void fabs(RegisterA64 dst, RegisterA64 src);
void fadd(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
void fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
Expand All @@ -139,6 +139,7 @@ class AssemblyBuilderA64
void fsqrt(RegisterA64 dst, RegisterA64 src);
void fsub(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);

// Vector component manipulation
void ins_4s(RegisterA64 dst, RegisterA64 src, uint8_t index);
void ins_4s(RegisterA64 dst, uint8_t dstIndex, RegisterA64 src, uint8_t srcIndex);
void dup_4s(RegisterA64 dst, RegisterA64 src, uint8_t index);
Expand Down
20 changes: 15 additions & 5 deletions CodeGen/src/AssemblyBuilderA64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -557,16 +557,26 @@ void AssemblyBuilderA64::fmov(RegisterA64 dst, RegisterA64 src)

void AssemblyBuilderA64::fmov(RegisterA64 dst, double src)
{
CODEGEN_ASSERT(dst.kind == KindA64::d);
CODEGEN_ASSERT(dst.kind == KindA64::d || dst.kind == KindA64::q);

int imm = getFmovImm(src);
CODEGEN_ASSERT(imm >= 0 && imm <= 256);

// fmov can't encode 0, but movi can; movi is otherwise not useful for 64-bit fp immediates because it encodes repeating patterns
if (imm == 256)
placeFMOV("movi", dst, src, 0b001'0111100000'000'1110'01'00000);
// fmov can't encode 0, but movi can; movi is otherwise not useful for fp immediates because it encodes repeating patterns
if (dst.kind == KindA64::d)
{
if (imm == 256)
placeFMOV("movi", dst, src, 0b001'0111100000'000'1110'01'00000);
else
placeFMOV("fmov", dst, src, 0b000'11110'01'1'00000000'100'00000 | (imm << 8));
}
else
placeFMOV("fmov", dst, src, 0b000'11110'01'1'00000000'100'00000 | (imm << 8));
{
if (imm == 256)
placeFMOV("movi.4s", dst, src, 0b010'0111100000'000'0000'01'00000);
else
placeFMOV("fmov.4s", dst, src, 0b010'0111100000'000'1111'0'1'00000 | ((imm >> 5) << 11) | (imm & 31));
}
}

void AssemblyBuilderA64::fabs(RegisterA64 dst, RegisterA64 src)
Expand Down
40 changes: 32 additions & 8 deletions CodeGen/src/IrLoweringA64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "lgc.h"

LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false)
LUAU_FASTFLAGVARIABLE(LuauCodeGenOptVecA64, false)

LUAU_FASTFLAG(LuauCodegenVectorTag2)

Expand Down Expand Up @@ -1176,17 +1177,40 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{
inst.regA64 = regs.allocReg(KindA64::q, index);

RegisterA64 tempd = tempDouble(inst.a);
RegisterA64 temps = castReg(KindA64::s, tempd);
RegisterA64 tempw = regs.allocTemp(KindA64::w);
if (FFlag::LuauCodeGenOptVecA64 && FFlag::LuauCodegenVectorTag2 && inst.a.kind == IrOpKind::Constant)
{
float value = float(doubleOp(inst.a));
uint32_t asU32;
static_assert(sizeof(asU32) == sizeof(value), "Expecting float to be 32-bit");
memcpy(&asU32, &value, sizeof(value));

build.fcvt(temps, tempd);
build.dup_4s(inst.regA64, castReg(KindA64::q, temps), 0);
if (AssemblyBuilderA64::isFmovSupported(value))
{
build.fmov(inst.regA64, value);
}
else
{
RegisterA64 temp = regs.allocTemp(KindA64::x);

if (!FFlag::LuauCodegenVectorTag2)
uint32_t vec[4] = { asU32, asU32, asU32, 0 };
build.adr(temp, vec, sizeof(vec));
build.ldr(inst.regA64, temp);
}
}
else
{
build.mov(tempw, LUA_TVECTOR);
build.ins_4s(inst.regA64, tempw, 3);
RegisterA64 tempd = tempDouble(inst.a);
RegisterA64 temps = castReg(KindA64::s, tempd);
RegisterA64 tempw = regs.allocTemp(KindA64::w);

build.fcvt(temps, tempd);
build.dup_4s(inst.regA64, castReg(KindA64::q, temps), 0);

if (!FFlag::LuauCodegenVectorTag2)
{
build.mov(tempw, LUA_TVECTOR);
build.ins_4s(inst.regA64, tempw, 3);
}
}
break;
}
Expand Down
6 changes: 6 additions & 0 deletions tests/AssemblyBuilderA64.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,12 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPImm")
SINGLE_COMPARE(fmov(d0, 0), 0x2F00E400);
SINGLE_COMPARE(fmov(d0, 0.125), 0x1E681000);
SINGLE_COMPARE(fmov(d0, -0.125), 0x1E781000);
SINGLE_COMPARE(fmov(d0, 1.9375), 0x1E6FF000);

SINGLE_COMPARE(fmov(q0, 0), 0x4F000400);
SINGLE_COMPARE(fmov(q0, 0.125), 0x4F02F400);
SINGLE_COMPARE(fmov(q0, -0.125), 0x4F06F400);
SINGLE_COMPARE(fmov(q0, 1.9375), 0x4F03F7E0);

CHECK(!AssemblyBuilderA64::isFmovSupported(-0.0));
CHECK(!AssemblyBuilderA64::isFmovSupported(0.12389));
Expand Down
6 changes: 6 additions & 0 deletions tests/conformance/vector.lua
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ assert(8 * vector(8, 16, 24) == vector(64, 128, 192));
assert(vector(1, 2, 4) * '8' == vector(8, 16, 32));
assert('8' * vector(8, 16, 24) == vector(64, 128, 192));

assert(vector(1, 2, 4) * -0.125 == vector(-0.125, -0.25, -0.5))
assert(-0.125 * vector(1, 2, 4) == vector(-0.125, -0.25, -0.5))

assert(vector(1, 2, 4) * 100 == vector(100, 200, 400))
assert(100 * vector(1, 2, 4) == vector(100, 200, 400))

if vector_size == 4 then
assert(vector(1, 2, 4, 8) / vector(8, 16, 24, 32) == vector(1/8, 2/16, 4/24, 8/32));
assert(8 / vector(8, 16, 24, 32) == vector(1, 1/2, 1/3, 1/4));
Expand Down

0 comments on commit 9aa82c6

Please sign in to comment.