diff --git a/lldb/test/API/linux/aarch64/sme_only_registers/Makefile b/lldb/test/API/linux/aarch64/sme_only_registers/Makefile new file mode 100644 index 0000000000000..f3b210b80b7e9 --- /dev/null +++ b/lldb/test/API/linux/aarch64/sme_only_registers/Makefile @@ -0,0 +1,6 @@ +C_SOURCES := main.c +# The memset provided by glibc may use instructions we cannot use in +# streaming mode. +CFLAGS_EXTRAS := -march=armv8-a+sve+sme+sme2 -fno-builtin-memset + +include Makefile.rules diff --git a/lldb/test/API/linux/aarch64/sme_only_registers/TestSMEOnlyRegisters.py b/lldb/test/API/linux/aarch64/sme_only_registers/TestSMEOnlyRegisters.py new file mode 100644 index 0000000000000..03d564d53dfaf --- /dev/null +++ b/lldb/test/API/linux/aarch64/sme_only_registers/TestSMEOnlyRegisters.py @@ -0,0 +1,553 @@ +""" +Check handling of registers on an AArch64 Linux system that only has SME. As +opposed to SVE and SME. Check register access and restoration after expression +evaluation. +""" + +from enum import Enum +from pprint import pprint +from functools import lru_cache +from itertools import permutations +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil +from itertools import cycle + + +class Mode(Enum): + SIMD = 0 + SSVE = 2 + + def __str__(self): + return "streaming" if self == Mode.SSVE else "simd" + + +class ZA(Enum): + ON = 1 + OFF = 2 + + def __str__(self): + return "on" if self == ZA.ON else "off" + + +class ByteVector(object): + def __init__(self, data): + self.data = data + + def __repr__(self): + return "{" + " ".join([f"0x{b:02x}" for b in self.data]) + "}" + + def as_bytes(self): + return self.data + + +class HexValue(object): + # Assume all values are 64-bit, but some display as 32-bit, like fpcr. + def __init__(self, value, repr_size=8): + self.value = value + # In bytes + self.repr_size = repr_size + + def __repr__(self): + return f"0x{self.value:0{self.repr_size*2}x}" + + def as_bytes(self): + data = [] + v = self.value + # Little endian order. + for i in range(8): + data.append(v & 0xFF) + v >>= 8 + + return data + + +class SVESIMDRegistersTestCase(TestBase): + def reg_names(self, prefix, count): + return [f"{prefix}{n}" for n in range(count)] + + def expected_registers(self, svl_b, mode, za): + register_values = [] + + if mode == Mode.SIMD: + # In streaming mode we have real V registers and no Z registers. + + # V regs are {N <7 0s> N <7 0s>} because we set the bottom element to N + # where N is 1 + the register index. + v_values = [ + ByteVector([n + 1] + [0] * 7 + [n + 1] + [0] * 7) for n in range(32) + ] + + # Z regs are {N <7 0s> N <7 0s> <16 more 0s}. First half overlaps a V + # register, the second half we fake 0s for as there is no real Z register + # in non-streaming mode. + z_values = [ + ByteVector([n + 1] + [0] * 7 + [n + 1] + [0] * 7 + [0] * (svl_b - 16)) + for n in range(32) + ] + + # P regs are {<4 0s>}, we fake the value. + p_values = [ByteVector([0] * (svl_b // 8)) for _ in range(16)] + else: + # In streaming mode, Z registers are real and V are the bottom 128 + # bits of the Z registers. + + # Streaming SVE registers have their elements set to their number plus 1. + # So z0 has elements of 0x01, z1 is 0x02 and so on. + v_values = [ByteVector([n + 1] * 16) for n in range(32)] + + z_values = [ByteVector([n + 1] * svl_b) for n in range(32)] + + # P registers have all emlements set to the same value and that value + # cycles between 0xff, 0x55, 0x11, 0x01 and 0x00. + p_values = [] + for i, v in zip(range(16), cycle([0xFF, 0x55, 0x11, 0x01, 0x00])): + p_values.append(ByteVector([v] * (svl_b // 8))) + + # Would use strict=True here but it requires Python 3.10. + register_values += list(zip(self.reg_names("v", 32), v_values)) + register_values += [ + ("fpsr", HexValue(0x50000015, repr_size=4)), + ("fpcr", HexValue(0x05551505, repr_size=4)), + ] + register_values += list(zip(self.reg_names("z", 32), z_values)) + register_values += list(zip(self.reg_names("p", 16), p_values)) + + # ffr is all 0s. In SIMD mode we're faking the value, in streaming mode, + # use of ffr is illegal so the kernel tells us it's 0s. + register_values += [("ffr", ByteVector([0] * (svl_b // 8)))] + + svcr_value = 1 if mode == Mode.SSVE else 0 + if za == ZA.ON: + svcr_value += 2 + + register_values += [ + ("svcr", HexValue(svcr_value)), + # SVG is the streaming vector length in granules. + ("svg", HexValue(svl_b // 8)), + ] + + # ZA and ZTO may be enabled or disabled regardless of streaming mode. + # ZA is a square of vector length * vector length. + # ZT0 is 512 bits regardless of streaming vector length. + if za == ZA.ON: + register_values += [("za", ByteVector(list(range(1, svl_b + 1)) * svl_b))] + if self.isAArch64SME2(): + register_values += [("zt0", ByteVector(list(range(1, (512 // 8) + 1))))] + else: + # ZA is fake. + register_values += [("za", ByteVector([0x0] * (svl_b * svl_b)))] + # ZT0 is also fake. + if self.isAArch64SME2(): + register_values += [("zt0", ByteVector([0x00] * (512 // 8)))] + + return dict(register_values) + + def check_expected_regs_fn(self, expected_registers): + def check_expected_regs(): + self.expect( + f'register read {" ".join(expected_registers.keys())}', + substrs=[f"{n} = {v}" for n, v in expected_registers.items()], + ) + + return check_expected_regs + + def skip_if_not_sme_only(self): + if self.isAArch64SVE(): + self.skipTest("SVE must not be present outside of streaming mode.") + + if not self.isAArch64SME(): + self.skipTest("SSVE registers must be supported.") + + def setup_test(self, mode, za, svl): + self.build() + self.line = line_number("main.c", "// Set a break point here.") + + exe = self.getBuildArtifact("a.out") + self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) + + self.runCmd(f"settings set target.run-args {mode} {za} {svl}") + + lldbutil.run_break_set_by_file_and_line( + self, "main.c", self.line, num_expected_locations=1 + ) + self.runCmd("run", RUN_SUCCEEDED) + + self.expect( + "thread backtrace", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stop reason = breakpoint 1."], + ) + + def write_expected_reg_data(self, reg_data): + # Write expected register values into program memory so it can be + # verified in-process. + # This must be done via. memory write instead of expressions because + # the latter may try to save/restore registers, which is part of what + # this file tests so we can't rely on it here. + # We will always write Z and ZA/ZTO, it's up to the program whether it + # checks them. + + for reg, value in reg_data.items(): + sym_name = None + # Since we cannot expression evaluate, we have to manually offset + # arrays. + offset = 0 + + # Offsets for scalable registers assume that the expected register + # data length matches the current svl. + if reg == "fpcr": + sym_name = "expected_fpcr" + elif reg == "fpsr": + sym_name = "expected_fpsr" + elif reg == "ffr": + sym_name = "expected_sve_ffr" + elif reg == "za": + sym_name = "expected_za" + elif reg == "zt0": + sym_name = "expected_zt0" + elif reg == "svcr": + sym_name = "expected_svcr" + elif reg == "svg": + sym_name = "expected_svg" + elif reg.startswith("v"): + num = int(reg.split("v")[1]) + offset = 16 * num + sym_name = "expected_v_regs" + elif reg.startswith("z"): + num = int(reg.split("z")[1]) + offset = len(value.as_bytes()) * num + sym_name = "expected_sve_z" + elif reg.startswith("p"): + num = int(reg.split("p")[1]) + offset = len(value.as_bytes()) * num + sym_name = "expected_sve_p" + + if sym_name is None: + raise RuntimeError( + f"Do not know how to write expected value for register {reg}." + ) + + address = self.lookup_address(sym_name) + offset + process = self.dbg.GetSelectedTarget().GetProcess() + err = lldb.SBError() + wrote = process.WriteMemory(address, bytearray(value.as_bytes()), err) + self.assertTrue(err.Success()) + self.assertEqual(len(value.as_bytes()), wrote) + + # This is safe to cache because each test will be its own instance of this + # class. + @lru_cache + def lookup_address(self, sym_name): + target = self.dbg.GetSelectedTarget() + sym = target.module[0].FindSymbol(sym_name) + self.assertTrue(sym.IsValid()) + address = sym.GetStartAddress().GetLoadAddress(target) + + # Dereference the pointer. + err = lldb.SBError() + ptr = target.GetProcess().ReadPointerFromMemory(address, err) + self.assertTrue(err.Success()) + + return ptr + + def get_svls(self): + # This proc file contains the default streaming vector length (in bytes). + # It defaults to 32, or the largest vector length. Whichever is smaller. + err, retcode, output = self.run_platform_command( + "cat /proc/sys/abi/sme_default_vector_length" + ) + if err.Fail() or retcode != 0: + self.skipTest(f"Failed to read sme_default_vector_length: {output}") + + # Content should be a single decimal number. + try: + default_svl = int(output) + except ValueError: + # File contained unexpected data. + self.skipTest( + f"sme_default_vector_length contained unexpected data: {output}" + ) + + # A valid svl is a multiple of 128 bits and a power of 2. We need to find + # 2 of them that will work. We could try to set this file to a higher value, + # but likely we need root permissions on some machines. + # So if the default is 128 bit, assume that's the max, in which case + # there is no second svl we can use. + if default_svl == 16: + self.skipTest( + f"Did not find 2 supported streaming vector lengths, default is {default_svl}" + ) + + # The default is something greater than 16. Use it and the next lowest. + return (default_svl, default_svl // 2) + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_simd_registers_ssve(self): + self.skip_if_not_sme_only() + + svl_b = self.get_svls()[0] + self.setup_test(Mode.SSVE, ZA.ON, svl_b) + + expected_registers = self.expected_registers(svl_b, Mode.SSVE, ZA.ON) + check_expected_regs = self.check_expected_regs_fn(expected_registers) + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + check_expected_regs() + + # Write via Z0 + z_value = ByteVector([0x12] * svl_b) + self.runCmd(f'register write z0 "{z_value}"') + + # z0 and v0 should change but nothing else. + expected_registers["z0"] = z_value + expected_registers["v0"] = ByteVector([0x12] * 16) + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + check_expected_regs() + + # We can do the same via a V register, the value will be extended and sent as + # a Z write. + v_value = ByteVector([0x34] * 16) + self.runCmd(f'register write v1 "{v_value}"') + + # The lower half of z1 is the v value, the upper part is the 0x2 that was previously in there. + expected_registers["z1"] = ByteVector([0x34] * 16 + [0x02] * (svl_b - 16)) + expected_registers["v1"] = v_value + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + check_expected_regs() + + # Even though you can't set all these bits in reality, until we do + # a step, it'll seem like we did. + # This arbitrary value is 0x55...55 when written to the real register. + # Some bits cannot be set. + fpsr = HexValue(0xA800008A, repr_size=4) + + self.runCmd(f"register write fpsr {fpsr}") + expected_registers["fpsr"] = fpsr + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + check_expected_regs() + + # Again this is 0x55...55, but with bits we cannot set removed. + fpcr = HexValue(0x05551505, repr_size=4) + self.runCmd(f"register write fpcr {fpcr}") + expected_registers["fpcr"] = fpcr + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + check_expected_regs() + + p_value = ByteVector([0x65] * (svl_b // 8)) + self.expect(f'register write p0 "{p_value}"') + expected_registers["p0"] = p_value + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + check_expected_regs() + + # We cannot interact with ffr in streaming mode while in process. So this + # will be verified by ptrace only. + ffr_value = ByteVector([0x78] * (svl_b // 8)) + self.expect(f'register write ffr "{ffr_value}"') + expected_registers["ffr"] = ffr_value + + # It will appear as if we wrote ffr, but in streaming mode without + # SME_FA64+SVE, it essentially does not exist. + check_expected_regs() + + # At least make sure we didn't disturb anything else. + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + + # The kernel will always return 0s for ffr. + expected_registers["ffr"] = ByteVector([0x0] * (svl_b // 8)) + check_expected_regs() + + za_value = ByteVector(list(range(2, svl_b + 2)) * svl_b) + self.expect(f'register write za "{za_value}"') + expected_registers["za"] = za_value + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + check_expected_regs() + + # ZT0 is 512 bit regardless of vector length. + if self.isAArch64SME2(): + zt0_value = ByteVector(list(range(2, (512 // 8) + 2))) + self.expect(f'register write zt0 "{zt0_value}"') + expected_registers["zt0"] = zt0_value + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + check_expected_regs() + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_simd_registers_simd(self): + self.skip_if_not_sme_only() + + svl_b = self.get_svls()[0] + self.setup_test(Mode.SIMD, ZA.OFF, svl_b) + + # Check for the values the program should have set. + expected_registers = self.expected_registers(svl_b, Mode.SIMD, ZA.OFF) + check_expected_regs = self.check_expected_regs_fn(expected_registers) + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + + check_expected_regs() + + # In SIMD mode if you write Z0, only the parts that overlap V0 will + # change. + z_value = ByteVector([0x12] * svl_b) + self.runCmd(f'register write z0 "{z_value}"') + + # z0 and z0 should change but nothing else. We check the rest because + # we are faking Z register data in this mode, and any offset mistake + # could lead to modifying other registers. + expected_registers["z0"] = ByteVector([0x12] * 16 + [0x00] * (svl_b - 16)) + expected_registers["v0"] = ByteVector([0x12] * 16) + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + + check_expected_regs() + + # We can do the same via a V register, the value will be extended and sent as + # a Z write. + v_value = ByteVector([0x34] * 16) + self.runCmd(f'register write v1 "{v_value}"') + + expected_registers["z1"] = ByteVector([0x34] * 16 + [0x00] * (svl_b - 16)) + expected_registers["v1"] = v_value + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + + check_expected_regs() + + # FPSR and FPCR are still described as real registers, so they are + # sent as normal writes. + # This is the value 0xaaaaaaaa but only the bits that we can actually + # set in reality. + fpcontrol = 0xA800008A + + # First FPSR on its own. + self.runCmd(f"register write fpsr 0x{fpcontrol:08x}") + expected_registers["fpsr"] = HexValue(fpcontrol, repr_size=4) + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + + check_expected_regs() + + # Then FPCR. This value is 0xaaaaaaaa reduced to the bits we can actually + # set. + fpcontrol = 0x02AAAA02 + self.runCmd(f"register write fpcr 0x{fpcontrol:08x}") + expected_registers["fpcr"] = HexValue(fpcontrol, repr_size=4) + + self.write_expected_reg_data(expected_registers) + self.expect("next", substrs=["stop reason = step over"]) + + check_expected_regs() + + # We are faking SVE registers while outside of streaming mode, and + # predicate registers and ffr have no real register to overlay. + # We chose to make this an error instead of eating the write silently. + + value = ByteVector([0x98] * (svl_b // 8)) + self.expect(f'register write p0 "{value}"', error=True) + check_expected_regs() + self.expect(f'register write ffr "{value}"', error=True) + check_expected_regs() + + # In theory we could test writing to ZA and ZT0, however this would + # enable streaming mode. In streaming mode, their handling is the same + # as on an SVE+SME system, and so is covered in other tests. + + @no_debug_info_test + @skipIf(archs=no_match(["aarch64"])) + @skipIf(oslist=no_match(["linux"])) + def test_expr_restore(self): + """ + Check that we can restore an intial state after expression evaluation + leaves us in a different state. + """ + self.skip_if_not_sme_only() + + # Each test goes from a start state to an expression state, and + # back to the start state. Those state contains: + # * Streaming mode on or off + # * ZA on or off + # * Streaming vector length + # + # Rather than trying to be clever choosing which transitions to test, + # test all the combinations where the start and end state are different. + # + # In theory a CPU can support many vector lengths, but the key difficulty + # with vector length is resizing buffers in LLDB. So we will test 1 "large" + # length (the default length) and one "small" length (the next smallest + # than the default). This will cover increasing and decreasing register + # size. + # + # Note that vector length applies to Z and to ZA/ZT0. Even if streaming + # mode is not enabled, ZA/ZT0 can change size. + # + # These tests take a very long time and in theory we could do them not + # by re-running the program but by changing the state via. register + # writes. However, some states cannot be accessed by writes done by LLDB + # so to keep things simple we handle all states the same way. + # + # Doing it this way also gives us extra register reading coverage in all + # the possible states. + states = [] + svls = self.get_svls() + for m in list(Mode): + for za in list(ZA): + for vl in svls: + states.append((m, za, vl)) + + # Start with not changing state. + expr_tests = [(state, state) for state in states] + # Then all combinations of different states. + expr_tests.extend(list(permutations(states, 2))) + + if self.TraceOn(): + print("Expression tests:") + pprint(expr_tests) + + for (sm, sz, svl), (em, ez, evl) in expr_tests: + if self.TraceOn(): + print( + f"Testing restore to [mode:{sm} za:{sz} svl:{svl}] from expression state [mode:{em} za:{ez} svl:{evl}]" + ) + + self.setup_test(sm, sz, svl) + + expected_registers = self.expected_registers(svl, sm, sz) + check_expected_regs = self.check_expected_regs_fn(expected_registers) + + # The program sets up the initial state by running code in process. + # In theory we could skip this, but it does give us coverage of + # reading registers in all modes. + check_expected_regs() + # This expression will change modes and set different values. + self.expect( + f"expression expr_function({str(em == Mode.SSVE).lower()}, {str(ez == ZA.ON).lower()}, {evl})" + ) + # LLDB should restore the process to the previous values and modes. + check_expected_regs() + + self.runCmd("process kill") diff --git a/lldb/test/API/linux/aarch64/sme_only_registers/main.c b/lldb/test/API/linux/aarch64/sme_only_registers/main.c new file mode 100644 index 0000000000000..e6df75a69dd20 --- /dev/null +++ b/lldb/test/API/linux/aarch64/sme_only_registers/main.c @@ -0,0 +1,495 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef PR_SME_SET_VL +#define PR_SME_SET_VL 63 +#endif + +#ifndef PR_SME_GET_VL +#define PR_SME_GET_VL 64 +#endif + +#define SM_INST(c) asm volatile("msr s0_3_c4_c" #c "_3, xzr") +#define SMSTART SM_INST(7) +#define SMSTART_SM SM_INST(3) +#define SMSTART_ZA SM_INST(5) +#define SMSTOP SM_INST(6) +#define SMSTOP_SM SM_INST(2) +#define SMSTOP_ZA SM_INST(4) + +#ifndef HWCAP2_SME2 +#define HWCAP2_SME2 (1UL << 37) +#endif + +// Doing a syscall exits streaming mode, but we need to use these during an +// expression in streaming mode. So it's set by main and we reference these +// later. +int svl_b = 0; +bool has_sme2 = 0; + +#define VREG_NUM 32 +#define VREG_SIZE 16 + +// Some of these could be statically allocated, but malloc-ing them all makes +// it simpler to write the Python side of this test. +uint8_t *expected_v_regs = NULL; +// These are treated as 32-bit but msr/mrs uses 64-bit values. +uint64_t *expected_fpcr = NULL; +uint64_t *expected_fpsr = NULL; +uint8_t *expected_sve_z = NULL; +uint8_t *expected_sve_p = NULL; +uint8_t *expected_sve_ffr = NULL; +uint8_t *expected_za = NULL; +uint8_t *expected_zt0 = NULL; +uint64_t *expected_svcr = NULL; +uint64_t *expected_svg = NULL; + +static void *checked_malloc(size_t size) { + void *ptr = malloc(size); + if (ptr == NULL) + exit(1); + + return ptr; +} + +static int gpr_only_memcmp(uint8_t *lhs, uint8_t *rhs, size_t len) { + // Hand written memcmp so we don't have to use + // the compiler or library version, which would use SIMD registers + // and corrupt registers before we can read them. + int ret = 0; + for (; len; ++lhs, ++rhs, --len) { + asm volatile("cmp %w1, %w2\n\t" + "cset %w0, ne \n\t" + : "=r"(ret) + : "r"(*lhs), "r"(*rhs) + : "cc"); + if (ret) + return ret; + } + return ret; +} + +// I would return bool here and check that from inside LLDB, but we cannot +// assume that expression evaluation works. So instead we exit, which is +// harder to track down but doesn't need expression evaluation to check for. +void check_register_values(bool streaming, bool za) { + // In streaming mode, SIMD instructions are illegal. We will read the SIMD + // portion of the streaming SVE registers later. + if (!streaming) { + uint64_t v_got[2]; +#define VERIFY_V(NUM) \ + do { \ + asm volatile("MOV %0, v" #NUM ".d[0]\n\t" \ + "MOV %1, v" #NUM ".d[1]\n\t" \ + : "=r"(v_got[0]), "=r"(v_got[1])); \ + if (gpr_only_memcmp((void *)(expected_v_regs + (NUM * VREG_SIZE)), \ + (void *)&v_got[0], VREG_SIZE) != 0) \ + exit(1); \ + } while (0) + + VERIFY_V(0); + VERIFY_V(1); + VERIFY_V(2); + VERIFY_V(3); + VERIFY_V(4); + VERIFY_V(5); + VERIFY_V(6); + VERIFY_V(7); + VERIFY_V(8); + VERIFY_V(9); + VERIFY_V(10); + VERIFY_V(11); + VERIFY_V(12); + VERIFY_V(13); + VERIFY_V(14); + VERIFY_V(15); + VERIFY_V(16); + VERIFY_V(17); + VERIFY_V(18); + VERIFY_V(19); + VERIFY_V(20); + VERIFY_V(21); + VERIFY_V(22); + VERIFY_V(23); + VERIFY_V(24); + VERIFY_V(25); + VERIFY_V(26); + VERIFY_V(27); + VERIFY_V(28); + VERIFY_V(29); + VERIFY_V(30); + VERIFY_V(31); + +#undef VERIFY_V + } + + uint64_t val = 0; + asm volatile("mrs %0, fpcr" : "=r"(val)); + if (val != *expected_fpcr) + exit(1); + + asm volatile("mrs %0, fpsr" : "=r"(val)); + if (val != *expected_fpsr) + exit(1); + + // Can't read SVE registers outside of streaming mode. + if (streaming) { + // Not 0 init because 0 is a valid register value here. + uint64_t got_svcr = 0xFFFFFFFFFFFFFFFFull; + asm volatile("mrs %0, svcr" : "=r"(got_svcr)); + if (got_svcr != *expected_svcr) + exit(1); + + // svg's unit is 8 byte granules. + if (*expected_svg != svl_b / 8) + exit(1); + + // We do not check FFR because we have no way to read or write it while in + // streaming mode. Both wrffr and store value of ffr require SME_FA64, which + // requires that you have SVE, which we don't have. + + size_t preg_size = svl_b / 8; + uint8_t *got_sve_p = checked_malloc(preg_size); +#define VERIFY_P(NUM) \ + do { \ + asm volatile("str p" #NUM ", [%0]" ::"r"(&got_sve_p[0]) : "memory"); \ + if (gpr_only_memcmp((void *)(expected_sve_p + (NUM * preg_size)), \ + (void *)got_sve_p, preg_size) != 0) \ + exit(1); \ + } while (0) + + VERIFY_P(0); + VERIFY_P(1); + VERIFY_P(2); + VERIFY_P(3); + VERIFY_P(4); + VERIFY_P(5); + VERIFY_P(6); + VERIFY_P(7); + VERIFY_P(8); + VERIFY_P(9); + VERIFY_P(10); + VERIFY_P(11); + VERIFY_P(12); + VERIFY_P(13); + VERIFY_P(14); + VERIFY_P(15); + +#undef VERIFY_P + + uint8_t *got_sve_z = checked_malloc(svl_b); + // Note that we are not using a p0 clobber below. We will manually restore + // it once we have checked all the Z registers. + // If we relied on the clobber, the compiler would only save and restore if + // if was already using p0, which it usually is not. +#define VERIFY_Z(NUM) \ + do { \ + asm volatile("ptrue p0.d\n\t" \ + "st1d z" #NUM ".d, p0, [%0]\n\t" ::"r"(got_sve_z) \ + : "memory"); \ + if (gpr_only_memcmp((void *)(expected_sve_z + (NUM * svl_b)), \ + (void *)got_sve_z, svl_b) != 0) \ + exit(1); \ + } while (0) + + VERIFY_Z(0); + + // Put back p0 value. LLDB will expect to see this. + asm volatile("ldr p0, [%0]" ::"r"(expected_sve_p)); + +#undef VERIFY_Z + } + + if (za) { + uint8_t *got_za = checked_malloc(svl_b * svl_b); + + // Store one row of ZA at a time. + uint8_t *za_row = got_za; + for (int i = 0; i < svl_b; ++i, za_row += svl_b) { + asm volatile("mov w12, %w0\n\t" + "str za[w12, 0], [%1]\n\t" ::"r"(i), + "r"(za_row) + : "w12"); + } + + if (gpr_only_memcmp(expected_za, got_za, svl_b * svl_b) != 0) + exit(1); + + if (has_sme2) { + uint8_t *got_zt0 = checked_malloc(svl_b * 2); + asm volatile("str zt0, [%0]" ::"r"(got_zt0)); + + if (gpr_only_memcmp((void *)expected_zt0, (void *)got_zt0, svl_b * 2) != + 0) + exit(1); + } + } +} + +static void write_fp_control() { + // Some of these bits won't get set, this is fine. Just needs to be + // recongisable from inside the debugger. + uint64_t val = 0x5555555555555555ULL; + asm volatile("msr fpcr, %0" ::"r"(val)); + asm volatile("msr fpsr, %0" ::"r"(val)); +} + +static void write_sve_regs() { + // We do not explicitly set ffr here because doing so requires the smefa64 + // extension (both for load to ffr, and the specific wrffr instruction). + // To have that extension you have to have SVE outside of streaming + // mode which we do not have. + + asm volatile("ptrue p0.b\n\t"); + asm volatile("ptrue p1.h\n\t"); + asm volatile("ptrue p2.s\n\t"); + asm volatile("ptrue p3.d\n\t"); + asm volatile("pfalse p4.b\n\t"); + asm volatile("ptrue p5.b\n\t"); + asm volatile("ptrue p6.h\n\t"); + asm volatile("ptrue p7.s\n\t"); + asm volatile("ptrue p8.d\n\t"); + asm volatile("pfalse p9.b\n\t"); + asm volatile("ptrue p10.b\n\t"); + asm volatile("ptrue p11.h\n\t"); + asm volatile("ptrue p12.s\n\t"); + asm volatile("ptrue p13.d\n\t"); + asm volatile("pfalse p14.b\n\t"); + asm volatile("ptrue p15.b\n\t"); + + asm volatile("cpy z0.b, p0/z, #1\n\t"); + asm volatile("cpy z1.b, p5/z, #2\n\t"); + asm volatile("cpy z2.b, p10/z, #3\n\t"); + asm volatile("cpy z3.b, p15/z, #4\n\t"); + asm volatile("cpy z4.b, p0/z, #5\n\t"); + asm volatile("cpy z5.b, p5/z, #6\n\t"); + asm volatile("cpy z6.b, p10/z, #7\n\t"); + asm volatile("cpy z7.b, p15/z, #8\n\t"); + asm volatile("cpy z8.b, p0/z, #9\n\t"); + asm volatile("cpy z9.b, p5/z, #10\n\t"); + asm volatile("cpy z10.b, p10/z, #11\n\t"); + asm volatile("cpy z11.b, p15/z, #12\n\t"); + asm volatile("cpy z12.b, p0/z, #13\n\t"); + asm volatile("cpy z13.b, p5/z, #14\n\t"); + asm volatile("cpy z14.b, p10/z, #15\n\t"); + asm volatile("cpy z15.b, p15/z, #16\n\t"); + asm volatile("cpy z16.b, p0/z, #17\n\t"); + asm volatile("cpy z17.b, p5/z, #18\n\t"); + asm volatile("cpy z18.b, p10/z, #19\n\t"); + asm volatile("cpy z19.b, p15/z, #20\n\t"); + asm volatile("cpy z20.b, p0/z, #21\n\t"); + asm volatile("cpy z21.b, p5/z, #22\n\t"); + asm volatile("cpy z22.b, p10/z, #23\n\t"); + asm volatile("cpy z23.b, p15/z, #24\n\t"); + asm volatile("cpy z24.b, p0/z, #25\n\t"); + asm volatile("cpy z25.b, p5/z, #26\n\t"); + asm volatile("cpy z26.b, p10/z, #27\n\t"); + asm volatile("cpy z27.b, p15/z, #28\n\t"); + asm volatile("cpy z28.b, p0/z, #29\n\t"); + asm volatile("cpy z29.b, p5/z, #30\n\t"); + asm volatile("cpy z30.b, p10/z, #31\n\t"); + asm volatile("cpy z31.b, p15/z, #32\n\t"); + + write_fp_control(); +} + +static void write_sme_regs(int svl_b) { + uint8_t value_offset = 1; + +#define MAX_VL_BYTES 256 + uint8_t data[MAX_VL_BYTES]; + + // ldr za will actually wrap the selected vector row, by the number of rows + // you have. So setting one that didn't exist would actually set one that did. + // That's why we need the streaming vector length here. + for (int i = 0; i < svl_b; ++i) { + // Glibc's memset uses instructions not allowed in streaming mode, so we + // do this, and make sure it's not optimised into memset. + for (unsigned j = 0; j < MAX_VL_BYTES; ++j) + data[j] = j + value_offset; + + // Each one of these loads a VL sized row of ZA. + asm volatile("mov w12, %w0\n\t" + "ldr za[w12, 0], [%1]\n\t" ::"r"(i), + "r"(&data) + : "w12"); + } +#undef MAX_VL_BYTES + + if (has_sme2) { +#define ZTO_LEN (512 / 8) + uint8_t data[ZTO_LEN]; + for (unsigned i = 0; i < ZTO_LEN; ++i) + data[i] = i + value_offset; + + asm volatile("ldr zt0, [%0]" ::"r"(&data)); +#undef ZT0_LEN + } +} + +static void write_simd_regs() { + // base is added to each value. If base = 1, then v0 = 1, v1 = 2, etc. + unsigned base = 1; + +#define WRITE_SIMD(NUM) \ + asm volatile("MOV v" #NUM ".d[0], %0\n\t" \ + "MOV v" #NUM ".d[1], %0\n\t" ::"r"((uint64_t)(base + NUM))) + + WRITE_SIMD(0); + WRITE_SIMD(1); + WRITE_SIMD(2); + WRITE_SIMD(3); + WRITE_SIMD(4); + WRITE_SIMD(5); + WRITE_SIMD(6); + WRITE_SIMD(7); + WRITE_SIMD(8); + WRITE_SIMD(9); + WRITE_SIMD(10); + WRITE_SIMD(11); + WRITE_SIMD(12); + WRITE_SIMD(13); + WRITE_SIMD(14); + WRITE_SIMD(15); + WRITE_SIMD(16); + WRITE_SIMD(17); + WRITE_SIMD(18); + WRITE_SIMD(19); + WRITE_SIMD(20); + WRITE_SIMD(21); + WRITE_SIMD(22); + WRITE_SIMD(23); + WRITE_SIMD(24); + WRITE_SIMD(25); + WRITE_SIMD(26); + WRITE_SIMD(27); + WRITE_SIMD(28); + WRITE_SIMD(29); + WRITE_SIMD(30); + WRITE_SIMD(31); + + write_fp_control(); +} + +void expr_function(bool streaming, bool za, unsigned svl) { + // Making this call exits streaming mode so it must be done first. + prctl(PR_SME_SET_VL, svl); + + if (streaming) { + SMSTART_SM; + write_sve_regs(); + } else { + SMSTOP_SM; + write_simd_regs(); + } + + if (za) { + SMSTART_ZA; + write_sme_regs(svl_b); + } else { + SMSTOP_ZA; + } +} + +typedef struct { + bool streaming; + bool za; + int svl; +} ProcessState; + +ProcessState get_initial_state(const char *mode, const char *za, + const char *svl) { + ProcessState ret; + + if (strcmp("streaming", mode) == 0) + ret.streaming = true; + else if (strcmp("simd", mode) == 0) + ret.streaming = false; + else { + printf("Unexpected value \"%s\" for mode option.\n", mode); + exit(1); + } + + if (strcmp("on", za) == 0) + ret.za = true; + else if (strcmp("off", za) == 0) + ret.za = false; + else { + printf("Unexpected value \"%s\" for za option.\n", za); + exit(1); + } + + ret.svl = atoi(svl); + if (!svl) { + printf("Unexpected svl \"%s\"\n", svl); + exit(1); + } + + return ret; +} + +int main(int argc, char *argv[]) { + if (argc != 4) { + printf("Expected 3 arguments, process mode, za on or off, streaming vector " + "length\n"); + exit(1); + } + + ProcessState initial_state = get_initial_state(argv[1], argv[2], argv[3]); + svl_b = initial_state.svl; + + // Making a syscall exits streaming mode, so we have to set this now. + prctl(PR_SME_SET_VL, svl_b); + + if ((getauxval(AT_HWCAP2) & HWCAP2_SME2)) + has_sme2 = true; + + expected_v_regs = checked_malloc(VREG_NUM * VREG_SIZE); + expected_fpcr = checked_malloc(sizeof(uint64_t)); + expected_fpsr = checked_malloc(sizeof(uint64_t)); + expected_sve_z = checked_malloc(svl_b * 32); + expected_sve_p = checked_malloc((svl_b / 8) * 16); + expected_sve_ffr = checked_malloc(svl_b / 8); + expected_za = checked_malloc(svl_b * svl_b); + expected_zt0 = checked_malloc(svl_b * 2); + expected_svcr = checked_malloc(sizeof(uint64_t)); + expected_svg = checked_malloc(sizeof(uint64_t)); + + if (initial_state.streaming) { + SMSTART_SM; + write_sve_regs(); + } else { + write_simd_regs(); + } + + if (initial_state.za) { + SMSTART_ZA; + write_sme_regs(svl_b); + } + + // The number of these is greater than or equal to the number of "next" + // each lldb test issues. The + // idea is that lldb will write register values in, updated the expected + // values, then step over. This should cause the values written via ptrace to + // appear in this process and match the expected values in memory. + check_register_values(initial_state.streaming, + initial_state.za); // Set a break point here. + check_register_values(initial_state.streaming, initial_state.za); + check_register_values(initial_state.streaming, initial_state.za); + check_register_values(initial_state.streaming, initial_state.za); + check_register_values(initial_state.streaming, initial_state.za); + check_register_values(initial_state.streaming, initial_state.za); + check_register_values(initial_state.streaming, initial_state.za); + check_register_values(initial_state.streaming, initial_state.za); + check_register_values(initial_state.streaming, initial_state.za); + // To catch us in case there are not enough above. + exit(2); + + return 0; +}