Skip to content

[ARM] CostPerUse for high registers should only apply in instructions with 16-bit variants #168570

@john-brawn-arm

Description

@john-brawn-arm

ARMRegisterInfo.td defines the integer registers like this:

 // Integer registers
def R0  : ARMReg< 0, "r0">,  DwarfRegNum<[0]>;
def R1  : ARMReg< 1, "r1">,  DwarfRegNum<[1]>;
def R2  : ARMReg< 2, "r2">,  DwarfRegNum<[2]>;
def R3  : ARMReg< 3, "r3">,  DwarfRegNum<[3]>;
def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
// These require 32-bit instructions.
let CostPerUse = [1] in {
def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
let RegAltNameIndices = [RegNamesRaw] in {
def SP  : ARMReg<13, "sp", [], ["r13"]>,  DwarfRegNum<[13]>;
def LR  : ARMReg<14, "lr", [], ["r14"]>,  DwarfRegNum<[14]>;
def PC  : ARMReg<15, "pc", [], ["r15"]>,  DwarfRegNum<[15]>;
}

The additional CostPerUse on high registers only makes sense for instructions that have 16-bit variants. The extra CostPerUse can lead to us inserting extra MOV instructions to/from high registers if it results in more instructions that use low registers. An example is the following, reduced from https://github.com/ARM-software/CMSIS-DSP/blob/main/Source/FilteringFunctions/arm_fir_q7.c:

#include <arm_mve.h>
void arm_fir_q7_1_16_mve(signed char *pState,
                         const signed char *pCoeffs,
                         unsigned int numTaps,
                         const signed char * __restrict pSrc,
                         signed char * __restrict pDst,
                         unsigned int blockSize)
{
  int8x16_t vecCoeff = vldrbq_s8(pCoeffs);  signed char *pStateCur = &(pState[(numTaps - 1u)]);
  const signed char *pTempSrc = pSrc;
  const signed char *pSamples = pState;
  signed char *pOutput = pDst;  signed int blkCnt = blockSize >> 2;
  while (blkCnt > 0) {
    vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc));
    pStateCur += 4;
    pTempSrc += 4;
    for (int j = 0; j < 4; j++) {
      int8x16_t vecIn0 = vld1q(pSamples + j);
      signed int acc = vmladavaq(0, vecIn0, vecCoeff);
      *pOutput++ = __builtin_arm_ssat((acc >> 7U), 8);
    }
    pSamples += 4;
    blkCnt--;
  }  signed int residual = blockSize & 3;
  for (int i = 0; i < residual; i++)
    *pStateCur++ = *pTempSrc++;  for (int j = 0; j < residual; j++) {
    int8x16_t vecIn0 = vld1q(pSamples + j);
    signed int acc = vmladavaq(0, vecIn0, vecCoeff);
    *pOutput++ = __builtin_arm_ssat((acc >> 7U), 8);
  }  pTempSrc = &pState[blockSize];
  signed char *pTempDest = pState;
  blkCnt = numTaps - 1;
  do {
    mve_pred16_t p = vctp8q(blkCnt);
    vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p);
    pTempSrc += 16;
    pTempDest += 16;
    blkCnt -= 16;
  }
  while (blkCnt > 0);
} 

When compiled with clang --target=arm-none-eabi -mcpu=cortex-m55 -O3 -ffast-math -mfloat-abi=hard the main loop looks like this:

.LBB0_2:                                @ =>This Inner Loop Header: Depth=1
        vldrb.u32       q1, [r3], #4
        vstrb.32        q1, [r5], #4
        mov     r9, r5
        vldrb.u8        q1, [r4], #4
        vmlav.s8        r2, q1, q0
        vldrb.u8        q1, [r4, #-3]
        vmlav.s8        r10, q1, q0
        vldrb.u8        q1, [r4, #-2]
        vmlav.s8        r8, q1, q0
        vldrb.u8        q1, [r4, #-1]
        vmlav.s8        r12, q1, q0
        ssat    r7, #8, r2, asr #7
        ssat    r5, #8, r10, asr #7
        ssat    r2, #8, r8, asr #7
        strb    r7, [r1], #4
        ssat    r7, #8, r12, asr #7
        strb    r5, [r1, #-3]
        mov     r5, r9
        strb    r2, [r1, #-2]
        strb    r7, [r1, #-1]
        le      lr, .LBB0_2

The register allocator has preferred to have a mov from r5 to r9 and back instead of using r9 in an ssat and strb instruction. But these instructions don't have 16-bit variants, so we're getting 32-bit instructions no matter what, so all we've done is pointlessly add two extra mov instructions to the loop.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions