[ARM] CostPerUse for high registers should only apply in instructions with 16-bit variants

ARMRegisterInfo.td defines the integer registers like this:
```
 // Integer registers
def R0  : ARMReg< 0, "r0">,  DwarfRegNum<[0]>;
def R1  : ARMReg< 1, "r1">,  DwarfRegNum<[1]>;
def R2  : ARMReg< 2, "r2">,  DwarfRegNum<[2]>;
def R3  : ARMReg< 3, "r3">,  DwarfRegNum<[3]>;
def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
// These require 32-bit instructions.
let CostPerUse = [1] in {
def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
let RegAltNameIndices = [RegNamesRaw] in {
def SP  : ARMReg<13, "sp", [], ["r13"]>,  DwarfRegNum<[13]>;
def LR  : ARMReg<14, "lr", [], ["r14"]>,  DwarfRegNum<[14]>;
def PC  : ARMReg<15, "pc", [], ["r15"]>,  DwarfRegNum<[15]>;
}
```
The additional CostPerUse on high registers only makes sense for instructions that have 16-bit variants. The extra CostPerUse can lead to us inserting extra MOV instructions to/from high registers if it results in more instructions that use low registers. An example is the following, reduced from https://github.com/ARM-software/CMSIS-DSP/blob/main/Source/FilteringFunctions/arm_fir_q7.c:
```
#include <arm_mve.h>
void arm_fir_q7_1_16_mve(signed char *pState,
                         const signed char *pCoeffs,
                         unsigned int numTaps,
                         const signed char * __restrict pSrc,
                         signed char * __restrict pDst,
                         unsigned int blockSize)
{
  int8x16_t vecCoeff = vldrbq_s8(pCoeffs);  signed char *pStateCur = &(pState[(numTaps - 1u)]);
  const signed char *pTempSrc = pSrc;
  const signed char *pSamples = pState;
  signed char *pOutput = pDst;  signed int blkCnt = blockSize >> 2;
  while (blkCnt > 0) {
    vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc));
    pStateCur += 4;
    pTempSrc += 4;
    for (int j = 0; j < 4; j++) {
      int8x16_t vecIn0 = vld1q(pSamples + j);
      signed int acc = vmladavaq(0, vecIn0, vecCoeff);
      *pOutput++ = __builtin_arm_ssat((acc >> 7U), 8);
    }
    pSamples += 4;
    blkCnt--;
  }  signed int residual = blockSize & 3;
  for (int i = 0; i < residual; i++)
    *pStateCur++ = *pTempSrc++;  for (int j = 0; j < residual; j++) {
    int8x16_t vecIn0 = vld1q(pSamples + j);
    signed int acc = vmladavaq(0, vecIn0, vecCoeff);
    *pOutput++ = __builtin_arm_ssat((acc >> 7U), 8);
  }  pTempSrc = &pState[blockSize];
  signed char *pTempDest = pState;
  blkCnt = numTaps - 1;
  do {
    mve_pred16_t p = vctp8q(blkCnt);
    vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p);
    pTempSrc += 16;
    pTempDest += 16;
    blkCnt -= 16;
  }
  while (blkCnt > 0);
} 
```
When compiled with `clang --target=arm-none-eabi -mcpu=cortex-m55 -O3 -ffast-math -mfloat-abi=hard` the main loop looks like this:
```
.LBB0_2:                                @ =>This Inner Loop Header: Depth=1
        vldrb.u32       q1, [r3], #4
        vstrb.32        q1, [r5], #4
        mov     r9, r5
        vldrb.u8        q1, [r4], #4
        vmlav.s8        r2, q1, q0
        vldrb.u8        q1, [r4, #-3]
        vmlav.s8        r10, q1, q0
        vldrb.u8        q1, [r4, #-2]
        vmlav.s8        r8, q1, q0
        vldrb.u8        q1, [r4, #-1]
        vmlav.s8        r12, q1, q0
        ssat    r7, #8, r2, asr #7
        ssat    r5, #8, r10, asr #7
        ssat    r2, #8, r8, asr #7
        strb    r7, [r1], #4
        ssat    r7, #8, r12, asr #7
        strb    r5, [r1, #-3]
        mov     r5, r9
        strb    r2, [r1, #-2]
        strb    r7, [r1, #-1]
        le      lr, .LBB0_2
```
The register allocator has preferred to have a mov from r5 to r9 and back instead of using r9 in an ssat and strb instruction. But these instructions don't have 16-bit variants, so we're getting 32-bit instructions no matter what, so all we've done is pointlessly add two extra mov instructions to the loop.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[ARM] CostPerUse for high registers should only apply in instructions with 16-bit variants #168570

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[ARM] CostPerUse for high registers should only apply in instructions with 16-bit variants #168570

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions