Skip to content

Conversation

@davemgreen
Copy link
Collaborator

In selecting a G_STORE with i8 memty and i16 type, we would try to generate a %6:gpr32 = COPY %3.sub_32:gpr32all instruction. Make sure we only add the subreg if the Ty is larger than 32bits and the subreg is needed.

In selecting a G_STORE with i8 memty and i16 type, we would try to generate a
%6:gpr32 = COPY %3.sub_32:gpr32all instruction. Make sure we only add the
subreg if the Ty is larger than 32bits and the subreg is needed.
@llvmbot
Copy link
Member

llvmbot commented Dec 8, 2025

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

In selecting a G_STORE with i8 memty and i16 type, we would try to generate a %6:gpr32 = COPY %3.sub_32:gpr32all instruction. Make sure we only add the subreg if the Ty is larger than 32bits and the subreg is needed.


Full diff: https://github.com/llvm/llvm-project/pull/171100.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp (+2)
  • (modified) llvm/test/CodeGen/AArch64/load-store-forwarding.ll (+80-23)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index f9db39e5f8622..88f1778680efe 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -3015,6 +3015,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
         return false;
 
       // Generate a subreg copy.
+      if (RB.getID() == AArch64::GPRRegBankID && ValTy.getSizeInBits() < 32)
+        SubReg = 0;
       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
                       .addReg(ValReg, 0, SubReg)
                       .getReg(0);
diff --git a/llvm/test/CodeGen/AArch64/load-store-forwarding.ll b/llvm/test/CodeGen/AArch64/load-store-forwarding.ll
index 02efbe9b409de..f1635239b146b 100644
--- a/llvm/test/CodeGen/AArch64/load-store-forwarding.ll
+++ b/llvm/test/CodeGen/AArch64/load-store-forwarding.ll
@@ -1,8 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck %s --check-prefix CHECK-BE
-; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s --check-prefix CHECK-LE
+; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s --check-prefixes CHECK-LE,CHECK-SD
+; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck %s --check-prefixes CHECK-BE
+; RUN: llc -mtriple=aarch64 -global-isel -o - %s | FileCheck %s --check-prefixes CHECK-LE,CHECK-GI
 
 define i8 @test1(i32 %a, ptr %pa) {
+; CHECK-SD-LABEL: test1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    str w0, [x1]
+; CHECK-SD-NEXT:    ret
+;
 ; CHECK-BE-LABEL: test1:
 ; CHECK-BE:       // %bb.0:
 ; CHECK-BE-NEXT:    mov w8, w0
@@ -10,27 +16,28 @@ define i8 @test1(i32 %a, ptr %pa) {
 ; CHECK-BE-NEXT:    str w8, [x1]
 ; CHECK-BE-NEXT:    ret
 ;
-; CHECK-LE-LABEL: test1:
-; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    str w0, [x1]
-; CHECK-LE-NEXT:    ret
+; CHECK-GI-LABEL: test1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str w0, [x1]
+; CHECK-GI-NEXT:    and w0, w0, #0xff
+; CHECK-GI-NEXT:    ret
   store i32 %a, ptr %pa
   %res = load i8, ptr %pa
   ret i8 %res
 }
 
 define i8 @test2(i32 %a, ptr %pa) {
-; CHECK-BE-LABEL: test2:
-; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    str w0, [x1]
-; CHECK-BE-NEXT:    ldrb w0, [x1, #1]
-; CHECK-BE-NEXT:    ret
-;
 ; CHECK-LE-LABEL: test2:
 ; CHECK-LE:       // %bb.0:
 ; CHECK-LE-NEXT:    str w0, [x1]
 ; CHECK-LE-NEXT:    ubfx w0, w0, #8, #8
 ; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test2:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ldrb w0, [x1, #1]
+; CHECK-BE-NEXT:    ret
   %p8 = getelementptr i8, ptr %pa, i32 1
   store i32 %a, ptr %pa
   %res = load i8, ptr %p8
@@ -38,17 +45,17 @@ define i8 @test2(i32 %a, ptr %pa) {
 }
 
 define i8 @test3(i32 %a, ptr %pa) {
-; CHECK-BE-LABEL: test3:
-; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    str w0, [x1]
-; CHECK-BE-NEXT:    ldrb w0, [x1, #2]
-; CHECK-BE-NEXT:    ret
-;
 ; CHECK-LE-LABEL: test3:
 ; CHECK-LE:       // %bb.0:
 ; CHECK-LE-NEXT:    str w0, [x1]
 ; CHECK-LE-NEXT:    ubfx w0, w0, #16, #8
 ; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test3:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ldrb w0, [x1, #2]
+; CHECK-BE-NEXT:    ret
   %p8 = getelementptr i8, ptr %pa, i32 2
   store i32 %a, ptr %pa
   %res = load i8, ptr %p8
@@ -56,18 +63,68 @@ define i8 @test3(i32 %a, ptr %pa) {
 }
 
 define i8 @test4(i32 %a, ptr %pa) {
-; CHECK-BE-LABEL: test4:
-; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    str w0, [x1]
-; CHECK-BE-NEXT:    ret
-;
 ; CHECK-LE-LABEL: test4:
 ; CHECK-LE:       // %bb.0:
 ; CHECK-LE-NEXT:    str w0, [x1]
 ; CHECK-LE-NEXT:    lsr w0, w0, #24
 ; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test4:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ret
   %p8 = getelementptr i8, ptr %pa, i32 3
   store i32 %a, ptr %pa
   %res = load i8, ptr %p8
   ret i8 %res
 }
+
+define i32 @load_i16_store_i8(ptr %p, ptr %q) {
+; CHECK-SD-LABEL: load_i16_store_i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldrb w8, [x0]
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    strb w8, [x1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-BE-LABEL: load_i16_store_i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldrb w8, [x0, #1]
+; CHECK-BE-NEXT:    mov w0, wzr
+; CHECK-BE-NEXT:    strb w8, [x1]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_i16_store_i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldrh w8, [x0]
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    strb w8, [x1]
+; CHECK-GI-NEXT:    ret
+entry:
+  %l = load i16, ptr %p, align 4
+  %tr = trunc i16 %l to i8
+  store i8 %tr, ptr %q, align 1
+  ret i32 0
+}
+
+define i32 @load_i16_store_i8_freeze(ptr %p, ptr %q) {
+; CHECK-LE-LABEL: load_i16_store_i8_freeze:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ldrh w8, [x0]
+; CHECK-LE-NEXT:    mov w0, wzr
+; CHECK-LE-NEXT:    strb w8, [x1]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: load_i16_store_i8_freeze:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ldrh w8, [x0]
+; CHECK-BE-NEXT:    mov w0, wzr
+; CHECK-BE-NEXT:    strb w8, [x1]
+; CHECK-BE-NEXT:    ret
+entry:
+  %l = load i16, ptr %p, align 4
+  %fr = freeze i16 %l
+  %tr = trunc i16 %fr to i8
+  store i8 %tr, ptr %q, align 1
+  ret i32 0
+}

Comment on lines +2 to +3
; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s --check-prefixes CHECK-LE,CHECK-SD
; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck %s --check-prefixes CHECK-BE
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s --check-prefixes CHECK-LE,CHECK-SD
; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck %s --check-prefixes CHECK-BE
; RUN: llc -global-isel=0 -mtriple=aarch64 -o - %s | FileCheck %s --check-prefixes CHECK-LE,CHECK-SD
; RUN: llc -global-isel=0 -mtriple=aarch64_be -o - %s | FileCheck %s --check-prefixes CHECK-BE


// Generate a subreg copy.
if (RB.getID() == AArch64::GPRRegBankID && ValTy.getSizeInBits() < 32)
SubReg = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
SubReg = 0;
SubReg = AArch64::NoSubRegister;

return false;

// Generate a subreg copy.
if (RB.getID() == AArch64::GPRRegBankID && ValTy.getSizeInBits() < 32)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logical flow seems off; it's correcting a wrong result above for a specific case. I'd expect getSubRegForClass to work in terms of reporting an accurate subregister, based on the register class of the value type not memory

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants