Skip to content

Conversation

@nhaehnle
Copy link
Collaborator

@nhaehnle nhaehnle commented Dec 3, 2025

The second pass of promotion to vector can be quite simple. Reflect that
simplicity in the code for better maintainability.

commit-id:cbc1e9ae
Create more canonical code that may even lead to slightly better
codegen.

commit-id:b8d6fdbb
@llvmbot
Copy link
Member

llvmbot commented Dec 3, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Nicolai Hähnle (nhaehnle)

Changes

Create more canonical code that may even lead to slightly better
codegen.


Stack:

  • [5/5] #170512
  • [4/5] #170511 ⬅
  • [3/5] #170510
  • [2/5] #170509
  • [1/5] #170508

⚠️ Part of a stack created by spr. Merging this PR using the GitHub UI may have unexpected results.


Full diff: https://github.com/llvm/llvm-project/pull/170511.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+7-6)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll (+15-13)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll (+8-4)
  • (modified) llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll (+1-1)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 73ec607014d31..efd3664266dee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -461,13 +461,15 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return nullptr;
 
   Value *Offset = VarOffset.first;
-  auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
-  if (!OffsetType)
+  if (!isa<IntegerType>(Offset->getType()))
     return nullptr;
 
+  Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW));
+  if (Offset != VarOffset.first)
+    NewInsts.push_back(cast<Instruction>(Offset));
+
   if (!OffsetQuot.isOne()) {
-    ConstantInt *ConstMul =
-        ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth()));
+    ConstantInt *ConstMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
     Offset = Builder.CreateMul(Offset, ConstMul);
     if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
       NewInsts.push_back(NewInst);
@@ -475,8 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   if (ConstOffset.isZero())
     return Offset;
 
-  ConstantInt *ConstIndex =
-      ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth()));
+  ConstantInt *ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
   Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
   if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
     NewInsts.push_back(NewInst);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index 63622e67e7d0b..7b64d8728cc24 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -262,14 +262,15 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset(ptr %out) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 3, i32 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 4, i32 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 5, i32 5
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[SEL3]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP7]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP17]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
 ; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
@@ -311,15 +312,16 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 3, i32 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 6
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[SEL3]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP17]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP8]], 6
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP19]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP18]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2
 ; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
index a865bf5058d6a..7da441f2e79d2 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
@@ -11,8 +11,10 @@ define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 [[TMP7]]
 ; CHECK-NEXT:    store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -39,8 +41,10 @@ define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 [[TMP7]]
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 9fb73963153a2..aaec725f85890 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}float4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4

if (!isa<IntegerType>(Offset->getType()))
return nullptr;

Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this using signed anything? The indexes should be treated as unsigned?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This patch changed it to signed: #157682
Before that, the unsigned treatment caused a bug: #155415 (comment)
GEPs with negative 32-bit indices were promoted into broken 64-bit extract-element indices.

Base automatically changed from users/nhaehnle/spr/main/cbc1e9ae to main December 5, 2025 20:54
@nhaehnle nhaehnle merged commit f558c30 into main Dec 5, 2025
12 checks passed
@nhaehnle nhaehnle deleted the users/nhaehnle/spr/main/b8d6fdbb branch December 5, 2025 20:55
nhaehnle added a commit that referenced this pull request Dec 6, 2025
Create more canonical code that may even lead to slightly better
codegen.

commit-id:a3832fee
nhaehnle added a commit that referenced this pull request Dec 6, 2025
Create more canonical code that may even lead to slightly better
codegen.

commit-id:a3832fee
nhaehnle added a commit that referenced this pull request Dec 6, 2025
…170956)

Create more canonical code that may even lead to slightly better
codegen.
honeygoyal pushed a commit to honeygoyal/llvm-project that referenced this pull request Dec 9, 2025
Create more canonical code that may even lead to slightly better
codegen.
honeygoyal pushed a commit to honeygoyal/llvm-project that referenced this pull request Dec 9, 2025
honeygoyal pushed a commit to honeygoyal/llvm-project that referenced this pull request Dec 9, 2025
…" (llvm#170956)

Create more canonical code that may even lead to slightly better
codegen.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants