AMDGPU/PromoteAlloca: Always use i32 for indexing #170511

nhaehnle · 2025-12-03T16:45:13Z

Create more canonical code that may even lead to slightly better
codegen.

Stack:

⚠️ Part of a stack created by spr. Merging this PR using the GitHub UI may have unexpected results.

commit-id:c5c3b1e8

commit-id:97c65f02

The second pass of promotion to vector can be quite simple. Reflect that simplicity in the code for better maintainability. commit-id:cbc1e9ae

Create more canonical code that may even lead to slightly better codegen. commit-id:b8d6fdbb

llvmbot · 2025-12-03T16:45:50Z

@llvm/pr-subscribers-backend-amdgpu

Author: Nicolai Hähnle (nhaehnle)

Changes

Create more canonical code that may even lead to slightly better
codegen.

Stack:

[5/5] #170512
[4/5] #170511 ⬅
[3/5] #170510
[2/5] #170509
[1/5] #170508

⚠️ Part of a stack created by spr. Merging this PR using the GitHub UI may have unexpected results.

Full diff: https://github.com/llvm/llvm-project/pull/170511.diff

4 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp (+7-6)
(modified) llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll (+15-13)
(modified) llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll (+8-4)
(modified) llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll (+1-1)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 73ec607014d31..efd3664266dee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -461,13 +461,15 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
     return nullptr;
 
   Value *Offset = VarOffset.first;
-  auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
-  if (!OffsetType)
+  if (!isa<IntegerType>(Offset->getType()))
     return nullptr;
 
+  Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW));
+  if (Offset != VarOffset.first)
+    NewInsts.push_back(cast<Instruction>(Offset));
+
   if (!OffsetQuot.isOne()) {
-    ConstantInt *ConstMul =
-        ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth()));
+    ConstantInt *ConstMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
     Offset = Builder.CreateMul(Offset, ConstMul);
     if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
       NewInsts.push_back(NewInst);
@@ -475,8 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
   if (ConstOffset.isZero())
     return Offset;
 
-  ConstantInt *ConstIndex =
-      ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth()));
+  ConstantInt *ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
   Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
   if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
     NewInsts.push_back(NewInst);
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index 63622e67e7d0b..7b64d8728cc24 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -262,14 +262,15 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset(ptr %out) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 3, i32 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 4, i32 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 5, i32 5
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[SEL3]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP7]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP17]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP16]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2
 ; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
@@ -311,15 +312,16 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 3, i32 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 6
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[SEL3]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP17]], 3
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP8]], 6
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP18]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP19]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP18]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2
 ; CHECK-NEXT:    [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2
 ; CHECK-NEXT:    store i64 [[ELEM]], ptr [[OUT]], align 8
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
index a865bf5058d6a..7da441f2e79d2 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
@@ -11,8 +11,10 @@ define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 [[TMP7]]
 ; CHECK-NEXT:    store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -39,8 +41,10 @@ define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP8]], -1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 [[TMP7]]
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
index 9fb73963153a2..aaec725f85890 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}float4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4

arsenm · 2025-12-03T20:52:39Z

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

+  if (!isa<IntegerType>(Offset->getType()))
    return nullptr;

+  Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW));


Why is this using signed anything? The indexes should be treated as unsigned?

This patch changed it to signed: #157682
Before that, the unsigned treatment caused a bug: #155415 (comment)
GEPs with negative 32-bit indices were promoted into broken 64-bit extract-element indices.

This reverts commit f558c30. Failure on clang-hip-vega20: https://lab.llvm.org/buildbot/#/builders/123/builds/31779

Create more canonical code that may even lead to slightly better codegen. commit-id:a3832fee

…170956) Create more canonical code that may even lead to slightly better codegen.

Create more canonical code that may even lead to slightly better codegen.

This reverts commit f558c30. Failure on clang-hip-vega20: https://lab.llvm.org/buildbot/#/builders/123/builds/31779

…" (llvm#170956) Create more canonical code that may even lead to slightly better codegen.

nhaehnle added 4 commits December 3, 2025 08:08

AMDGPU: Generalize and normalize some tests to avoid future churn

90f9f54

commit-id:c5c3b1e8

AMDGPU/PromoteAlloca: Extract getVectorTypeForAlloca helper

a443460

commit-id:97c65f02

AMDGPU/PromoteAlloca: Simplify how deferred loads work

c0f7873

The second pass of promotion to vector can be quite simple. Reflect that simplicity in the code for better maintainability. commit-id:cbc1e9ae

AMDGPU/PromoteAlloca: Always use i32 for indexing

0576459

Create more canonical code that may even lead to slightly better codegen. commit-id:b8d6fdbb

llvmbot added the backend:AMDGPU label Dec 3, 2025

nhaehnle requested review from ritter-x2a and zGoldthorpe December 3, 2025 16:48

arsenm approved these changes Dec 3, 2025

View reviewed changes

Base automatically changed from users/nhaehnle/spr/main/cbc1e9ae to main December 5, 2025 20:54

nhaehnle merged commit f558c30 into main Dec 5, 2025
12 checks passed

nhaehnle deleted the users/nhaehnle/spr/main/b8d6fdbb branch December 5, 2025 20:55

nhaehnle added a commit that referenced this pull request Dec 5, 2025

Revert "AMDGPU/PromoteAlloca: Always use i32 for indexing (#170511)"

de86696

This reverts commit f558c30. Failure on clang-hip-vega20: https://lab.llvm.org/buildbot/#/builders/123/builds/31779

nhaehnle added a commit that referenced this pull request Dec 6, 2025

Reland "AMDGPU/PromoteAlloca: Always use i32 for indexing (#170511)"

06702f0

Create more canonical code that may even lead to slightly better codegen. commit-id:a3832fee

nhaehnle added a commit that referenced this pull request Dec 6, 2025

Reland "AMDGPU/PromoteAlloca: Always use i32 for indexing (#170511)"

67b208e

Create more canonical code that may even lead to slightly better codegen. commit-id:a3832fee

nhaehnle added a commit that referenced this pull request Dec 6, 2025

Reland "AMDGPU/PromoteAlloca: Always use i32 for indexing (#170511)" (#…

8dee997

…170956) Create more canonical code that may even lead to slightly better codegen.

honeygoyal pushed a commit to honeygoyal/llvm-project that referenced this pull request Dec 9, 2025

AMDGPU/PromoteAlloca: Always use i32 for indexing (llvm#170511)

0259668

Create more canonical code that may even lead to slightly better codegen.

honeygoyal pushed a commit to honeygoyal/llvm-project that referenced this pull request Dec 9, 2025

Reland "AMDGPU/PromoteAlloca: Always use i32 for indexing (llvm#170511)…

63e6c00

…" (llvm#170956) Create more canonical code that may even lead to slightly better codegen.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU/PromoteAlloca: Always use i32 for indexing #170511

AMDGPU/PromoteAlloca: Always use i32 for indexing #170511

Uh oh!

nhaehnle commented Dec 3, 2025 •

edited

Loading

Uh oh!

llvmbot commented Dec 3, 2025

Uh oh!

arsenm Dec 3, 2025

Uh oh!

ritter-x2a Dec 4, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

AMDGPU/PromoteAlloca: Always use i32 for indexing #170511

AMDGPU/PromoteAlloca: Always use i32 for indexing #170511

Uh oh!

Conversation

nhaehnle commented Dec 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Dec 3, 2025

Uh oh!

arsenm Dec 3, 2025

Choose a reason for hiding this comment

Uh oh!

ritter-x2a Dec 4, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants

nhaehnle commented Dec 3, 2025 •

edited

Loading