[InstCombine] Try to fold add into GEP x, C #85090

davemgreen · 2024-03-13T14:44:23Z

This attempts to GEP p, (x + C1), C2 with GEP p, x, C2+C1*S, removing the need
for the add by folding it into the following constant offset in the GEP. It
drops inbounds from geps, as they may no longer be valid.

Example proof using a much reduced pointer address space:
https://alive2.llvm.org/ce/z/hG8xfF

llvmbot · 2024-03-13T14:44:54Z

@llvm/pr-subscribers-llvm-transforms

Author: David Green (davemgreen)

Changes

This attempts to GEP p, (x + C1), C2 with GEP p, x, C2+C1*S, removing the need
for the add by folding it into the following constant offset in the GEP. It
does not attempt to alter inbounds geps, to avoid the need to drop the
inbounds.

Example proof using a much reduced pointer address space:
https://alive2.llvm.org/ce/z/hG8xfF

Full diff: https://github.com/llvm/llvm-project/pull/85090.diff

2 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstructionCombining.cpp (+31)
(added) llvm/test/Transforms/InstCombine/gepadd.ll (+108)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1688005de2104d..7932822c0f7464 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2893,6 +2893,37 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         }
       }
     }
+
+    // Try to replace GEP p, (x + C1), C2 with GEP p, x, C2+C1*S
+    if (GEP.getNumIndices() > 1) {
+      gep_type_iterator GTI = gep_type_begin(GEP);
+      for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end() - 1;
+           I != E; ++I, ++GTI) {
+        if (!GTI.isSequential())
+          break;
+        Value *X;
+        const APInt *C1, *C2;
+        User::op_iterator Next = std::next(I);
+        if (match(I->get(), m_Add(m_Value(X), m_APInt(C1))) &&
+            match(Next->get(), m_APInt(C2))) {
+          TypeSize Scale1 = GTI.getSequentialElementStride(DL);
+          if (Scale1.isScalable() || !(++GTI).isSequential())
+            break;
+          TypeSize Scale2 = GTI.getSequentialElementStride(DL);
+          if (Scale2.isScalable())
+            break;
+
+          // Update the GEP instruction indices, and add Add to the worklist
+          // so that it can be DCEd.
+          Instruction *Add = cast<Instruction>(*I);
+          *I = X;
+          *Next = ConstantInt::get((*Next)->getType(),
+                                    *C2 + *C1 * (Scale1 / Scale2));
+          addToWorklist(Add);
+          return &GEP;
+        }
+      }
+    }
   }
 
   if (Instruction *R = foldSelectGEP(GEP, Builder))
diff --git a/llvm/test/Transforms/InstCombine/gepadd.ll b/llvm/test/Transforms/InstCombine/gepadd.ll
new file mode 100644
index 00000000000000..895bb94ca99576
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/gepadd.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+target datalayout = "e-p:64:64:64"
+
+define ptr @add1(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @add1(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[G:%.*]] = getelementptr [10 x [10 x [10 x [10 x i32]]]], ptr [[P]], i64 0, i64 [[O]], i64 11, i64 -6, i64 [[N]]
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = add i64 %o, 1
+  %g = getelementptr [10 x [10 x [10 x [10 x i32]]]], ptr %p, i64 0, i64 %a, i64 1, i64 -6, i64 %n
+  ret ptr %g
+}
+
+define ptr @sub1(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @sub1(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[G:%.*]] = getelementptr [10 x [10 x [10 x [10 x i32]]]], ptr [[P]], i64 0, i64 [[O]], i64 -9, i64 -6, i64 [[N]]
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = sub i64 %o, 1
+  %g = getelementptr [10 x [10 x [10 x [10 x i32]]]], ptr %p, i64 0, i64 %a, i64 1, i64 -6, i64 %n
+  ret ptr %g
+}
+
+define ptr @add10(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @add10(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[G:%.*]] = getelementptr [10 x [10 x [10 x [10 x i32]]]], ptr [[P]], i64 0, i64 [[O]], i64 101, i64 -6, i64 [[N]]
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = add i64 %o, 10
+  %g = getelementptr [10 x [10 x [10 x [10 x i32]]]], ptr %p, i64 0, i64 %a, i64 1, i64 -6, i64 %n
+  ret ptr %g
+}
+
+define ptr @nooff(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @nooff(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = add i64 [[O]], 10
+; CHECK-NEXT:    [[G:%.*]] = getelementptr [10 x [10 x [10 x [10 x i32]]]], ptr [[P]], i64 0, i64 [[A]], i64 [[N]]
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = add i64 %o, 10
+  %g = getelementptr [10 x [10 x [10 x [10 x i32]]]], ptr %p, i64 0, i64 %a, i64 %n
+  ret ptr %g
+}
+
+define ptr @inbounds(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @inbounds(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = add i64 [[O]], 1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr inbounds [10 x [10 x [10 x [10 x i32]]]], ptr [[P]], i64 0, i64 [[A]], i64 1, i64 -6, i64 [[N]]
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = add i64 %o, 1
+  %g = getelementptr inbounds [10 x [10 x [10 x [10 x i32]]]], ptr %p, i64 0, i64 %a, i64 1, i64 -6, i64 %n
+  ret ptr %g
+}
+
+%struct.Struct = type { i32, i32, ptr }
+define ptr @struct(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @struct(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[A:%.*]] = add i64 [[O]], -1
+; CHECK-NEXT:    [[G:%.*]] = getelementptr [[STRUCT_STRUCT:%.*]], ptr [[P]], i64 [[A]], i32 1
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = add i64 %o, -1
+  %g = getelementptr %struct.Struct, ptr %p, i64 %a, i32 1
+  ret ptr %g
+}
+
+define ptr @lessargs(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @lessargs(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[G:%.*]] = getelementptr [10 x [10 x [10 x i32]]], ptr [[P]], i64 0, i64 [[O]], i64 11
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = add i64 %o, 1
+  %g = getelementptr [10 x [10 x [10 x i32]]], ptr %p, i64 0, i64 %a, i64 1
+  ret ptr %g
+}
+
+define ptr @twice(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @twice(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[G:%.*]] = getelementptr [10 x [10 x [10 x [10 x [10 x i32]]]]], ptr [[P]], i64 0, i64 [[O]], i64 11, i64 [[O]], i64 -13
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = add i64 %o, 1
+  %b = sub i64 %o, 1
+  %g = getelementptr [10 x [10 x [10 x [10 x [10 x i32]]]]], ptr %p, i64 0, i64 %a, i64 1, i64 %b, i64 -3
+  ret ptr %g
+}
+
+define ptr @simpler(ptr %p, i64 %o, i64 %n) {
+; CHECK-LABEL: define ptr @simpler(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[O:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:    [[G:%.*]] = getelementptr [16 x [8 x i32]], ptr [[P]], i64 0, i64 [[O]], i64 65
+; CHECK-NEXT:    ret ptr [[G]]
+;
+  %a = add i64 %o, 8
+  %g = getelementptr [16 x [8 x i32]], ptr %p, i64 0, i64 %a, i64 1
+  ret ptr %g
+}

github-actions · 2024-03-13T14:46:51Z

✅ With the latest revision this PR passed the C/C++ code formatter.

dtcxzyw · 2024-03-13T15:10:46Z

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

+          *Next = ConstantInt::get((*Next)->getType(),
+                                    *C2 + *C1 * (Scale1 / Scale2));
+          addToWorklist(Add);
+          return &GEP;


Is it safe to propagate inbounds flag?
Alive2: https://alive2.llvm.org/ce/z/QD6chN

Yeah it would need to be dropped. This is in a if (!GEP.isInBounds()) block, hence we already know it is not inbounds. I wasn't sure which is best for optimization between a = add x; c1; gep inbounds p, a, c2 and gep p, x, c3 without the inbounds. As the motivating case I had didn't have inbounds (even though it is from a global) I opted for the simpler route that hopefully wouldn't make anything worse.

I believe it should be OK to change it around though, I don't think this comes up a huge amount either way.

nikic

I'm not really willing to take patches for the formation of "big GEPs" anymore at this point. Our canonicalization direction should start to go in the opposite direction, breaking up these GEPs into smaller parts. At that point, the existing gep+add to gep+gep canonicalization will also handle this.

nikic · 2024-03-13T15:18:03Z

I'm also not willing to take folds that apply only if flags are not present. That goes contrary to our general philosophy on handling of poison-generating flags.

davemgreen

I'm not really willing to take patches for the formation of "big GEPs" anymore at this point. Our canonicalization direction should start to go in the opposite direction, breaking up these GEPs into smaller parts. At that point, the existing gep+add to gep+gep canonicalization will also handle this.

Yeah I thought you might bring that up :) I was hoping we could be reasonable and take optimizations like this as a short-term improvement until the canonicalisation is ready, if the patch was simple enough. (This wasn't attempting to make bigger geps, just change the ones that already existed). I had seen that with smaller geps it was already handled better, this just reduces some of the differences between the current larger geps and the future smaller ones.

davemgreen · 2024-03-14T10:26:16Z

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

+          *Next = ConstantInt::get((*Next)->getType(),
+                                    *C2 + *C1 * (Scale1 / Scale2));
+          addToWorklist(Add);
+          return &GEP;


Yeah it would need to be dropped. This is in a if (!GEP.isInBounds()) block, hence we already know it is not inbounds. I wasn't sure which is best for optimization between a = add x; c1; gep inbounds p, a, c2 and gep p, x, c3 without the inbounds. As the motivating case I had didn't have inbounds (even though it is from a global) I opted for the simpler route that hopefully wouldn't make anything worse.

I believe it should be OK to change it around though, I don't think this comes up a huge amount either way.

This attempts to GEP p, (x + C1), C2 with GEP p, x, C2+C1*S, removing the need for the add by folding it into the following constant offset in the GEP. It does not attempt to alter inbounds geps, to avoid the need to drop the inbounds. Example proof using a much reduced pointer address space: https://alive2.llvm.org/ce/z/hG8xfF

davemgreen · 2024-04-28T17:46:43Z

I'd like to try this again if I can. This doesn't seem like an unreasonable patch to take in the sort-term, and as far as I understand is a (little) step towards where we expect codegen to end up. As in - it performs an optimization that will happen if geps are canonicalized to individual offsets, bringing the final codegen closer to what it will later be. And doesn't create larger geps, just transforms existing large geps to be a little different. I've given the patch a rebase. Let me know what you think.

nikic · 2024-04-29T01:45:57Z

I think we should at least try to solve this by breaking up the GEPs first. If that is not feasible without a lot of effort right now (because too many other opts depend on the status quo), then we can consider this direction.

davemgreen · 2024-04-30T13:23:17Z

bringing the final codegen closer to what it will later be

Unfortunately it looks like this might not be exactly true, and canonicalizing the geps (at least in the way I have tried it) does not produce the same final output I was hoping for, I think partly because of multiple uses.

nikic · 2024-05-01T02:30:09Z

bringing the final codegen closer to what it will later be

Unfortunately it looks like this might not be exactly true, and canonicalizing the geps (at least in the way I have tried it) does not produce the same final output I was hoping for, I think partly because of multiple uses.

Can you share the motivating example?

davemgreen requested review from nikic, dtcxzyw and goldsteinn March 13, 2024 14:44

llvmbot added the llvm:transforms label Mar 13, 2024

dtcxzyw reviewed Mar 13, 2024

View reviewed changes

dtcxzyw requested a review from nunoplopes March 13, 2024 15:11

nikic requested changes Mar 13, 2024

View reviewed changes

davemgreen commented Mar 14, 2024

View reviewed changes

davemgreen force-pushed the gh-gepaddconst branch from adeb287 to 6c8c91e Compare March 18, 2024 08:00

davemgreen added 2 commits April 28, 2024 17:54

[InstCombine] Add Gep + Add tests. NFC

e4795e7

davemgreen force-pushed the gh-gepaddconst branch from 6c8c91e to 67c1d52 Compare April 28, 2024 17:46

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[InstCombine] Try to fold add into GEP x, C #85090

[InstCombine] Try to fold add into GEP x, C #85090

davemgreen commented Mar 13, 2024 •

edited

llvmbot commented Mar 13, 2024

github-actions bot commented Mar 13, 2024 •

edited

dtcxzyw Mar 13, 2024

davemgreen Mar 14, 2024

nikic left a comment

nikic commented Mar 13, 2024

davemgreen left a comment

davemgreen Mar 14, 2024

davemgreen commented Apr 28, 2024

nikic commented Apr 29, 2024

davemgreen commented Apr 30, 2024

nikic commented May 1, 2024

[InstCombine] Try to fold add into GEP x, C #85090

Are you sure you want to change the base?

[InstCombine] Try to fold add into GEP x, C #85090

Conversation

davemgreen commented Mar 13, 2024 • edited

llvmbot commented Mar 13, 2024

github-actions bot commented Mar 13, 2024 • edited

dtcxzyw Mar 13, 2024

Choose a reason for hiding this comment

davemgreen Mar 14, 2024

Choose a reason for hiding this comment

nikic left a comment

Choose a reason for hiding this comment

nikic commented Mar 13, 2024

davemgreen left a comment

Choose a reason for hiding this comment

davemgreen Mar 14, 2024

Choose a reason for hiding this comment

davemgreen commented Apr 28, 2024

nikic commented Apr 29, 2024

davemgreen commented Apr 30, 2024

nikic commented May 1, 2024

davemgreen commented Mar 13, 2024 •

edited

github-actions bot commented Mar 13, 2024 •

edited