Skip to content

Commit

Permalink
[Passes] add a tail-call-elim pass near the end of the opt pipeline
Browse files Browse the repository at this point in the history
We call tail-call-elim near the beginning of the pipeline,
but that is too early to annotate calls that get added later.

In the motivating case from issue #47852, the missing 'tail'
on memset leads to sub-optimal codegen.

I experimented with removing the early instance of
tail-call-elim instead of just adding another pass, but that
appears to be slightly worse for compile-time:
+0.15% vs. +0.08% time.
"tailcall" shows adding the pass; "tailcall2" shows moving
the pass to later, then adding the original early pass back
(so 1596886 is functionally equivalent to 180b043 ):
https://llvm-compile-time-tracker.com/index.php?config=NewPM-O3&stat=instructions&remote=rotateright

Note that there was an effort to split the tail call functionality
into 2 passes - that could help reduce compile-time if we find
that this change costs more in compile-time than expected based
on the preliminary testing:
D60031

Differential Revision: https://reviews.llvm.org/D130374
  • Loading branch information
rotateright committed Jul 25, 2022
1 parent 95f4ca7 commit bfb9b8e
Show file tree
Hide file tree
Showing 455 changed files with 20,939 additions and 20,952 deletions.
6 changes: 3 additions & 3 deletions clang/test/CodeGen/aarch64-ls64-inline-asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ struct foo { unsigned long long x[8]; };

// CHECK-LABEL: @load(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6
// CHECK-NEXT: [[TMP0:%.*]] = tail call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6
// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.foo* [[OUTPUT:%.*]] to i512*
// CHECK-NEXT: store i512 [[TMP0]], i512* [[TMP1]], align 8
// CHECK-NEXT: ret void
Expand All @@ -19,7 +19,7 @@ void load(struct foo *output, void *addr)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.foo* [[INPUT:%.*]] to i512*
// CHECK-NEXT: [[TMP1:%.*]] = load i512, i512* [[TMP0]], align 8
// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7
// CHECK-NEXT: tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7
// CHECK-NEXT: ret void
//
void store(const struct foo *input, void *addr)
Expand Down Expand Up @@ -74,7 +74,7 @@ void store(const struct foo *input, void *addr)
// CHECK-NEXT: [[S_SROA_0_0_INSERT_EXT:%.*]] = zext i64 [[CONV]] to i512
// CHECK-NEXT: [[S_SROA_0_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_4_0_INSERT_MASK]], [[S_SROA_4_0_INSERT_SHIFT]]
// CHECK-NEXT: [[S_SROA_0_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_0_0_INSERT_MASK]], [[S_SROA_0_0_INSERT_EXT]]
// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12
// CHECK-NEXT: tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12
// CHECK-NEXT: ret void
//
void store2(int *in, void *addr)
Expand Down
104 changes: 52 additions & 52 deletions clang/test/CodeGen/aarch64-neon-vcmla.c

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,20 @@ vec2048 x2048 = {0, 1, 2, 3, 3 , 2 , 1, 0, 0, 1, 2, 3, 3 , 2 , 1, 0,
typedef int8_t vec_int8 __attribute__((vector_size(N / 8)));
// CHECK128-LABEL: define{{.*}} <16 x i8> @f2(<16 x i8> noundef %x)
// CHECK128-NEXT: entry:
// CHECK128-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
// CHECK128-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> [[X:%.*]], i64 0)
// CHECK128-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
// CHECK128-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP1]], i64 0)
// CHECK128-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
// CHECK128-NEXT: [[CASTSCALABLESVE:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> [[X:%.*]], i64 0)
// CHECK128-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
// CHECK128-NEXT: [[CASTFIXEDSVE:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP1]], i64 0)
// CHECK128-NEXT: ret <16 x i8> [[CASTFIXEDSVE]]

// CHECK-LABEL: define{{.*}} void @f2(
// CHECK-SAME: <[[#div(VBITS,8)]] x i8>* noalias nocapture writeonly sret(<[[#div(VBITS,8)]] x i8>) align 16 %agg.result, <[[#div(VBITS,8)]] x i8>* nocapture noundef readonly %0)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[X:%.*]] = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* [[TMP0:%.*]], align 16, [[TBAA6:!tbaa !.*]]
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8(<vscale x 16 x i8> undef, <[[#div(VBITS,8)]] x i8> [[X]], i64 0)
// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[TMP2]], i64 0)
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8(<vscale x 16 x i8> undef, <[[#div(VBITS,8)]] x i8> [[X]], i64 0)
// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = tail call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[TMP2]], i64 0)
// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[CASTFIXEDSVE]], <[[#div(VBITS,8)]] x i8>* [[AGG_RESULT:%.*]], align 16, [[TBAA6]]
// CHECK-NEXT: ret void
vec_int8 f2(vec_int8 x) { return svasrd_x(svptrue_b8(), x, 1); }
Expand All @@ -80,14 +80,14 @@ typedef svint8_t vec2 __attribute__((arm_sve_vector_bits(N)));

// CHECK128-LABEL: define{{.*}} void @g(<vscale x 16 x i8> noundef %x.coerce)
// CHECK128-NEXT: entry:
// CHECK128-NEXT: [[X:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
// CHECK128-NEXT: [[X:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
// CHECK128-NEXT: call void @f3(<16 x i8> noundef [[X]]) [[ATTR5:#.*]]
// CHECK128-NEXT: ret void

// CHECK-LABEL: define{{.*}} void @g(<vscale x 16 x i8> noundef %x.coerce)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca <[[#div(VBITS,8)]] x i8>, align 16
// CHECK-NEXT: [[X:%.*]] = call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
// CHECK-NEXT: [[X:%.*]] = tail call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[X]], <[[#div(VBITS,8)]] x i8>* [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6]]
// CHECK-NEXT: call void @f3(<[[#div(VBITS,8)]] x i8>* noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
// CHECK-NEXT: ret void
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ void test02() {
// CHECK-SAME: [[#VBITS]]
// CHECK-SAME: EES_(<vscale x 4 x i32> noundef %x.coerce, <vscale x 4 x i32> noundef %y.coerce)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[X:%.*]] = call <[[#div(VBITS, 32)]] x i32> @llvm.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
// CHECK-NEXT: [[Y:%.*]] = call <[[#div(VBITS, 32)]] x i32> @llvm.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE1:%.*]], i64 0)
// CHECK-NEXT: [[X:%.*]] = tail call <[[#div(VBITS, 32)]] x i32> @llvm.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
// CHECK-NEXT: [[Y:%.*]] = tail call <[[#div(VBITS, 32)]] x i32> @llvm.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE1:%.*]], i64 0)
// CHECK-NEXT: [[ADD:%.*]] = add <[[#div(VBITS, 32)]] x i32> [[Y]], [[X]]
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v[[#div(VBITS, 32)]]i32(<vscale x 4 x i32> undef, <[[#div(VBITS, 32)]] x i32> [[ADD]], i64 0)
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v[[#div(VBITS, 32)]]i32(<vscale x 4 x i32> undef, <[[#div(VBITS, 32)]] x i32> [[ADD]], i64 0)
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
typedef svint32_t vec __attribute__((arm_sve_vector_bits(N)));
auto f(vec x, vec y) { return x + y; } // Returns a vec.
Expand All @@ -68,11 +68,11 @@ typedef svint16_t vec2 __attribute__((arm_sve_vector_bits(N)));
// CHECK-SAME: [[#VBITS]]
// CHECK-SAME: EE(<vscale x 8 x i16> noundef %x.coerce)
// CHECK-NEXT: entry:
// CHECK128-NEXT: [[X:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
// CHECK128-NEXT: [[X:%.*]] = tail call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
// CHECK128-NEXT: call void @_Z1fDv8_s(<8 x i16> noundef [[X]]) [[ATTR5:#.*]]
// CHECK128-NEXT: ret void
// CHECKWIDE-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca <[[#div(VBITS, 16)]] x i16>, align 16
// CHECKWIDE-NEXT: [[X:%.*]] = call <[[#div(VBITS, 16)]] x i16> @llvm.vector.extract.v[[#div(VBITS, 16)]]i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
// CHECKWIDE-NEXT: [[X:%.*]] = tail call <[[#div(VBITS, 16)]] x i16> @llvm.vector.extract.v[[#div(VBITS, 16)]]i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
// CHECKWIDE-NEXT: store <[[#div(VBITS, 16)]] x i16> [[X]], <[[#div(VBITS, 16)]] x i16>* [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6:!tbaa !.*]]
// CHECKWIDE-NEXT: call void @_Z1fDv[[#div(VBITS, 16)]]_s(<[[#div(VBITS, 16)]] x i16>* noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
// CHECKWIDE-NEXT: ret void
Expand Down
Loading

0 comments on commit bfb9b8e

Please sign in to comment.