Skip to content

Commit

Permalink
[clang][AArch64][SVE] Avoid going through memory for coerced VLST arg…
Browse files Browse the repository at this point in the history
…uments

VLST arguments are coerced to VLATs at the function boundary for
consistency with the VLAT ABI. They are then bitcast back to VLSTs in
the function prolog. Previously, this conversion is done through memory.
With the introduction of the llvm.vector.{insert,extract} intrinsic, we
can avoid going through memory here.

Depends on D92761

Differential Revision: https://reviews.llvm.org/D92762
  • Loading branch information
Joe Ellis committed Jan 5, 2021
1 parent c005518 commit 3d5b18a
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 130 deletions.
21 changes: 21 additions & 0 deletions clang/lib/CodeGen/CGCall.cpp
Expand Up @@ -2688,6 +2688,27 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
break;
}

// VLST arguments are coerced to VLATs at the function boundary for
// ABI consistency. If this is a VLST that was coerced to
// a VLAT at the function boundary and the types match up, use
// llvm.experimental.vector.extract to convert back to the original
// VLST.
if (auto *VecTyTo = dyn_cast<llvm::FixedVectorType>(ConvertType(Ty))) {
auto *Coerced = Fn->getArg(FirstIRArg);
if (auto *VecTyFrom =
dyn_cast<llvm::ScalableVectorType>(Coerced->getType())) {
if (VecTyFrom->getElementType() == VecTyTo->getElementType()) {
llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);

assert(NumIRArgs == 1);
Coerced->setName(Arg->getName() + ".coerce");
ArgVals.push_back(ParamValue::forDirect(Builder.CreateExtractVector(
VecTyTo, Coerced, Zero, "castFixedSve")));
break;
}
}
}

Address Alloca = CreateMemTemp(Ty, getContext().getDeclAlign(Arg),
Arg->getName());

Expand Down
Expand Up @@ -59,14 +59,14 @@ typedef int8_t vec_int8 __attribute__((vector_size(N / 8)));

// CHECK-LABEL: define{{.*}} void @f2(
// CHECK-SAME: <[[#div(VBITS,8)]] x i8>* noalias nocapture sret(<[[#div(VBITS,8)]] x i8>) align 16 %agg.result, <[[#div(VBITS,8)]] x i8>* nocapture readonly %0)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[X:%.*]] = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* [[TMP0:%.*]], align 16, [[TBAA6:!tbaa !.*]]
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8(<vscale x 16 x i8> undef, <[[#div(VBITS,8)]] x i8> [[X]], i64 0)
// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <[[#div(VBITS,8)]] x i8> @llvm.experimental.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[TMP2]], i64 0)
// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[CASTFIXEDSVE]], <[[#div(VBITS,8)]] x i8>* [[AGG_RESULT:%.*]], align 16, [[TBAA6]]
// CHECK-NEXT: ret void
// CHECK-NEXT: entry:
// CHECK-NEXT: [[X:%.*]] = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* [[TMP0:%.*]], align 16, [[TBAA6:!tbaa !.*]]
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8(<vscale x 16 x i8> undef, <[[#div(VBITS,8)]] x i8> [[X]], i64 0)
// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <[[#div(VBITS,8)]] x i8> @llvm.experimental.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[TMP2]], i64 0)
// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[CASTFIXEDSVE]], <[[#div(VBITS,8)]] x i8>* [[AGG_RESULT:%.*]], align 16, [[TBAA6]]
// CHECK-NEXT: ret void
vec_int8 f2(vec_int8 x) { return svasrd_x(svptrue_b8(), x, 1); }
#endif

Expand All @@ -78,24 +78,18 @@ void f3(vec1);
typedef svint8_t vec2 __attribute__((arm_sve_vector_bits(N)));

// CHECK128-LABEL: define{{.*}} void @g(<vscale x 16 x i8> %x.coerce)
// CHECK128-NEXT: entry:
// CHECK128-NEXT: [[X:%.*]] = alloca <16 x i8>, align 16
// CHECK128-NEXT: [[TMP0:%.*]] = bitcast <16 x i8>* [[X]] to <vscale x 16 x i8>*
// CHECK128-NEXT: store <vscale x 16 x i8> [[X_COERCE:%.*]], <vscale x 16 x i8>* [[TMP0]], align 16
// CHECK128-NEXT: [[X1:%.*]] = load <16 x i8>, <16 x i8>* [[X]], align 16, [[TBAA6:!tbaa !.*]]
// CHECK128-NEXT: call void @f3(<16 x i8> [[X1]]) [[ATTR5:#.*]]
// CHECK128-NEXT: entry:
// CHECK128-NEXT: [[X:%.*]] = call <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
// CHECK128-NEXT: call void @f3(<16 x i8> [[X]]) [[ATTR5:#.*]]
// CHECK128-NEXT: ret void

// CHECK-LABEL: define{{.*}} void @g(<vscale x 16 x i8> %x.coerce)
// CHECK-NEXT: entry:
// CHECK-NEXT: [[X:%.*]] = alloca <[[#div(VBITS,8)]] x i8>, align 16
// CHECK-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca <[[#div(VBITS,8)]] x i8>, align 16
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <[[#div(VBITS,8)]] x i8>* [[X]] to <vscale x 16 x i8>*
// CHECK-NEXT: store <vscale x 16 x i8> [[X_COERCE:%.*]], <vscale x 16 x i8>* [[TMP0]], align 16
// CHECK-NEXT: [[X1:%.*]] = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* [[X]], align 16, [[TBAA6]]
// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[X1]], <[[#div(VBITS,8)]] x i8>* [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6]]
// CHECK-NEXT: call void @f3(<[[#div(VBITS,8)]] x i8>* nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
// CHECK-NEXT: ret void
// CHECK-NEXT: entry:
// CHECK-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca <[[#div(VBITS,8)]] x i8>, align 16
// CHECK-NEXT: [[X:%.*]] = call <[[#div(VBITS,8)]] x i8> @llvm.experimental.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[X]], <[[#div(VBITS,8)]] x i8>* [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6]]
// CHECK-NEXT: call void @f3(<[[#div(VBITS,8)]] x i8>* nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
// CHECK-NEXT: ret void

// CHECK128-LABEL: declare void @f3(<16 x i8>)

Expand Down
Expand Up @@ -48,20 +48,14 @@ void test02() {
// CHECK-SAME: [[#VBITS]]
// CHECK-SAME: EES_(<vscale x 4 x i32> %x.coerce, <vscale x 4 x i32> %y.coerce)
// CHECK-NEXT: entry:
// CHECK-NEXT: %x = alloca <[[#div(VBITS,32)]] x i32>, align 16
// CHECK-NEXT: %y = alloca <[[#div(VBITS,32)]] x i32>, align 16
// CHECK-NEXT: %retval.coerce = alloca <vscale x 4 x i32>, align 16
// CHECK-NEXT: %0 = bitcast <[[#div(VBITS,32)]] x i32>* %x to <vscale x 4 x i32>*
// CHECK-NEXT: store <vscale x 4 x i32> %x.coerce, <vscale x 4 x i32>* %0, align 16
// CHECK-NEXT: %x1 = load <[[#div(VBITS,32)]] x i32>, <[[#div(VBITS,32)]] x i32>* %x, align 16
// CHECK-NEXT: %1 = bitcast <[[#div(VBITS,32)]] x i32>* %y to <vscale x 4 x i32>*
// CHECK-NEXT: store <vscale x 4 x i32> %y.coerce, <vscale x 4 x i32>* %1, align 16
// CHECK-NEXT: %y2 = load <[[#div(VBITS,32)]] x i32>, <[[#div(VBITS,32)]] x i32>* %y, align 16
// CHECK-NEXT: %add = add <[[#div(VBITS,32)]] x i32> %y2, %x1
// CHECK-NEXT: %retval.0..sroa_cast = bitcast <vscale x 4 x i32>* %retval.coerce to <[[#div(VBITS,32)]] x i32>*
// CHECK-NEXT: store <[[#div(VBITS,32)]] x i32> %add, <[[#div(VBITS,32)]] x i32>* %retval.0..sroa_cast, align 16
// CHECK-NEXT: %2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %retval.coerce, align 16
// CHECK-NEXT: ret <vscale x 4 x i32> %2
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
// CHECK-NEXT: [[X:%.*]] = call <[[#div(VBITS, 32)]] x i32> @llvm.experimental.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
// CHECK-NEXT: [[Y:%.*]] = call <[[#div(VBITS, 32)]] x i32> @llvm.experimental.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE1:%.*]], i64 0)
// CHECK-NEXT: [[ADD:%.*]] = add <[[#div(VBITS, 32)]] x i32> [[Y]], [[X]]
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 4 x i32>* [[RETVAL_COERCE]] to <[[#div(VBITS, 32)]] x i32>*
// CHECK-NEXT: store <[[#div(VBITS, 32)]] x i32> [[ADD]], <[[#div(VBITS, 32)]] x i32>* [[RETVAL_0__SROA_CAST]], align 16
// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[RETVAL_COERCE]], align 16
// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
typedef svint32_t vec __attribute__((arm_sve_vector_bits(N)));
auto f(vec x, vec y) { return x + y; } // Returns a vec.
#endif
Expand All @@ -76,19 +70,13 @@ typedef svint16_t vec2 __attribute__((arm_sve_vector_bits(N)));
// CHECK-SAME: [[#VBITS]]
// CHECK-SAME: EE(<vscale x 8 x i16> %x.coerce)
// CHECK-NEXT: entry:
// CHECK128-NEXT: %x = alloca <[[#div(VBITS,16)]] x i16>, align 16
// CHECK128-NEXT: %0 = bitcast <[[#div(VBITS,16)]] x i16>* %x to <vscale x 8 x i16>*
// CHECK128-NEXT: store <vscale x 8 x i16> %x.coerce, <vscale x 8 x i16>* %0, align 16
// CHECK128-NEXT: %x1 = load <[[#div(VBITS,16)]] x i16>, <[[#div(VBITS,16)]] x i16>* %x, align 16
// CHECK128-NEXT: call void @_Z1fDv[[#div(VBITS,16)]]_s(<[[#div(VBITS,16)]] x i16> %x1)
// CHECK128-NEXT: [[X:%.*]] = call <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
// CHECK128-NEXT: call void @_Z1fDv8_s(<8 x i16> [[X]]) [[ATTR5:#.*]]
// CHECK128-NEXT: ret void
// CHECKWIDE-NEXT: %x = alloca <[[#div(VBITS,16)]] x i16>, align 16
// CHECKWIDE-NEXT: %indirect-arg-temp = alloca <[[#div(VBITS,16)]] x i16>, align 16
// CHECKWIDE-NEXT: %0 = bitcast <[[#div(VBITS,16)]] x i16>* %x to <vscale x 8 x i16>*
// CHECKWIDE-NEXT: store <vscale x 8 x i16> %x.coerce, <vscale x 8 x i16>* %0, align 16
// CHECKWIDE-NEXT: %x1 = load <[[#div(VBITS,16)]] x i16>, <[[#div(VBITS,16)]] x i16>* %x, align 16
// CHECKWIDE-NEXT: store <[[#div(VBITS,16)]] x i16> %x1, <[[#div(VBITS,16)]] x i16>* %indirect-arg-temp, align 16
// CHECKWIDE-NEXT: call void @_Z1fDv[[#div(VBITS,16)]]_s(<[[#div(VBITS,16)]] x i16>* nonnull %indirect-arg-temp)
// CHECKWIDE-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca <[[#div(VBITS, 16)]] x i16>, align 16
// CHECKWIDE-NEXT: [[X:%.*]] = call <[[#div(VBITS, 16)]] x i16> @llvm.experimental.vector.extract.v[[#div(VBITS, 16)]]i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
// CHECKWIDE-NEXT: store <[[#div(VBITS, 16)]] x i16> [[X]], <[[#div(VBITS, 16)]] x i16>* [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6:!tbaa !.*]]
// CHECKWIDE-NEXT: call void @_Z1fDv[[#div(VBITS, 16)]]_s(<[[#div(VBITS, 16)]] x i16>* nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
// CHECKWIDE-NEXT: ret void
void g(vec2 x) { f(x); } // OK
#endif

0 comments on commit 3d5b18a

Please sign in to comment.