From d412f7467a9b4dfc8ab8d969b45704d058fe6e6f Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis@arm.com>
Date: Fri, 24 Nov 2023 13:44:45 +0000
Subject: [PATCH 1/8] [NFC][TLI] Improve tests for ArmPL and SLEEF Intrinsics.

Auto-generate test `armpl-intrinsics.ll`, and use active lane mask to
have shorter `shufflevector` check lines.

Update scripts now add `@llvm.compiler.used` instead of using the regex:
`@[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]]`
---
 .../AArch64/sleef-intrinsic-calls-aarch64.ll  | 190 +++++++++++-------
 1 file changed, 114 insertions(+), 76 deletions(-)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
index 2300ce74996e3..83898374c1c6c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
@@ -139,8 +139,9 @@ define void @llvm_cos_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_cos_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV:%.*]]) #[[ATTR4:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -168,8 +169,9 @@ define void @llvm_cos_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_cos_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV:%.*]]) #[[ATTR5:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -200,8 +202,9 @@ define void @llvm_exp_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[CONV:%.*]]) #[[ATTR6:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -229,8 +232,9 @@ define void @llvm_exp_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[CONV:%.*]]) #[[ATTR7:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -261,8 +265,9 @@ define void @llvm_exp2_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp2_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[CONV:%.*]]) #[[ATTR8:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -290,8 +295,9 @@ define void @llvm_exp2_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp2_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[CONV:%.*]]) #[[ATTR9:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -322,8 +328,9 @@ define void @llvm_exp10_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp10_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[CONV:%.*]]) #[[ATTR10:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -351,8 +358,9 @@ define void @llvm_exp10_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp10_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[CONV:%.*]]) #[[ATTR11:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -383,8 +391,9 @@ define void @llvm_fabs_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fabs_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.fabs.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -413,8 +422,9 @@ define void @llvm_fabs_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fabs_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.fabs.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -445,8 +455,9 @@ define void @llvm_floor_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_floor_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.floor.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -474,8 +485,9 @@ define void @llvm_floor_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_floor_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.floor.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -506,8 +518,9 @@ define void @llvm_fma_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fma_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x double> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x double> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[CONV:%.*]], double [[CONV]], double [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -535,8 +548,9 @@ define void @llvm_fma_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fma_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x float> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[CONV:%.*]], float [[CONV]], float [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -567,8 +581,9 @@ define void @llvm_log_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.log.f64(double [[CONV:%.*]]) #[[ATTR12:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -596,8 +611,9 @@ define void @llvm_log_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.log.f32(float [[CONV:%.*]]) #[[ATTR13:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -628,8 +644,9 @@ define void @llvm_log10_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log10_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[CONV:%.*]]) #[[ATTR14:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -657,8 +674,9 @@ define void @llvm_log10_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log10_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[CONV:%.*]]) #[[ATTR15:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -689,8 +707,9 @@ define void @llvm_log2_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log2_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[CONV:%.*]]) #[[ATTR16:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -718,8 +737,9 @@ define void @llvm_log2_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log2_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[CONV:%.*]]) #[[ATTR17:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -750,8 +770,9 @@ define void @llvm_maxnum_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_maxnum_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.maxnum.f64(double [[CONV:%.*]], double [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -779,8 +800,9 @@ define void @llvm_maxnum_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_maxnum_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.maxnum.f32(float [[CONV:%.*]], float [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -811,8 +833,9 @@ define void @llvm_minnum_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_minnum_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.minnum.f64(double [[CONV:%.*]], double [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -840,8 +863,9 @@ define void @llvm_minnum_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_minnum_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.minnum.f32(float [[CONV:%.*]], float [[CONV]])
 ; SVE:    ret void
 ;
   entry:
@@ -872,8 +896,9 @@ define void @llvm_nearbyint_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_nearbyint_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.nearbyint.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -901,8 +926,9 @@ define void @llvm_nearbyint_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_nearbyint_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.nearbyint.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -933,8 +959,9 @@ define void @llvm_pow_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_pow_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[CONV:%.*]], double [[CONV]]) #[[ATTR18:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -962,8 +989,9 @@ define void @llvm_pow_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_pow_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[CONV:%.*]], float [[CONV]]) #[[ATTR19:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -994,8 +1022,9 @@ define void @llvm_rint_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_rint_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.rint.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1023,8 +1052,9 @@ define void @llvm_rint_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_rint_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.rint.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1055,8 +1085,9 @@ define void @llvm_round_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_round_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.round.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1084,8 +1115,9 @@ define void @llvm_round_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_round_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.round.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1116,8 +1148,9 @@ define void @llvm_sin_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sin_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV:%.*]]) #[[ATTR20:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -1145,8 +1178,9 @@ define void @llvm_sin_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sin_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV:%.*]]) #[[ATTR21:[0-9]+]]
 ; SVE:    ret void
 ;
   entry:
@@ -1177,8 +1211,9 @@ define void @llvm_sqrt_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sqrt_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.sqrt.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1206,8 +1241,9 @@ define void @llvm_sqrt_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sqrt_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.sqrt.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1238,8 +1274,9 @@ define void @llvm_trunc_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_trunc_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call double @llvm.trunc.f64(double [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1267,8 +1304,9 @@ define void @llvm_trunc_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_trunc_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
+; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
+; SVE:    [[CALL:%.*]] = tail call float @llvm.trunc.f32(float [[CONV:%.*]])
 ; SVE:    ret void
 ;
   entry:

From fe84777007eb8bb0359429e47f715390d2d44bd6 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis@arm.com>
Date: Mon, 27 Nov 2023 17:36:29 +0000
Subject: [PATCH 2/8] Add `simplifycfg` pass and `noalias` to ensure tail
 folding.

`noalias` attribute was added only to the `%in.ptr` parameter of the
ArmPL Intrinsics.
---
 .../AArch64/sleef-intrinsic-calls-aarch64.ll  | 190 +++++++-----------
 1 file changed, 76 insertions(+), 114 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
index 83898374c1c6c..2300ce74996e3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
@@ -139,9 +139,8 @@ define void @llvm_cos_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_cos_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV:%.*]]) #[[ATTR4:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -169,9 +168,8 @@ define void @llvm_cos_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_cos_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV:%.*]]) #[[ATTR5:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -202,9 +200,8 @@ define void @llvm_exp_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[CONV:%.*]]) #[[ATTR6:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -232,9 +229,8 @@ define void @llvm_exp_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[CONV:%.*]]) #[[ATTR7:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -265,9 +261,8 @@ define void @llvm_exp2_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[CONV:%.*]]) #[[ATTR8:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -295,9 +290,8 @@ define void @llvm_exp2_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[CONV:%.*]]) #[[ATTR9:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -328,9 +322,8 @@ define void @llvm_exp10_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp10_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[CONV:%.*]]) #[[ATTR10:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -358,9 +351,8 @@ define void @llvm_exp10_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_exp10_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[CONV:%.*]]) #[[ATTR11:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -391,9 +383,8 @@ define void @llvm_fabs_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fabs_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.fabs.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -422,9 +413,8 @@ define void @llvm_fabs_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fabs_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.fabs.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -455,9 +445,8 @@ define void @llvm_floor_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_floor_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.floor.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -485,9 +474,8 @@ define void @llvm_floor_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_floor_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.floor.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -518,9 +506,8 @@ define void @llvm_fma_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fma_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x double> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[CONV:%.*]], double [[CONV]], double [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x double> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -548,9 +535,8 @@ define void @llvm_fma_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_fma_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x float> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[CONV:%.*]], float [[CONV]], float [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -581,9 +567,8 @@ define void @llvm_log_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.log.f64(double [[CONV:%.*]]) #[[ATTR12:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -611,9 +596,8 @@ define void @llvm_log_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.log.f32(float [[CONV:%.*]]) #[[ATTR13:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -644,9 +628,8 @@ define void @llvm_log10_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log10_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[CONV:%.*]]) #[[ATTR14:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -674,9 +657,8 @@ define void @llvm_log10_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log10_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[CONV:%.*]]) #[[ATTR15:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -707,9 +689,8 @@ define void @llvm_log2_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[CONV:%.*]]) #[[ATTR16:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -737,9 +718,8 @@ define void @llvm_log2_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_log2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[CONV:%.*]]) #[[ATTR17:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -770,9 +750,8 @@ define void @llvm_maxnum_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_maxnum_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.maxnum.f64(double [[CONV:%.*]], double [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -800,9 +779,8 @@ define void @llvm_maxnum_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_maxnum_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.maxnum.f32(float [[CONV:%.*]], float [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -833,9 +811,8 @@ define void @llvm_minnum_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_minnum_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.minnum.f64(double [[CONV:%.*]], double [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -863,9 +840,8 @@ define void @llvm_minnum_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_minnum_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.minnum.f32(float [[CONV:%.*]], float [[CONV]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
 ; SVE:    ret void
 ;
   entry:
@@ -896,9 +872,8 @@ define void @llvm_nearbyint_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_nearbyint_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.nearbyint.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -926,9 +901,8 @@ define void @llvm_nearbyint_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_nearbyint_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.nearbyint.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -959,9 +933,8 @@ define void @llvm_pow_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_pow_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x double> [[TMP11]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[CONV:%.*]], double [[CONV]]) #[[ATTR18:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -989,9 +962,8 @@ define void @llvm_pow_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_pow_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x float> [[TMP11]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[CONV:%.*]], float [[CONV]]) #[[ATTR19:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1022,9 +994,8 @@ define void @llvm_rint_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_rint_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.rint.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1052,9 +1023,8 @@ define void @llvm_rint_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_rint_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.rint.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1085,9 +1055,8 @@ define void @llvm_round_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_round_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.round.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1115,9 +1084,8 @@ define void @llvm_round_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_round_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.round.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1148,9 +1116,8 @@ define void @llvm_sin_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sin_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP11:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV:%.*]]) #[[ATTR20:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1178,9 +1145,8 @@ define void @llvm_sin_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sin_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP11:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; SVE:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV:%.*]]) #[[ATTR21:[0-9]+]]
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1211,9 +1177,8 @@ define void @llvm_sqrt_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sqrt_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.sqrt.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1241,9 +1206,8 @@ define void @llvm_sqrt_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_sqrt_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.sqrt.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1274,9 +1238,8 @@ define void @llvm_trunc_f64(double* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_trunc_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call double @llvm.trunc.f64(double [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:
@@ -1304,9 +1267,8 @@ define void @llvm_trunc_f32(float* %varray) {
 ; NEON:    ret void
 ;
 ; SVE-LABEL: define void @llvm_trunc_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP11:%.*]])
-; SVE:    [[CALL:%.*]] = tail call float @llvm.trunc.f32(float [[CONV:%.*]])
+; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
+; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
 ; SVE:    ret void
 ;
   entry:

From 52ce4f2d655a60cc2eb5dd5b0b01fc925a38c273 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis@arm.com>
Date: Mon, 27 Nov 2023 16:55:17 +0000
Subject: [PATCH 3/8] [TLI] Pass replace-with-veclib works with Scalable
 Vectors.

The pass uses the Masked variant of TLI method when the Intrinsic
operates on Scalable Vectors and it fails to find a non-Masked variant.
---
 llvm/lib/Analysis/VFABIDemangling.cpp         |  2 +-
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        | 24 ++++++-------
 .../replace-intrinsics-with-veclib-armpl.ll   | 36 +++++++++----------
 ...e-intrinsics-with-veclib-sleef-scalable.ll | 35 +++++++++---------
 4 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index 22fc52070015c..426f98c0c6284 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -126,7 +126,7 @@ static ParseRet tryParseLinearTokenWithRuntimeStep(StringRef &ParseString,
   return ParseRet::None;
 }
 
-/// The function looks for the following stringt at the beginning of
+/// The function looks for the following string at the beginning of
 /// the input string `ParseString`:
 ///
 ///  <token> <number>
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 36c91b7fa97e4..d31a793556dfd 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -105,6 +105,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // all vector operands have identical vector width.
   ElementCount VF = ElementCount::getFixed(0);
   SmallVector<Type *> ScalarTypes;
+  bool MayBeMasked = false;
   for (auto Arg : enumerate(CI.args())) {
     auto *ArgType = Arg.value()->getType();
     // Vector calls to intrinsics can still have
@@ -121,17 +122,13 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
         return false;
       }
       ElementCount NumElements = VectorArgTy->getElementCount();
-      if (NumElements.isScalable()) {
-        // The current implementation does not support
-        // scalable vectors.
-        return false;
-      }
-      if (VF.isNonZero() && VF != NumElements) {
-        // The different arguments differ in vector size.
+      if (NumElements.isScalable())
+        MayBeMasked = true;
+
+      // The different arguments differ in vector size.
+      if (VF.isNonZero() && VF != NumElements)
         return false;
-      } else {
-        VF = NumElements;
-      }
+      VF = NumElements;
       ScalarTypes.push_back(VectorArgTy->getElementType());
     }
   }
@@ -152,11 +149,14 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
     return false;
   }
 
+  // Assume it has a mask when that is a possibility and has no mapping for
+  // a Non-Masked variant.
+  const bool IsMasked =
+      MayBeMasked && !TLI.getVectorMappingInfo(ScalarName, VF, false);
   // Try to find the mapping for the scalar version of this intrinsic
   // and the exact vector width of the call operands in the
   // TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF);
-
+  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, IsMasked);
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
                     << ScalarName << "` and vector width " << VF << ".\n");
 
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
index 18431ae021f97..633cb220f5246 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
@@ -15,7 +15,7 @@ declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [32 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_cos_f64(<2 x double> %in) {
 ; CHECK-LABEL: define <2 x double> @llvm_cos_f64
@@ -40,7 +40,7 @@ define <4 x float> @llvm_cos_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_cos_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -50,7 +50,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_cos_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -85,7 +85,7 @@ define <4 x float> @llvm_sin_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_sin_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -95,7 +95,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_sin_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
@@ -130,7 +130,7 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -140,7 +140,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -175,7 +175,7 @@ define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -185,7 +185,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -220,7 +220,7 @@ define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -230,7 +230,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -265,7 +265,7 @@ define <4 x float> @llvm_log_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -275,7 +275,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -310,7 +310,7 @@ define <4 x float> @llvm_log2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -320,7 +320,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -355,7 +355,7 @@ define <4 x float> @llvm_log10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -365,7 +365,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[IN]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -380,7 +380,7 @@ declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4
 ;
 ; There is a bug in the replace-with-veclib pass, and for intrinsics which take
 ; more than one arguments, but has just one overloaded type, it incorrectly
-; reconstructs the scalar name, for pow specificlly it is searching for:
+; reconstructs the scalar name, for pow specifically it is searching for:
 ; llvm.pow.f64.f64 and llvm.pow.f32.f32
 ;
 
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
index 8b06c41bcb1a6..969945590a0a1 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
@@ -5,6 +5,9 @@ target triple = "aarch64-unknown-linux-gnu"
 
 ; NOTE: The existing TLI mappings are not used since the -replace-with-veclib pass is broken for scalable vectors.
 
+;.
+; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf], section "llvm.metadata"
+;.
 define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_vscale_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
@@ -43,7 +46,7 @@ define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag,
 
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -52,7 +55,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -61,7 +64,7 @@ define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -70,7 +73,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -79,7 +82,7 @@ define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -88,7 +91,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -97,7 +100,7 @@ define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -106,7 +109,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -169,7 +172,7 @@ define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscal
 
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -178,7 +181,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -187,7 +190,7 @@ define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -196,7 +199,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -205,7 +208,7 @@ define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -214,7 +217,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -331,7 +334,7 @@ define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -340,7 +343,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]])
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)

From 690d36dc8a97d8ac82f51d6726eba439bb0c7b3f Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis@arm.com>
Date: Tue, 12 Dec 2023 15:48:35 +0000
Subject: [PATCH 4/8] Use createFunctionType to correctly replace veclib calls.

Split replaceWithTLIFunction method into two methods.
---
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        | 203 ++++++++++--------
 .../replace-intrinsics-with-veclib-armpl.ll   |  32 +--
 ...e-intrinsics-with-veclib-sleef-scalable.ll |  32 +--
 3 files changed, 149 insertions(+), 118 deletions(-)

diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index d31a793556dfd..ddcc55a8e52c4 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -15,15 +15,19 @@
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <optional>
 
 using namespace llvm;
 
@@ -38,138 +42,166 @@ STATISTIC(NumTLIFuncDeclAdded,
 STATISTIC(NumFuncUsedAdded,
           "Number of functions added to `llvm.compiler.used`");
 
-static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) {
-  Module *M = CI.getModule();
-
-  Function *OldFunc = CI.getCalledFunction();
-
-  // Check if the vector library function is already declared in this module,
-  // otherwise insert it.
+/// Returns a vector Function that it adds to the Module \p M. When an \p
+/// OptOldFunc is given, it copies its attributes to the newly created Function.
+Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
+                         std::optional<Function *> OptOldFunc,
+                         const StringRef TLIName) {
   Function *TLIFunc = M->getFunction(TLIName);
   if (!TLIFunc) {
-    TLIFunc = Function::Create(OldFunc->getFunctionType(),
-                               Function::ExternalLinkage, TLIName, *M);
-    TLIFunc->copyAttributesFrom(OldFunc);
+    TLIFunc =
+        Function::Create(VectorFTy, Function::ExternalLinkage, TLIName, *M);
+    if (OptOldFunc)
+      TLIFunc->copyAttributesFrom(*OptOldFunc);
 
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `"
                       << TLIName << "` of type `" << *(TLIFunc->getType())
                       << "` to module.\n");
 
     ++NumTLIFuncDeclAdded;
-
-    // Add the freshly created function to llvm.compiler.used,
-    // similar to as it is done in InjectTLIMappings
+    // Add the freshly created function to llvm.compiler.used, similar to as it
+    // is done in InjectTLIMappings
     appendToCompilerUsed(*M, {TLIFunc});
-
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName
                       << "` to `@llvm.compiler.used`.\n");
     ++NumFuncUsedAdded;
   }
+  return TLIFunc;
+}
 
-  // Replace the call to the vector intrinsic with a call
-  // to the corresponding function from the vector library.
+/// Replace the call to the vector intrinsic ( \p OldFunc ) with a call to the
+/// corresponding function from the vector library ( \p TLIFunc ).
+static bool replaceWithTLIFunction(const Module *M, CallInst &CI,
+                                   const ElementCount &VecVF, Function *OldFunc,
+                                   Function *TLIFunc, FunctionType *VecFTy,
+                                   bool IsMasked) {
   IRBuilder<> IRBuilder(&CI);
   SmallVector<Value *> Args(CI.args());
+  if (IsMasked) {
+    if (Args.size() == VecFTy->getNumParams())
+      static_assert(true && "mask was already in place");
+
+    auto *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VecVF);
+    Args.push_back(Constant::getAllOnesValue(MaskTy));
+  }
+
   // Preserve the operand bundles.
   SmallVector<OperandBundleDef, 1> OpBundles;
   CI.getOperandBundlesAsDefs(OpBundles);
   CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles);
-  assert(OldFunc->getFunctionType() == TLIFunc->getFunctionType() &&
+  assert(VecFTy == TLIFunc->getFunctionType() &&
          "Expecting function types to be identical");
   CI.replaceAllUsesWith(Replacement);
-  if (isa<FPMathOperator>(Replacement)) {
-    // Preserve fast math flags for FP math.
+  // Preserve fast math flags for FP math.
+  if (isa<FPMathOperator>(Replacement))
     Replacement->copyFastMathFlags(&CI);
-  }
 
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
-                    << OldFunc->getName() << "` with call to `" << TLIName
-                    << "`.\n");
+                    << OldFunc->getName() << "` with call to `"
+                    << TLIFunc->getName() << "`.\n");
   ++NumCallsReplaced;
   return true;
 }
 
+/// Utility method to get the VecDesc, depending on whether there is a TLI
+/// mapping, either with or without a mask.
+static std::optional<const VecDesc *> getVecDesc(const TargetLibraryInfo &TLI,
+                                                 const StringRef &ScalarName,
+                                                 const ElementCount &VF) {
+  const VecDesc *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true);
+  const VecDesc *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false);
+  // Invalid when there are both variants (ie masked and unmasked), or none
+  if ((VDMasked == nullptr) == (VDNoMask == nullptr))
+    return std::nullopt;
+
+  return {VDMasked != nullptr ? VDMasked : VDNoMask};
+}
+
+/// Returns whether it is able to replace a call to the intrinsic \p CI with a
+/// TLI mapped call.
 static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
                                     CallInst &CI) {
-  if (!CI.getCalledFunction()) {
+  if (!CI.getCalledFunction())
     return false;
-  }
 
   auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID();
-  if (IntrinsicID == Intrinsic::not_intrinsic) {
-    // Replacement is only performed for intrinsic functions
+  // Replacement is only performed for intrinsic functions
+  if (IntrinsicID == Intrinsic::not_intrinsic)
     return false;
-  }
 
-  // Convert vector arguments to scalar type and check that
-  // all vector operands have identical vector width.
+  // Convert vector arguments to scalar type and check that all vector operands
+  // have identical vector width.
   ElementCount VF = ElementCount::getFixed(0);
   SmallVector<Type *> ScalarTypes;
-  bool MayBeMasked = false;
   for (auto Arg : enumerate(CI.args())) {
-    auto *ArgType = Arg.value()->getType();
-    // Vector calls to intrinsics can still have
-    // scalar operands for specific arguments.
+    auto *ArgTy = Arg.value()->getType();
     if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
-      ScalarTypes.push_back(ArgType);
-    } else {
-      // The argument in this place should be a vector if
-      // this is a call to a vector intrinsic.
-      auto *VectorArgTy = dyn_cast<VectorType>(ArgType);
-      if (!VectorArgTy) {
-        // The argument is not a vector, do not perform
-        // the replacement.
-        return false;
-      }
-      ElementCount NumElements = VectorArgTy->getElementCount();
-      if (NumElements.isScalable())
-        MayBeMasked = true;
-
-      // The different arguments differ in vector size.
-      if (VF.isNonZero() && VF != NumElements)
+      ScalarTypes.push_back(ArgTy);
+    } else if (auto *VectorArgTy = dyn_cast<VectorType>(ArgTy)) {
+      ScalarTypes.push_back(ArgTy->getScalarType());
+      // Disallow vector arguments with different VFs. When processing the first
+      // vector argument, store it's VF, and for the rest ensure that they match
+      // it.
+      if (VF.isZero())
+        VF = VectorArgTy->getElementCount();
+      else if (VF != VectorArgTy->getElementCount())
         return false;
-      VF = NumElements;
-      ScalarTypes.push_back(VectorArgTy->getElementType());
+    } else {
+      // enters when it is supposed to be a vector argument but it isn't.
+      return false;
     }
   }
 
-  // Try to reconstruct the name for the scalar version of this
-  // intrinsic using the intrinsic ID and the argument types
-  // converted to scalar above.
-  std::string ScalarName;
-  if (Intrinsic::isOverloaded(IntrinsicID)) {
-    ScalarName = Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule());
-  } else {
-    ScalarName = Intrinsic::getName(IntrinsicID).str();
-  }
+  // Try to reconstruct the name for the scalar version of this intrinsic using
+  // the intrinsic ID and the argument types converted to scalar above.
+  std::string ScalarName =
+      (Intrinsic::isOverloaded(IntrinsicID)
+           ? Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule())
+           : Intrinsic::getName(IntrinsicID).str());
 
-  if (!TLI.isFunctionVectorizable(ScalarName)) {
-    // The TargetLibraryInfo does not contain a vectorized version of
-    // the scalar function.
+  // The TargetLibraryInfo does not contain a vectorized version of the scalar
+  // function.
+  if (!TLI.isFunctionVectorizable(ScalarName))
     return false;
-  }
 
-  // Assume it has a mask when that is a possibility and has no mapping for
-  // a Non-Masked variant.
-  const bool IsMasked =
-      MayBeMasked && !TLI.getVectorMappingInfo(ScalarName, VF, false);
-  // Try to find the mapping for the scalar version of this intrinsic
-  // and the exact vector width of the call operands in the
-  // TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, IsMasked);
+  auto OptVD = getVecDesc(TLI, ScalarName, VF);
+  if (!OptVD)
+    return false;
+
+  const VecDesc *VD = *OptVD;
+  // Try to find the mapping for the scalar version of this intrinsic and the
+  // exact vector width of the call operands in the TargetLibraryInfo.
+  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, VD->isMasked());
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
                     << ScalarName << "` and vector width " << VF << ".\n");
 
-  if (!TLIName.empty()) {
-    // Found the correct mapping in the TargetLibraryInfo,
-    // replace the call to the intrinsic with a call to
-    // the vector library function.
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
-                      << "`.\n");
-    return replaceWithTLIFunction(CI, TLIName);
-  }
+  // TLI failed to find a correct mapping.
+  if (TLIName.empty())
+    return false;
 
-  return false;
+  // Find the vector Function and replace the call to the intrinsic with a call
+  // to the vector library function.
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
+                    << "`.\n");
+
+  Type *ScalarRetTy = CI.getType()->getScalarType();
+  FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarTypes, false);
+  const std::string MangledName = VD->getVectorFunctionABIVariantString();
+  auto OptInfo = VFABI::tryDemangleForVFABI(MangledName, ScalarFTy);
+  if (!OptInfo)
+    return false;
+
+  // get the vector FunctionType
+  Module *M = CI.getModule();
+  auto OptFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy);
+  if (!OptFTy)
+    return false;
+
+  Function *OldFunc = CI.getCalledFunction();
+  FunctionType *VectorFTy = *OptFTy;
+  Function *TLIFunc = getTLIFunction(M, VectorFTy, OldFunc, TLIName);
+  return replaceWithTLIFunction(M, CI, OptInfo->Shape.VF, OldFunc, TLIFunc,
+                                VectorFTy, VD->isMasked());
 }
 
 static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
@@ -185,9 +217,8 @@ static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
   }
   // Erase the calls to the intrinsics that have been replaced
   // with calls to the vector library.
-  for (auto *CI : ReplacedCalls) {
+  for (auto *CI : ReplacedCalls)
     CI->eraseFromParent();
-  }
   return Changed;
 }
 
@@ -207,10 +238,10 @@ PreservedAnalyses ReplaceWithVeclib::run(Function &F,
     PA.preserve<DemandedBitsAnalysis>();
     PA.preserve<OptimizationRemarkEmitterAnalysis>();
     return PA;
-  } else {
-    // The pass did not replace any calls, hence it preserves all analyses.
-    return PreservedAnalyses::all();
   }
+
+  // The pass did not replace any calls, hence it preserves all analyses.
+  return PreservedAnalyses::all();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
index 633cb220f5246..d41870ec6e791 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
@@ -40,7 +40,7 @@ define <4 x float> @llvm_cos_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_cos_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -50,7 +50,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_cos_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -85,7 +85,7 @@ define <4 x float> @llvm_sin_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_sin_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -95,7 +95,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_sin_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
@@ -130,7 +130,7 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -140,7 +140,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -175,7 +175,7 @@ define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -185,7 +185,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -220,7 +220,7 @@ define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -230,7 +230,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -265,7 +265,7 @@ define <4 x float> @llvm_log_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -275,7 +275,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -310,7 +310,7 @@ define <4 x float> @llvm_log2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -320,7 +320,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -355,7 +355,7 @@ define <4 x float> @llvm_log10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -365,7 +365,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
index 969945590a0a1..baf16f83a3e24 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
@@ -46,7 +46,7 @@ define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag,
 
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -55,7 +55,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -64,7 +64,7 @@ define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -73,7 +73,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -82,7 +82,7 @@ define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -91,7 +91,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -100,7 +100,7 @@ define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -109,7 +109,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -172,7 +172,7 @@ define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscal
 
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -181,7 +181,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -190,7 +190,7 @@ define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -199,7 +199,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -208,7 +208,7 @@ define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -217,7 +217,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -334,7 +334,7 @@ define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -343,7 +343,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)

From 02b0e21aa97ae72834438223e68edc65ae8a23f1 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis@arm.com>
Date: Wed, 13 Dec 2023 18:08:46 +0000
Subject: [PATCH 5/8] getVecDesc now prioritizes masked variant

Also further cleanup to address reviewers.
---
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        | 85 ++++++++-----------
 ...e-intrinsics-with-veclib-sleef-scalable.ll |  2 -
 2 files changed, 36 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index ddcc55a8e52c4..4ea163e4eaafb 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -69,20 +69,20 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
   return TLIFunc;
 }
 
-/// Replace the call to the vector intrinsic ( \p OldFunc ) with a call to the
-/// corresponding function from the vector library ( \p TLIFunc ).
-static bool replaceWithTLIFunction(const Module *M, CallInst &CI,
-                                   const ElementCount &VecVF, Function *OldFunc,
-                                   Function *TLIFunc, FunctionType *VecFTy,
-                                   bool IsMasked) {
+/// Replace the call to the vector intrinsic ( \p FuncToReplace ) with a call to
+/// the corresponding function from the vector library ( \p TLIFunc ).
+static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info,
+                                   Function *TLIFunc, FunctionType *VecFTy) {
   IRBuilder<> IRBuilder(&CI);
   SmallVector<Value *> Args(CI.args());
-  if (IsMasked) {
+  if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) {
     if (Args.size() == VecFTy->getNumParams())
       static_assert(true && "mask was already in place");
 
-    auto *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VecVF);
-    Args.push_back(Constant::getAllOnesValue(MaskTy));
+    auto *MaskTy =
+        VectorType::get(Type::getInt1Ty(CI.getContext()), Info.Shape.VF);
+    Args.insert(Args.begin() + OptMaskpos.value(),
+                Constant::getAllOnesValue(MaskTy));
   }
 
   // Preserve the operand bundles.
@@ -95,26 +95,18 @@ static bool replaceWithTLIFunction(const Module *M, CallInst &CI,
   // Preserve fast math flags for FP math.
   if (isa<FPMathOperator>(Replacement))
     Replacement->copyFastMathFlags(&CI);
-
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
-                    << OldFunc->getName() << "` with call to `"
-                    << TLIFunc->getName() << "`.\n");
-  ++NumCallsReplaced;
-  return true;
 }
 
-/// Utility method to get the VecDesc, depending on whether there is a TLI
-/// mapping, either with or without a mask.
+/// Utility method to get the VecDesc, depending on whether there is such a TLI
+/// mapping, prioritizing a masked version.
 static std::optional<const VecDesc *> getVecDesc(const TargetLibraryInfo &TLI,
                                                  const StringRef &ScalarName,
                                                  const ElementCount &VF) {
-  const VecDesc *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true);
-  const VecDesc *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false);
-  // Invalid when there are both variants (ie masked and unmasked), or none
-  if ((VDMasked == nullptr) == (VDNoMask == nullptr))
-    return std::nullopt;
-
-  return {VDMasked != nullptr ? VDMasked : VDNoMask};
+  if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true))
+    return VDMasked;
+  if (auto *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false))
+    return VDNoMask;
+  return std::nullopt;
 }
 
 /// Returns whether it is able to replace a call to the intrinsic \p CI with a
@@ -146,10 +138,9 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
         VF = VectorArgTy->getElementCount();
       else if (VF != VectorArgTy->getElementCount())
         return false;
-    } else {
+    } else
       // enters when it is supposed to be a vector argument but it isn't.
       return false;
-    }
   }
 
   // Try to reconstruct the name for the scalar version of this intrinsic using
@@ -164,26 +155,19 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   if (!TLI.isFunctionVectorizable(ScalarName))
     return false;
 
+  // Try to find the mapping for the scalar version of this intrinsic and the
+  // exact vector width of the call operands in the TargetLibraryInfo.
   auto OptVD = getVecDesc(TLI, ScalarName, VF);
   if (!OptVD)
     return false;
 
   const VecDesc *VD = *OptVD;
-  // Try to find the mapping for the scalar version of this intrinsic and the
-  // exact vector width of the call operands in the TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, VD->isMasked());
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
-                    << ScalarName << "` and vector width " << VF << ".\n");
-
-  // TLI failed to find a correct mapping.
-  if (TLIName.empty())
-    return false;
-
-  // Find the vector Function and replace the call to the intrinsic with a call
-  // to the vector library function.
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
-                    << "`.\n");
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName
+                    << "` and vector width " << VF << " to: `"
+                    << VD->getVectorFnName() << "`.\n");
 
+  // Replace the call to the intrinsic with a call to the vector library
+  // function.
   Type *ScalarRetTy = CI.getType()->getScalarType();
   FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarTypes, false);
   const std::string MangledName = VD->getVectorFunctionABIVariantString();
@@ -191,17 +175,20 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   if (!OptInfo)
     return false;
 
-  // get the vector FunctionType
-  Module *M = CI.getModule();
-  auto OptFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy);
-  if (!OptFTy)
+  FunctionType *VectorFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy);
+  if (!VectorFTy)
     return false;
 
-  Function *OldFunc = CI.getCalledFunction();
-  FunctionType *VectorFTy = *OptFTy;
-  Function *TLIFunc = getTLIFunction(M, VectorFTy, OldFunc, TLIName);
-  return replaceWithTLIFunction(M, CI, OptInfo->Shape.VF, OldFunc, TLIFunc,
-                                VectorFTy, VD->isMasked());
+  Function *FuncToReplace = CI.getCalledFunction();
+  Function *TLIFunc = getTLIFunction(CI.getModule(), VectorFTy, FuncToReplace,
+                                     VD->getVectorFnName());
+  replaceWithTLIFunction(CI, *OptInfo, TLIFunc, VectorFTy);
+
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
+                    << FuncToReplace->getName() << "` with call to `"
+                    << TLIFunc->getName() << "`.\n");
+  ++NumCallsReplaced;
+  return true;
 }
 
 static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
index baf16f83a3e24..c2ff6014bc694 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
@@ -3,8 +3,6 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; NOTE: The existing TLI mappings are not used since the -replace-with-veclib pass is broken for scalable vectors.
-
 ;.
 ; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf], section "llvm.metadata"
 ;.

From 5b2e317962d0239d46fc0be2c76556b04cfcd0eb Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis@arm.com>
Date: Fri, 15 Dec 2023 12:08:37 +0000
Subject: [PATCH 6/8] Addressing review.

getTLIFunction is no longer an optional. It accepts a pointer for
ScalarFunc
---
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 41 +++++++++++++-------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 4ea163e4eaafb..05ceb6d01bc98 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -43,16 +43,16 @@ STATISTIC(NumFuncUsedAdded,
           "Number of functions added to `llvm.compiler.used`");
 
 /// Returns a vector Function that it adds to the Module \p M. When an \p
-/// OptOldFunc is given, it copies its attributes to the newly created Function.
+/// ScalarFunc is not null, it copies its attributes to the newly created
+/// Function.
 Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
-                         std::optional<Function *> OptOldFunc,
-                         const StringRef TLIName) {
+                         Function *ScalarFunc, const StringRef TLIName) {
   Function *TLIFunc = M->getFunction(TLIName);
   if (!TLIFunc) {
     TLIFunc =
         Function::Create(VectorFTy, Function::ExternalLinkage, TLIName, *M);
-    if (OptOldFunc)
-      TLIFunc->copyAttributesFrom(*OptOldFunc);
+    if (ScalarFunc)
+      TLIFunc->copyAttributesFrom(ScalarFunc);
 
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `"
                       << TLIName << "` of type `" << *(TLIFunc->getType())
@@ -60,7 +60,7 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
 
     ++NumTLIFuncDeclAdded;
     // Add the freshly created function to llvm.compiler.used, similar to as it
-    // is done in InjectTLIMappings
+    // is done in InjectTLIMappings.
     appendToCompilerUsed(*M, {TLIFunc});
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName
                       << "` to `@llvm.compiler.used`.\n");
@@ -72,11 +72,11 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
 /// Replace the call to the vector intrinsic ( \p FuncToReplace ) with a call to
 /// the corresponding function from the vector library ( \p TLIFunc ).
 static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info,
-                                   Function *TLIFunc, FunctionType *VecFTy) {
+                                   Function *TLIVecFunc) {
   IRBuilder<> IRBuilder(&CI);
   SmallVector<Value *> Args(CI.args());
   if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) {
-    if (Args.size() == VecFTy->getNumParams())
+    if (Args.size() == TLIVecFunc->getFunctionType()->getNumParams())
       static_assert(true && "mask was already in place");
 
     auto *MaskTy =
@@ -88,9 +88,7 @@ static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info,
   // Preserve the operand bundles.
   SmallVector<OperandBundleDef, 1> OpBundles;
   CI.getOperandBundlesAsDefs(OpBundles);
-  CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles);
-  assert(VecFTy == TLIFunc->getFunctionType() &&
-         "Expecting function types to be identical");
+  CallInst *Replacement = IRBuilder.CreateCall(TLIVecFunc, Args, OpBundles);
   CI.replaceAllUsesWith(Replacement);
   // Preserve fast math flags for FP math.
   if (isa<FPMathOperator>(Replacement))
@@ -102,10 +100,10 @@ static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info,
 static std::optional<const VecDesc *> getVecDesc(const TargetLibraryInfo &TLI,
                                                  const StringRef &ScalarName,
                                                  const ElementCount &VF) {
-  if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true))
-    return VDMasked;
   if (auto *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false))
     return VDNoMask;
+  if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true))
+    return VDMasked;
   return std::nullopt;
 }
 
@@ -117,20 +115,20 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
     return false;
 
   auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID();
-  // Replacement is only performed for intrinsic functions
+  // Replacement is only performed for intrinsic functions.
   if (IntrinsicID == Intrinsic::not_intrinsic)
     return false;
 
   // Convert vector arguments to scalar type and check that all vector operands
   // have identical vector width.
   ElementCount VF = ElementCount::getFixed(0);
-  SmallVector<Type *> ScalarTypes;
+  SmallVector<Type *> ScalarArgTypes;
   for (auto Arg : enumerate(CI.args())) {
     auto *ArgTy = Arg.value()->getType();
     if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
-      ScalarTypes.push_back(ArgTy);
+      ScalarArgTypes.push_back(ArgTy);
     } else if (auto *VectorArgTy = dyn_cast<VectorType>(ArgTy)) {
-      ScalarTypes.push_back(ArgTy->getScalarType());
+      ScalarArgTypes.push_back(ArgTy->getScalarType());
       // Disallow vector arguments with different VFs. When processing the first
       // vector argument, store it's VF, and for the rest ensure that they match
       // it.
@@ -139,7 +137,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
       else if (VF != VectorArgTy->getElementCount())
         return false;
     } else
-      // enters when it is supposed to be a vector argument but it isn't.
+      // Exit when it is supposed to be a vector argument but it isn't.
       return false;
   }
 
@@ -147,7 +145,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // the intrinsic ID and the argument types converted to scalar above.
   std::string ScalarName =
       (Intrinsic::isOverloaded(IntrinsicID)
-           ? Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule())
+           ? Intrinsic::getName(IntrinsicID, ScalarArgTypes, CI.getModule())
            : Intrinsic::getName(IntrinsicID).str());
 
   // The TargetLibraryInfo does not contain a vectorized version of the scalar
@@ -169,7 +167,8 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // Replace the call to the intrinsic with a call to the vector library
   // function.
   Type *ScalarRetTy = CI.getType()->getScalarType();
-  FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarTypes, false);
+  FunctionType *ScalarFTy =
+      FunctionType::get(ScalarRetTy, ScalarArgTypes, /*isVarArg*/ false);
   const std::string MangledName = VD->getVectorFunctionABIVariantString();
   auto OptInfo = VFABI::tryDemangleForVFABI(MangledName, ScalarFTy);
   if (!OptInfo)
@@ -182,7 +181,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   Function *FuncToReplace = CI.getCalledFunction();
   Function *TLIFunc = getTLIFunction(CI.getModule(), VectorFTy, FuncToReplace,
                                      VD->getVectorFnName());
-  replaceWithTLIFunction(CI, *OptInfo, TLIFunc, VectorFTy);
+  replaceWithTLIFunction(CI, *OptInfo, TLIFunc);
 
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
                     << FuncToReplace->getName() << "` with call to `"

From b2f81eb066c1d945e4a465a94af3ae2892bc8bed Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis@arm.com>
Date: Wed, 20 Dec 2023 08:51:03 +0000
Subject: [PATCH 7/8] Improved comments and minor code cleanup.

---
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 79 ++++++++++----------------
 1 file changed, 31 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 05ceb6d01bc98..6e308abc6990a 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -69,61 +69,48 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
   return TLIFunc;
 }
 
-/// Replace the call to the vector intrinsic ( \p FuncToReplace ) with a call to
-/// the corresponding function from the vector library ( \p TLIFunc ).
-static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info,
+/// Replace the call to the vector intrinsic ( \p CalltoReplace ) with a call to
+/// the corresponding function from the vector library ( \p TLIVecFunc ).
+static void replaceWithTLIFunction(CallInst &CalltoReplace, VFInfo &Info,
                                    Function *TLIVecFunc) {
-  IRBuilder<> IRBuilder(&CI);
-  SmallVector<Value *> Args(CI.args());
+  IRBuilder<> IRBuilder(&CalltoReplace);
+  SmallVector<Value *> Args(CalltoReplace.args());
   if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) {
-    if (Args.size() == TLIVecFunc->getFunctionType()->getNumParams())
-      static_assert(true && "mask was already in place");
-
-    auto *MaskTy =
-        VectorType::get(Type::getInt1Ty(CI.getContext()), Info.Shape.VF);
+    auto *MaskTy = VectorType::get(Type::getInt1Ty(CalltoReplace.getContext()),
+                                   Info.Shape.VF);
     Args.insert(Args.begin() + OptMaskpos.value(),
                 Constant::getAllOnesValue(MaskTy));
   }
 
   // Preserve the operand bundles.
   SmallVector<OperandBundleDef, 1> OpBundles;
-  CI.getOperandBundlesAsDefs(OpBundles);
+  CalltoReplace.getOperandBundlesAsDefs(OpBundles);
   CallInst *Replacement = IRBuilder.CreateCall(TLIVecFunc, Args, OpBundles);
-  CI.replaceAllUsesWith(Replacement);
+  CalltoReplace.replaceAllUsesWith(Replacement);
   // Preserve fast math flags for FP math.
   if (isa<FPMathOperator>(Replacement))
-    Replacement->copyFastMathFlags(&CI);
-}
-
-/// Utility method to get the VecDesc, depending on whether there is such a TLI
-/// mapping, prioritizing a masked version.
-static std::optional<const VecDesc *> getVecDesc(const TargetLibraryInfo &TLI,
-                                                 const StringRef &ScalarName,
-                                                 const ElementCount &VF) {
-  if (auto *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false))
-    return VDNoMask;
-  if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true))
-    return VDMasked;
-  return std::nullopt;
+    Replacement->copyFastMathFlags(&CalltoReplace);
 }
 
-/// Returns whether it is able to replace a call to the intrinsic \p CI with a
-/// TLI mapped call.
+/// Returns true when successfully replaced \p CallToReplace with a suitable
+/// function taking vector arguments, based on available mappings in the \p TLI.
+/// Currently only works when \p CallToReplace is a call to vectorized
+/// intrinsic.
 static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
-                                    CallInst &CI) {
-  if (!CI.getCalledFunction())
+                                    CallInst &CallToReplace) {
+  if (!CallToReplace.getCalledFunction())
     return false;
 
-  auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID();
+  auto IntrinsicID = CallToReplace.getCalledFunction()->getIntrinsicID();
   // Replacement is only performed for intrinsic functions.
   if (IntrinsicID == Intrinsic::not_intrinsic)
     return false;
 
-  // Convert vector arguments to scalar type and check that all vector operands
-  // have identical vector width.
+  // Compute arguments types of the corresponding scalar call. Additionally
+  // checks if in the vector call, all vector operands have the same EC.
   ElementCount VF = ElementCount::getFixed(0);
   SmallVector<Type *> ScalarArgTypes;
-  for (auto Arg : enumerate(CI.args())) {
+  for (auto Arg : enumerate(CallToReplace.args())) {
     auto *ArgTy = Arg.value()->getType();
     if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
       ScalarArgTypes.push_back(ArgTy);
@@ -145,28 +132,24 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // the intrinsic ID and the argument types converted to scalar above.
   std::string ScalarName =
       (Intrinsic::isOverloaded(IntrinsicID)
-           ? Intrinsic::getName(IntrinsicID, ScalarArgTypes, CI.getModule())
+           ? Intrinsic::getName(IntrinsicID, ScalarArgTypes,
+                                CallToReplace.getModule())
            : Intrinsic::getName(IntrinsicID).str());
 
-  // The TargetLibraryInfo does not contain a vectorized version of the scalar
-  // function.
-  if (!TLI.isFunctionVectorizable(ScalarName))
-    return false;
-
   // Try to find the mapping for the scalar version of this intrinsic and the
-  // exact vector width of the call operands in the TargetLibraryInfo.
-  auto OptVD = getVecDesc(TLI, ScalarName, VF);
-  if (!OptVD)
+  // exact vector width of the call operands in the TargetLibraryInfo. First,
+  // check with a non-masked variant, and if that fails try with a masked one.
+  const VecDesc *VD = TLI.getVectorMappingInfo(ScalarName, VF, false);
+  if (!VD && !(VD = TLI.getVectorMappingInfo(ScalarName, VF, true)))
     return false;
 
-  const VecDesc *VD = *OptVD;
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName
                     << "` and vector width " << VF << " to: `"
                     << VD->getVectorFnName() << "`.\n");
 
   // Replace the call to the intrinsic with a call to the vector library
   // function.
-  Type *ScalarRetTy = CI.getType()->getScalarType();
+  Type *ScalarRetTy = CallToReplace.getType()->getScalarType();
   FunctionType *ScalarFTy =
       FunctionType::get(ScalarRetTy, ScalarArgTypes, /*isVarArg*/ false);
   const std::string MangledName = VD->getVectorFunctionABIVariantString();
@@ -178,10 +161,10 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   if (!VectorFTy)
     return false;
 
-  Function *FuncToReplace = CI.getCalledFunction();
-  Function *TLIFunc = getTLIFunction(CI.getModule(), VectorFTy, FuncToReplace,
-                                     VD->getVectorFnName());
-  replaceWithTLIFunction(CI, *OptInfo, TLIFunc);
+  Function *FuncToReplace = CallToReplace.getCalledFunction();
+  Function *TLIFunc = getTLIFunction(CallToReplace.getModule(), VectorFTy,
+                                     FuncToReplace, VD->getVectorFnName());
+  replaceWithTLIFunction(CallToReplace, *OptInfo, TLIFunc);
 
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
                     << FuncToReplace->getName() << "` with call to `"

From 787baf7aee1ad0df2b33596a0a3f45e829b580c9 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <Paschalis.Mpeis@arm.com>
Date: Wed, 20 Dec 2023 15:24:01 +0000
Subject: [PATCH 8/8] Addressing review.

---
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 6e308abc6990a..893aa4a91828d 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -27,7 +27,6 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <optional>
 
 using namespace llvm;
 
@@ -46,7 +45,8 @@ STATISTIC(NumFuncUsedAdded,
 /// ScalarFunc is not null, it copies its attributes to the newly created
 /// Function.
 Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
-                         Function *ScalarFunc, const StringRef TLIName) {
+                         const StringRef TLIName,
+                         Function *ScalarFunc = nullptr) {
   Function *TLIFunc = M->getFunction(TLIName);
   if (!TLIFunc) {
     TLIFunc =
@@ -139,8 +139,9 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // Try to find the mapping for the scalar version of this intrinsic and the
   // exact vector width of the call operands in the TargetLibraryInfo. First,
   // check with a non-masked variant, and if that fails try with a masked one.
-  const VecDesc *VD = TLI.getVectorMappingInfo(ScalarName, VF, false);
-  if (!VD && !(VD = TLI.getVectorMappingInfo(ScalarName, VF, true)))
+  const VecDesc *VD =
+      TLI.getVectorMappingInfo(ScalarName, VF, /*Masked*/ false);
+  if (!VD && !(VD = TLI.getVectorMappingInfo(ScalarName, VF, /*Masked*/ true)))
     return false;
 
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName
@@ -163,7 +164,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
 
   Function *FuncToReplace = CallToReplace.getCalledFunction();
   Function *TLIFunc = getTLIFunction(CallToReplace.getModule(), VectorFTy,
-                                     FuncToReplace, VD->getVectorFnName());
+                                     VD->getVectorFnName(), FuncToReplace);
   replaceWithTLIFunction(CallToReplace, *OptInfo, TLIFunc);
 
   LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"