From d412f7467a9b4dfc8ab8d969b45704d058fe6e6f Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Fri, 24 Nov 2023 13:44:45 +0000 Subject: [PATCH 1/8] [NFC][TLI] Improve tests for ArmPL and SLEEF Intrinsics. Auto-generate test `armpl-intrinsics.ll`, and use active lane mask to have shorter `shufflevector` check lines. Update scripts now add `@llvm.compiler.used` instead of using the regex: `@[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]]` --- .../AArch64/sleef-intrinsic-calls-aarch64.ll | 190 +++++++++++------- 1 file changed, 114 insertions(+), 76 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll index 2300ce74996e3..83898374c1c6c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll @@ -139,8 +139,9 @@ define void @llvm_cos_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_cos_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_cos( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_cos( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV:%.*]]) #[[ATTR4:[0-9]+]] ; SVE: ret void ; entry: @@ -168,8 +169,9 @@ define void @llvm_cos_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_cos_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_cosf( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_cosf( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV:%.*]]) #[[ATTR5:[0-9]+]] ; SVE: ret void ; entry: @@ -200,8 +202,9 @@ define void @llvm_exp_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[CONV:%.*]]) #[[ATTR6:[0-9]+]] ; SVE: ret void ; entry: @@ -229,8 +232,9 @@ define void @llvm_exp_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_expf( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_expf( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[CONV:%.*]]) #[[ATTR7:[0-9]+]] ; SVE: ret void ; entry: @@ -261,8 +265,9 @@ define void @llvm_exp2_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp2_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp2( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp2( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[CONV:%.*]]) #[[ATTR8:[0-9]+]] ; SVE: ret void ; entry: @@ -290,8 +295,9 @@ define void @llvm_exp2_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp2_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp2f( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp2f( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[CONV:%.*]]) #[[ATTR9:[0-9]+]] ; SVE: ret void ; entry: @@ -322,8 +328,9 @@ define void @llvm_exp10_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp10_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp10( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp10( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[CONV:%.*]]) #[[ATTR10:[0-9]+]] ; SVE: ret void ; entry: @@ -351,8 +358,9 @@ define void @llvm_exp10_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp10_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp10f( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp10f( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[CONV:%.*]]) #[[ATTR11:[0-9]+]] ; SVE: ret void ; entry: @@ -383,8 +391,9 @@ define void @llvm_fabs_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_fabs_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.fabs.nxv2f64( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.fabs.nxv2f64( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.fabs.f64(double [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -413,8 +422,9 @@ define void @llvm_fabs_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_fabs_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.fabs.nxv4f32( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.fabs.nxv4f32( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.fabs.f32(float [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -445,8 +455,9 @@ define void @llvm_floor_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_floor_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.floor.nxv2f64( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.floor.nxv2f64( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.floor.f64(double [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -474,8 +485,9 @@ define void @llvm_floor_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_floor_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.floor.nxv4f32( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.floor.nxv4f32( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.floor.f32(float [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -506,8 +518,9 @@ define void @llvm_fma_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_fma_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.fma.nxv2f64( [[TMP17:%.*]], [[TMP17]], [[TMP17]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.fma.nxv2f64( [[TMP11:%.*]], [[TMP11]], [[TMP11]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[CONV:%.*]], double [[CONV]], double [[CONV]]) ; SVE: ret void ; entry: @@ -535,8 +548,9 @@ define void @llvm_fma_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_fma_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.fma.nxv4f32( [[TMP17:%.*]], [[TMP17]], [[TMP17]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.fma.nxv4f32( [[TMP11:%.*]], [[TMP11]], [[TMP11]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[CONV:%.*]], float [[CONV]], float [[CONV]]) ; SVE: ret void ; entry: @@ -567,8 +581,9 @@ define void @llvm_log_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.log.f64(double [[CONV:%.*]]) #[[ATTR12:[0-9]+]] ; SVE: ret void ; entry: @@ -596,8 +611,9 @@ define void @llvm_log_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_logf( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_logf( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.log.f32(float [[CONV:%.*]]) #[[ATTR13:[0-9]+]] ; SVE: ret void ; entry: @@ -628,8 +644,9 @@ define void @llvm_log10_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log10_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log10( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log10( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[CONV:%.*]]) #[[ATTR14:[0-9]+]] ; SVE: ret void ; entry: @@ -657,8 +674,9 @@ define void @llvm_log10_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log10_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log10f( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log10f( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[CONV:%.*]]) #[[ATTR15:[0-9]+]] ; SVE: ret void ; entry: @@ -689,8 +707,9 @@ define void @llvm_log2_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log2_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log2( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log2( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[CONV:%.*]]) #[[ATTR16:[0-9]+]] ; SVE: ret void ; entry: @@ -718,8 +737,9 @@ define void @llvm_log2_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log2_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log2f( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log2f( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[CONV:%.*]]) #[[ATTR17:[0-9]+]] ; SVE: ret void ; entry: @@ -750,8 +770,9 @@ define void @llvm_maxnum_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_maxnum_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.maxnum.nxv2f64( [[TMP17:%.*]], [[TMP17]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.maxnum.nxv2f64( [[TMP11:%.*]], [[TMP11]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.maxnum.f64(double [[CONV:%.*]], double [[CONV]]) ; SVE: ret void ; entry: @@ -779,8 +800,9 @@ define void @llvm_maxnum_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_maxnum_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.maxnum.nxv4f32( [[TMP17:%.*]], [[TMP17]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.maxnum.nxv4f32( [[TMP11:%.*]], [[TMP11]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.maxnum.f32(float [[CONV:%.*]], float [[CONV]]) ; SVE: ret void ; entry: @@ -811,8 +833,9 @@ define void @llvm_minnum_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_minnum_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.minnum.nxv2f64( [[TMP17:%.*]], [[TMP17]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.minnum.nxv2f64( [[TMP11:%.*]], [[TMP11]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.minnum.f64(double [[CONV:%.*]], double [[CONV]]) ; SVE: ret void ; entry: @@ -840,8 +863,9 @@ define void @llvm_minnum_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_minnum_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.minnum.nxv4f32( [[TMP17:%.*]], [[TMP17]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.minnum.nxv4f32( [[TMP11:%.*]], [[TMP11]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.minnum.f32(float [[CONV:%.*]], float [[CONV]]) ; SVE: ret void ; entry: @@ -872,8 +896,9 @@ define void @llvm_nearbyint_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_nearbyint_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.nearbyint.nxv2f64( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.nearbyint.nxv2f64( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.nearbyint.f64(double [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -901,8 +926,9 @@ define void @llvm_nearbyint_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_nearbyint_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.nearbyint.nxv4f32( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.nearbyint.nxv4f32( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.nearbyint.f32(float [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -933,8 +959,9 @@ define void @llvm_pow_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_pow_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxvv_pow( [[TMP17:%.*]], [[TMP17]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxvv_pow( [[TMP11:%.*]], [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[CONV:%.*]], double [[CONV]]) #[[ATTR18:[0-9]+]] ; SVE: ret void ; entry: @@ -962,8 +989,9 @@ define void @llvm_pow_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_pow_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxvv_powf( [[TMP17:%.*]], [[TMP17]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxvv_powf( [[TMP11:%.*]], [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[CONV:%.*]], float [[CONV]]) #[[ATTR19:[0-9]+]] ; SVE: ret void ; entry: @@ -994,8 +1022,9 @@ define void @llvm_rint_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_rint_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.rint.nxv2f64( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.rint.nxv2f64( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.rint.f64(double [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -1023,8 +1052,9 @@ define void @llvm_rint_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_rint_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.rint.nxv4f32( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.rint.nxv4f32( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.rint.f32(float [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -1055,8 +1085,9 @@ define void @llvm_round_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_round_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.round.nxv2f64( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.round.nxv2f64( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.round.f64(double [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -1084,8 +1115,9 @@ define void @llvm_round_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_round_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.round.nxv4f32( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.round.nxv4f32( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.round.f32(float [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -1116,8 +1148,9 @@ define void @llvm_sin_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_sin_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_sin( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_sin( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV:%.*]]) #[[ATTR20:[0-9]+]] ; SVE: ret void ; entry: @@ -1145,8 +1178,9 @@ define void @llvm_sin_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_sin_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_sinf( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_sinf( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; SVE: [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV:%.*]]) #[[ATTR21:[0-9]+]] ; SVE: ret void ; entry: @@ -1177,8 +1211,9 @@ define void @llvm_sqrt_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_sqrt_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.sqrt.nxv2f64( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.sqrt.nxv2f64( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.sqrt.f64(double [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -1206,8 +1241,9 @@ define void @llvm_sqrt_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_sqrt_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.sqrt.nxv4f32( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.sqrt.nxv4f32( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.sqrt.f32(float [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -1238,8 +1274,9 @@ define void @llvm_trunc_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_trunc_f64 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.trunc.nxv2f64( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.trunc.nxv2f64( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call double @llvm.trunc.f64(double [[CONV:%.*]]) ; SVE: ret void ; entry: @@ -1267,8 +1304,9 @@ define void @llvm_trunc_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_trunc_f32 -; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP18:%.*]] = call @llvm.trunc.nxv4f32( [[TMP17:%.*]]) +; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP12:%.*]] = call @llvm.trunc.nxv4f32( [[TMP11:%.*]]) +; SVE: [[CALL:%.*]] = tail call float @llvm.trunc.f32(float [[CONV:%.*]]) ; SVE: ret void ; entry: From fe84777007eb8bb0359429e47f715390d2d44bd6 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Mon, 27 Nov 2023 17:36:29 +0000 Subject: [PATCH 2/8] Add `simplifycfg` pass and `noalias` to ensure tail folding. `noalias` attribute was added only to the `%in.ptr` parameter of the ArmPL Intrinsics. --- .../AArch64/sleef-intrinsic-calls-aarch64.ll | 190 +++++++----------- 1 file changed, 76 insertions(+), 114 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll index 83898374c1c6c..2300ce74996e3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll @@ -139,9 +139,8 @@ define void @llvm_cos_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_cos_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_cos( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV:%.*]]) #[[ATTR4:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_cos( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -169,9 +168,8 @@ define void @llvm_cos_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_cos_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_cosf( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV:%.*]]) #[[ATTR5:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_cosf( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -202,9 +200,8 @@ define void @llvm_exp_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[CONV:%.*]]) #[[ATTR6:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -232,9 +229,8 @@ define void @llvm_exp_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_expf( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[CONV:%.*]]) #[[ATTR7:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_expf( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -265,9 +261,8 @@ define void @llvm_exp2_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp2_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp2( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[CONV:%.*]]) #[[ATTR8:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp2( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -295,9 +290,8 @@ define void @llvm_exp2_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp2_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp2f( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[CONV:%.*]]) #[[ATTR9:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp2f( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -328,9 +322,8 @@ define void @llvm_exp10_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp10_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp10( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[CONV:%.*]]) #[[ATTR10:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp10( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -358,9 +351,8 @@ define void @llvm_exp10_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_exp10_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_exp10f( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[CONV:%.*]]) #[[ATTR11:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_exp10f( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -391,9 +383,8 @@ define void @llvm_fabs_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_fabs_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.fabs.nxv2f64( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.fabs.f64(double [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.fabs.nxv2f64( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -422,9 +413,8 @@ define void @llvm_fabs_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_fabs_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.fabs.nxv4f32( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.fabs.f32(float [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.fabs.nxv4f32( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -455,9 +445,8 @@ define void @llvm_floor_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_floor_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.floor.nxv2f64( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.floor.f64(double [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.floor.nxv2f64( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -485,9 +474,8 @@ define void @llvm_floor_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_floor_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.floor.nxv4f32( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.floor.f32(float [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.floor.nxv4f32( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -518,9 +506,8 @@ define void @llvm_fma_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_fma_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.fma.nxv2f64( [[TMP11:%.*]], [[TMP11]], [[TMP11]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.fma.f64(double [[CONV:%.*]], double [[CONV]], double [[CONV]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.fma.nxv2f64( [[TMP17:%.*]], [[TMP17]], [[TMP17]]) ; SVE: ret void ; entry: @@ -548,9 +535,8 @@ define void @llvm_fma_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_fma_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.fma.nxv4f32( [[TMP11:%.*]], [[TMP11]], [[TMP11]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.fma.f32(float [[CONV:%.*]], float [[CONV]], float [[CONV]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.fma.nxv4f32( [[TMP17:%.*]], [[TMP17]], [[TMP17]]) ; SVE: ret void ; entry: @@ -581,9 +567,8 @@ define void @llvm_log_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.log.f64(double [[CONV:%.*]]) #[[ATTR12:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -611,9 +596,8 @@ define void @llvm_log_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_logf( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.log.f32(float [[CONV:%.*]]) #[[ATTR13:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_logf( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -644,9 +628,8 @@ define void @llvm_log10_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log10_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log10( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[CONV:%.*]]) #[[ATTR14:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log10( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -674,9 +657,8 @@ define void @llvm_log10_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log10_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log10f( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[CONV:%.*]]) #[[ATTR15:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log10f( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -707,9 +689,8 @@ define void @llvm_log2_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log2_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log2( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[CONV:%.*]]) #[[ATTR16:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log2( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -737,9 +718,8 @@ define void @llvm_log2_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_log2_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_log2f( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[CONV:%.*]]) #[[ATTR17:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_log2f( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -770,9 +750,8 @@ define void @llvm_maxnum_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_maxnum_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.maxnum.nxv2f64( [[TMP11:%.*]], [[TMP11]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.maxnum.f64(double [[CONV:%.*]], double [[CONV]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.maxnum.nxv2f64( [[TMP17:%.*]], [[TMP17]]) ; SVE: ret void ; entry: @@ -800,9 +779,8 @@ define void @llvm_maxnum_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_maxnum_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.maxnum.nxv4f32( [[TMP11:%.*]], [[TMP11]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.maxnum.f32(float [[CONV:%.*]], float [[CONV]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.maxnum.nxv4f32( [[TMP17:%.*]], [[TMP17]]) ; SVE: ret void ; entry: @@ -833,9 +811,8 @@ define void @llvm_minnum_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_minnum_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.minnum.nxv2f64( [[TMP11:%.*]], [[TMP11]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.minnum.f64(double [[CONV:%.*]], double [[CONV]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.minnum.nxv2f64( [[TMP17:%.*]], [[TMP17]]) ; SVE: ret void ; entry: @@ -863,9 +840,8 @@ define void @llvm_minnum_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_minnum_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.minnum.nxv4f32( [[TMP11:%.*]], [[TMP11]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.minnum.f32(float [[CONV:%.*]], float [[CONV]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.minnum.nxv4f32( [[TMP17:%.*]], [[TMP17]]) ; SVE: ret void ; entry: @@ -896,9 +872,8 @@ define void @llvm_nearbyint_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_nearbyint_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.nearbyint.nxv2f64( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.nearbyint.f64(double [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.nearbyint.nxv2f64( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -926,9 +901,8 @@ define void @llvm_nearbyint_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_nearbyint_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.nearbyint.nxv4f32( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.nearbyint.f32(float [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.nearbyint.nxv4f32( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -959,9 +933,8 @@ define void @llvm_pow_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_pow_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxvv_pow( [[TMP11:%.*]], [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[CONV:%.*]], double [[CONV]]) #[[ATTR18:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxvv_pow( [[TMP17:%.*]], [[TMP17]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -989,9 +962,8 @@ define void @llvm_pow_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_pow_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxvv_powf( [[TMP11:%.*]], [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[CONV:%.*]], float [[CONV]]) #[[ATTR19:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxvv_powf( [[TMP17:%.*]], [[TMP17]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -1022,9 +994,8 @@ define void @llvm_rint_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_rint_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.rint.nxv2f64( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.rint.f64(double [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.rint.nxv2f64( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -1052,9 +1023,8 @@ define void @llvm_rint_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_rint_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.rint.nxv4f32( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.rint.f32(float [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.rint.nxv4f32( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -1085,9 +1055,8 @@ define void @llvm_round_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_round_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.round.nxv2f64( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.round.f64(double [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.round.nxv2f64( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -1115,9 +1084,8 @@ define void @llvm_round_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_round_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.round.nxv4f32( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.round.f32(float [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.round.nxv4f32( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -1148,9 +1116,8 @@ define void @llvm_sin_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_sin_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_sin( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV:%.*]]) #[[ATTR20:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_sin( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -1178,9 +1145,8 @@ define void @llvm_sin_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_sin_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @_ZGVsMxv_sinf( [[TMP11:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE: [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV:%.*]]) #[[ATTR21:[0-9]+]] +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @_ZGVsMxv_sinf( [[TMP17:%.*]], [[ACTIVE_LANE_MASK:%.*]]) ; SVE: ret void ; entry: @@ -1211,9 +1177,8 @@ define void @llvm_sqrt_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_sqrt_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.sqrt.nxv2f64( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.sqrt.f64(double [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.sqrt.nxv2f64( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -1241,9 +1206,8 @@ define void @llvm_sqrt_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_sqrt_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.sqrt.nxv4f32( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.sqrt.f32(float [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.sqrt.nxv4f32( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -1274,9 +1238,8 @@ define void @llvm_trunc_f64(double* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_trunc_f64 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.trunc.nxv2f64( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call double @llvm.trunc.f64(double [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.trunc.nxv2f64( [[TMP17:%.*]]) ; SVE: ret void ; entry: @@ -1304,9 +1267,8 @@ define void @llvm_trunc_f32(float* %varray) { ; NEON: ret void ; ; SVE-LABEL: define void @llvm_trunc_f32 -; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR1]] { -; SVE: [[TMP12:%.*]] = call @llvm.trunc.nxv4f32( [[TMP11:%.*]]) -; SVE: [[CALL:%.*]] = tail call float @llvm.trunc.f32(float [[CONV:%.*]]) +; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] { +; SVE: [[TMP18:%.*]] = call @llvm.trunc.nxv4f32( [[TMP17:%.*]]) ; SVE: ret void ; entry: From 52ce4f2d655a60cc2eb5dd5b0b01fc925a38c273 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Mon, 27 Nov 2023 16:55:17 +0000 Subject: [PATCH 3/8] [TLI] Pass replace-with-veclib works with Scalable Vectors. The pass uses the Masked variant of TLI method when the Intrinsic operates on Scalable Vectors and it fails to find a non-Masked variant. --- llvm/lib/Analysis/VFABIDemangling.cpp | 2 +- llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 24 ++++++------- .../replace-intrinsics-with-veclib-armpl.ll | 36 +++++++++---------- ...e-intrinsics-with-veclib-sleef-scalable.ll | 35 +++++++++--------- 4 files changed, 50 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp index 22fc52070015c..426f98c0c6284 100644 --- a/llvm/lib/Analysis/VFABIDemangling.cpp +++ b/llvm/lib/Analysis/VFABIDemangling.cpp @@ -126,7 +126,7 @@ static ParseRet tryParseLinearTokenWithRuntimeStep(StringRef &ParseString, return ParseRet::None; } -/// The function looks for the following stringt at the beginning of +/// The function looks for the following string at the beginning of /// the input string `ParseString`: /// /// diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp index 36c91b7fa97e4..d31a793556dfd 100644 --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -105,6 +105,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, // all vector operands have identical vector width. ElementCount VF = ElementCount::getFixed(0); SmallVector ScalarTypes; + bool MayBeMasked = false; for (auto Arg : enumerate(CI.args())) { auto *ArgType = Arg.value()->getType(); // Vector calls to intrinsics can still have @@ -121,17 +122,13 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, return false; } ElementCount NumElements = VectorArgTy->getElementCount(); - if (NumElements.isScalable()) { - // The current implementation does not support - // scalable vectors. - return false; - } - if (VF.isNonZero() && VF != NumElements) { - // The different arguments differ in vector size. + if (NumElements.isScalable()) + MayBeMasked = true; + + // The different arguments differ in vector size. + if (VF.isNonZero() && VF != NumElements) return false; - } else { - VF = NumElements; - } + VF = NumElements; ScalarTypes.push_back(VectorArgTy->getElementType()); } } @@ -152,11 +149,14 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, return false; } + // Assume it has a mask when that is a possibility and has no mapping for + // a Non-Masked variant. + const bool IsMasked = + MayBeMasked && !TLI.getVectorMappingInfo(ScalarName, VF, false); // Try to find the mapping for the scalar version of this intrinsic // and the exact vector width of the call operands in the // TargetLibraryInfo. - StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF); - + StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, IsMasked); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `" << ScalarName << "` and vector width " << VF << ".\n"); diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll index 18431ae021f97..633cb220f5246 100644 --- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll +++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll @@ -15,7 +15,7 @@ declare @llvm.cos.nxv2f64() declare @llvm.cos.nxv4f32() ;. -; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32], section "llvm.metadata" +; CHECK: @llvm.compiler.used = appending global [32 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x], section "llvm.metadata" ;. define <2 x double> @llvm_cos_f64(<2 x double> %in) { ; CHECK-LABEL: define <2 x double> @llvm_cos_f64 @@ -40,7 +40,7 @@ define <4 x float> @llvm_cos_f32(<4 x float> %in) { define @llvm_cos_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_cos_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.cos.nxv2f64( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svcos_f64_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.cos.nxv2f64( %in) @@ -50,7 +50,7 @@ define @llvm_cos_vscale_f64( %in) #0 define @llvm_cos_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_cos_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.cos.nxv4f32( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svcos_f32_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.cos.nxv4f32( %in) @@ -85,7 +85,7 @@ define <4 x float> @llvm_sin_f32(<4 x float> %in) { define @llvm_sin_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_sin_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.sin.nxv2f64( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svsin_f64_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.sin.nxv2f64( %in) @@ -95,7 +95,7 @@ define @llvm_sin_vscale_f64( %in) #0 define @llvm_sin_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_sin_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.sin.nxv4f32( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svsin_f32_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.sin.nxv4f32( %in) @@ -130,7 +130,7 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) { define @llvm_exp_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_exp_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp.nxv2f64( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp_f64_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp.nxv2f64( %in) @@ -140,7 +140,7 @@ define @llvm_exp_vscale_f64( %in) #0 define @llvm_exp_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_exp_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp.nxv4f32( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp_f32_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp.nxv4f32( %in) @@ -175,7 +175,7 @@ define <4 x float> @llvm_exp2_f32(<4 x float> %in) { define @llvm_exp2_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_exp2_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp2.nxv2f64( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp2_f64_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp2.nxv2f64( %in) @@ -185,7 +185,7 @@ define @llvm_exp2_vscale_f64( %in) #0 define @llvm_exp2_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_exp2_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp2.nxv4f32( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp2_f32_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp2.nxv4f32( %in) @@ -220,7 +220,7 @@ define <4 x float> @llvm_exp10_f32(<4 x float> %in) { define @llvm_exp10_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_exp10_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp10.nxv2f64( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp10_f64_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp10.nxv2f64( %in) @@ -230,7 +230,7 @@ define @llvm_exp10_vscale_f64( %in) # define @llvm_exp10_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_exp10_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp10.nxv4f32( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp10_f32_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp10.nxv4f32( %in) @@ -265,7 +265,7 @@ define <4 x float> @llvm_log_f32(<4 x float> %in) { define @llvm_log_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_log_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log.nxv2f64( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog_f64_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log.nxv2f64( %in) @@ -275,7 +275,7 @@ define @llvm_log_vscale_f64( %in) #0 define @llvm_log_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_log_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log.nxv4f32( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog_f32_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log.nxv4f32( %in) @@ -310,7 +310,7 @@ define <4 x float> @llvm_log2_f32(<4 x float> %in) { define @llvm_log2_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_log2_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log2.nxv2f64( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog2_f64_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log2.nxv2f64( %in) @@ -320,7 +320,7 @@ define @llvm_log2_vscale_f64( %in) #0 define @llvm_log2_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_log2_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log2.nxv4f32( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog2_f32_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log2.nxv4f32( %in) @@ -355,7 +355,7 @@ define <4 x float> @llvm_log10_f32(<4 x float> %in) { define @llvm_log10_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_log10_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log10.nxv2f64( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog10_f64_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log10.nxv2f64( %in) @@ -365,7 +365,7 @@ define @llvm_log10_vscale_f64( %in) # define @llvm_log10_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_log10_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log10.nxv4f32( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog10_f32_x( [[IN]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log10.nxv4f32( %in) @@ -380,7 +380,7 @@ declare @llvm.pow.nxv4f32(, @llvm_ceil_vscale_f64( %in) { ; CHECK-LABEL: @llvm_ceil_vscale_f64( ; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.ceil.nxv2f64( [[IN:%.*]]) @@ -43,7 +46,7 @@ define @llvm_copysign_vscale_f32( %mag, define @llvm_cos_vscale_f64( %in) { ; CHECK-LABEL: @llvm_cos_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.cos.nxv2f64( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_cos( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.cos.nxv2f64( %in) @@ -52,7 +55,7 @@ define @llvm_cos_vscale_f64( %in) { define @llvm_cos_vscale_f32( %in) { ; CHECK-LABEL: @llvm_cos_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.cos.nxv4f32( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_cosf( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.cos.nxv4f32( %in) @@ -61,7 +64,7 @@ define @llvm_cos_vscale_f32( %in) { define @llvm_exp_vscale_f64( %in) { ; CHECK-LABEL: @llvm_exp_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp.nxv2f64( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp.nxv2f64( %in) @@ -70,7 +73,7 @@ define @llvm_exp_vscale_f64( %in) { define @llvm_exp_vscale_f32( %in) { ; CHECK-LABEL: @llvm_exp_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp.nxv4f32( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_expf( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp.nxv4f32( %in) @@ -79,7 +82,7 @@ define @llvm_exp_vscale_f32( %in) { define @llvm_exp2_vscale_f64( %in) { ; CHECK-LABEL: @llvm_exp2_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp2.nxv2f64( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp2( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp2.nxv2f64( %in) @@ -88,7 +91,7 @@ define @llvm_exp2_vscale_f64( %in) { define @llvm_exp2_vscale_f32( %in) { ; CHECK-LABEL: @llvm_exp2_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp2.nxv4f32( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp2f( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp2.nxv4f32( %in) @@ -97,7 +100,7 @@ define @llvm_exp2_vscale_f32( %in) { define @llvm_exp10_vscale_f64( %in) { ; CHECK-LABEL: @llvm_exp10_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp10.nxv2f64( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp10( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp10.nxv2f64( %in) @@ -106,7 +109,7 @@ define @llvm_exp10_vscale_f64( %in) { define @llvm_exp10_vscale_f32( %in) { ; CHECK-LABEL: @llvm_exp10_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.exp10.nxv4f32( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp10f( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp10.nxv4f32( %in) @@ -169,7 +172,7 @@ define @llvm_fma_vscale_f32( %a, @llvm_log_vscale_f64( %in) { ; CHECK-LABEL: @llvm_log_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log.nxv2f64( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log.nxv2f64( %in) @@ -178,7 +181,7 @@ define @llvm_log_vscale_f64( %in) { define @llvm_log_vscale_f32( %in) { ; CHECK-LABEL: @llvm_log_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log.nxv4f32( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_logf( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log.nxv4f32( %in) @@ -187,7 +190,7 @@ define @llvm_log_vscale_f32( %in) { define @llvm_log10_vscale_f64( %in) { ; CHECK-LABEL: @llvm_log10_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log10.nxv2f64( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log10( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log10.nxv2f64( %in) @@ -196,7 +199,7 @@ define @llvm_log10_vscale_f64( %in) { define @llvm_log10_vscale_f32( %in) { ; CHECK-LABEL: @llvm_log10_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log10.nxv4f32( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log10f( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log10.nxv4f32( %in) @@ -205,7 +208,7 @@ define @llvm_log10_vscale_f32( %in) { define @llvm_log2_vscale_f64( %in) { ; CHECK-LABEL: @llvm_log2_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log2.nxv2f64( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log2( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log2.nxv2f64( %in) @@ -214,7 +217,7 @@ define @llvm_log2_vscale_f64( %in) { define @llvm_log2_vscale_f32( %in) { ; CHECK-LABEL: @llvm_log2_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.log2.nxv4f32( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log2f( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log2.nxv4f32( %in) @@ -331,7 +334,7 @@ define @llvm_round_vscale_f32( %in) { define @llvm_sin_vscale_f64( %in) { ; CHECK-LABEL: @llvm_sin_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.sin.nxv2f64( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_sin( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.sin.nxv2f64( %in) @@ -340,7 +343,7 @@ define @llvm_sin_vscale_f64( %in) { define @llvm_sin_vscale_f32( %in) { ; CHECK-LABEL: @llvm_sin_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @llvm.sin.nxv4f32( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_sinf( [[IN:%.*]]) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.sin.nxv4f32( %in) From 690d36dc8a97d8ac82f51d6726eba439bb0c7b3f Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Tue, 12 Dec 2023 15:48:35 +0000 Subject: [PATCH 4/8] Use createFunctionType to correctly replace veclib calls. Split replaceWithTLIFunction method into two methods. --- llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 203 ++++++++++-------- .../replace-intrinsics-with-veclib-armpl.ll | 32 +-- ...e-intrinsics-with-veclib-sleef-scalable.ll | 32 +-- 3 files changed, 149 insertions(+), 118 deletions(-) diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp index d31a793556dfd..ddcc55a8e52c4 100644 --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -15,15 +15,19 @@ #include "llvm/CodeGen/ReplaceWithVeclib.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include using namespace llvm; @@ -38,138 +42,166 @@ STATISTIC(NumTLIFuncDeclAdded, STATISTIC(NumFuncUsedAdded, "Number of functions added to `llvm.compiler.used`"); -static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) { - Module *M = CI.getModule(); - - Function *OldFunc = CI.getCalledFunction(); - - // Check if the vector library function is already declared in this module, - // otherwise insert it. +/// Returns a vector Function that it adds to the Module \p M. When an \p +/// OptOldFunc is given, it copies its attributes to the newly created Function. +Function *getTLIFunction(Module *M, FunctionType *VectorFTy, + std::optional OptOldFunc, + const StringRef TLIName) { Function *TLIFunc = M->getFunction(TLIName); if (!TLIFunc) { - TLIFunc = Function::Create(OldFunc->getFunctionType(), - Function::ExternalLinkage, TLIName, *M); - TLIFunc->copyAttributesFrom(OldFunc); + TLIFunc = + Function::Create(VectorFTy, Function::ExternalLinkage, TLIName, *M); + if (OptOldFunc) + TLIFunc->copyAttributesFrom(*OptOldFunc); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `" << TLIName << "` of type `" << *(TLIFunc->getType()) << "` to module.\n"); ++NumTLIFuncDeclAdded; - - // Add the freshly created function to llvm.compiler.used, - // similar to as it is done in InjectTLIMappings + // Add the freshly created function to llvm.compiler.used, similar to as it + // is done in InjectTLIMappings appendToCompilerUsed(*M, {TLIFunc}); - LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName << "` to `@llvm.compiler.used`.\n"); ++NumFuncUsedAdded; } + return TLIFunc; +} - // Replace the call to the vector intrinsic with a call - // to the corresponding function from the vector library. +/// Replace the call to the vector intrinsic ( \p OldFunc ) with a call to the +/// corresponding function from the vector library ( \p TLIFunc ). +static bool replaceWithTLIFunction(const Module *M, CallInst &CI, + const ElementCount &VecVF, Function *OldFunc, + Function *TLIFunc, FunctionType *VecFTy, + bool IsMasked) { IRBuilder<> IRBuilder(&CI); SmallVector Args(CI.args()); + if (IsMasked) { + if (Args.size() == VecFTy->getNumParams()) + static_assert(true && "mask was already in place"); + + auto *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VecVF); + Args.push_back(Constant::getAllOnesValue(MaskTy)); + } + // Preserve the operand bundles. SmallVector OpBundles; CI.getOperandBundlesAsDefs(OpBundles); CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles); - assert(OldFunc->getFunctionType() == TLIFunc->getFunctionType() && + assert(VecFTy == TLIFunc->getFunctionType() && "Expecting function types to be identical"); CI.replaceAllUsesWith(Replacement); - if (isa(Replacement)) { - // Preserve fast math flags for FP math. + // Preserve fast math flags for FP math. + if (isa(Replacement)) Replacement->copyFastMathFlags(&CI); - } LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" - << OldFunc->getName() << "` with call to `" << TLIName - << "`.\n"); + << OldFunc->getName() << "` with call to `" + << TLIFunc->getName() << "`.\n"); ++NumCallsReplaced; return true; } +/// Utility method to get the VecDesc, depending on whether there is a TLI +/// mapping, either with or without a mask. +static std::optional getVecDesc(const TargetLibraryInfo &TLI, + const StringRef &ScalarName, + const ElementCount &VF) { + const VecDesc *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true); + const VecDesc *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false); + // Invalid when there are both variants (ie masked and unmasked), or none + if ((VDMasked == nullptr) == (VDNoMask == nullptr)) + return std::nullopt; + + return {VDMasked != nullptr ? VDMasked : VDNoMask}; +} + +/// Returns whether it is able to replace a call to the intrinsic \p CI with a +/// TLI mapped call. static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, CallInst &CI) { - if (!CI.getCalledFunction()) { + if (!CI.getCalledFunction()) return false; - } auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID(); - if (IntrinsicID == Intrinsic::not_intrinsic) { - // Replacement is only performed for intrinsic functions + // Replacement is only performed for intrinsic functions + if (IntrinsicID == Intrinsic::not_intrinsic) return false; - } - // Convert vector arguments to scalar type and check that - // all vector operands have identical vector width. + // Convert vector arguments to scalar type and check that all vector operands + // have identical vector width. ElementCount VF = ElementCount::getFixed(0); SmallVector ScalarTypes; - bool MayBeMasked = false; for (auto Arg : enumerate(CI.args())) { - auto *ArgType = Arg.value()->getType(); - // Vector calls to intrinsics can still have - // scalar operands for specific arguments. + auto *ArgTy = Arg.value()->getType(); if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) { - ScalarTypes.push_back(ArgType); - } else { - // The argument in this place should be a vector if - // this is a call to a vector intrinsic. - auto *VectorArgTy = dyn_cast(ArgType); - if (!VectorArgTy) { - // The argument is not a vector, do not perform - // the replacement. - return false; - } - ElementCount NumElements = VectorArgTy->getElementCount(); - if (NumElements.isScalable()) - MayBeMasked = true; - - // The different arguments differ in vector size. - if (VF.isNonZero() && VF != NumElements) + ScalarTypes.push_back(ArgTy); + } else if (auto *VectorArgTy = dyn_cast(ArgTy)) { + ScalarTypes.push_back(ArgTy->getScalarType()); + // Disallow vector arguments with different VFs. When processing the first + // vector argument, store it's VF, and for the rest ensure that they match + // it. + if (VF.isZero()) + VF = VectorArgTy->getElementCount(); + else if (VF != VectorArgTy->getElementCount()) return false; - VF = NumElements; - ScalarTypes.push_back(VectorArgTy->getElementType()); + } else { + // enters when it is supposed to be a vector argument but it isn't. + return false; } } - // Try to reconstruct the name for the scalar version of this - // intrinsic using the intrinsic ID and the argument types - // converted to scalar above. - std::string ScalarName; - if (Intrinsic::isOverloaded(IntrinsicID)) { - ScalarName = Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule()); - } else { - ScalarName = Intrinsic::getName(IntrinsicID).str(); - } + // Try to reconstruct the name for the scalar version of this intrinsic using + // the intrinsic ID and the argument types converted to scalar above. + std::string ScalarName = + (Intrinsic::isOverloaded(IntrinsicID) + ? Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule()) + : Intrinsic::getName(IntrinsicID).str()); - if (!TLI.isFunctionVectorizable(ScalarName)) { - // The TargetLibraryInfo does not contain a vectorized version of - // the scalar function. + // The TargetLibraryInfo does not contain a vectorized version of the scalar + // function. + if (!TLI.isFunctionVectorizable(ScalarName)) return false; - } - // Assume it has a mask when that is a possibility and has no mapping for - // a Non-Masked variant. - const bool IsMasked = - MayBeMasked && !TLI.getVectorMappingInfo(ScalarName, VF, false); - // Try to find the mapping for the scalar version of this intrinsic - // and the exact vector width of the call operands in the - // TargetLibraryInfo. - StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, IsMasked); + auto OptVD = getVecDesc(TLI, ScalarName, VF); + if (!OptVD) + return false; + + const VecDesc *VD = *OptVD; + // Try to find the mapping for the scalar version of this intrinsic and the + // exact vector width of the call operands in the TargetLibraryInfo. + StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, VD->isMasked()); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `" << ScalarName << "` and vector width " << VF << ".\n"); - if (!TLIName.empty()) { - // Found the correct mapping in the TargetLibraryInfo, - // replace the call to the intrinsic with a call to - // the vector library function. - LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName - << "`.\n"); - return replaceWithTLIFunction(CI, TLIName); - } + // TLI failed to find a correct mapping. + if (TLIName.empty()) + return false; - return false; + // Find the vector Function and replace the call to the intrinsic with a call + // to the vector library function. + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName + << "`.\n"); + + Type *ScalarRetTy = CI.getType()->getScalarType(); + FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarTypes, false); + const std::string MangledName = VD->getVectorFunctionABIVariantString(); + auto OptInfo = VFABI::tryDemangleForVFABI(MangledName, ScalarFTy); + if (!OptInfo) + return false; + + // get the vector FunctionType + Module *M = CI.getModule(); + auto OptFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy); + if (!OptFTy) + return false; + + Function *OldFunc = CI.getCalledFunction(); + FunctionType *VectorFTy = *OptFTy; + Function *TLIFunc = getTLIFunction(M, VectorFTy, OldFunc, TLIName); + return replaceWithTLIFunction(M, CI, OptInfo->Shape.VF, OldFunc, TLIFunc, + VectorFTy, VD->isMasked()); } static bool runImpl(const TargetLibraryInfo &TLI, Function &F) { @@ -185,9 +217,8 @@ static bool runImpl(const TargetLibraryInfo &TLI, Function &F) { } // Erase the calls to the intrinsics that have been replaced // with calls to the vector library. - for (auto *CI : ReplacedCalls) { + for (auto *CI : ReplacedCalls) CI->eraseFromParent(); - } return Changed; } @@ -207,10 +238,10 @@ PreservedAnalyses ReplaceWithVeclib::run(Function &F, PA.preserve(); PA.preserve(); return PA; - } else { - // The pass did not replace any calls, hence it preserves all analyses. - return PreservedAnalyses::all(); } + + // The pass did not replace any calls, hence it preserves all analyses. + return PreservedAnalyses::all(); } //////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll index 633cb220f5246..d41870ec6e791 100644 --- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll +++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll @@ -40,7 +40,7 @@ define <4 x float> @llvm_cos_f32(<4 x float> %in) { define @llvm_cos_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_cos_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svcos_f64_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svcos_f64_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.cos.nxv2f64( %in) @@ -50,7 +50,7 @@ define @llvm_cos_vscale_f64( %in) #0 define @llvm_cos_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_cos_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svcos_f32_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svcos_f32_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.cos.nxv4f32( %in) @@ -85,7 +85,7 @@ define <4 x float> @llvm_sin_f32(<4 x float> %in) { define @llvm_sin_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_sin_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svsin_f64_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svsin_f64_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.sin.nxv2f64( %in) @@ -95,7 +95,7 @@ define @llvm_sin_vscale_f64( %in) #0 define @llvm_sin_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_sin_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svsin_f32_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svsin_f32_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.sin.nxv4f32( %in) @@ -130,7 +130,7 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) { define @llvm_exp_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_exp_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp_f64_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp_f64_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp.nxv2f64( %in) @@ -140,7 +140,7 @@ define @llvm_exp_vscale_f64( %in) #0 define @llvm_exp_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_exp_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp_f32_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp_f32_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp.nxv4f32( %in) @@ -175,7 +175,7 @@ define <4 x float> @llvm_exp2_f32(<4 x float> %in) { define @llvm_exp2_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_exp2_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp2_f64_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp2_f64_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp2.nxv2f64( %in) @@ -185,7 +185,7 @@ define @llvm_exp2_vscale_f64( %in) #0 define @llvm_exp2_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_exp2_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp2_f32_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp2_f32_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp2.nxv4f32( %in) @@ -220,7 +220,7 @@ define <4 x float> @llvm_exp10_f32(<4 x float> %in) { define @llvm_exp10_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_exp10_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp10_f64_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp10_f64_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp10.nxv2f64( %in) @@ -230,7 +230,7 @@ define @llvm_exp10_vscale_f64( %in) # define @llvm_exp10_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_exp10_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp10_f32_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svexp10_f32_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp10.nxv4f32( %in) @@ -265,7 +265,7 @@ define <4 x float> @llvm_log_f32(<4 x float> %in) { define @llvm_log_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_log_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog_f64_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog_f64_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log.nxv2f64( %in) @@ -275,7 +275,7 @@ define @llvm_log_vscale_f64( %in) #0 define @llvm_log_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_log_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog_f32_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog_f32_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log.nxv4f32( %in) @@ -310,7 +310,7 @@ define <4 x float> @llvm_log2_f32(<4 x float> %in) { define @llvm_log2_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_log2_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog2_f64_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog2_f64_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log2.nxv2f64( %in) @@ -320,7 +320,7 @@ define @llvm_log2_vscale_f64( %in) #0 define @llvm_log2_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_log2_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog2_f32_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog2_f32_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log2.nxv4f32( %in) @@ -355,7 +355,7 @@ define <4 x float> @llvm_log10_f32(<4 x float> %in) { define @llvm_log10_vscale_f64( %in) #0 { ; CHECK-LABEL: define @llvm_log10_vscale_f64 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog10_f64_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog10_f64_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log10.nxv2f64( %in) @@ -365,7 +365,7 @@ define @llvm_log10_vscale_f64( %in) # define @llvm_log10_vscale_f32( %in) #0 { ; CHECK-LABEL: define @llvm_log10_vscale_f32 ; CHECK-SAME: ( [[IN:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog10_f32_x( [[IN]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @armpl_svlog10_f32_x( [[IN]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log10.nxv4f32( %in) diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll index 969945590a0a1..baf16f83a3e24 100644 --- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll +++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll @@ -46,7 +46,7 @@ define @llvm_copysign_vscale_f32( %mag, define @llvm_cos_vscale_f64( %in) { ; CHECK-LABEL: @llvm_cos_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_cos( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_cos( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.cos.nxv2f64( %in) @@ -55,7 +55,7 @@ define @llvm_cos_vscale_f64( %in) { define @llvm_cos_vscale_f32( %in) { ; CHECK-LABEL: @llvm_cos_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_cosf( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_cosf( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.cos.nxv4f32( %in) @@ -64,7 +64,7 @@ define @llvm_cos_vscale_f32( %in) { define @llvm_exp_vscale_f64( %in) { ; CHECK-LABEL: @llvm_exp_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp.nxv2f64( %in) @@ -73,7 +73,7 @@ define @llvm_exp_vscale_f64( %in) { define @llvm_exp_vscale_f32( %in) { ; CHECK-LABEL: @llvm_exp_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_expf( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_expf( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp.nxv4f32( %in) @@ -82,7 +82,7 @@ define @llvm_exp_vscale_f32( %in) { define @llvm_exp2_vscale_f64( %in) { ; CHECK-LABEL: @llvm_exp2_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp2( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp2( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp2.nxv2f64( %in) @@ -91,7 +91,7 @@ define @llvm_exp2_vscale_f64( %in) { define @llvm_exp2_vscale_f32( %in) { ; CHECK-LABEL: @llvm_exp2_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp2f( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp2f( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp2.nxv4f32( %in) @@ -100,7 +100,7 @@ define @llvm_exp2_vscale_f32( %in) { define @llvm_exp10_vscale_f64( %in) { ; CHECK-LABEL: @llvm_exp10_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp10( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp10( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp10.nxv2f64( %in) @@ -109,7 +109,7 @@ define @llvm_exp10_vscale_f64( %in) { define @llvm_exp10_vscale_f32( %in) { ; CHECK-LABEL: @llvm_exp10_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp10f( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_exp10f( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.exp10.nxv4f32( %in) @@ -172,7 +172,7 @@ define @llvm_fma_vscale_f32( %a, @llvm_log_vscale_f64( %in) { ; CHECK-LABEL: @llvm_log_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log.nxv2f64( %in) @@ -181,7 +181,7 @@ define @llvm_log_vscale_f64( %in) { define @llvm_log_vscale_f32( %in) { ; CHECK-LABEL: @llvm_log_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_logf( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_logf( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log.nxv4f32( %in) @@ -190,7 +190,7 @@ define @llvm_log_vscale_f32( %in) { define @llvm_log10_vscale_f64( %in) { ; CHECK-LABEL: @llvm_log10_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log10( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log10( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log10.nxv2f64( %in) @@ -199,7 +199,7 @@ define @llvm_log10_vscale_f64( %in) { define @llvm_log10_vscale_f32( %in) { ; CHECK-LABEL: @llvm_log10_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log10f( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log10f( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log10.nxv4f32( %in) @@ -208,7 +208,7 @@ define @llvm_log10_vscale_f32( %in) { define @llvm_log2_vscale_f64( %in) { ; CHECK-LABEL: @llvm_log2_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log2( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log2( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log2.nxv2f64( %in) @@ -217,7 +217,7 @@ define @llvm_log2_vscale_f64( %in) { define @llvm_log2_vscale_f32( %in) { ; CHECK-LABEL: @llvm_log2_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log2f( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_log2f( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.log2.nxv4f32( %in) @@ -334,7 +334,7 @@ define @llvm_round_vscale_f32( %in) { define @llvm_sin_vscale_f64( %in) { ; CHECK-LABEL: @llvm_sin_vscale_f64( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_sin( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_sin( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.sin.nxv2f64( %in) @@ -343,7 +343,7 @@ define @llvm_sin_vscale_f64( %in) { define @llvm_sin_vscale_f32( %in) { ; CHECK-LABEL: @llvm_sin_vscale_f32( -; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_sinf( [[IN:%.*]]) +; CHECK-NEXT: [[TMP1:%.*]] = call fast @_ZGVsMxv_sinf( [[IN:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call fast @llvm.sin.nxv4f32( %in) From 02b0e21aa97ae72834438223e68edc65ae8a23f1 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Wed, 13 Dec 2023 18:08:46 +0000 Subject: [PATCH 5/8] getVecDesc now prioritizes masked variant Also further cleanup to address reviewers. --- llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 85 ++++++++----------- ...e-intrinsics-with-veclib-sleef-scalable.ll | 2 - 2 files changed, 36 insertions(+), 51 deletions(-) diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp index ddcc55a8e52c4..4ea163e4eaafb 100644 --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -69,20 +69,20 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy, return TLIFunc; } -/// Replace the call to the vector intrinsic ( \p OldFunc ) with a call to the -/// corresponding function from the vector library ( \p TLIFunc ). -static bool replaceWithTLIFunction(const Module *M, CallInst &CI, - const ElementCount &VecVF, Function *OldFunc, - Function *TLIFunc, FunctionType *VecFTy, - bool IsMasked) { +/// Replace the call to the vector intrinsic ( \p FuncToReplace ) with a call to +/// the corresponding function from the vector library ( \p TLIFunc ). +static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info, + Function *TLIFunc, FunctionType *VecFTy) { IRBuilder<> IRBuilder(&CI); SmallVector Args(CI.args()); - if (IsMasked) { + if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) { if (Args.size() == VecFTy->getNumParams()) static_assert(true && "mask was already in place"); - auto *MaskTy = VectorType::get(Type::getInt1Ty(M->getContext()), VecVF); - Args.push_back(Constant::getAllOnesValue(MaskTy)); + auto *MaskTy = + VectorType::get(Type::getInt1Ty(CI.getContext()), Info.Shape.VF); + Args.insert(Args.begin() + OptMaskpos.value(), + Constant::getAllOnesValue(MaskTy)); } // Preserve the operand bundles. @@ -95,26 +95,18 @@ static bool replaceWithTLIFunction(const Module *M, CallInst &CI, // Preserve fast math flags for FP math. if (isa(Replacement)) Replacement->copyFastMathFlags(&CI); - - LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" - << OldFunc->getName() << "` with call to `" - << TLIFunc->getName() << "`.\n"); - ++NumCallsReplaced; - return true; } -/// Utility method to get the VecDesc, depending on whether there is a TLI -/// mapping, either with or without a mask. +/// Utility method to get the VecDesc, depending on whether there is such a TLI +/// mapping, prioritizing a masked version. static std::optional getVecDesc(const TargetLibraryInfo &TLI, const StringRef &ScalarName, const ElementCount &VF) { - const VecDesc *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true); - const VecDesc *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false); - // Invalid when there are both variants (ie masked and unmasked), or none - if ((VDMasked == nullptr) == (VDNoMask == nullptr)) - return std::nullopt; - - return {VDMasked != nullptr ? VDMasked : VDNoMask}; + if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true)) + return VDMasked; + if (auto *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false)) + return VDNoMask; + return std::nullopt; } /// Returns whether it is able to replace a call to the intrinsic \p CI with a @@ -146,10 +138,9 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, VF = VectorArgTy->getElementCount(); else if (VF != VectorArgTy->getElementCount()) return false; - } else { + } else // enters when it is supposed to be a vector argument but it isn't. return false; - } } // Try to reconstruct the name for the scalar version of this intrinsic using @@ -164,26 +155,19 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, if (!TLI.isFunctionVectorizable(ScalarName)) return false; + // Try to find the mapping for the scalar version of this intrinsic and the + // exact vector width of the call operands in the TargetLibraryInfo. auto OptVD = getVecDesc(TLI, ScalarName, VF); if (!OptVD) return false; const VecDesc *VD = *OptVD; - // Try to find the mapping for the scalar version of this intrinsic and the - // exact vector width of the call operands in the TargetLibraryInfo. - StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF, VD->isMasked()); - LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `" - << ScalarName << "` and vector width " << VF << ".\n"); - - // TLI failed to find a correct mapping. - if (TLIName.empty()) - return false; - - // Find the vector Function and replace the call to the intrinsic with a call - // to the vector library function. - LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName - << "`.\n"); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName + << "` and vector width " << VF << " to: `" + << VD->getVectorFnName() << "`.\n"); + // Replace the call to the intrinsic with a call to the vector library + // function. Type *ScalarRetTy = CI.getType()->getScalarType(); FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarTypes, false); const std::string MangledName = VD->getVectorFunctionABIVariantString(); @@ -191,17 +175,20 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, if (!OptInfo) return false; - // get the vector FunctionType - Module *M = CI.getModule(); - auto OptFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy); - if (!OptFTy) + FunctionType *VectorFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy); + if (!VectorFTy) return false; - Function *OldFunc = CI.getCalledFunction(); - FunctionType *VectorFTy = *OptFTy; - Function *TLIFunc = getTLIFunction(M, VectorFTy, OldFunc, TLIName); - return replaceWithTLIFunction(M, CI, OptInfo->Shape.VF, OldFunc, TLIFunc, - VectorFTy, VD->isMasked()); + Function *FuncToReplace = CI.getCalledFunction(); + Function *TLIFunc = getTLIFunction(CI.getModule(), VectorFTy, FuncToReplace, + VD->getVectorFnName()); + replaceWithTLIFunction(CI, *OptInfo, TLIFunc, VectorFTy); + + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" + << FuncToReplace->getName() << "` with call to `" + << TLIFunc->getName() << "`.\n"); + ++NumCallsReplaced; + return true; } static bool runImpl(const TargetLibraryInfo &TLI, Function &F) { diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll index baf16f83a3e24..c2ff6014bc694 100644 --- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll +++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll @@ -3,8 +3,6 @@ target triple = "aarch64-unknown-linux-gnu" -; NOTE: The existing TLI mappings are not used since the -replace-with-veclib pass is broken for scalable vectors. - ;. ; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf], section "llvm.metadata" ;. From 5b2e317962d0239d46fc0be2c76556b04cfcd0eb Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Fri, 15 Dec 2023 12:08:37 +0000 Subject: [PATCH 6/8] Addressing review. getTLIFunction is no longer an optional. It accepts a pointer for ScalarFunc --- llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 41 +++++++++++++------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp index 4ea163e4eaafb..05ceb6d01bc98 100644 --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -43,16 +43,16 @@ STATISTIC(NumFuncUsedAdded, "Number of functions added to `llvm.compiler.used`"); /// Returns a vector Function that it adds to the Module \p M. When an \p -/// OptOldFunc is given, it copies its attributes to the newly created Function. +/// ScalarFunc is not null, it copies its attributes to the newly created +/// Function. Function *getTLIFunction(Module *M, FunctionType *VectorFTy, - std::optional OptOldFunc, - const StringRef TLIName) { + Function *ScalarFunc, const StringRef TLIName) { Function *TLIFunc = M->getFunction(TLIName); if (!TLIFunc) { TLIFunc = Function::Create(VectorFTy, Function::ExternalLinkage, TLIName, *M); - if (OptOldFunc) - TLIFunc->copyAttributesFrom(*OptOldFunc); + if (ScalarFunc) + TLIFunc->copyAttributesFrom(ScalarFunc); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `" << TLIName << "` of type `" << *(TLIFunc->getType()) @@ -60,7 +60,7 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy, ++NumTLIFuncDeclAdded; // Add the freshly created function to llvm.compiler.used, similar to as it - // is done in InjectTLIMappings + // is done in InjectTLIMappings. appendToCompilerUsed(*M, {TLIFunc}); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName << "` to `@llvm.compiler.used`.\n"); @@ -72,11 +72,11 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy, /// Replace the call to the vector intrinsic ( \p FuncToReplace ) with a call to /// the corresponding function from the vector library ( \p TLIFunc ). static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info, - Function *TLIFunc, FunctionType *VecFTy) { + Function *TLIVecFunc) { IRBuilder<> IRBuilder(&CI); SmallVector Args(CI.args()); if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) { - if (Args.size() == VecFTy->getNumParams()) + if (Args.size() == TLIVecFunc->getFunctionType()->getNumParams()) static_assert(true && "mask was already in place"); auto *MaskTy = @@ -88,9 +88,7 @@ static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info, // Preserve the operand bundles. SmallVector OpBundles; CI.getOperandBundlesAsDefs(OpBundles); - CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles); - assert(VecFTy == TLIFunc->getFunctionType() && - "Expecting function types to be identical"); + CallInst *Replacement = IRBuilder.CreateCall(TLIVecFunc, Args, OpBundles); CI.replaceAllUsesWith(Replacement); // Preserve fast math flags for FP math. if (isa(Replacement)) @@ -102,10 +100,10 @@ static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info, static std::optional getVecDesc(const TargetLibraryInfo &TLI, const StringRef &ScalarName, const ElementCount &VF) { - if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true)) - return VDMasked; if (auto *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false)) return VDNoMask; + if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true)) + return VDMasked; return std::nullopt; } @@ -117,20 +115,20 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, return false; auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID(); - // Replacement is only performed for intrinsic functions + // Replacement is only performed for intrinsic functions. if (IntrinsicID == Intrinsic::not_intrinsic) return false; // Convert vector arguments to scalar type and check that all vector operands // have identical vector width. ElementCount VF = ElementCount::getFixed(0); - SmallVector ScalarTypes; + SmallVector ScalarArgTypes; for (auto Arg : enumerate(CI.args())) { auto *ArgTy = Arg.value()->getType(); if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) { - ScalarTypes.push_back(ArgTy); + ScalarArgTypes.push_back(ArgTy); } else if (auto *VectorArgTy = dyn_cast(ArgTy)) { - ScalarTypes.push_back(ArgTy->getScalarType()); + ScalarArgTypes.push_back(ArgTy->getScalarType()); // Disallow vector arguments with different VFs. When processing the first // vector argument, store it's VF, and for the rest ensure that they match // it. @@ -139,7 +137,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, else if (VF != VectorArgTy->getElementCount()) return false; } else - // enters when it is supposed to be a vector argument but it isn't. + // Exit when it is supposed to be a vector argument but it isn't. return false; } @@ -147,7 +145,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, // the intrinsic ID and the argument types converted to scalar above. std::string ScalarName = (Intrinsic::isOverloaded(IntrinsicID) - ? Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule()) + ? Intrinsic::getName(IntrinsicID, ScalarArgTypes, CI.getModule()) : Intrinsic::getName(IntrinsicID).str()); // The TargetLibraryInfo does not contain a vectorized version of the scalar @@ -169,7 +167,8 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, // Replace the call to the intrinsic with a call to the vector library // function. Type *ScalarRetTy = CI.getType()->getScalarType(); - FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarTypes, false); + FunctionType *ScalarFTy = + FunctionType::get(ScalarRetTy, ScalarArgTypes, /*isVarArg*/ false); const std::string MangledName = VD->getVectorFunctionABIVariantString(); auto OptInfo = VFABI::tryDemangleForVFABI(MangledName, ScalarFTy); if (!OptInfo) @@ -182,7 +181,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, Function *FuncToReplace = CI.getCalledFunction(); Function *TLIFunc = getTLIFunction(CI.getModule(), VectorFTy, FuncToReplace, VD->getVectorFnName()); - replaceWithTLIFunction(CI, *OptInfo, TLIFunc, VectorFTy); + replaceWithTLIFunction(CI, *OptInfo, TLIFunc); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" << FuncToReplace->getName() << "` with call to `" From b2f81eb066c1d945e4a465a94af3ae2892bc8bed Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Wed, 20 Dec 2023 08:51:03 +0000 Subject: [PATCH 7/8] Improved comments and minor code cleanup. --- llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 79 ++++++++++---------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp index 05ceb6d01bc98..6e308abc6990a 100644 --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -69,61 +69,48 @@ Function *getTLIFunction(Module *M, FunctionType *VectorFTy, return TLIFunc; } -/// Replace the call to the vector intrinsic ( \p FuncToReplace ) with a call to -/// the corresponding function from the vector library ( \p TLIFunc ). -static void replaceWithTLIFunction(CallInst &CI, VFInfo &Info, +/// Replace the call to the vector intrinsic ( \p CalltoReplace ) with a call to +/// the corresponding function from the vector library ( \p TLIVecFunc ). +static void replaceWithTLIFunction(CallInst &CalltoReplace, VFInfo &Info, Function *TLIVecFunc) { - IRBuilder<> IRBuilder(&CI); - SmallVector Args(CI.args()); + IRBuilder<> IRBuilder(&CalltoReplace); + SmallVector Args(CalltoReplace.args()); if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) { - if (Args.size() == TLIVecFunc->getFunctionType()->getNumParams()) - static_assert(true && "mask was already in place"); - - auto *MaskTy = - VectorType::get(Type::getInt1Ty(CI.getContext()), Info.Shape.VF); + auto *MaskTy = VectorType::get(Type::getInt1Ty(CalltoReplace.getContext()), + Info.Shape.VF); Args.insert(Args.begin() + OptMaskpos.value(), Constant::getAllOnesValue(MaskTy)); } // Preserve the operand bundles. SmallVector OpBundles; - CI.getOperandBundlesAsDefs(OpBundles); + CalltoReplace.getOperandBundlesAsDefs(OpBundles); CallInst *Replacement = IRBuilder.CreateCall(TLIVecFunc, Args, OpBundles); - CI.replaceAllUsesWith(Replacement); + CalltoReplace.replaceAllUsesWith(Replacement); // Preserve fast math flags for FP math. if (isa(Replacement)) - Replacement->copyFastMathFlags(&CI); -} - -/// Utility method to get the VecDesc, depending on whether there is such a TLI -/// mapping, prioritizing a masked version. -static std::optional getVecDesc(const TargetLibraryInfo &TLI, - const StringRef &ScalarName, - const ElementCount &VF) { - if (auto *VDNoMask = TLI.getVectorMappingInfo(ScalarName, VF, false)) - return VDNoMask; - if (auto *VDMasked = TLI.getVectorMappingInfo(ScalarName, VF, true)) - return VDMasked; - return std::nullopt; + Replacement->copyFastMathFlags(&CalltoReplace); } -/// Returns whether it is able to replace a call to the intrinsic \p CI with a -/// TLI mapped call. +/// Returns true when successfully replaced \p CallToReplace with a suitable +/// function taking vector arguments, based on available mappings in the \p TLI. +/// Currently only works when \p CallToReplace is a call to vectorized +/// intrinsic. static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, - CallInst &CI) { - if (!CI.getCalledFunction()) + CallInst &CallToReplace) { + if (!CallToReplace.getCalledFunction()) return false; - auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID(); + auto IntrinsicID = CallToReplace.getCalledFunction()->getIntrinsicID(); // Replacement is only performed for intrinsic functions. if (IntrinsicID == Intrinsic::not_intrinsic) return false; - // Convert vector arguments to scalar type and check that all vector operands - // have identical vector width. + // Compute arguments types of the corresponding scalar call. Additionally + // checks if in the vector call, all vector operands have the same EC. ElementCount VF = ElementCount::getFixed(0); SmallVector ScalarArgTypes; - for (auto Arg : enumerate(CI.args())) { + for (auto Arg : enumerate(CallToReplace.args())) { auto *ArgTy = Arg.value()->getType(); if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) { ScalarArgTypes.push_back(ArgTy); @@ -145,28 +132,24 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, // the intrinsic ID and the argument types converted to scalar above. std::string ScalarName = (Intrinsic::isOverloaded(IntrinsicID) - ? Intrinsic::getName(IntrinsicID, ScalarArgTypes, CI.getModule()) + ? Intrinsic::getName(IntrinsicID, ScalarArgTypes, + CallToReplace.getModule()) : Intrinsic::getName(IntrinsicID).str()); - // The TargetLibraryInfo does not contain a vectorized version of the scalar - // function. - if (!TLI.isFunctionVectorizable(ScalarName)) - return false; - // Try to find the mapping for the scalar version of this intrinsic and the - // exact vector width of the call operands in the TargetLibraryInfo. - auto OptVD = getVecDesc(TLI, ScalarName, VF); - if (!OptVD) + // exact vector width of the call operands in the TargetLibraryInfo. First, + // check with a non-masked variant, and if that fails try with a masked one. + const VecDesc *VD = TLI.getVectorMappingInfo(ScalarName, VF, false); + if (!VD && !(VD = TLI.getVectorMappingInfo(ScalarName, VF, true))) return false; - const VecDesc *VD = *OptVD; LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName << "` and vector width " << VF << " to: `" << VD->getVectorFnName() << "`.\n"); // Replace the call to the intrinsic with a call to the vector library // function. - Type *ScalarRetTy = CI.getType()->getScalarType(); + Type *ScalarRetTy = CallToReplace.getType()->getScalarType(); FunctionType *ScalarFTy = FunctionType::get(ScalarRetTy, ScalarArgTypes, /*isVarArg*/ false); const std::string MangledName = VD->getVectorFunctionABIVariantString(); @@ -178,10 +161,10 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, if (!VectorFTy) return false; - Function *FuncToReplace = CI.getCalledFunction(); - Function *TLIFunc = getTLIFunction(CI.getModule(), VectorFTy, FuncToReplace, - VD->getVectorFnName()); - replaceWithTLIFunction(CI, *OptInfo, TLIFunc); + Function *FuncToReplace = CallToReplace.getCalledFunction(); + Function *TLIFunc = getTLIFunction(CallToReplace.getModule(), VectorFTy, + FuncToReplace, VD->getVectorFnName()); + replaceWithTLIFunction(CallToReplace, *OptInfo, TLIFunc); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" << FuncToReplace->getName() << "` with call to `" From 787baf7aee1ad0df2b33596a0a3f45e829b580c9 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Wed, 20 Dec 2023 15:24:01 +0000 Subject: [PATCH 8/8] Addressing review. --- llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp index 6e308abc6990a..893aa4a91828d 100644 --- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp +++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp @@ -27,7 +27,6 @@ #include "llvm/IR/InstIterator.h" #include "llvm/Support/TypeSize.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include using namespace llvm; @@ -46,7 +45,8 @@ STATISTIC(NumFuncUsedAdded, /// ScalarFunc is not null, it copies its attributes to the newly created /// Function. Function *getTLIFunction(Module *M, FunctionType *VectorFTy, - Function *ScalarFunc, const StringRef TLIName) { + const StringRef TLIName, + Function *ScalarFunc = nullptr) { Function *TLIFunc = M->getFunction(TLIName); if (!TLIFunc) { TLIFunc = @@ -139,8 +139,9 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, // Try to find the mapping for the scalar version of this intrinsic and the // exact vector width of the call operands in the TargetLibraryInfo. First, // check with a non-masked variant, and if that fails try with a masked one. - const VecDesc *VD = TLI.getVectorMappingInfo(ScalarName, VF, false); - if (!VD && !(VD = TLI.getVectorMappingInfo(ScalarName, VF, true))) + const VecDesc *VD = + TLI.getVectorMappingInfo(ScalarName, VF, /*Masked*/ false); + if (!VD && !(VD = TLI.getVectorMappingInfo(ScalarName, VF, /*Masked*/ true))) return false; LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName @@ -163,7 +164,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI, Function *FuncToReplace = CallToReplace.getCalledFunction(); Function *TLIFunc = getTLIFunction(CallToReplace.getModule(), VectorFTy, - FuncToReplace, VD->getVectorFnName()); + VD->getVectorFnName(), FuncToReplace); replaceWithTLIFunction(CallToReplace, *OptInfo, TLIFunc); LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"