Skip to content

Commit

Permalink
DAGCombiner optimization for pow(x,0.75) and pow(x,0.25) on double an…
Browse files Browse the repository at this point in the history
…d single precision even in case massv function is asked

Here, I am proposing to add an special case for massv powf4/powd2 function (SIMD counterpart of powf/pow function in MASSV library) in MASSV pass to get later optimizations like conversion from pow(x,0.75) and pow(x,0.25) for double and single precision to sequence of sqrt's in the DAGCombiner in vector float case. My reason for doing this is: the optimized pow(x,0.75) and pow(x,0.25) for double and single precision to sequence of sqrt's is faster than powf4/powd2 on P8 and P9.

In case MASSV functions is called, and if the exponent of pow is 0.75 or 0.25, we will get the sequence of sqrt's and if exponent is not 0.75 or 0.25 we will get the appropriate MASSV function.

Reviewed By: steven.zhang

Tags: #LLVM #PowerPC

Differential Revision: https://reviews.llvm.org/D80744
  • Loading branch information
msdataei committed Jun 12, 2020
1 parent 5509e2c commit 2d03837
Show file tree
Hide file tree
Showing 3 changed files with 365 additions and 0 deletions.
33 changes: 33 additions & 0 deletions llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
Expand Up @@ -54,6 +54,7 @@ class PPCLowerMASSVEntries : public ModulePass {
static StringRef getCPUSuffix(const PPCSubtarget *Subtarget);
static std::string createMASSVFuncName(Function &Func,
const PPCSubtarget *Subtarget);
bool handlePowSpecialCases(CallInst *CI, Function &Func, Module &M);
bool lowerMASSVCall(CallInst *CI, Function &Func, Module &M,
const PPCSubtarget *Subtarget);
};
Expand Down Expand Up @@ -96,6 +97,34 @@ PPCLowerMASSVEntries::createMASSVFuncName(Function &Func,
return MASSVEntryName;
}

/// If there are proper fast-math flags, this function creates llvm.pow
/// intrinsics when the exponent is 0.25 or 0.75.
bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func,
Module &M) {
if (Func.getName() != "__powf4_massv" && Func.getName() != "__powd2_massv")
return false;

if (Constant *Exp = dyn_cast<Constant>(CI->getArgOperand(1)))
if (ConstantFP *CFP = dyn_cast<ConstantFP>(Exp->getSplatValue())) {
// If the argument is 0.75 or 0.25 it is cheaper to turn it into pow
// intrinsic so that it could be optimzed as sequence of sqrt's.
if (!CI->hasNoInfs() || !CI->hasApproxFunc())
return false;

if (!CFP->isExactlyValue(0.75) && !CFP->isExactlyValue(0.25))
return false;

if (CFP->isExactlyValue(0.25) && !CI->hasNoSignedZeros())
return false;

CI->setCalledFunction(
Intrinsic::getDeclaration(&M, Intrinsic::pow, CI->getType()));
return true;
}

return false;
}

/// Lowers generic MASSV entries to PowerPC subtarget-specific MASSV entries.
/// e.g.: __sind2_massv --> __sind2_P9 for a Power9 subtarget.
/// Both function prototypes and their callsites are updated during lowering.
Expand All @@ -105,6 +134,10 @@ bool PPCLowerMASSVEntries::lowerMASSVCall(CallInst *CI, Function &Func,
if (CI->use_empty())
return false;

// Handling pow(x, 0.25), pow(x, 0.75), powf(x, 0.25), powf(x, 0.75)
if (handlePowSpecialCases(CI, Func, M))
return true;

std::string MASSVEntryName = createMASSVFuncName(Func, Subtarget);
FunctionCallee FCache = M.getOrInsertFunction(
MASSVEntryName, Func.getFunctionType(), Func.getAttributes());
Expand Down
166 changes: 166 additions & 0 deletions llvm/test/CodeGen/PowerPC/pow_massv_075_025exp.ll
@@ -0,0 +1,166 @@
; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck -check-prefixes=CHECK-PWR9 %s
; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 | FileCheck -check-prefixes=CHECK-PWR8 %s

; Exponent is a variable
define void @my_vpow_var(double* nocapture %z, double* nocapture readonly %y, double* nocapture readonly %x) {
; CHECK-LABEL: @vspow_var
; CHECK-PWR9: bl __powd2_P9
; CHECK-PWR8: bl __powd2_P8
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr double, double* %z, i64 %index
%next.gep31 = getelementptr double, double* %y, i64 %index
%next.gep32 = getelementptr double, double* %x, i64 %index
%0 = bitcast double* %next.gep32 to <2 x double>*
%wide.load = load <2 x double>, <2 x double>* %0, align 8
%1 = bitcast double* %next.gep31 to <2 x double>*
%wide.load33 = load <2 x double>, <2 x double>* %1, align 8
%2 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> %wide.load33)
%3 = bitcast double* %next.gep to <2 x double>*
store <2 x double> %2, <2 x double>* %3, align 8
%index.next = add i64 %index, 2
%4 = icmp eq i64 %index.next, 1024
br i1 %4, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is a constant != 0.75 and !=0.25
define void @my_vpow_const(double* nocapture %y, double* nocapture readonly %x) {
; CHECK-LABEL: @vspow_const
; CHECK-PWR9: bl __powd2_P9
; CHECK-PWR8: bl __powd2_P8
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr double, double* %y, i64 %index
%next.gep19 = getelementptr double, double* %x, i64 %index
%0 = bitcast double* %next.gep19 to <2 x double>*
%wide.load = load <2 x double>, <2 x double>* %0, align 8
%1 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.600000e-01, double 7.600000e-01>)
%2 = bitcast double* %next.gep to <2 x double>*
store <2 x double> %1, <2 x double>* %2, align 8
%index.next = add i64 %index, 2
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is 0.75
define void @my_vpow_075(double* nocapture %y, double* nocapture readonly %x) {
; CHECK-LABEL: @vspow_075
; CHECK-NOT: bl __powd2_P{{[8,9]}}
; CHECK: xvrsqrtesp
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr double, double* %y, i64 %index
%next.gep19 = getelementptr double, double* %x, i64 %index
%0 = bitcast double* %next.gep19 to <2 x double>*
%wide.load = load <2 x double>, <2 x double>* %0, align 8
%1 = call ninf afn <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.500000e-01, double 7.500000e-01>)
%2 = bitcast double* %next.gep to <2 x double>*
store <2 x double> %1, <2 x double>* %2, align 8
%index.next = add i64 %index, 2
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is 0.25
define void @my_vpow_025(double* nocapture %y, double* nocapture readonly %x) {
; CHECK-LABEL: @vspow_025
; CHECK-NOT: bl __powd2_P{{[8,9]}}
; CHECK: xvrsqrtesp
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr double, double* %y, i64 %index
%next.gep19 = getelementptr double, double* %x, i64 %index
%0 = bitcast double* %next.gep19 to <2 x double>*
%wide.load = load <2 x double>, <2 x double>* %0, align 8
%1 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 2.500000e-01, double 2.500000e-01>)
%2 = bitcast double* %next.gep to <2 x double>*
store <2 x double> %1, <2 x double>* %2, align 8
%index.next = add i64 %index, 2
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is 0.75 but no proper fast-math flags
define void @my_vpow_075_nofast(double* nocapture %y, double* nocapture readonly %x) {
; CHECK-LABEL: @vspow_075_nofast
; CHECK-PWR9: bl __powd2_P9
; CHECK-PWR8: bl __powd2_P8
; CHECK-NOT: xvrsqrtesp
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr double, double* %y, i64 %index
%next.gep19 = getelementptr double, double* %x, i64 %index
%0 = bitcast double* %next.gep19 to <2 x double>*
%wide.load = load <2 x double>, <2 x double>* %0, align 8
%1 = call <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.500000e-01, double 7.500000e-01>)
%2 = bitcast double* %next.gep to <2 x double>*
store <2 x double> %1, <2 x double>* %2, align 8
%index.next = add i64 %index, 2
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is 0.25 but no proper fast-math flags
define void @my_vpow_025_nofast(double* nocapture %y, double* nocapture readonly %x) {
; CHECK-LABEL: @vspow_025_nofast
; CHECK-PWR9: bl __powd2_P9
; CHECK-PWR8: bl __powd2_P8
; CHECK-NOT: xvrsqrtesp
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr double, double* %y, i64 %index
%next.gep19 = getelementptr double, double* %x, i64 %index
%0 = bitcast double* %next.gep19 to <2 x double>*
%wide.load = load <2 x double>, <2 x double>* %0, align 8
%1 = call <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 2.500000e-01, double 2.500000e-01>)
%2 = bitcast double* %next.gep to <2 x double>*
store <2 x double> %1, <2 x double>* %2, align 8
%index.next = add i64 %index, 2
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare <2 x double> @__powd2_massv(<2 x double>, <2 x double>) #1
166 changes: 166 additions & 0 deletions llvm/test/CodeGen/PowerPC/powf_massv_075_025exp.ll
@@ -0,0 +1,166 @@
; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck -check-prefixes=CHECK-PWR9 %s
; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 | FileCheck -check-prefixes=CHECK-PWR8 %s

; Exponent is a variable
define void @vspow_var(float* nocapture %z, float* nocapture readonly %y, float* nocapture readonly %x) {
; CHECK-LABEL: @vspow_var
; CHECK-PWR9: bl __powf4_P9
; CHECK-PWR8: bl __powf4_P8
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr float, float* %z, i64 %index
%next.gep31 = getelementptr float, float* %y, i64 %index
%next.gep32 = getelementptr float, float* %x, i64 %index
%0 = bitcast float* %next.gep32 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %0, align 4
%1 = bitcast float* %next.gep31 to <4 x float>*
%wide.load33 = load <4 x float>, <4 x float>* %1, align 4
%2 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> %wide.load33)
%3 = bitcast float* %next.gep to <4 x float>*
store <4 x float> %2, <4 x float>* %3, align 4
%index.next = add i64 %index, 4
%4 = icmp eq i64 %index.next, 1024
br i1 %4, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is a constant != 0.75 and !=0.25
define void @vspow_const(float* nocapture %y, float* nocapture readonly %x) {
; CHECK-LABEL: @vspow_const
; CHECK-PWR9: bl __powf4_P9
; CHECK-PWR8: bl __powf4_P8
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr float, float* %y, i64 %index
%next.gep19 = getelementptr float, float* %x, i64 %index
%0 = bitcast float* %next.gep19 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %0, align 4
%1 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 0x3FE851EB80000000, float 0x3FE851EB80000000, float 0x3FE851EB80000000, float 0x3FE851EB80000000>)
%2 = bitcast float* %next.gep to <4 x float>*
store <4 x float> %1, <4 x float>* %2, align 4
%index.next = add i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is 0.75
define void @vspow_075(float* nocapture %y, float* nocapture readonly %x) {
; CHECK-LABEL: @vspow_075
; CHECK-NOT: bl __powf4_P{{[8,9]}}
; CHECK: xvrsqrtesp
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr float, float* %y, i64 %index
%next.gep19 = getelementptr float, float* %x, i64 %index
%0 = bitcast float* %next.gep19 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %0, align 4
%1 = call ninf afn <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>)
%2 = bitcast float* %next.gep to <4 x float>*
store <4 x float> %1, <4 x float>* %2, align 4
%index.next = add i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is 0.25
define void @vspow_025(float* nocapture %y, float* nocapture readonly %x) {
; CHECK-LABEL: @vspow_025
; CHECK-NOT: bl __powf4_P{{[8,9]}}
; CHECK: xvrsqrtesp
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr float, float* %y, i64 %index
%next.gep19 = getelementptr float, float* %x, i64 %index
%0 = bitcast float* %next.gep19 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %0, align 4
%1 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 2.500000e-01, float 2.500000e-01, float 2.500000e-01, float 2.500000e-01>)
%2 = bitcast float* %next.gep to <4 x float>*
store <4 x float> %1, <4 x float>* %2, align 4
%index.next = add i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is 0.75 but no proper fast-math flags
define void @vspow_075_nofast(float* nocapture %y, float* nocapture readonly %x) {
; CHECK-LABEL: @vspow_075_nofast
; CHECK-PWR9: bl __powf4_P9
; CHECK-PWR8: bl __powf4_P8
; CHECK-NOT: xvrsqrtesp
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr float, float* %y, i64 %index
%next.gep19 = getelementptr float, float* %x, i64 %index
%0 = bitcast float* %next.gep19 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %0, align 4
%1 = call <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>)
%2 = bitcast float* %next.gep to <4 x float>*
store <4 x float> %1, <4 x float>* %2, align 4
%index.next = add i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Exponent is 0.25 but no proper fast-math flags
define void @vspow_025_nofast(float* nocapture %y, float* nocapture readonly %x) {
; CHECK-LABEL: @vspow_025_nofast
; CHECK-PWR9: bl __powf4_P9
; CHECK-PWR8: bl __powf4_P8
; CHECK-NOT: xvrsqrtesp
; CHECK: blr
entry:
br label %vector.body

vector.body:
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
%next.gep = getelementptr float, float* %y, i64 %index
%next.gep19 = getelementptr float, float* %x, i64 %index
%0 = bitcast float* %next.gep19 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %0, align 4
%1 = call <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 2.500000e-01, float 2.500000e-01, float 2.500000e-01, float 2.500000e-01>)
%2 = bitcast float* %next.gep to <4 x float>*
store <4 x float> %1, <4 x float>* %2, align 4
%index.next = add i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %for.end, label %vector.body

for.end:
ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x float> @__powf4_massv(<4 x float>, <4 x float>)

0 comments on commit 2d03837

Please sign in to comment.