Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DAGCombiner optimization for pow(x,0.75) and pow(x,0.25) on double an…
…d single precision even in case massv function is asked Here, I am proposing to add an special case for massv powf4/powd2 function (SIMD counterpart of powf/pow function in MASSV library) in MASSV pass to get later optimizations like conversion from pow(x,0.75) and pow(x,0.25) for double and single precision to sequence of sqrt's in the DAGCombiner in vector float case. My reason for doing this is: the optimized pow(x,0.75) and pow(x,0.25) for double and single precision to sequence of sqrt's is faster than powf4/powd2 on P8 and P9. In case MASSV functions is called, and if the exponent of pow is 0.75 or 0.25, we will get the sequence of sqrt's and if exponent is not 0.75 or 0.25 we will get the appropriate MASSV function. Reviewed By: steven.zhang Tags: #LLVM #PowerPC Differential Revision: https://reviews.llvm.org/D80744
- Loading branch information
Showing
3 changed files
with
365 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck -check-prefixes=CHECK-PWR9 %s | ||
; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 | FileCheck -check-prefixes=CHECK-PWR8 %s | ||
|
||
; Exponent is a variable | ||
define void @my_vpow_var(double* nocapture %z, double* nocapture readonly %y, double* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_var | ||
; CHECK-PWR9: bl __powd2_P9 | ||
; CHECK-PWR8: bl __powd2_P8 | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr double, double* %z, i64 %index | ||
%next.gep31 = getelementptr double, double* %y, i64 %index | ||
%next.gep32 = getelementptr double, double* %x, i64 %index | ||
%0 = bitcast double* %next.gep32 to <2 x double>* | ||
%wide.load = load <2 x double>, <2 x double>* %0, align 8 | ||
%1 = bitcast double* %next.gep31 to <2 x double>* | ||
%wide.load33 = load <2 x double>, <2 x double>* %1, align 8 | ||
%2 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> %wide.load33) | ||
%3 = bitcast double* %next.gep to <2 x double>* | ||
store <2 x double> %2, <2 x double>* %3, align 8 | ||
%index.next = add i64 %index, 2 | ||
%4 = icmp eq i64 %index.next, 1024 | ||
br i1 %4, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is a constant != 0.75 and !=0.25 | ||
define void @my_vpow_const(double* nocapture %y, double* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_const | ||
; CHECK-PWR9: bl __powd2_P9 | ||
; CHECK-PWR8: bl __powd2_P8 | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr double, double* %y, i64 %index | ||
%next.gep19 = getelementptr double, double* %x, i64 %index | ||
%0 = bitcast double* %next.gep19 to <2 x double>* | ||
%wide.load = load <2 x double>, <2 x double>* %0, align 8 | ||
%1 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.600000e-01, double 7.600000e-01>) | ||
%2 = bitcast double* %next.gep to <2 x double>* | ||
store <2 x double> %1, <2 x double>* %2, align 8 | ||
%index.next = add i64 %index, 2 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is 0.75 | ||
define void @my_vpow_075(double* nocapture %y, double* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_075 | ||
; CHECK-NOT: bl __powd2_P{{[8,9]}} | ||
; CHECK: xvrsqrtesp | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr double, double* %y, i64 %index | ||
%next.gep19 = getelementptr double, double* %x, i64 %index | ||
%0 = bitcast double* %next.gep19 to <2 x double>* | ||
%wide.load = load <2 x double>, <2 x double>* %0, align 8 | ||
%1 = call ninf afn <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.500000e-01, double 7.500000e-01>) | ||
%2 = bitcast double* %next.gep to <2 x double>* | ||
store <2 x double> %1, <2 x double>* %2, align 8 | ||
%index.next = add i64 %index, 2 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is 0.25 | ||
define void @my_vpow_025(double* nocapture %y, double* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_025 | ||
; CHECK-NOT: bl __powd2_P{{[8,9]}} | ||
; CHECK: xvrsqrtesp | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr double, double* %y, i64 %index | ||
%next.gep19 = getelementptr double, double* %x, i64 %index | ||
%0 = bitcast double* %next.gep19 to <2 x double>* | ||
%wide.load = load <2 x double>, <2 x double>* %0, align 8 | ||
%1 = call ninf afn nsz <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 2.500000e-01, double 2.500000e-01>) | ||
%2 = bitcast double* %next.gep to <2 x double>* | ||
store <2 x double> %1, <2 x double>* %2, align 8 | ||
%index.next = add i64 %index, 2 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is 0.75 but no proper fast-math flags | ||
define void @my_vpow_075_nofast(double* nocapture %y, double* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_075_nofast | ||
; CHECK-PWR9: bl __powd2_P9 | ||
; CHECK-PWR8: bl __powd2_P8 | ||
; CHECK-NOT: xvrsqrtesp | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr double, double* %y, i64 %index | ||
%next.gep19 = getelementptr double, double* %x, i64 %index | ||
%0 = bitcast double* %next.gep19 to <2 x double>* | ||
%wide.load = load <2 x double>, <2 x double>* %0, align 8 | ||
%1 = call <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 7.500000e-01, double 7.500000e-01>) | ||
%2 = bitcast double* %next.gep to <2 x double>* | ||
store <2 x double> %1, <2 x double>* %2, align 8 | ||
%index.next = add i64 %index, 2 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is 0.25 but no proper fast-math flags | ||
define void @my_vpow_025_nofast(double* nocapture %y, double* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_025_nofast | ||
; CHECK-PWR9: bl __powd2_P9 | ||
; CHECK-PWR8: bl __powd2_P8 | ||
; CHECK-NOT: xvrsqrtesp | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr double, double* %y, i64 %index | ||
%next.gep19 = getelementptr double, double* %x, i64 %index | ||
%0 = bitcast double* %next.gep19 to <2 x double>* | ||
%wide.load = load <2 x double>, <2 x double>* %0, align 8 | ||
%1 = call <2 x double> @__powd2_massv(<2 x double> %wide.load, <2 x double> <double 2.500000e-01, double 2.500000e-01>) | ||
%2 = bitcast double* %next.gep to <2 x double>* | ||
store <2 x double> %1, <2 x double>* %2, align 8 | ||
%index.next = add i64 %index, 2 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Function Attrs: nounwind readnone speculatable willreturn | ||
declare <2 x double> @__powd2_massv(<2 x double>, <2 x double>) #1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 | FileCheck -check-prefixes=CHECK-PWR9 %s | ||
; RUN: llc -vector-library=MASSV < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr8 | FileCheck -check-prefixes=CHECK-PWR8 %s | ||
|
||
; Exponent is a variable | ||
define void @vspow_var(float* nocapture %z, float* nocapture readonly %y, float* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_var | ||
; CHECK-PWR9: bl __powf4_P9 | ||
; CHECK-PWR8: bl __powf4_P8 | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr float, float* %z, i64 %index | ||
%next.gep31 = getelementptr float, float* %y, i64 %index | ||
%next.gep32 = getelementptr float, float* %x, i64 %index | ||
%0 = bitcast float* %next.gep32 to <4 x float>* | ||
%wide.load = load <4 x float>, <4 x float>* %0, align 4 | ||
%1 = bitcast float* %next.gep31 to <4 x float>* | ||
%wide.load33 = load <4 x float>, <4 x float>* %1, align 4 | ||
%2 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> %wide.load33) | ||
%3 = bitcast float* %next.gep to <4 x float>* | ||
store <4 x float> %2, <4 x float>* %3, align 4 | ||
%index.next = add i64 %index, 4 | ||
%4 = icmp eq i64 %index.next, 1024 | ||
br i1 %4, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is a constant != 0.75 and !=0.25 | ||
define void @vspow_const(float* nocapture %y, float* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_const | ||
; CHECK-PWR9: bl __powf4_P9 | ||
; CHECK-PWR8: bl __powf4_P8 | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr float, float* %y, i64 %index | ||
%next.gep19 = getelementptr float, float* %x, i64 %index | ||
%0 = bitcast float* %next.gep19 to <4 x float>* | ||
%wide.load = load <4 x float>, <4 x float>* %0, align 4 | ||
%1 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 0x3FE851EB80000000, float 0x3FE851EB80000000, float 0x3FE851EB80000000, float 0x3FE851EB80000000>) | ||
%2 = bitcast float* %next.gep to <4 x float>* | ||
store <4 x float> %1, <4 x float>* %2, align 4 | ||
%index.next = add i64 %index, 4 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is 0.75 | ||
define void @vspow_075(float* nocapture %y, float* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_075 | ||
; CHECK-NOT: bl __powf4_P{{[8,9]}} | ||
; CHECK: xvrsqrtesp | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr float, float* %y, i64 %index | ||
%next.gep19 = getelementptr float, float* %x, i64 %index | ||
%0 = bitcast float* %next.gep19 to <4 x float>* | ||
%wide.load = load <4 x float>, <4 x float>* %0, align 4 | ||
%1 = call ninf afn <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>) | ||
%2 = bitcast float* %next.gep to <4 x float>* | ||
store <4 x float> %1, <4 x float>* %2, align 4 | ||
%index.next = add i64 %index, 4 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is 0.25 | ||
define void @vspow_025(float* nocapture %y, float* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_025 | ||
; CHECK-NOT: bl __powf4_P{{[8,9]}} | ||
; CHECK: xvrsqrtesp | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr float, float* %y, i64 %index | ||
%next.gep19 = getelementptr float, float* %x, i64 %index | ||
%0 = bitcast float* %next.gep19 to <4 x float>* | ||
%wide.load = load <4 x float>, <4 x float>* %0, align 4 | ||
%1 = call ninf afn nsz <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 2.500000e-01, float 2.500000e-01, float 2.500000e-01, float 2.500000e-01>) | ||
%2 = bitcast float* %next.gep to <4 x float>* | ||
store <4 x float> %1, <4 x float>* %2, align 4 | ||
%index.next = add i64 %index, 4 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is 0.75 but no proper fast-math flags | ||
define void @vspow_075_nofast(float* nocapture %y, float* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_075_nofast | ||
; CHECK-PWR9: bl __powf4_P9 | ||
; CHECK-PWR8: bl __powf4_P8 | ||
; CHECK-NOT: xvrsqrtesp | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr float, float* %y, i64 %index | ||
%next.gep19 = getelementptr float, float* %x, i64 %index | ||
%0 = bitcast float* %next.gep19 to <4 x float>* | ||
%wide.load = load <4 x float>, <4 x float>* %0, align 4 | ||
%1 = call <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 7.500000e-01, float 7.500000e-01, float 7.500000e-01, float 7.500000e-01>) | ||
%2 = bitcast float* %next.gep to <4 x float>* | ||
store <4 x float> %1, <4 x float>* %2, align 4 | ||
%index.next = add i64 %index, 4 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Exponent is 0.25 but no proper fast-math flags | ||
define void @vspow_025_nofast(float* nocapture %y, float* nocapture readonly %x) { | ||
; CHECK-LABEL: @vspow_025_nofast | ||
; CHECK-PWR9: bl __powf4_P9 | ||
; CHECK-PWR8: bl __powf4_P8 | ||
; CHECK-NOT: xvrsqrtesp | ||
; CHECK: blr | ||
entry: | ||
br label %vector.body | ||
|
||
vector.body: | ||
%index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] | ||
%next.gep = getelementptr float, float* %y, i64 %index | ||
%next.gep19 = getelementptr float, float* %x, i64 %index | ||
%0 = bitcast float* %next.gep19 to <4 x float>* | ||
%wide.load = load <4 x float>, <4 x float>* %0, align 4 | ||
%1 = call <4 x float> @__powf4_massv(<4 x float> %wide.load, <4 x float> <float 2.500000e-01, float 2.500000e-01, float 2.500000e-01, float 2.500000e-01>) | ||
%2 = bitcast float* %next.gep to <4 x float>* | ||
store <4 x float> %1, <4 x float>* %2, align 4 | ||
%index.next = add i64 %index, 4 | ||
%3 = icmp eq i64 %index.next, 1024 | ||
br i1 %3, label %for.end, label %vector.body | ||
|
||
for.end: | ||
ret void | ||
} | ||
|
||
; Function Attrs: nounwind readnone speculatable willreturn | ||
declare <4 x float> @__powf4_massv(<4 x float>, <4 x float>) |