Skip to content

Commit

Permalink
prevent statement reordering
Browse files Browse the repository at this point in the history
  • Loading branch information
baruxu committed Jun 5, 2024
1 parent 9085daa commit 90a8bbf
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 53 deletions.
47 changes: 2 additions & 45 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,29 +46,8 @@ fixed64 is designed to find a balance between percision and performance. It is s
- support speed up the multiplication and division with integer
### Performance
- forceinline
- no overflow
- no hardware int128
Intel Core i9-12900K 3.2GHz
|Arithmetic|Fixed64|Hardware Float|
|-|:-:|:-:|
|Addition/Subtraction|0.027 ns|0.433 ns|
|Multiplication|2.621 ns|0.837 ns|
|Division|1.316 ns|2.784 ns|
Apple M1 pro
|Arithmetic|Fixed64|Hardware Float|
|-|:-:|:-:|
|Addition/Subtraction|0.000001 ns*|0.953 ns|
|Multiplication|4.057 ns|1.246 ns|
|Division|1.102 ns|3.144 ns|
* result is calculated with random operand, can not be calculated in compile time.
see more in ``benchmark.cpp``
see more in [benchmark](https://github.com/nustxujun/FixedPoint64/blob/main/benchmark/benchmark.md)
### Supported Switcher
```c++
#define FIXED_64_ENABLE_ROUNDING // apply rounding
Expand Down Expand Up @@ -131,31 +110,9 @@ fixed64综合考虑了精度与性能的问题,使用了int64存储。个人
- 支持与整型的乘除法加速
### Performance
- 开启强制内敛
- 无溢出检测
- 无硬件int128支持
Intel Core i9-12900K 3.2GHz
|算数操作|定点数|系统浮点数|
|-|:-:|:-:|
|加/减|0.027 ns|0.433 ns|
|乘|2.621 ns|0.837 ns|
|除|1.316 ns|2.784 ns|
Apple M1 pro
|算数操作|定点数|系统浮点数|
|-|:-:|:-:|
|加/减|0.000001 ns*|0.953 ns|
|乘|4.057 ns|1.246 ns|
|除|1.102 ns|3.144 ns|
* 计算数值是随机的,不可能是编译期计算出来的
具体参考[benchmark](https://github.com/nustxujun/FixedPoint64/blob/main/benchmark/benchmark.md)
具体请参考``benchmark.cpp``
### 开关
```c++
#define FIXED_64_ENABLE_ROUNDING // 使用四舍五入
Expand Down
36 changes: 28 additions & 8 deletions benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ struct Operand
fixed fa;
fixed fb;

inline Operand(float Min, float Max)
inline Operand(fp Min, fp Max)
{
std::uniform_real_distribution<fp> u(Min, Max);
std::uniform_real_distribution<double> u(Min, Max);
a = u(e) ;
b = u(e);
fa = a;
Expand Down Expand Up @@ -84,13 +84,27 @@ FIXED_64_FORCEINLINE void PreventOptimizedAway(fixed val)
EXPR1;\
}

#pragma optimize("",off) // prevent statement reordering
// prevent statment reordering
#ifdef _MSC_VER
#pragma optimize("",off)
#elif defined(__clang__)
#pragma clang optimize off
#else
#pragma GCC push_options
#pragma GCC optimize ("O0")
#endif
template<class T>
void run_test(T& a, T& b, std::function<void(T&, T&)>&& f)
{
f(a,b);
}
#pragma optimize("",on)
#ifdef _MSC_VER
#pragma optimize("",on)
#elif defined(__clang__)
#pragma clang optimize on
#else
#pragma GCC pop_options
#endif


#define RUN_TEST(EXPR1, EXPR2, COUNT, Min, Max) \
Expand All @@ -114,21 +128,25 @@ struct TestGroup
std::string name;
uint64_t num_batch;
uint64_t count;
fp min;
fp max;
TestGroup(std::string n, uint64_t num, uint64_t c, fp min, fp max)
{
this->min = min;
this->max = max;
name = n;
num_batch = num;
count = c;
totals[0] = 0;
totals[1] = 0;
printf("%s [%f, %f]\n", name.c_str(), min, max);
}

~TestGroup()
{
printf("hard float: %lf ns, fixed point: %lf ns\n\n",
double(totals[0]) /count / num_batch
,double(totals[1]) / count / num_batch
printf("%16s[%6.1f, %6.1f]| %3.4lf ns | %3.4lf ns |\n",
name.c_str(),(float)min, (float)max,
double(totals[1]) /count / num_batch
,double(totals[0]) / count / num_batch
);
}
};
Expand Down Expand Up @@ -167,6 +185,8 @@ auto benchmark = [](){
const uint64_t count1 = 0xffff'ff;
const uint64_t count2 = 0xffff'f;

printf(" arithmetic[ min, max]|fixed point| hard float|\n");

RUN_BASIC_TEST_GROUP("add/sub", +, -, 0xff, count1, -100, 100);

#if FIXED_64_ENABLE_INT128_ACCELERATION
Expand Down
69 changes: 69 additions & 0 deletions benchmark/benchmark.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
## Intel Core i9-12900K 3.2GHz windows11 clang c++20

|Arithmetic|Fixed64|Hardware Float|
|:-|-:|-:|
add/sub[-100.0, 100.0]|0.0038 ns|0.4295 ns|
mul[-100.0, 100.0]|2.6078 ns|0.9425 ns|
mul[ 0.0, 0.5]|2.9100 ns|0.9226 ns|
mul[ 0.5, 1.0]|2.6604 ns|27.1776 ns|
mul[ 1.0, 2.0]|2.6227 ns|0.8468 ns|
mul[ 2.0, 100.0]|2.6320 ns|0.8465 ns|
div[-100.0, 100.0]|1.2967 ns|2.7318 ns|
div[ 0.0, 0.5]|12.0082 ns|2.3092 ns|
div[ 0.5, 1.0]|11.9341 ns|2.3041 ns|
div[ 1.0, 2.0]|5.8836 ns|29.7369 ns|
div[ 2.0, 100.0]|1.1417 ns|2.2860 ns|
ceil[ -2.0, 2.0]|0.3916 ns|1.9973 ns|
floor[ -2.0, 2.0]|0.3870 ns|1.7243 ns|
round[ -2.0, 2.0]|0.6668 ns|6.5249 ns|
abs[ -2.0, 2.0]|0.0538 ns|0.4180 ns|
exp[ 0.0, 1.0]|3.6085 ns|2.1249 ns|
exp2[ 0.0, 1.0]|2.7666 ns|30.3101 ns|
sqrt[ 0.0, 100.0]|13.7207 ns|1.6524 ns|
sin[ -10.0, 10.0]|1.0053 ns|3.6804 ns|
cos[ -10.0, 10.0]|1.3183 ns|3.4292 ns|
tan[ -10.0, 10.0]|7.3048 ns|3.5210 ns|
asin[ -1.0, 1.0]|23.3574 ns|2.7947 ns|
acos[ -1.0, 1.0]|22.9430 ns|2.8789 ns|
atan[ 1.0, 100.0]|6.9723 ns|3.0697 ns|
atan[ 1.0, 100.0]|6.9567 ns|3.4708 ns|


## Intel Core i9-12900K 3.2GHz windows11 MSVC c++20

|Arithmetic|Fixed64|Hardware Float|
|:-|-:|-:|
add/sub[-100.0, 100.0]| 0.2107 ns | 0.4187 ns |
mul[-100.0, 100.0]| 2.7356 ns | 0.8556 ns |
mul[ 0.0, 0.5]| 2.6802 ns | 0.9478 ns |
mul[ 0.5, 1.0]| 2.6930 ns | 0.8574 ns |
mul[ 1.0, 2.0]| 2.6725 ns | 0.8455 ns |
mul[ 2.0, 100.0]| 2.6829 ns | 0.8512 ns |
div[-100.0, 100.0]| 1.6081 ns | 3.1777 ns |
div[ 0.0, 0.5]| 13.7667 ns | 28.6761 ns |
div[ 0.5, 1.0]| 13.7861 ns | 29.7010 ns |
div[ 1.0, 2.0]| 6.2241 ns | 29.6876 ns |
div[ 2.0, 100.0]| 1.5079 ns | 2.3074 ns |
ceil[ -2.0, 2.0]| 0.4330 ns | 2.0436 ns |
floor[ -2.0, 2.0]| 0.4341 ns | 2.0402 ns |
round[ -2.0, 2.0]| 2.0854 ns | 6.4961 ns |
abs[ -2.0, 2.0]| 0.2149 ns | 0.4175 ns |
exp[ 0.0, 1.0]| 10.8654 ns | 2.0519 ns |
exp2[ 0.0, 1.0]| 9.4355 ns | 30.2611 ns |
sqrt[ 0.0, 100.0]| 10.9269 ns | 0.6271 ns |
sin[ -10.0, 10.0]| 2.3128 ns | 3.9287 ns |
cos[ -10.0, 10.0]| 2.1791 ns | 3.6846 ns |
tan[ -10.0, 10.0]| 12.0050 ns | 3.6412 ns |
asin[ -1.0, 1.0]| 19.5066 ns | 2.7914 ns |
acos[ -1.0, 1.0]| 20.2079 ns | 2.9695 ns |
atan[ 1.0, 100.0]| 17.5488 ns | 3.1872 ns |
atan[ 1.0, 100.0]| 17.1658 ns | 3.0822 ns |


## Apple M1 pro

|Arithmetic|Fixed64|Hardware Float|
|-|-:|-:|
|Addition/Subtraction|0.215 ns|0.953 ns|
|Multiplication|4.057 ns|1.246 ns|
|Division|1.102 ns|3.144 ns|

0 comments on commit 90a8bbf

Please sign in to comment.