-
Notifications
You must be signed in to change notification settings - Fork 6
/
Common.cpp
134 lines (120 loc) · 4.79 KB
/
Common.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include "Common.h"
#include "Timer.h"
#include "Core/Vector.h"
#include <stdio.h>
void parse_args(Config *config, int argc, char **argv)
{
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
config->verbose = true;
} else if (strcmp(arg, "-s") == 0) {
config->data_size = atoi(argv[++i]);
}
}
}
void print_results(Slice<const uint32_t> bits, const Config &config)
{
if (!config.verbose)
return;
const int size = config.data_size;
const int mid = size / 2;
for (int z = 0; z < size; z++) {
for (int x = 0; x < size; x++) {
const int i = offset_3d(Vec3i(x, mid, z), Vec3i(size));
const int ri = i / 32;
const int shift = i % 32;
if (bits[ri] & (1U << shift)) {
printf(". ");
} else {
printf("# ");
}
}
printf("\n");
}
}
void measure(Func<void()> f, int warmup, int runs, const char *name, const Config &config)
{
Vector<double> results(runs);
for (int i = 0; i < warmup; i++) {
f();
}
double average = get_time_milliseconds();
for (int i = 0; i < runs; i++) {
const double begin = get_time_milliseconds();
f();
results[i] = get_time_milliseconds() - begin;
}
average = (get_time_milliseconds() - average) / runs;
printf("'%s' done in %d runs, average: %fms\n", name, runs, average);
if (config.verbose) {
printf("per-run info:\n");
for (int i = 0; i < runs; i++)
printf(" [%d] %fms\n", i, results[i]);
}
}
void naive_cull(Slice<uint32_t> results, Slice<const Sphere> spheres, const Frustum &f)
{
for (int i = 0, n = spheres.length; i < n; i++) {
const Sphere &s = spheres.data[i];
const uint32_t result = f.cull(s) & 1;
const int ri = i / 32;
const int shift = i % 32;
results.data[ri] |= result << shift;
}
}
void sse_cull(Slice<uint32_t> results, Slice<const Sphere> spheres, const Frustum &f)
{
// we negate everything because we use this formula to cull:
// dot(-p.n, s.center) - p.d > s.radius
// it's equivalent to:
// dot(p.n, s.center) + p.d < -s.radius
// but no need to negate sphere radius
const __m128 plane_components[8] = {
simd_set(-f.planes[0].n.x, -f.planes[1].n.x, -f.planes[2].n.x, -f.planes[3].n.x),
simd_set(-f.planes[0].n.y, -f.planes[1].n.y, -f.planes[2].n.y, -f.planes[3].n.y),
simd_set(-f.planes[0].n.z, -f.planes[1].n.z, -f.planes[2].n.z, -f.planes[3].n.z),
simd_set(-f.planes[0].d, -f.planes[1].d, -f.planes[2].d, -f.planes[3].d),
simd_set(-f.planes[4].n.x, -f.planes[5].n.x, -f.planes[4].n.x, -f.planes[5].n.x),
simd_set(-f.planes[4].n.y, -f.planes[5].n.y, -f.planes[4].n.y, -f.planes[5].n.y),
simd_set(-f.planes[4].n.z, -f.planes[5].n.z, -f.planes[4].n.z, -f.planes[5].n.z),
simd_set(-f.planes[4].d, -f.planes[5].d, -f.planes[4].d, -f.planes[5].d),
};
for (int i = 0, n = spheres.length; i < n; i++) {
// Load sphere into SSE register.
const __m128 s = _mm_load_ps(reinterpret_cast<const float*>(spheres.data+i));
const __m128 xxxx = simd_splat_x(s);
const __m128 yyyy = simd_splat_y(s);
const __m128 zzzz = simd_splat_z(s);
const __m128 rrrr = simd_splat_w(s);
__m128 v, r;
// Move sphere center to plane normal space and make it relative to plane.
// dot(p.n, s) + p.d
v = simd_madd(xxxx, plane_components[0], plane_components[3]);
v = simd_madd(yyyy, plane_components[1], v);
v = simd_madd(zzzz, plane_components[2], v);
// One of r floats will be set to 0xFFFFFFFF if sphere is outside of the frustum.
r = _mm_cmpgt_ps(v, rrrr);
// Same for second set of planes.
v = simd_madd(xxxx, plane_components[4], plane_components[7]);
v = simd_madd(yyyy, plane_components[5], v);
v = simd_madd(zzzz, plane_components[6], v);
r = _mm_or_ps(r, _mm_cmpgt_ps(v, rrrr));
// Shuffle and extract the result:
// 1. movehl(r, r) does this (we're interested in 2 lower floats):
// a b c d -> c d c d
// 2. then we OR it with the existing value (ignoring 2 upper floats)
// a b | c d = A B
// 3. and then we OR it again ignoring all but 1 lowest float:
// A | B = R
// Result is written in the lowest float.
r = _mm_or_ps(r, _mm_movehl_ps(r, r));
r = _mm_or_ps(r, simd_splat_y(r));
uint32_t result;
_mm_store_ss((float*)&result, r);
// And write the result back to bit buffer.
const int ri = i / 32;
const int shift = i % 32;
results.data[ri] |= (result & 1) << shift;
}
}