Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 516 lines (473 sloc) 16.528 kb
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
1 /*
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
2 * Copyright 2010,2011,2012 Reality Jockey, Ltd.
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
3 * info@rjdj.me
4 * http://rjdj.me/
5 *
6 * This file is part of ZenGarden.
7 *
8 * ZenGarden is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as published by
10 * the Free Software Foundation, either version 3 of the License, or
11 * (at your option) any later version.
12 *
13 * ZenGarden is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with ZenGarden. If not, see <http://www.gnu.org/licenses/>.
20 *
21 */
22
23 #ifndef _ARRAY_ARITHMETIC_H_
24 #define _ARRAY_ARITHMETIC_H_
25
3cceb15 @mhroth Adapt vector adds to check for Acceleration framework and break dependin...
authored
26 #if __APPLE__
f4a2d77 @mhroth Add Acclerate-specific implementations to ArrayArithmetic.
authored
27 // The Accelerate framework is a library of tuned vector operations
28 #include <Accelerate/Accelerate.h>
7abaebe @mhroth Use __ARM_NEON__ parser flag instead of _ARM_ARCH_7. This is more correc...
authored
29 #endif
30 #if __SSE__
7ce1183 @mhroth Add SSE option to ArrayArithmetic::add(float *, float*). Works as advert...
authored
31 #include <xmmintrin.h>
7abaebe @mhroth Use __ARM_NEON__ parser flag instead of _ARM_ARCH_7. This is more correc...
authored
32 #elif __ARM_NEON__
f244d3c @mhroth Add comment to ArrayArithmetic.h
authored
33 // __ARM_NEON__ is defined by the compiler if the arguments "-mfloat-abi=softfp -mfpu=neon" are passed.
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
34 #include <arm_neon.h>
7ce1183 @mhroth Add SSE option to ArrayArithmetic::add(float *, float*). Works as advert...
authored
35 #endif
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
36
37 /**
38 * This class offers static inline functions for computing basic arithmetic with float arrays.
f4a2d77 @mhroth Add Acclerate-specific implementations to ArrayArithmetic.
authored
39 * It offers a central place for optimised implementations of common compute-intensive operations.
581bacc @mhroth Update ArrayArithmetic documentation.
authored
40 * In all SSE cases, input vectors can be (16-byte) unaligned, but output vectors must be aligned.
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
41 */
42 class ArrayArithmetic {
43
44 public:
7cfea45 @mhroth Use Apple's weak linking flag to see if the Accelerate framework is actu...
authored
45
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
46 static inline void add(float *input0, float *input1, float *output, int startIndex, int endIndex) {
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
47 #if __APPLE__
48 vDSP_vadd(input0+startIndex, 1, input1+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
49 #elif __SSE__
50 input0 += startIndex;
51 input1 += startIndex;
52 output += startIndex;
53 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
54
55 // align buffer to 16-byte boundary
56 switch (startIndex & 0x3) {
57 case 0: default: break;
58 case 1: *output++ = *input0++ + *input1++; --n;
59 case 2: *output++ = *input0++ + *input1++; --n;
60 case 3: *output++ = *input0++ + *input1++; --n;
61 }
62
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
63 int n4 = n & 0xFFFFFFFC;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
64 while (n4) {
65 _mm_store_ps(output, _mm_add_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
66 n4 -= 4; input0 += 4; input1 += 4; output += 4;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
67 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
68
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
69 switch (n & 0x3) {
70 case 3: *output++ = *input0++ + *input1++;
71 case 2: *output++ = *input0++ + *input1++;
72 case 1: *output++ = *input0++ + *input1++;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
73 case 0: default: break;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
74 }
75 #elif __ARM_NEON__
76 input0 += startIndex;
77 input1 += startIndex;
78 output += startIndex;
79 int n = endIndex - startIndex;
80 int n4 = n & 0xFFFFFFFC;
81 float32x4_t inVec0, inVec1, res;
82 while (n4) {
83 inVec0 = vld1q_f32((const float32_t *) input0);
84 inVec1 = vld1q_f32((const float32_t *) input1);
85 res = vaddq_f32(inVec0, inVec1);
86 vst1q_f32((float32_t *) output, res);
87 n4 -= 4;
88 input0 += 4;
89 input1 += 4;
90 output += 4;
91 }
92 switch (n & 0x3) {
93 case 3: *output++ = *input0++ + *input1++;
94 case 2: *output++ = *input0++ + *input1++;
95 case 1: *output++ = *input0++ + *input1++;
96 default: break;
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
97 }
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
98 #else
99 for (int i = startIndex; i < endIndex; i++) {
100 output[i] = input0[i] + input1[i];
101 }
102 #endif
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
103 }
104
105 static inline void add(float *input, float constant, float *output, int startIndex, int endIndex) {
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
106 #if __APPLE__
107 vDSP_vsadd(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
108 #elif __SSE__
109 input += startIndex;
110 output += startIndex;
111 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
112
113 // align buffer to 16-byte boundary
114 switch (startIndex & 0x3) {
115 case 0: default: break;
116 case 1: *output++ += constant; --n;
117 case 2: *output++ += constant; --n;
118 case 3: *output++ += constant; --n;
119 }
120
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
121 int n4 = n & 0xFFFFFFFC;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
122 const __m128 constVec = _mm_set1_ps(constant);
123 while (n4) {
124 _mm_store_ps(output, _mm_add_ps(_mm_load_ps(input), constVec));
125 n4 -= 4; input += 4; output += 4;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
126 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
127
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
128 switch (n & 0x3) {
129 case 3: *output++ += constant;
130 case 2: *output++ += constant;
131 case 1: *output++ += constant;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
132 case 0: default: break;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
133 }
134 #elif __ARM_NEON__
135 input += startIndex;
136 output += startIndex;
137 int n = endIndex - startIndex;
138 int n4 = n & 0xFFFFFFFC;
139 float32x4_t inVec, res;
140 while (n4) {
141 inVec = vld1q_f32((const float32_t *) input);
142 res = vaddq_f32(inVec, constant);
143 vst1q_f32((float32_t *) output, res);
144 n4 -= 4;
145 input += 4;
146 output += 4;
147 }
148 switch (n & 0x3) {
149 case 3: *output++ += constant;
150 case 2: *output++ += constant;
151 case 1: *output++ += constant;
152 default: break;
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
153 }
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
154 #else
155 for (int i = startIndex; i < endIndex; i++) {
156 output[i] = input[i] + constant;
157 }
158 #endif
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
159 }
160
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
161 // output = input0 - input1
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
162 static inline void subtract(float *input0, float *input1, float *output, int startIndex, int endIndex) {
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
163 #if __APPLE__
164 vDSP_vsub(input1+startIndex, 1, input0+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
165 #elif __SSE__
166 input0 += startIndex;
167 input1 += startIndex;
168 output += startIndex;
169 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
170
171 switch (startIndex & 0x3) {
172 case 0: default: break;
173 case 1: *output++ = *input0++ - *input1++; --n;
174 case 2: *output++ = *input0++ - *input1++; --n;
175 case 3: *output++ = *input0++ - *input1++; --n;
176 }
177
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
178 int n4 = n & 0xFFFFFFFC;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
179 while (n4) {
180 _mm_store_ps(output, _mm_sub_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
181 n4 -= 4; input0 += 4; input1 += 4; output += 4;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
182 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
183
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
184 switch (n & 0x3) {
185 case 3: *output++ = *input0++ - *input1++;
186 case 2: *output++ = *input0++ - *input1++;
187 case 1: *output++ = *input0++ - *input1++;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
188 case 0: default: break;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
189 }
190 #elif __ARM_NEON__
191 input0 += startIndex;
192 input1 += startIndex;
193 output += startIndex;
194 int n = endIndex - startIndex;
195 int n4 = n & 0xFFFFFFFC;
196 float32x4_t inVec0, inVec1, res;
197 while (n4) {
198 inVec0 = vld1q_f32((const float32_t *) input0);
199 inVec1 = vld1q_f32((const float32_t *) input1);
200 res = vsubq_f32(inVec0, inVec1);
201 vst1q_f32((float32_t *) output, res);
202 n4 -= 4;
203 input0 += 4;
204 input1 += 4;
205 output += 4;
206 }
207 switch (n & 0x3) {
208 case 3: *output++ = *input0++ - *input1++;
209 case 2: *output++ = *input0++ - *input1++;
210 case 1: *output++ = *input0++ - *input1++;
211 default: break;
212 }
213 #else
214 for (int i = startIndex; i < endIndex; i++) {
215 output[i] = input0[i] - input1[i];
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
216 }
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
217 #endif
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
218 }
219
220 static inline void subtract(float *input, float constant, float *output, int startIndex, int endIndex) {
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
221 #if __APPLE__
222 float negation = -1.0f * constant;
223 vDSP_vsadd(input+startIndex, 1, &negation, output+startIndex, 1, endIndex-startIndex);
224 #elif __SSE__
225 input += startIndex;
226 output += startIndex;
227 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
228
229 switch (startIndex & 0x3) {
230 case 0: default: break;
231 case 1: *output++ -= constant; --n;
232 case 2: *output++ -= constant; --n;
233 case 3: *output++ -= constant; --n;
234 }
235
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
236 int n4 = n & 0xFFFFFFFC;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
237 const __m128 constVec = _mm_set1_ps(constant);
238 while (n4) {
239 _mm_store_ps(output, _mm_sub_ps(_mm_load_ps(input), constVec));
240 n4 -= 4; input += 4; output += 4;
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
241 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
242
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
243 switch (n & 0x3) {
244 case 3: *output++ -= constant;
245 case 2: *output++ -= constant;
246 case 1: *output++ -= constant;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
247 case 0: default: break;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
248 }
249 #elif __ARM_NEON__
250 input += startIndex;
251 output += startIndex;
252 int n = endIndex - startIndex;
253 int n4 = n & 0xFFFFFFFC;
254 float32x4_t inVec, res;
255 while (n4) {
256 inVec = vld1q_f32((const float32_t *) input);
257 res = vsubq_f32(inVec, constant);
258 vst1q_f32((float32_t *) output, res);
259 n4 -= 4;
260 input += 4;
261 output += 4;
262 }
263 switch (n & 0x3) {
264 case 3: *output++ -= constant;
265 case 2: *output++ -= constant;
266 case 1: *output++ -= constant;
267 default: break;
268 }
269 #else
270 for (int i = startIndex; i < endIndex; i++) {
271 output[i] = input[i] - constant;
272 }
273 #endif
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
274 }
275
276 static inline void multiply(float *input0, float *input1, float *output, int startIndex, int endIndex) {
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
277 #if __APPLE__
278 vDSP_vmul(input0+startIndex, 1, input1+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
279 #elif __SSE__
280 input0 += startIndex;
281 input1 += startIndex;
282 output += startIndex;
283 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
284
285 switch (startIndex & 0x3) {
286 case 0: default: break;
287 case 1: *output++ = *input0++ * *input1++; --n;
288 case 2: *output++ = *input0++ * *input1++; --n;
289 case 3: *output++ = *input0++ * *input1++; --n;
290 }
291
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
292 int n4 = n & 0xFFFFFFFC;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
293 while (n4) {
294 _mm_store_ps(output, _mm_mul_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
295 n4 -= 4; input0 += 4; input1 += 4; output += 4;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
296 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
297
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
298 switch (n & 0x3) {
299 case 3: *output++ = *input0++ * *input1++;
300 case 2: *output++ = *input0++ * *input1++;
301 case 1: *output++ = *input0++ * *input1++;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
302 case 0: default: break;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
303 }
304 #elif __ARM_NEON__
305 input0 += startIndex;
306 input1 += startIndex;
307 output += startIndex;
308 int n = endIndex - startIndex;
309 int n4 = n & 0xFFFFFFFC;
310 float32x4_t inVec0, inVec1, res;
311 while (n4) {
312 inVec0 = vld1q_f32((const float32_t *) input0); // use VLD1 as data is NOT interleaved
313 inVec1 = vld1q_f32((const float32_t *) input1); // load
314 res = vmulq_f32(inVec0, inVec1); // compute
315 vst1q_f32((float32_t *) output, res); // store
316 n4 -= 4;
317 input0 += 4;
318 input1 += 4;
319 output += 4;
320 }
321 switch (n & 0x3) {
322 case 3: *output++ = *input0++ * *input1++;
323 case 2: *output++ = *input0++ * *input1++;
324 case 1: *output++ = *input0++ * *input1++;
325 default: break;
326 }
327 #else
328 for (int i = startIndex; i < endIndex; i++) {
329 output[i] = input0[i] * input1[i];
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
330 }
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
331 #endif
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
332 }
333
334 static inline void multiply(float *input, float constant, float *output, int startIndex, int endIndex) {
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
335 #if __APPLE__
336 vDSP_vsmul(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
337 #elif __SSE__
338 input += startIndex;
339 output += startIndex;
340 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
341
342 switch (startIndex & 0x3) {
343 case 0: default: break;
344 case 1: *output++ *= constant; --n;
345 case 2: *output++ *= constant; --n;
346 case 3: *output++ *= constant; --n;
347 }
348
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
349 int n4 = n & 0xFFFFFFFC;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
350 const __m128 constVec = _mm_set1_ps(constant);
351 while (n4) {
352 _mm_store_ps(output, _mm_mul_ps(_mm_load_ps(input), constVec));
353 n4 -= 4; input += 4; output += 4;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
354 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
355
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
356 switch (n & 0x3) {
357 case 3: *output++ *= constant;
358 case 2: *output++ *= constant;
359 case 1: *output++ *= constant;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
360 case 0: default: break;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
361 }
362 #elif __ARM_NEON__
363 input += startIndex;
364 output += startIndex;
365 int n = endIndex - startIndex;
366 int n4 = n & 0xFFFFFFFC;
367 float32x4_t inVec, res;
368 while (n4) {
369 inVec = vld1q_f32((const float32_t *) input);
370 res = vmulq_n_f32(inVec, constant);
371 vst1q_f32((float32_t *) output, res);
372 n4 -= 4;
373 input += 4;
374 output += 4;
375 }
376 switch (n & 0x3) {
377 case 3: *output++ *= constant;
378 case 2: *output++ *= constant;
379 case 1: *output++ *= constant;
380 default: break;
381 }
382 #else
383 for (int i = startIndex; i < endIndex; i++) {
384 output[i] = input[i] * constant;
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
385 }
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
386 #endif
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
387 }
388
389 // recipocal: vrecpeq_f32
78a6c5c @mhroth Add remaining SSE implementations for ArrayArithmetic.
authored
390 static inline void divide(float *input0, float *input1, float *output, int startIndex, int endIndex) {
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
391 #if __APPLE__
392 vDSP_vdiv(input1+startIndex, 1, input0+startIndex, 1, output+startIndex, 1, endIndex-startIndex);
393 #elif __SSE__
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
394 input0 += startIndex;
395 input1 += startIndex;
396 output += startIndex;
397 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
398
399 switch (startIndex & 0x3) {
400 case 0: default: break;
401 case 1: *output++ = *input0++ / *input1++; --n;
402 case 2: *output++ = *input0++ / *input1++; --n;
403 case 3: *output++ = *input0++ / *input1++; --n;
404 }
405
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
406 int n4 = n & 0xFFFFFFFC;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
407 while (n4) {
408 _mm_store_ps(output, _mm_div_ps(_mm_load_ps(input0), _mm_load_ps(input1)));
409 n4 -= 4; input0 += 4; input1 += 4; output += 4;
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
410 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
411
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
412 switch (n & 0x3) {
413 case 3: *output++ = *input0++ / *input1++;
414 case 2: *output++ = *input0++ / *input1++;
415 case 1: *output++ = *input0++ / *input1++;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
416 case 0: default: break;
78a6c5c @mhroth Add remaining SSE implementations for ArrayArithmetic.
authored
417 }
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
418 #else
419 for (int i = startIndex; i < endIndex; i++) {
420 output[i] = input0[i] / input1[i];
421 }
422 #endif
78a6c5c @mhroth Add remaining SSE implementations for ArrayArithmetic.
authored
423 }
424
425 static inline void divide(float *input, float constant, float *output, int startIndex, int endIndex) {
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
426 #if __APPLE__
427 vDSP_vsdiv(input+startIndex, 1, &constant, output+startIndex, 1, endIndex-startIndex);
428 #elif __SSE__
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
429 input += startIndex;
430 output += startIndex;
431 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
432
433 switch (startIndex & 0x3) {
434 case 0: default: break;
435 case 1: *output++ /= constant; --n;
436 case 2: *output++ /= constant; --n;
437 case 3: *output++ /= constant; --n;
438 }
439
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
440 int n4 = n & 0xFFFFFFFC;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
441 const __m128 constVec = _mm_set1_ps(constant);
442 while (n4) {
443 _mm_store_ps(output, _mm_div_ps(_mm_load_ps(input), constVec));
444 n4 -= 4; input += 4; output += 4;
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
445 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
446
88f800c @mhroth Allocate all dsp buffers on page boundaries.
authored
447 switch (n & 0x3) {
448 case 3: *output++ /= constant;
449 case 2: *output++ /= constant;
450 case 1: *output++ /= constant;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
451 case 0: default: break;
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
452 }
453 #else
454 for (int i = startIndex; i < endIndex; i++) {
455 output[i] = input[i] / constant;
78a6c5c @mhroth Add remaining SSE implementations for ArrayArithmetic.
authored
456 }
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
457 #endif
78a6c5c @mhroth Add remaining SSE implementations for ArrayArithmetic.
authored
458 }
2edc276 @mhroth Remove all references to memset_pattern*. Replace it with ArrayArithmeti...
authored
459
460 static inline void fill(float *input, float constant, int startIndex, int endIndex) {
461 #if __APPLE__
8187e73 @mhroth Remove check for Accelerate framework if on Apple platform. ZenGarden no...
authored
462 vDSP_vfill(&constant, input+startIndex, 1, endIndex-startIndex);
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
463 #elif __SSE__
2edc276 @mhroth Remove all references to memset_pattern*. Replace it with ArrayArithmeti...
authored
464 input += startIndex;
6376cd1 @mhroth Update error in which highest multiple of 4 in SIMD optimisations was no...
authored
465 int n = endIndex - startIndex;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
466
467 switch (startIndex & 0x3) {
468 case 0: default: break;
469 case 1: *input++ = constant; --n;
470 case 2: *input++ = constant; --n;
471 case 3: *input++ = constant; --n;
472 }
473
35688bc @mhroth Finally defined the correct constant. Need to stop writing code quickly.
authored
474 int n4 = n & 0xFFFFFFFC; // force n to be a multiple of 4
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
475 const __m128 constVec = _mm_set1_ps(constant);
2edc276 @mhroth Remove all references to memset_pattern*. Replace it with ArrayArithmeti...
authored
476 while (n4) {
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
477 _mm_store_ps(input, constVec);
478 n4 -= 4; input += 4;
2edc276 @mhroth Remove all references to memset_pattern*. Replace it with ArrayArithmeti...
authored
479 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
480
2edc276 @mhroth Remove all references to memset_pattern*. Replace it with ArrayArithmeti...
authored
481 switch (n & 0x3) {
482 case 3: *input++ = constant;
483 case 2: *input++ = constant;
484 case 1: *input++ = constant;
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
485 case 0: default: break;
2edc276 @mhroth Remove all references to memset_pattern*. Replace it with ArrayArithmeti...
authored
486 }
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
487 #elif __ARM_NEON__
2edc276 @mhroth Remove all references to memset_pattern*. Replace it with ArrayArithmeti...
authored
488 input += startIndex;
6376cd1 @mhroth Update error in which highest multiple of 4 in SIMD optimisations was no...
authored
489 int n = endIndex - startIndex;
35688bc @mhroth Finally defined the correct constant. Need to stop writing code quickly.
authored
490 int n4 = n & 0xFFFFFFFC; // force n to be a multiple of 4
6b125d3 @mhroth Refactor ArrayArithmetic SSE to make it more readable
authored
491 float32x4_t constVec = vdupq_n_f32(constant);
492 while (n4) {
493 vst1q_f32((float32_t *) input, constVec);
494 n4 -= 4;
495 input += 4;
2edc276 @mhroth Remove all references to memset_pattern*. Replace it with ArrayArithmeti...
authored
496 }
497 switch (n & 0x3) {
498 case 3: *input++ = constant;
499 case 2: *input++ = constant;
500 case 1: *input++ = constant;
501 default: break;
502 }
503 #else
504 for (int i = startIndex; i < endIndex; i++) {
505 input[i] = constant;
506 }
507 #endif
508 }
1709d7b @mhroth Add a central implementation of all ArrayArithmetic operations.
authored
509
510 private:
511 ArrayArithmetic(); // no instances of this object are allowed
512 ~ArrayArithmetic();
513 };
514
515 #endif // _ARRAY_ARITHMETIC_H_
Something went wrong with that request. Please try again.