Skip to content

Commit

Permalink
Update error in which highest multiple of 4 in SIMD optimisations was…
Browse files Browse the repository at this point in the history
… not being calculated correctly.
  • Loading branch information
Martin Roth committed Dec 1, 2010
1 parent bed34c1 commit 6376cd1
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 23 deletions.
31 changes: 20 additions & 11 deletions src/ArrayArithmetic.h
Original file line number Diff line number Diff line change
Expand Up @@ -257,16 +257,25 @@ class ArrayArithmetic {
#endif
} else {
#if __SSE__
input += startIndex;
output += startIndex;
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFF4;
__m128 inVec, res;
const __m128 constVec = _mm_load1_ps(&constant);
const int numFours = (endIndex - startIndex) >> 2;
for (int i = startIndex, j = 0; j < numFours; i+=4, j++) {
inVec = _mm_loadu_ps(input + i);
__m128 constVec = _mm_set1_ps(constant);
while (n4) {
inVec = _mm_loadu_ps(input);
res = _mm_mul_ps(inVec, constVec);
_mm_store_ps(output + i, res);
_mm_storeu_ps(output, res);
n4 -= 4;
input += 4;
output += 4;
}
for (int i = startIndex + numFours<<2; i < endIndex; i++) {
output[i] = input[i] * constant;
switch (n & 0x3) {
case 3: *input++ *= constant;
case 2: *input++ *= constant;
case 1: *input++ *= constant;
default: break;
}
#elif __ARM_NEON__
const int numFours = (endIndex - startIndex) >> 2;
Expand Down Expand Up @@ -344,8 +353,8 @@ class ArrayArithmetic {
vDSP_vfill(&constant, input, 1, endIndex-startIndex);
#elif __ARM_NEON__
input += startIndex;
const int n = endIndex - startIndex;
const int n4 = n & 0xFFFFFFF3; // force n to be a multiple of 4
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFF4; // force n to be a multiple of 4
float32x4_t constVec = vdupq_n_f32(constant);
while (n4) {
vst1q_f32((float32_t *) input, constVec);
Expand All @@ -360,8 +369,8 @@ class ArrayArithmetic {
}
#elif __SSE__
input += startIndex;
const int n = endIndex - startIndex;
const int n4 = n & 0xFFFFFFF3; // force n to be a multiple of 4
int n = endIndex - startIndex;
int n4 = n & 0xFFFFFFF4; // force n to be a multiple of 4
const __m128 constVec = _mm_set1_ps(constant);
while (n4) {
_mm_storeu_ps(input, constVec); // _mm_store_ps or _mm_storeu_ps?
Expand Down
35 changes: 24 additions & 11 deletions src/DspReciprocalSqrt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,37 +39,50 @@ const char *DspReciprocalSqrt::getObjectLabel() {
void DspReciprocalSqrt::processDspWithIndex(int fromIndex, int toIndex) {
// [rsqrt~] takes no messages, so the full block will be computed every time
#if __ARM_NEON__
float *inBuff = dspBufferAtInlet0 + fromIndex;
float *outBuff = dspBufferAtOutlet0 + fromIndex;
float *inBuff = dspBufferAtInlet0;
float *outBuff = dspBufferAtOutlet0;
float32x4_t inVec, outVec;
float32x4_t zeroVec = vdupq_n_f32(FLT_MIN);
int n = toIndex - fromIndex;
for (int i = 0; i < n; i+=4, inBuff+=4, outBuff+=4) {
inVec = vld1q_f32((const float32_t *) inBuff);
int n4 = n & 0xFFFFFFF4;
while (n4) {
inVec = vld1q_f32(inBuff);
inVec = vmaxq_f32(inVec, zeroVec);
outVec = vrsqrteq_f32(inVec);
vst1q_f32((float32_t *) outBuff, outVec);
n4 -= 4;
inBuff += 4;
outBuff += 4;
}
switch (n & 0x3) {
case 3: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
case 2: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
case 1: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
default: break;
}
#elif __SSE__
// NOTE: for all non-positive numbers, this routine will output a very large number (not Inf) == 1/sqrt(FLT_MIN)
float *inBuff = dspBufferAtInlet0 + fromIndex;
float *outBuff = dspBufferAtOutlet0 + fromIndex;
float *inBuff = dspBufferAtInlet0;
float *outBuff = dspBufferAtOutlet0;
__m128 inVec, outVec;
__m128 zeroVec = _mm_set1_ps(FLT_MIN);
int n = toIndex - fromIndex;
int n4 = n & 0xFFFFFFF3;
int n4 = n & 0xFFFFFFF4;
while (n4) {
inVec = _mm_loadu_ps(inBuff);
inVec = _mm_loadu_ps(inBuff); // unaligned load must be used because inBuff could point anywhere
// ensure that all inputs are positive, max(FLT_MIN, inVec), preventing divide-by-zero
inVec = _mm_max_ps(inVec, zeroVec);
outVec = _mm_rsqrt_ps(inVec);
// aligned store may be used because outBuff always points to the beginning of the output buffer
_mm_store_ps(outBuff, outVec);
n4 -= 4;
inBuff += 4;
outBuff += 4;
}
switch (n & 0x3) {
case 3: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff++ : FLT_MIN);
case 2: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff++ : FLT_MIN);
case 1: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff++ : FLT_MIN);
case 3: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
case 2: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
case 1: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
default: break;
}
#else
Expand Down
2 changes: 1 addition & 1 deletion src/DspRifft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ void DspRifft::processDspWithIndex(int fromIndex, int toIndex) {
inputVector.imagp = dspBufferAtInlet1;
DSPSplitComplex outputVector;
outputVector.realp = dspBufferAtOutlet0;
outputVector.imagp = (float *) alloca(numBytesInBlock);
outputVector.imagp = (float *) alloca(numBytesInBlock); // this buffer will not contain any useful data
vDSP_fft_zop(fftSetup, &inputVector, 1, &outputVector, 1, log2n, kFFTDirection_Inverse);
#endif // __APPLE__
}

0 comments on commit 6376cd1

Please sign in to comment.