Update error in which highest multiple of 4 in SIMD optimisations was…

… not being calculated correctly.
mhroth · Dec 1, 2010 · 6376cd1 · 6376cd1
1 parent bed34c1
commit 6376cd1
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 23 deletions.
diff --git a/src/ArrayArithmetic.h b/src/ArrayArithmetic.h
@@ -257,16 +257,25 @@ class ArrayArithmetic {
         #endif
       } else {
         #if __SSE__
+        input += startIndex;
+        output += startIndex;
+        int n = endIndex - startIndex;
+        int n4 = n & 0xFFFFFFF4;
         __m128 inVec, res;
-        const __m128 constVec = _mm_load1_ps(&constant);
-        const int numFours = (endIndex - startIndex) >> 2;
-        for (int i = startIndex, j = 0; j < numFours; i+=4, j++) {
-          inVec = _mm_loadu_ps(input + i);
+        __m128 constVec = _mm_set1_ps(constant);
+        while (n4) {
+          inVec = _mm_loadu_ps(input);
           res = _mm_mul_ps(inVec, constVec);
-          _mm_store_ps(output + i, res);
+          _mm_storeu_ps(output, res);
+          n4 -= 4;
+          input += 4;
+          output += 4;
         }
-        for (int i = startIndex + numFours<<2; i < endIndex; i++) {
-          output[i] = input[i] * constant;
+        switch (n & 0x3) {
+          case 3: *input++ *= constant;
+          case 2: *input++ *= constant;
+          case 1: *input++ *= constant;
+          default: break;
         }
         #elif __ARM_NEON__
         const int numFours = (endIndex - startIndex) >> 2;
@@ -344,8 +353,8 @@ class ArrayArithmetic {
       vDSP_vfill(&constant, input, 1, endIndex-startIndex);
       #elif __ARM_NEON__
       input += startIndex;
-      const int n = endIndex - startIndex;
-      const int n4 = n & 0xFFFFFFF3; // force n to be a multiple of 4
+      int n = endIndex - startIndex;
+      int n4 = n & 0xFFFFFFF4; // force n to be a multiple of 4
       float32x4_t constVec = vdupq_n_f32(constant);
       while (n4) {
         vst1q_f32((float32_t *) input, constVec);
@@ -360,8 +369,8 @@ class ArrayArithmetic {
       }
       #elif __SSE__
       input += startIndex;
-      const int n = endIndex - startIndex;
-      const int n4 = n & 0xFFFFFFF3; // force n to be a multiple of 4
+      int n = endIndex - startIndex;
+      int n4 = n & 0xFFFFFFF4; // force n to be a multiple of 4
       const __m128 constVec = _mm_set1_ps(constant);
       while (n4) {
         _mm_storeu_ps(input, constVec); // _mm_store_ps or _mm_storeu_ps?

diff --git a/src/DspReciprocalSqrt.cpp b/src/DspReciprocalSqrt.cpp
@@ -39,37 +39,50 @@ const char *DspReciprocalSqrt::getObjectLabel() {
 void DspReciprocalSqrt::processDspWithIndex(int fromIndex, int toIndex) {
   // [rsqrt~] takes no messages, so the full block will be computed every time
   #if __ARM_NEON__
-  float *inBuff = dspBufferAtInlet0 + fromIndex;
-  float *outBuff = dspBufferAtOutlet0 + fromIndex;
+  float *inBuff = dspBufferAtInlet0;
+  float *outBuff = dspBufferAtOutlet0;
   float32x4_t inVec, outVec;
+  float32x4_t zeroVec = vdupq_n_f32(FLT_MIN);
   int n = toIndex - fromIndex;
-  for (int i = 0; i < n; i+=4, inBuff+=4, outBuff+=4) {
-    inVec = vld1q_f32((const float32_t *) inBuff);
+  int n4 = n & 0xFFFFFFF4;
+  while (n4) {
+    inVec = vld1q_f32(inBuff);
+    inVec = vmaxq_f32(inVec, zeroVec);
     outVec = vrsqrteq_f32(inVec);
     vst1q_f32((float32_t *) outBuff, outVec);
+    n4 -= 4;
+    inBuff += 4;
+    outBuff += 4;
+  }
+  switch (n & 0x3) {
+    case 3: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
+    case 2: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
+    case 1: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
+    default: break;
   }
   #elif __SSE__
   // NOTE: for all non-positive numbers, this routine will output a very large number (not Inf) == 1/sqrt(FLT_MIN)
-  float *inBuff = dspBufferAtInlet0 + fromIndex;
-  float *outBuff = dspBufferAtOutlet0 + fromIndex;
+  float *inBuff = dspBufferAtInlet0;
+  float *outBuff = dspBufferAtOutlet0;
   __m128 inVec, outVec;
   __m128 zeroVec = _mm_set1_ps(FLT_MIN);
   int n = toIndex - fromIndex;
-  int n4 = n & 0xFFFFFFF3;
+  int n4 = n & 0xFFFFFFF4;
   while (n4) {
-    inVec = _mm_loadu_ps(inBuff);
+    inVec = _mm_loadu_ps(inBuff); // unaligned load must be used because inBuff could point anywhere
     // ensure that all inputs are positive, max(FLT_MIN, inVec), preventing divide-by-zero
     inVec = _mm_max_ps(inVec, zeroVec);
     outVec = _mm_rsqrt_ps(inVec);
+    // aligned store may be used because outBuff always points to the beginning of the output buffer
     _mm_store_ps(outBuff, outVec);
     n4 -= 4;
     inBuff += 4;
     outBuff += 4;
   }
   switch (n & 0x3) {
-    case 3: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff++ : FLT_MIN);
-    case 2: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff++ : FLT_MIN);
-    case 1: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff++ : FLT_MIN);
+    case 3: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
+    case 2: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
+    case 1: *outBuff++ = 1.0f / sqrtf((*inBuff >= 0.0f) ? *inBuff : FLT_MIN); ++inBuff;
     default: break;
   }
   #else

diff --git a/src/DspRifft.cpp b/src/DspRifft.cpp
@@ -50,7 +50,7 @@ void DspRifft::processDspWithIndex(int fromIndex, int toIndex) {
   inputVector.imagp = dspBufferAtInlet1;
   DSPSplitComplex outputVector;
   outputVector.realp = dspBufferAtOutlet0;
-  outputVector.imagp = (float *) alloca(numBytesInBlock);
+  outputVector.imagp = (float *) alloca(numBytesInBlock); // this buffer will not contain any useful data
   vDSP_fft_zop(fftSetup, &inputVector, 1, &outputVector, 1, log2n, kFFTDirection_Inverse);
   #endif // __APPLE__
 }