In [1]:
#include <assert.h>
#include <iostream>
#include <chrono>

In [2]:
const size_t n_classes = 10;
const size_t rstride = 20;
const size_t n_cubes = 30 * n_classes * rstride;
float in[n_cubes];
float out_1[n_cubes / n_classes];
float out_2[n_cubes / n_classes];
for (int i = 0; i < n_cubes; ++i)
    in[i] = (float) (rand() % 100) / 100;

In [3]:
inline void reduce_counters_readable(size_t n_classes, size_t n_cubes, float *in, float *out, size_t rstride) {
    // Equivalent and hopefully more readable but also presumably
    // less efficient version of the `reduce_counters` function.
    //
    // Explanation:
    // Take a 3D `in` matrix of shape (`n` x `n_classes` x `rstride`),
    // where `n` = `n_cubes` / (`rstride` * `n_classes`),
    // and reduce it to a 2D `out` matrix of shape (`n` x `rstride`)
    // by summing along the second dimension.

    size_t i, j, k;
    
    size_t K = rstride;
    size_t J = n_classes;
    size_t I = n_cubes / J / K;

    size_t JK = J * K;
    
    for (i = 0; i < I; ++i)
        for (j = 0; j < J; ++j)
            for (k = 0; k < K; ++k)
                out[i * K + k] += in[i * JK + j * K + k];
}
        

In [4]:
inline void reduce_counters(size_t n_classes, size_t n_cubes, float *in, float *out, size_t rstride) {
    for (size_t c = 0, v = 0; c < n_cubes; c += rstride * n_classes) {
        for (size_t s = 0; s < rstride; ++s, ++v) {
            for (size_t d = 0; d < n_classes; ++d) {
                out[v] += in[c + s + (d * rstride)];
            }
        }
    }
}

In [5]:
auto start = std::chrono::high_resolution_clock::now();
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed;

In [6]:
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100000; ++i) {
    reduce_counters(n_classes, n_cubes, in, out_1, rstride);
}
finish = std::chrono::high_resolution_clock::now();
elapsed = finish - start;
std::cout << "Elapsed time: " << elapsed.count() << " s\n";

Elapsed time: 1.65867 s


In [7]:
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100000; ++i) {
    reduce_counters_readable(n_classes, n_cubes, in, out_2, rstride);
}
finish = std::chrono::high_resolution_clock::now();
elapsed = finish - start;
std::cout << "Elapsed time: " << elapsed.count() << " s\n";

Elapsed time: 1.73905 s


In [8]:
(1.74 - 1.66) / 1.66

0.0481928

In [8]:
for (int i = 0; i < n_cubes / n_classes; ++i)
    if (out_1[i] != out_2[i]) {
        std::cout << i << " ";
    }