Moderately increase speed of compute_cost in sabre

This commit modies the heuristic scoring function compute_cost in the sabre swap rust code to use a faster sum implementation. This is based on the fast_sum() used in the pauli expectation value module but modified to deal with the different data structures used in sabre. This new implementation enables the compiler to more easily use SIMD when available. This is about as fast as we can make a sume of values of indeterminite length without dropping to use SIMD intrinsics directly, either for x86/x86_64 with an unsafe calls (via the simdeez library) or requiring nightly rust and using packed_simd_2 or stdsimd which is cross platform. While this isn't a huge performance boost, it does provide a 1-10% speed up to compute cost in local testing.
mtreinish · Sep 22, 2022 · b26b590 · b26b590
1 parent 0f688eb
commit b26b590
Showing 1 changed file with 20 additions and 2 deletions.
diff --git a/src/sabre_swap/mod.rs b/src/sabre_swap/mod.rs
@@ -16,6 +16,8 @@ pub mod neighbor_table;
 pub mod sabre_dag;
 pub mod swap_map;
 
+use std::convert::TryInto;
+
 use std::cmp::Ordering;
 
 use hashbrown::{HashMap, HashSet};
@@ -41,6 +43,7 @@ use neighbor_table::NeighborTable;
 use sabre_dag::SabreDAG;
 use swap_map::SwapMap;
 
+const LANES: usize = 8;
 const EXTENDED_SET_SIZE: usize = 20; // Size of lookahead window.
 const DECAY_RATE: f64 = 0.001; // Decay coefficient for penalizing serial swaps.
 const DECAY_RESET_INTERVAL: u8 = 5; // How often to reset all decay rates to 1.
@@ -453,10 +456,25 @@ fn sabre_score_heuristic(
 
 #[inline]
 fn compute_cost(layer: &[[usize; 2]], layout: &NLayout, dist: &ArrayView2<f64>) -> f64 {
-    layer
+    let chunks = layer.chunks_exact(LANES);
+    let remainder = chunks.remainder();
+    let sum = chunks.fold([0.; LANES], |mut acc, chunk| {
+        let chunk: [[usize; 2]; LANES] = chunk.try_into().unwrap();
+        for i in 0..LANES {
+            let gate = chunk[i];
+            acc[i] += dist[[layout.logic_to_phys[gate[0]], layout.logic_to_phys[gate[1]]]]
+        }
+        acc
+    });
+    let remainder: f64 = remainder
         .iter()
         .map(|gate| dist[[layout.logic_to_phys[gate[0]], layout.logic_to_phys[gate[1]]]])
-        .sum()
+        .sum();
+    let mut reduced = 0.;
+    for val in sum {
+        reduced += val;
+    }
+    reduced + remainder
 }
 
 fn score_lookahead(