x64: swap loop order for xf16/int8 using AMX

Swapping loop order improve performance of shapes with very large mb if compute intensity increases. This is similar to what was done for f32 for avx512 and avx2.
oneapi-src · Mar 6, 2024 · 67012a5 · 67012a5
1 parent 1d3d502
commit 67012a5
Showing 1 changed file with 8 additions and 6 deletions.
diff --git a/src/cpu/x64/jit_brgemm_inner_product_utils.cpp b/src/cpu/x64/jit_brgemm_inner_product_utils.cpp
@@ -1638,9 +1638,8 @@ void jit_brgemm_ip_fwd_conf_t::choose_loop_order() {
     const bool is_f32 = everyone_is(f32, src_dt, wei_dt, dst_dt);
     const bool is_f32_compute = is_f32 && !is_bf32;
 
-    // Optimize loop order for f32, if buffer is not required.
-    const bool ocb_inner_most = is_f32_compute;
-    if (ocb_inner_most) {
+    // Optimize loop order for f32
+    if (is_f32_compute) {
         loop_order = osc_occ_icc_osb_ocb;
 
         // Use icc loop as outer-most to save bandwidth when os is small.
@@ -1683,10 +1682,13 @@ void jit_brgemm_ip_fwd_conf_t::choose_loop_order() {
     float eff_occ_osc = eff(os_span_occ_osc, oc_span_occ_osc, ic_span);
     bool do_occ_osc = eff_occ_osc > 1.15 * eff_osc_occ;
 
-    // Enable occ_osc_... for f32 and with small os-blocks.
-    // TODO: Expand to other precisions and other blocks sizes.
     const bool is_avx2 = is_superset(isa, avx2);
-    if ((os_block < 32 || do_occ_osc) && is_f32_compute && is_avx2)
+    const bool is_f32_avx2 = is_f32_compute && is_avx2;
+    const bool is_xf16 = one_of(wei_dt, bf16, f16) || is_bf32;
+    const bool is_int8 = one_of(src_dt, u8, s8) && wei_dt == s8;
+    const bool is_compute_amx = (is_xf16 || is_int8) && is_amx;
+
+    if ((os_block < 32 || do_occ_osc) && (is_compute_amx || is_f32_avx2))
         loop_order = icc_occ_osc_ocb_osb;
 }