Skip to content

Commit

Permalink
x64: swap loop order for xf16/int8 using AMX
Browse files Browse the repository at this point in the history
Swapping loop order improve performance of shapes with very large mb if
compute intensity increases.  This is similar to what was done for f32
for avx512 and avx2.
  • Loading branch information
aaraujom committed Mar 6, 2024
1 parent 1d3d502 commit 67012a5
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions src/cpu/x64/jit_brgemm_inner_product_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1638,9 +1638,8 @@ void jit_brgemm_ip_fwd_conf_t::choose_loop_order() {
const bool is_f32 = everyone_is(f32, src_dt, wei_dt, dst_dt);
const bool is_f32_compute = is_f32 && !is_bf32;

// Optimize loop order for f32, if buffer is not required.
const bool ocb_inner_most = is_f32_compute;
if (ocb_inner_most) {
// Optimize loop order for f32
if (is_f32_compute) {
loop_order = osc_occ_icc_osb_ocb;

// Use icc loop as outer-most to save bandwidth when os is small.
Expand Down Expand Up @@ -1683,10 +1682,13 @@ void jit_brgemm_ip_fwd_conf_t::choose_loop_order() {
float eff_occ_osc = eff(os_span_occ_osc, oc_span_occ_osc, ic_span);
bool do_occ_osc = eff_occ_osc > 1.15 * eff_osc_occ;

// Enable occ_osc_... for f32 and with small os-blocks.
// TODO: Expand to other precisions and other blocks sizes.
const bool is_avx2 = is_superset(isa, avx2);
if ((os_block < 32 || do_occ_osc) && is_f32_compute && is_avx2)
const bool is_f32_avx2 = is_f32_compute && is_avx2;
const bool is_xf16 = one_of(wei_dt, bf16, f16) || is_bf32;
const bool is_int8 = one_of(src_dt, u8, s8) && wei_dt == s8;
const bool is_compute_amx = (is_xf16 || is_int8) && is_amx;

if ((os_block < 32 || do_occ_osc) && (is_compute_amx || is_f32_avx2))
loop_order = icc_occ_osc_ocb_osb;
}

Expand Down

0 comments on commit 67012a5

Please sign in to comment.