Skip to content

Commit

Permalink
Prepacked gemm (#26)
Browse files Browse the repository at this point in the history
* Small tiling refactoring to reuse procs between GEMM and prepacked GEMM

* initial template of prepacked GEMM

* fix dispatch return from function

* Prepack for A

* Packed compiles (but Segfaults)

* Working for Generic but not for SIMD

* debug info

* fix kc vs KC bug in packB

* partitionMNK was returning result in wrong order

* Pass all GEMM tests with prepacked GEMM
  • Loading branch information
mratsim committed Feb 9, 2019
1 parent 27e4a77 commit e898f02
Show file tree
Hide file tree
Showing 5 changed files with 578 additions and 35 deletions.
22 changes: 10 additions & 12 deletions laser/primitives/matrix_multiplication/gemm.nim
Expand Up @@ -45,12 +45,11 @@ withCompilerOptimHints()
#
# ############################################################

proc gebp_mkernel[T; ukernel: static MicroKernel](
proc gebp_mkernel*[T; ukernel: static MicroKernel](
mc, nc, kc: int,
alpha: T, packA: ptr UncheckedArray[T],
alpha: T, packA, packB: ptr UncheckedArray[T],
beta: T,
mcncC: MatrixView[T],
tiles: Tiles[T]
mcncC: MatrixView[T]
) =
## Macro kernel, multiply:
## - a block A[mc, kc] * panel B[kc, N]
Expand All @@ -73,7 +72,7 @@ proc gebp_mkernel[T; ukernel: static MicroKernel](

# #####################################
# 4. for jr = 0,...,nc−1 in steps of nr
for jr in countup(0, tiles.nc-1, NR):
for jr in countup(0, nc-1, NR):
omp_task("firstprivate(`jr`)"):
let nr = min(nc - jr, NR) # C[ic:ic+mc, jc+jr:jc+jr+nr]

Expand All @@ -83,7 +82,7 @@ proc gebp_mkernel[T; ukernel: static MicroKernel](
let mr = min(mc - ir, MR)
let c_aux = mcncC.stride(ir, jr) # C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr]

let upanel_b = tiles.b + jr*kc
let upanel_b = packB + jr*kc
prefetch(upanel_b, Read, ModerateTemporalLocality)
let upanel_a = packA + ir*kc
prefetch(upanel_a, Read, ModerateTemporalLocality)
Expand Down Expand Up @@ -163,20 +162,19 @@ proc gemm_impl[T; ukernel: static MicroKernel](
omp_parallel_if(parallelize):
# ####################################
# 3. for ic = 0,...,m−1 in steps of mc
omp_for(ict, tiles.ic_num_tasks, use_simd=false, nowait=true):
let packA = tiles.a + ict * tiles.upanelA_size
omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true):
let packA = tiles.a + icb * tiles.upanelA_size
prefetch(packA, Write, LowTemporalLocality)
let ic = ict * tiles.mc
let ic = icb * tiles.mc
let mc = min(M-ic, tiles.mc) # C[ic:ic+mc, jc:jc+nc]

let mckcA = vA.stride(ic, pc) # A[ic:ic+mc, pc:pc+kc]
pack_A_mc_kc[T, ukernel](packA, mc, kc, mckcA) # PackA block [mc, kc]

gebp_mkernel[T, ukernel]( # GEBP macrokernel:
mc, nc, kc, # C[ic:ic+mc, jc:jc+nc] =
alpha, packA, # αA[ic:ic+mc, pc:pc+kc] * B[pc:pc+kc, jc:jc+nc] +
beta, vC.stride(ic, 0), # βC[ic:ic+mc, jc:jc+nc]
tiles
alpha, packA, tiles.b, # αA[ic:ic+mc, pc:pc+kc] * B[pc:pc+kc, jc:jc+nc] +
beta, vC.stride(ic, 0) # βC[ic:ic+mc, jc:jc+nc]
)

# ############################################################
Expand Down

0 comments on commit e898f02

Please sign in to comment.