Skip to content

Commit

Permalink
ugly hack around #27 (breaks #26)
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Mar 23, 2019
1 parent c4ac93d commit 0805771
Showing 1 changed file with 19 additions and 31 deletions.
50 changes: 19 additions & 31 deletions laser/primitives/matrix_multiplication/gemm_tiling.nim
Expand Up @@ -109,29 +109,29 @@ const X86_vecwidth_int: X86_FeatureMap = [
]

# Registers constraints and micro-kernel tuning
#  - To issue 2xFMAs in parallel we need to use 2x SIMD registers
# - To issue 2xFMAs in parallel we need to use 2x SIMD registers
# - We want to hold C of size MR * NR completely in SIMD registers as well
# as each value is reused k times during accumulation C[i, j] += A[i, k] * B[k, j]
# - We should have enough SIMD registers left to hold
#  the corresponding sections of A and B (at least 4, 2xA and 2xB for FMAs)
# the corresponding sections of A and B (at least 4, 2xA and 2xB for FMAs)
#
# On x86-64 X SIMD registers that can issue 2xFMAs per cycle:
# - NbVecs is 2 minimum
# - RegsPerVec = 2 * NbVecs => 4 minimum (for A and for B)
# - NR = NbVecs * NbScalarsPerSIMD
#  - C: MR*NR and uses MR*NbVecs SIMD registers
# - C: MR*NR and uses MR*NbVecs SIMD registers
# - MR*NbVecs + RegsPerVec <= X
# -> MR*NbVecs + 2 * NbVecs <= X
# -> (MR+2) * NbVecs <= X
#
# Some solutions:
# - AVX with 16 registers:
# - MR = 6, NbVecs = 2
#  FP32: 8xFP32 per SIMD --> NR = 2x8
# FP32: 8xFP32 per SIMD --> NR = 2x8
# ukernel = 6x16
# FP64, ukernel = 6x8
# - MR = 2, NbVecs = 4
#  FP32: 8xFP32 per SIMD --> NR = 4x8
# FP32: 8xFP32 per SIMD --> NR = 4x8
# ukernel = 2x32
# FP64, ukernel = 2x16
# - AVX512 with 32 registers
Expand Down Expand Up @@ -269,11 +269,19 @@ proc deallocTiles[T](tiles: Tiles[T]) =
if not tiles.b_alloc_mem.isNil:
deallocShared tiles.b_alloc_mem

func partitionMNK*(
ukernel: static MicroKernel,
T: typedesc,
M, N, K: Natural,
): tuple[mc, nc, kc: int] =
proc newTiles*(
ukernel: static MicroKernel,
T: typedesc,
M, N, K: Natural,
): Tiles[T] =
# BLIS paper [2] section II Figure 2:
# - kc * nr in L1 cache µkernel
# - mc * kc in L2 cache Ã
# - kc * nc in L3 cache ~B (no L3 in Xeon Phi ¯\_(ツ)_/¯)
new result, deallocTiles[T]
const
nr = ukernel.nr
mr = ukernel.mr

result.nc = N # We don't partition over N

Expand Down Expand Up @@ -304,29 +312,9 @@ func partitionMNK*(
result.mc = min( 768 div T.sizeof, M)
result.kc = min(2048 div T.sizeof, K)

func get_num_tiles*(dim_size, tile_size: int): int {.inline.} =
## Get the number of tiles along a dimension depending on the tile size
(dim_size + tile_size - 1) div tile_size

proc newTiles*(
ukernel: static MicroKernel,
T: typedesc,
M, N, K: Natural,
): Tiles[T] =
# BLIS paper [2] section II Figure 2:
# - kc * nr in L1 cache µkernel
# - mc * kc in L2 cache Ã
# - kc * nc in L3 cache ~B (no L3 in Xeon Phi ¯\_(ツ)_/¯)
new result, deallocTiles[T]
const
nr = ukernel.nr
mr = ukernel.mr

(result.mc, result.kc, result.nc) = ukernel.partitionMNK(T, M, N, K)

# Parallel config
# Ic loop parallel means that each thread will share a panel B and pack a different A
result.ic_num_tasks = get_num_tiles(M, result.mc)
result.ic_num_tasks = (M+result.mc-1) div result.mc

# Packing
# During packing the max size is unroll_stop*kc+kc*LR, LR = MR or NR
Expand Down

0 comments on commit 0805771

Please sign in to comment.