Skip to content

Conversation

@LeiWang1999
Copy link
Contributor

Clean the Interface.

With GEMM_SS, a gemm can be:

ptx_macro_generator = TensorCorePTXMacroGenerator(
        a_dtype=dtypeAB, b_dtype=dtypeAB, accum_dtype=accum_dtype,
        a_transposed=False, b_transposed=True, block_row_warps=block_row_warps,
        block_col_warps=block_col_warps, warp_row_tiles=warp_row_tiles,
        warp_col_tiles=warp_col_tiles, chunk=chunk, threads=threads
    )
    @T.prim_func
    def main(
        A: T.Buffer(A_shape, dtypeAB),
        B: T.Buffer(B_shape, dtypeAB),
        C: T.Buffer((M, N), dtypeC),
    ):
        with T.Kernel(
            T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads
        ) as (bx, by):

            A_shared = T.alloc_shared(A_shared_shape, dtypeAB, scope=shared_scope)
            B_shared = T.alloc_shared(B_shared_shape, dtypeAB, scope=shared_scope)
            C_shared = T.alloc_shared(C_shared_shape, dtypeC, scope=shared_scope)
            C_local = T.alloc_fragment((warp_rows * warp_cols * local_size), accum_dtype, scope="local")
            thread_bindings = T.thread_binding(0, threads, "threadIdx.x")

            T.annotate_layout(
                {
                    A_shared: make_swizzle_layout(A_shared),
                    B_shared: make_swizzle_layout(B_shared),
                }
            )
            
            for i in T.serial(warp_rows * warp_cols * local_size):
                C_local[i] = 0

            for ko in T.Pipelined((K // block_K), num_stages=(stage - 1)):
                # TODO(lei): storage sync should be injected automatically by TVM Pass
                T.tvm_storage_sync("shared")

                # Load A into shared memory
                for i, k in T.Parallel(block_M, block_K):
                    A_shared[i, k] = A[by * block_M + i, ko * block_K + k]

                # Load B into shared memory
                for j, k in T.Parallel(block_N, block_K):
                    B_shared[j, k] = B[bx * block_N + j, ko * block_K + k]

                # TODO(lei): storage sync should be injected automatically by TVM Pass
                T.tvm_storage_sync("shared")

                # perform gemm computation
                ptx_macro_generator.GEMM_SS(
                    ptx_macro_generator,
                    A_shared,
                    B_shared,
                    C_local,
                    thread_bindings=thread_bindings,
                )

            ptx_macro_generator.STMATRIX(
                ptx_macro_generator,
                C_local,
                C_shared,
                thread_bindings=thread_bindings,
            )

            for i, j in T.Parallel(block_M, block_N):
                C[by * block_M + i, bx * block_N + j] = C_shared[i // micro_size_x, j // micro_size_y, i % micro_size_x, j % micro_size_y]

@LeiWang1999 LeiWang1999 merged commit b9fab25 into microsoft:main Sep 6, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant