# Matrix Acceleration Engine

In [3]:
from hardware_accelerators.rtllib import (
    SystolicArrayDiP,
    BufferMemory,
    AccumulatorMemoryBank,
    float_adder,
    float_multiplier,
    lmul_fast,
)
from hardware_accelerators.dtypes import *
from pyrtl import WireVector, Register, Input, Output, conditional_assignment, otherwise
import numpy as np
from dataclasses import dataclass
from typing import Type, Callable

## Putting everything together

In [20]:
@dataclass
class AcceleratorConfig:
    """Configuration class for a systolic array accelerator.

    This class defines the parameters and specifications for a systolic array
    accelerator including array dimensions, data types, arithmetic operations,
    and memory configuration.
    """

    array_size: int
    """Dimension of systolic array (always square)"""

    data_type: Type[BaseFloat]
    """Floating point format of input data to systolic array"""

    weight_type: Type[BaseFloat]
    """Floating point format of weight inputs"""

    accum_type: Type[BaseFloat]
    """Floating point format to accumulate values in"""

    pe_adder: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
    """Function to generate adder hardware for the processing elements"""

    accum_adder: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
    """Function to generate adder hardware for the accumulator buffer"""

    pe_multiplier: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector]
    """Function to generate multiplier hardware for the processing elements"""

    pipeline: bool
    """Whether to add a pipeline stage in processing elements between multiplier and adder"""

    accumulator_tiles: int
    """Number of tiles in the accumulator memory, each tile is equal to the size of the systolic array"""

    @property
    def accum_addr_width(self):
        """Get the width of the accumulator address bus in bits"""
        return (self.accumulator_tiles - 1).bit_length()

In [17]:
x = 5
(x - 1).bit_length()

3

In [None]:
config = AcceleratorConfig(
    array_size=4,
    data_type=BF16,
    weight_type=BF16,
    accum_type=BF16,
    pe_adder=float_adder,
    pe_multiplier=lmul_fast,
    accum_adder=float_adder,
    pipeline=False,
    accumulator_tiles=4,
)

In [None]:
# Buffer Input Signals

data_start = Input(1, "data_start")  # Start data streaming
data_bank = Input(1, "data_bank")  # Select data memory bank
weight_start = Input(1, "weight_start")  # Start weight streaming
weight_bank = Input(1, "weight_bank")  # Select weight memory bank

buffer = BufferMemory(
    array_size=config.array_size,
    data_type=config.data_type,
    weight_type=config.weight_type,
)

buffer.connect_inputs(
    data_start=data_start,
    data_bank=data_bank,
    weight_start=weight_start,
    weight_bank=weight_bank,
)

buffer_outputs = buffer.get_outputs()

systolic_array = SystolicArrayDiP(
    size=config.array_size,
    data_type=config.data_type,
    accum_type=config.accum_type,
    multiplier=config.pe_multiplier,
    adder=config.pe_adder,
    pipeline=config.pipeline,
)

systolic_array.connect_inputs(
    weight_enable=buffer_outputs.weight_valid,
    weight_inputs=buffer_outputs.weights_out,
    data_inputs=buffer_outputs.datas_out,
    enable_input=buffer_outputs.data_valid,
)


acumulator = AccumulatorMemoryBank(
    tile_addr_width=config.accum_addr_width,
    array_size=config.array_size,
    data_type=config.accum_type,
    adder=config.accum_adder,
)