# Data and Weight Buffer

Will read an entire array from the simulation code into the buffer at once. Inputs will include a start signal and will emit a done signal once all data/weights have been read.

In [2]:
from hardware_accelerators.rtllib import (
    SystolicArrayDiP,
    AccumulatorMemoryBank,
    float_adder,
    float_multiplier,
    lmul_fast,
)
from hardware_accelerators.dtypes import *

from typing import Type
import numpy as np
from pyrtl import *

## Main Class

In [3]:
class BufferMemory:
    def __init__(
        self, array_size: int, data_type: Type[BaseFloat], weight_type: Type[BaseFloat]
    ):
        # Configuration parameters
        self.array_size = array_size
        self.addr_width = (array_size - 1).bit_length()
        self.d_width = data_type.bitwidth()
        self.w_width = weight_type.bitwidth()
        self.data_mem_width = self.d_width * array_size
        self.weight_mem_width = self.w_width * array_size

        # Memory Banks
        self.data_mems = [
            MemBlock(bitwidth=self.data_mem_width, addrwidth=self.addr_width)
            for _ in range(2)
        ]
        self.weight_mems = [
            MemBlock(bitwidth=self.weight_mem_width, addrwidth=self.addr_width)
            for _ in range(2)
        ]

        # Control Inputs
        self.data_start = WireVector(1)  # Start data streaming
        self.data_bank = WireVector(1)  # Select data memory bank
        self.weight_start = WireVector(1)  # Start weight streaming
        self.weight_bank = WireVector(1)  # Select weight memory bank

        # State Registers
        self.data_active = Register(1)  # Data streaming in progress
        self.data_addr = Register(self.addr_width)
        self.weight_active = Register(1)  # Weight streaming in progress
        self.weight_addr = Register(self.addr_width)

        # Status Outputs
        self.data_valid = WireVector(1)  # Data output is valid
        self.weight_valid = WireVector(1)  # Weight output is valid

        # Data Outputs
        self.datas_out = [WireVector(self.d_width) for _ in range(array_size)]
        self.weights_out = [WireVector(self.w_width) for _ in range(array_size)]

        # Control Logic
        self._implement_control_logic()

    def _implement_control_logic(self):
        """Implement the control and datapath logic."""
        with conditional_assignment:
            # Data streaming control
            with self.data_start & ~self.data_active:
                self.data_active.next |= 1
                self.data_addr.next |= 0

            with self.data_active:
                # Generate valid signal
                self.data_valid |= 1

                # Stream data from selected memory bank
                with self.data_bank == 0:
                    for out, data in zip(
                        self.datas_out,
                        chop(
                            self.data_mems[0][self.data_addr],
                            *[self.d_width] * self.array_size,
                        ),
                    ):
                        out |= data
                with otherwise:
                    for out, data in zip(
                        self.datas_out,
                        chop(
                            self.data_mems[1][self.data_addr],
                            *[self.d_width] * self.array_size,
                        ),
                    ):
                        out |= data

                # Address counter and completion logic
                with self.data_addr == self.array_size - 1:
                    self.data_active.next |= 0
                with otherwise:
                    self.data_addr.next |= self.data_addr + 1

        with conditional_assignment:
            # Weight streaming control (mirror of data control)
            with self.weight_start & ~self.weight_active:
                self.weight_active.next |= 1
                self.weight_addr.next |= 0

            with self.weight_active:
                self.weight_valid |= 1

                with self.weight_bank == 0:
                    for out, weight in zip(
                        self.weights_out,
                        chop(
                            self.weight_mems[0][self.weight_addr],
                            *[self.w_width] * self.array_size,
                        ),
                    ):
                        out |= weight
                with otherwise:
                    for out, weight in zip(
                        self.weights_out,
                        chop(
                            self.weight_mems[1][self.weight_addr],
                            *[self.w_width] * self.array_size,
                        ),
                    ):
                        out |= weight

                with self.weight_addr == self.array_size - 1:
                    self.weight_active.next |= 0
                with otherwise:
                    self.weight_addr.next |= self.weight_addr + 1

    def connect_inputs(self, data_start, data_bank, weight_start, weight_bank):
        """Connect control signals for the buffer memory.

        Args:
            data_start: Start signal for data streaming (1 bit)
            data_bank: Data memory bank selection (1 bit)
            weight_start: Start signal for weight streaming (1 bit)
            weight_bank: Weight memory bank selection (1 bit)
        """
        if data_start is not None:
            assert len(data_start) == 1
            self.data_start <<= data_start

        if data_bank is not None:
            assert len(data_bank) == 1
            self.data_bank <<= data_bank

        if weight_start is not None:
            assert len(weight_start) == 1
            self.weight_start <<= weight_start

        if weight_bank is not None:
            assert len(weight_bank) == 1
            self.weight_bank <<= weight_bank

## Simulation Class

In [4]:
from typing import List


class BufferMemorySimulator:
    """Simulator for BufferMemory with dual banks for data and weights"""

    def __init__(
        self,
        array_size: int,
        data_type: Type[BaseFloat] = BF16,
        weight_type: Type[BaseFloat] = BF16,
    ):
        """Initialize simulator configuration

        Args:
            array_size: Dimension of systolic array (NxN)
            data_type: Number format for data (default: BF16)
            weight_type: Number format for weights (default: BF16)
        """
        self.array_size = array_size
        self.data_type = data_type
        self.weight_type = weight_type

        # Store configuration
        self.config = {
            "array_size": array_size,
            "data_type": data_type,
            "weight_type": weight_type,
        }
        self.sim = None

    def setup(self):
        """Initialize PyRTL simulation environment"""
        reset_working_block()

        # Input control ports
        self._data_start = Input(1, "data_start")
        self._data_select = Input(1, "data_select")
        self._weight_start = Input(1, "weight_start")
        self._weight_select = Input(1, "weight_select")

        # Create buffer memory
        self.buffer = BufferMemory(**self.config)
        self.buffer.connect_inputs(
            self._data_start, self._data_select, self._weight_start, self._weight_select
        )

        # Create simulation
        self.sim = Simulation()
        return self

    def _get_default_inputs(self, updates: dict = {}) -> dict:
        """Get dictionary of default input values with optional updates"""
        defaults = {
            "data_start": 0,
            "data_select": 0,
            "weight_start": 0,
            "weight_select": 0,
        }
        defaults.update(updates)
        return defaults

    def vec_to_binary(self, vec, dtype: Type[BaseFloat]):
        """Convert vector to concatenated binary representation"""
        concatenated = 0
        for i, d in enumerate(vec[::-1]):
            binary = dtype(d).binint
            concatenated += binary << (i * dtype.bitwidth())
        return concatenated

    def load_memories(
        self,
        data_bank: int | None = None,
        weight_bank: int | None = None,
        data: np.ndarray | None = None,
        weights: np.ndarray | None = None,
        check_bounds: bool = True,
    ) -> None:
        """Load data and weights into specified memory banks"""
        if self.sim is None:
            raise RuntimeError("Simulator not initialized. Call setup() first")

        # Convert matrices to binary format
        if data is not None:
            assert data_bank is not None
            if check_bounds and data.shape != (self.array_size, self.array_size):
                raise ValueError(f"Data must be {self.array_size}x{self.array_size}")
            data_mem = self.sim.inspect_mem(self.buffer.data_mems[data_bank])
            # Load directly into memory banks
            for i, row in enumerate(data[::-1]):
                data_mem[i] = self.vec_to_binary(row, self.data_type)

        if weights is not None:
            assert weight_bank is not None
            if check_bounds and weights.shape != (self.array_size, self.array_size):
                raise ValueError(f"Weights must be {self.array_size}x{self.array_size}")
            weight_mem = self.sim.inspect_mem(self.buffer.weight_mems[weight_bank])
            # Load directly into memory banks
            for i, row in enumerate(weights[::-1]):
                weight_mem[i] = self.vec_to_binary(row, self.weight_type)

    def stream_data(
        self,
        data_bank: int,
    ) -> List[np.ndarray]:
        """Stream data from memory to systolic array

        Args:
            data_bank: Data memory bank to read from

        Returns:
            List of data vectors streamed to systolic array
        """
        if self.sim is None:
            raise RuntimeError("Simulator not initialized. Call setup() first")

        data_vectors = []

        # Start streaming
        self.sim.step(
            self._get_default_inputs(
                {
                    "data_start": 1,
                    "data_select": data_bank,
                }
            )
        )

        # Stream for array_size cycles
        for _ in range(self.array_size):
            self.sim.step(
                self._get_default_inputs(
                    {
                        "data_select": data_bank,
                    }
                )
            )

            # Capture outputs
            data_vec = [
                float(self.data_type(binint=self.sim.inspect(wire.name)))
                for wire in self.buffer.datas_out
            ]
            data_vectors.insert(0, data_vec)

        return data_vectors

    def stream_weights(
        self,
        weight_bank: int,
    ) -> List[np.ndarray]:
        """Stream weights from memory to systolic array

        Args:
            weight_bank: Weight memory bank to read from

        Returns:
            List of weight vectors streamed to systolic array
        """
        if self.sim is None:
            raise RuntimeError("Simulator not initialized. Call setup() first")

        weight_vectors = []

        # Start streaming
        self.sim.step(
            self._get_default_inputs(
                {
                    "weight_start": 1,
                    "weight_select": weight_bank,
                }
            )
        )

        # Stream for array_size cycles
        for _ in range(self.array_size):
            self.sim.step(
                self._get_default_inputs(
                    {
                        "weight_select": weight_bank,
                    }
                )
            )

            # Capture outputs
            weight_vec = [
                float(self.weight_type(binint=self.sim.inspect(wire.name)))
                for wire in self.buffer.weights_out
            ]
            weight_vectors.insert(0, weight_vec)

        return weight_vectors

## Testing the simulation class

In [5]:
def test_buffer_memory():
    # Test parameters
    SIZE = 3

    # Create and setup simulator
    sim = BufferMemorySimulator(array_size=SIZE).setup()

    # Test data
    data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])

    weights = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])

    # Load memories
    print("\nLoading memories...")
    sim.load_memories(data_bank=0, weight_bank=0, data=data, weights=weights)

    # Stream data
    print("\nStreaming data...")
    data_vectors = sim.stream_data(data_bank=0)

    print("\nData vectors streamed:")
    for i, vec in enumerate(data_vectors):
        print(f"Step {i}: {np.array2string(np.array(vec), precision=4)}")

    # Stream weights
    print("\nStreaming weights...")
    weight_vectors = sim.stream_weights(weight_bank=0)

    print("\nWeight vectors streamed:")
    for i, vec in enumerate(weight_vectors):
        print(f"Step {i}: {np.array2string(np.array(vec), precision=4)}")

    # Verify correct streaming order
    expected_data = data  # For this simple case, expect rows in order
    expected_weights = weights  # For this simple case, expect rows in order

    actual_data = np.array(data_vectors)
    actual_weights = np.array(weight_vectors)

    print("\nVerifying results...")
    np.testing.assert_allclose(actual_data, expected_data, rtol=0.01)
    np.testing.assert_allclose(actual_weights, expected_weights, rtol=0.01)
    print("All tests passed!")


if __name__ == "__main__":
    test_buffer_memory()


Loading memories...

Streaming data...

Data vectors streamed:
Step 0: [1. 2. 3.]
Step 1: [4. 5. 6.]
Step 2: [7. 8. 9.]

Streaming weights...

Weight vectors streamed:
Step 0: [0.0996 0.1992 0.2988]
Step 1: [0.3984 0.5    0.5977]
Step 2: [0.6992 0.7969 0.8984]

Verifying results...
All tests passed!


## Control Unit

Will contain an instruction memory and decoder to send appropriate signals to the hardware units (data/weight buffers, systolic array, accumulator, and activation module). 

Design a simple instruction set that supports the following operations:

- load weight
- load data into buffer
- multiply by data buffer (X), to accumulator tile (Y) in mode (overwrite/accumulate)
- activation on accumulate tile (Y)

Maybe VLIW instructions would be better to enable better pipelining of hardware resources. 
Weights must be loaded separately from data.

Dataflow path:

`Simulation (python) -> data buffer -> systolic array -> accumulator -> activation -> Simulation (python)`