# Generating Hardware Blocks for Analysis


## Setup and helpers


In [1]:
import pyrtl
from pyrtl import *
from dataclasses import dataclass
from typing import Callable, Type, Literal, Optional
from hardware_accelerators.dtypes import *
from hardware_accelerators.rtllib import *
from hardware_accelerators.rtllib.processing_element import ProcessingElement
from hardware_accelerators.rtllib.adders import *
from hardware_accelerators.rtllib.multipliers import *
from hardware_accelerators.rtllib.lmul import *
from hardware_accelerators.rtllib.utils.common import *
from hardware_accelerators.simulation.utils import *

In [2]:
dtype_list = [Float8, BF16, Float32]
dtype_map = {8: Float8, 16: BF16, 32: Float32}
w_a_pairs = [(8, 8), (8, 16), (8, 32), (16, 16), (16, 32), (32, 32)]
w_a_dtypes = [(dtype_map[w], dtype_map[a]) for w, a in w_a_pairs]

In [3]:
def create_inputs(*bitwidths, **named_bitwidths):
    """
    Create PyRTL Input wires with specified bitwidths.

    Args:
        *bitwidths: Variable number of bitwidths for unnamed inputs
        **named_bitwidths: Named bitwidths where the key is used as the wire name

    Returns:
        Generator of PyRTL Input wires

    Note:
        You must use either all positional arguments or all keyword arguments, not a mix.
    """
    if bitwidths and named_bitwidths:
        raise ValueError(
            "Please use either all positional arguments or all keyword arguments, not a mix."
        )

    # If using positional arguments
    for bitwidth in bitwidths:
        yield pyrtl.Input(bitwidth)

    # If using keyword arguments
    for name, bitwidth in named_bitwidths.items():
        yield pyrtl.Input(bitwidth, name=name)


def create_outputs(*args, **named_wires):
    """
    Create PyRTL Output wires connected to the input wires.

    Args:
        *args: Variable number of wires to connect to unnamed outputs
        **named_wires: Named wires where the key is used as the output wire name

    Note:
        You must use either all positional arguments or all keyword arguments, not a mix.
    """
    if args and named_wires:
        raise ValueError(
            "Please use either all positional arguments or all keyword arguments, not a mix."
        )

    # If using positional arguments
    for wire in args:
        out = pyrtl.Output(len(wire))
        out <<= wire

    # If using keyword arguments
    for name, wire in named_wires.items():
        out = pyrtl.Output(len(wire), name=name)
        out <<= wire


@dataclass
class RTLAnalysis:
    """Results of RTL analysis."""

    max_delay: float
    max_freq: float
    logic_area: float
    mem_area: float
    name: Optional[str] = None

    def __repr__(self):
        if self.name is None:
            return (
                f"RTLAnalysisResults("
                f"max_delay={self.max_delay:.2f} ps, "
                f"max_freq={self.max_freq:.2f} MHz, "
                f"logic_area={self.logic_area:.2f}um², "
                f"mem_area={self.mem_area:.2f}um²)"
            )
        else:
            return (
                f"RTLAnalysisResults for {self.name}:\n\t"
                f"max_delay={self.max_delay:.2f} ps\n\t"
                f"max_freq={self.max_freq:.2f} MHz\n\t"
                f"logic_area={self.logic_area:.2f}um²\n\t"
                f"mem_area={self.mem_area:.2f}um²"
            )


def analyze(
    block: Block | None = None, synth: bool = True, opt: bool = True, name=None
):
    if block is not None:
        pyrtl.set_working_block(block)

    if synth:
        pyrtl.synthesize()
    if opt:
        pyrtl.optimize()

    timing = pyrtl.TimingAnalysis()
    max_delay = timing.max_length()
    max_freq = timing.max_freq()
    logic_area, mem_area = pyrtl.area_estimation()

    return RTLAnalysis(
        name=name,
        max_delay=max_delay,
        max_freq=max_freq,
        logic_area=logic_area * 1e6,
        mem_area=mem_area * 1e6,
    )

# Adders


In [4]:
def create_adder_blocks(dtype: Type[BaseFloat], fast: bool) -> dict[str, Block]:
    bits = dtype.bitwidth()
    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()

    combinational_block = pyrtl.Block()
    adder_pipelined_block = pyrtl.Block()
    stage_2_block = pyrtl.Block()
    stage_3_block = pyrtl.Block()
    stage_4_block = pyrtl.Block()
    stage_5_block = pyrtl.Block()

    # Combinational design
    with set_working_block(combinational_block):
        create_outputs(*float_adder(*create_inputs(bits, bits), dtype=dtype, fast=fast))

    # Complete pipelined design
    with set_working_block(adder_pipelined_block):
        create_outputs(
            float_adder_pipelined(
                *create_inputs(bits, bits),
                dtype=dtype,
                fast=fast,
            )
        )

    # Stages 1 & 2
    with set_working_block(stage_2_block):
        float_components = extract_float_components(
            *create_inputs(bits, bits),
            e_bits=e_bits,
            m_bits=m_bits,
        )
        stage_2_outputs = adder_stage_2(
            *float_components,
            e_bits,
            m_bits,
            fast,
        )
        create_outputs(*stage_2_outputs)

    # Stage 3
    with set_working_block(stage_3_block):
        # Perform alignment and generate SGR bits
        stage_3_outputs = adder_stage_3(
            *create_inputs(m_bits + 1, e_bits),
            e_bits=e_bits,
            m_bits=m_bits,
        )
        create_outputs(*stage_3_outputs)

    # Stage 4
    with set_working_block(stage_4_block):
        # Perform mantissa addition and leading zero detection
        stage_4_outputs = adder_stage_4(
            *create_inputs(m_bits + 1, m_bits + 1, 1), m_bits=m_bits, fast=fast
        )
        create_outputs(*stage_4_outputs)

    # Stage 5
    with set_working_block(stage_5_block):
        # Perform normalization, rounding, and final assembly
        stage_5_outputs = adder_stage_5(
            *create_inputs(
                m_bits + 2,  # abs_mantissa: m_bits + 2 wide
                1,  # sticky_bit: 1 bit
                1,  # guard_bit: 1 bit
                1,  # round_bit: 1 bit
                4,  # lzc: 4 bits wide
                e_bits,  # exp_larger: e_bits wide
                1,  # sign_a: 1 bit
                1,  # sign_b: 1 bit
                e_bits + 1,  # exp_diff: e_bits + 1 wide
                1,  # is_neg: 1 bit
            ),
            e_bits=e_bits,
            m_bits=m_bits,
        )
        create_outputs(*stage_5_outputs)

    # Return all the generated blocks for analysis
    return {
        "adder_combinational": combinational_block,
        "adder_pipelined": adder_pipelined_block,
        "adder_stage_2": stage_2_block,
        "adder_stage_3": stage_3_block,
        "adder_stage_4": stage_4_block,
        "adder_stage_5": stage_5_block,
    }

In [5]:
adder_blocks = create_adder_blocks(Float8, fast=True)

for name, block in adder_blocks.items():
    results = analyze(block, name=name)
    print(results, "\n")

RTLAnalysisResults for adder_combinational:
	max_delay=4106.58 ps
	max_freq=222.74 MHz
	logic_area=3568.44um²
	mem_area=0.00um² 

RTLAnalysisResults for adder_pipelined:
	max_delay=1908.24 ps
	max_freq=436.44 MHz
	logic_area=6453.85um²
	mem_area=0.00um² 

RTLAnalysisResults for adder_stage_2:
	max_delay=1230.84 ps
	max_freq=619.64 MHz
	logic_area=818.93um²
	mem_area=0.00um² 

RTLAnalysisResults for adder_stage_3:
	max_delay=1565.30 ps
	max_freq=513.27 MHz
	logic_area=496.58um²
	mem_area=0.00um² 

RTLAnalysisResults for adder_stage_4:
	max_delay=1445.68 ps
	max_freq=546.84 MHz
	logic_area=1190.06um²
	mem_area=0.00um² 

RTLAnalysisResults for adder_stage_5:
	max_delay=1711.24 ps
	max_freq=477.50 MHz
	logic_area=1080.29um²
	mem_area=0.00um² 



# Multipliers


In [6]:
def create_multiplier_blocks(dtype: Type[BaseFloat], fast: bool) -> dict[str, Block]:
    bits = dtype.bitwidth()
    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()

    combinational_block = pyrtl.Block()
    multiplier_block = pyrtl.Block()
    stage_2_block = pyrtl.Block()
    stage_3_block = pyrtl.Block()
    stage_4_block = pyrtl.Block()

    # Combinational design
    with set_working_block(combinational_block):
        create_outputs(
            float_multiplier(*create_inputs(bits, bits), dtype=dtype, fast=fast)
        )

    # Complete pipelined design
    with set_working_block(multiplier_block):
        multiplier = FloatMultiplierPipelined(
            *create_inputs(bits, bits), dtype=dtype, fast=fast
        )
        create_outputs(multiplier._result)

    # Stage 1 & 2: Extract components and calculate sign, exponent sum, mantissa product
    with set_working_block(stage_2_block):
        float_components = extract_float_components(
            *create_inputs(bits, bits),
            e_bits=e_bits,
            m_bits=m_bits,
        )
        stage_2_outputs = multiplier_stage_2(
            *float_components,
            m_bits,
            fast,
        )
        create_outputs(*stage_2_outputs)

    # Stage 3: Leading zero detection and exponent adjustment
    with set_working_block(stage_3_block):
        stage_3_outputs = multiplier_stage_3(
            *create_inputs(e_bits + 1, 2 * m_bits + 2),  # exp_sum, mantissa_product
            e_bits=e_bits,
            m_bits=m_bits,
            fast=fast,
        )
        create_outputs(*stage_3_outputs)

    # Stage 4: Normalization, rounding, and final assembly
    with set_working_block(stage_4_block):
        stage_4_outputs = multiplier_stage_4(
            *create_inputs(
                e_bits,  # unbiased_exp
                e_bits,  # leading_zeros
                2 * m_bits + 2,  # mantissa_product
            ),
            m_bits=m_bits,
            e_bits=e_bits,
            fast=fast,
        )
        create_outputs(*stage_4_outputs)

    # Return all the generated blocks for analysis
    return {
        "combinational": combinational_block,
        "multiplier": multiplier_block,
        "stage_2": stage_2_block,
        "stage_3": stage_3_block,
        "stage_4": stage_4_block,
    }

In [7]:
multiplier_blocks = create_multiplier_blocks(Float8, fast=True)

for name, block in multiplier_blocks.items():
    results = analyze(block, name=name)
    print(results, "\n")

RTLAnalysisResults for combinational:
	max_delay=4906.51 ps
	max_freq=189.05 MHz
	logic_area=3023.06um²
	mem_area=0.00um² 

RTLAnalysisResults for multiplier:
	max_delay=1828.47 ps
	max_freq=452.19 MHz
	logic_area=3800.17um²
	mem_area=0.00um² 

RTLAnalysisResults for stage_2:
	max_delay=1394.44 ps
	max_freq=562.61 MHz
	logic_area=1062.86um²
	mem_area=0.00um² 

RTLAnalysisResults for stage_3:
	max_delay=1585.10 ps
	max_freq=508.10 MHz
	logic_area=670.82um²
	mem_area=0.00um² 

RTLAnalysisResults for stage_4:
	max_delay=1828.47 ps
	max_freq=452.19 MHz
	logic_area=1210.97um²
	mem_area=0.00um² 



# L-mul


In [8]:
def create_lmul_blocks(dtype: Type[BaseFloat]) -> dict[str, Block]:
    bits = dtype.bitwidth()

    combinational_block = pyrtl.Block()
    combinational_fast_block = pyrtl.Block()
    pipelined_block = pyrtl.Block()
    pipelined_fast_block = pyrtl.Block()

    # Combinational design (simple)
    with set_working_block(combinational_block):
        create_outputs(lmul_simple(*create_inputs(bits, bits), dtype=dtype))

    # Combinational design (fast)
    with set_working_block(combinational_fast_block):
        create_outputs(lmul_fast(*create_inputs(bits, bits), dtype=dtype))

    # Pipelined design (simple)
    with set_working_block(pipelined_block):
        mult = LmulPipelined(*create_inputs(bits, bits), dtype=dtype, fast=False)
        create_outputs(mult.output_reg)

    # Pipelined design (fast)
    with set_working_block(pipelined_fast_block):
        mult = LmulPipelined(*create_inputs(bits, bits), dtype=dtype, fast=True)
        create_outputs(mult.output_reg)

    # Return all the generated blocks for analysis
    return {
        "combinational_simple": combinational_block,
        "combinational_fast": combinational_fast_block,
        "pipelined_simple": pipelined_block,
        "pipelined_fast": pipelined_fast_block,
    }

In [9]:
lmul_blocks = create_lmul_blocks(Float8)

for name, block in lmul_blocks.items():
    results = analyze(block, name=name)
    print(results, "\n")

RTLAnalysisResults for combinational_simple:
	max_delay=1962.64 ps
	max_freq=426.32 MHz
	logic_area=635.98um²
	mem_area=0.00um² 

RTLAnalysisResults for combinational_fast:
	max_delay=1406.37 ps
	max_freq=558.86 MHz
	logic_area=1036.73um²
	mem_area=0.00um² 

RTLAnalysisResults for pipelined_simple:
	max_delay=2223.24 ps
	max_freq=383.69 MHz
	logic_area=1998.53um²
	mem_area=0.00um² 

RTLAnalysisResults for pipelined_fast:
	max_delay=1085.34 ps
	max_freq=681.04 MHz
	logic_area=2103.08um²
	mem_area=0.00um² 



# Processing Element


In [10]:
def connect_pe_io(pe: ProcessingElement):
    # Connect the inputs and outputs of the processing element
    w_bits, a_bits = pe.weight_type.bitwidth(), pe.data_type.bitwidth()
    w_in, d_in, acc_in = create_inputs(
        weight_in=w_bits, data_in=a_bits, accum_in=a_bits
    )
    # w_in, d_in, acc_in = create_inputs(w_bits, a_bits, a_bits)
    pe.connect_weight(w_in)
    pe.connect_data(d_in)
    pe.connect_accum(acc_in)
    # if pe.pipeline:
    #     controls = create_inputs(weight_en=1, data_en=1, mul_en=1, adder_en=1)
    pe.connect_control_signals(
        *create_inputs(weight_en=1, data_en=1, mul_en=1, adder_en=1)
    )
    create_outputs(*pe.outputs.__dict__.values())


def create_pe_blocks(
    dtypes: tuple[Type[BaseFloat], Type[BaseFloat]]
) -> dict[str, Block]:
    """Create a processing element for each pair of dtypes."""

    weight_dtype, act_dtype = dtypes

    # Defining blocks to encapsulate hardware

    combinational_block = Block()
    simple_pipeline_block = Block()
    simple_pipeline_fast_block = Block()
    full_pipeline_block = Block()
    full_pipeline_fast_block = Block()

    combinational_lmul_block = Block()
    simple_pipeline_lmul_block = Block()
    simple_pipeline_fast_lmul_block = Block()
    full_pipeline_lmul_block = Block()
    full_pipeline_fast_lmul_block = Block()

    # Standard IEEE multiplier versions

    with set_working_block(combinational_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier,
            adder=float_adder,
            pipeline_mult=False,
        )
        connect_pe_io(pe)

    with set_working_block(simple_pipeline_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier,
            adder=float_adder,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(simple_pipeline_fast_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier_fast_unstable,
            adder=float_adder_fast_unstable,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(full_pipeline_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier_pipelined,
            adder=float_adder_pipelined,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(full_pipeline_fast_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier_pipelined_fast_unstable,
            adder=float_adder_pipelined_fast_unstable,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    # L-mul versions

    with set_working_block(combinational_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_simple,
            adder=float_adder,
            pipeline_mult=False,
        )
        connect_pe_io(pe)

    with set_working_block(simple_pipeline_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_simple,
            adder=float_adder,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(simple_pipeline_fast_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_fast,
            adder=float_adder_fast_unstable,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(full_pipeline_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_pipelined,
            adder=float_adder_pipelined,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(full_pipeline_fast_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_pipelined_fast,
            adder=float_adder_pipelined_fast_unstable,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    return {
        "combinational": combinational_block,
        "standard": simple_pipeline_block,
        "fast": simple_pipeline_fast_block,
        "pipelined": full_pipeline_block,
        "fast_pipelined": full_pipeline_fast_block,
        "combinational_lmul": combinational_lmul_block,
        "standard_lmul": simple_pipeline_lmul_block,
        "fast_lmul": simple_pipeline_fast_lmul_block,
        "pipelined_lmul": full_pipeline_lmul_block,
        "fast_pipelined_lmul": full_pipeline_fast_lmul_block,
    }

In [11]:
reset_working_block()
pe_blocks = create_pe_blocks((Float8, Float8))

for name, block in pe_blocks.items():
    results = analyze(block, name=name)
    print(results, "\n")

Pipelining disabled, no product register to enable. Deleting wire.
Pipelining disabled, no product register to enable. Deleting wire.
RTLAnalysisResults for combinational:
	max_delay=12953.76 ps
	max_freq=74.98 MHz
	logic_area=7992.39um²
	mem_area=0.00um² 

RTLAnalysisResults for standard:
	max_delay=7848.58 ps
	max_freq=121.48 MHz
	logic_area=8452.38um²
	mem_area=0.00um² 

RTLAnalysisResults for fast:
	max_delay=5110.31 ps
	max_freq=182.04 MHz
	logic_area=8431.47um²
	mem_area=0.00um² 

RTLAnalysisResults for pipelined:
	max_delay=3226.71 ps
	max_freq=277.03 MHz
	logic_area=12114.91um²
	mem_area=0.00um² 

RTLAnalysisResults for fast_pipelined:
	max_delay=2112.04 ps
	max_freq=400.80 MHz
	logic_area=12094.00um²
	mem_area=0.00um² 

RTLAnalysisResults for combinational_lmul:
	max_delay=8835.42 ps
	max_freq=108.48 MHz
	logic_area=5596.59um²
	mem_area=0.00um² 

RTLAnalysisResults for standard_lmul:
	max_delay=7848.58 ps
	max_freq=121.48 MHz
	logic_area=6056.58um²
	mem_area=0.00um² 

RTLAnaly

# Accelerator


## New top level class


In [12]:
@dataclass
class AcceleratorAnalysisConfig:
    """Configuration for an accelerator to be generated for analysis."""

    array_size: int
    """
    The size of the systolic array (N x N).
    Determines the number of processing elements in the accelerator.
    """

    weight_type: Type[BaseFloat]
    """
    The floating-point data type for weights.
    Must be a subclass of BaseFloat (e.g., Float8, BF16, Float32).
    """

    activation_type: Type[BaseFloat]
    """
    The floating-point data type for activations/inputs.
    Must be a subclass of BaseFloat (e.g., Float8, BF16, Float32).
    """

    lmul: bool
    """
    Whether to use L-mul for multiplication operations.
    If True, uses linear-time multipliers; if False, uses standard IEEE multipliers.
    """

    pipeline_level: Literal["low", "high"] | None
    """
    The level of pipelining in the accelerator:
    - None: No pipelining (fully combinational design)
    - 'low': Basic pipelining between multiplier and adder in each PE
    - 'high': Full pipelining with pipelined arithmetic units
    """

    use_fast_internals: bool
    """
    Whether to use faster basic arithmetic implementations with more complex low-level RTL.  
    - True: uses optimized arithmetic units from PyRTL's rtllib  
    - False: prioritize simplicity over speed  

    WARNING: Setting to True could potentially make final synthesis on the Verilog output worse as the synthesis tools will not be able to infer optimal circuits from the complex low-level RTL.
    """

    accum_addr_width: int = 12
    """
    The bit width of the accumulator address.
    Determines the size of the accumulator memory (2^width entries).
    Default is 12 bits (4096 entries).
    """

    def __post_init__(self):
        # Ensure activation dtype has bitwidth >= weight dtype
        if self.activation_type.bitwidth() < self.weight_type.bitwidth():
            raise ValueError(
                f"Activation dtype bitwidth ({self.activation_type.bitwidth()}) must be greater than or equal to "
                f"weight dtype bitwidth ({self.weight_type.bitwidth()})"
            )

        # Determine if we should use pipelined arithmetic functions
        use_pipelined_funcs = self.pipeline_level == "high"

        # Set pipeline_pe flag for PE configuration
        # True if any pipeline level is specified (low or high)
        self.pipeline_pe = self.pipeline_level is not None

        # Multiplier function selection using dictionary mapping
        multiplier_map = {
            # (lmul, use_pipelined_funcs, fast_internals) -> function
            (True, True, True): lmul_pipelined_fast,
            (True, True, False): lmul_pipelined,
            (True, False, True): lmul_fast,
            (True, False, False): lmul_simple,
            (False, True, True): float_multiplier_pipelined_fast_unstable,
            (False, True, False): float_multiplier_pipelined,
            (False, False, True): float_multiplier_fast_unstable,
            (False, False, False): float_multiplier,
        }

        # Adder function selection using dictionary mapping
        adder_map = {
            # (use_pipelined_funcs, fast_internals) -> function
            (True, True): float_adder_pipelined_fast_unstable,
            (True, False): float_adder_pipelined,
            (False, True): float_adder_fast_unstable,
            (False, False): float_adder,
        }

        # Select functions using the maps
        self.multiplier_func = multiplier_map[
            (self.lmul, use_pipelined_funcs, self.use_fast_internals)
        ]
        self.adder_func = adder_map[(use_pipelined_funcs, self.use_fast_internals)]

    @property
    def name(self):
        dtype_name = lambda d: d.bitwidth() if d != BF16 else "b16"
        mul = "-lmul" if self.lmul else "-ieee"
        pipe_name_map = {"low": "-pipePE", "high": "-pipeALL"}
        fast = "-fast" if self.use_fast_internals else ""
        mem = f"-m{self.accum_addr_width}" if self.accum_addr_width != 12 else ""
        return (
            f"w{dtype_name(self.weight_type)}"
            f"a{dtype_name(self.activation_type)}"
            f"-{self.array_size}x{self.array_size}"
            + mem
            + mul
            + fast
            + pipe_name_map.get(self.pipeline_level, "")  # type: ignore
        )

In [13]:
from hardware_accelerators.rtllib.accumulators import Accumulator
from hardware_accelerators.rtllib.activations import ReluUnit


class AcceleratorTopLevel(CompiledAccelerator):
    def __init__(self, config: AcceleratorAnalysisConfig):
        self.config = config

        # Instantiate hardware components
        self.systolic_array = SystolicArrayDiP(
            size=config.array_size,
            data_type=config.activation_type,
            weight_type=config.weight_type,
            accum_type=config.activation_type,
            multiplier=config.multiplier_func,
            adder=config.adder_func,
            pipeline=config.pipeline_pe,
        )
        self.accumulator = Accumulator(
            addr_width=12,
            array_size=config.array_size,
            data_type=config.activation_type,
            adder=config.adder_func,
        )
        self.activation = ReluUnit(
            size=config.array_size,
            dtype=config.activation_type,
        )
        self.outputs = [
            Output(config.activation_type.bitwidth(), f"out_{i}")
            for i in range(config.array_size)
        ]

        # Connect everything together and create io ports
        self._connect_components()
        self.valid_out = Output(1, "valid_out")
        self.valid_out <<= self.activation.outputs_valid

    def _create_control_wires(self):
        """Create named Input wires for control signals"""
        self.data_enable = Input(1, "data_enable")
        self.data_ins = [
            Input(self.config.activation_type.bitwidth(), f"data_in_{i}")
            for i in range(self.config.array_size)
        ]
        self.weight_enable = Input(1, "weight_enable")
        self.weights_in = [
            Input(self.config.weight_type.bitwidth(), f"weight_in_{i}")
            for i in range(self.config.array_size)
        ]
        self.accum_addr_in = Input(self.config.accum_addr_width, "accum_addr_in")
        self.accum_mode_in = Input(1, "accum_mode_in")
        self.act_start_in = Input(1, "act_start_in")
        self.act_func_in = Input(1, "act_func_in")

## Hardware generation function


In [14]:
from itertools import product


def create_accelerator_blocks(
    dtypes: tuple[Type[BaseFloat], Type[BaseFloat]],
    array_size: int = 4,
    addr_bits: int = 12,
) -> dict[str, Block]:
    """
    Create accelerator blocks for all valid configurations based on the given inputs.

    Args:
        dtypes: Tuple of (weight_type, activation_type) data types
        array_size: Size of the systolic array (N x N)
        addr_bits: Bit width for accumulator address (uses default if None)

    Returns:
        Dictionary mapping configuration names to PyRTL blocks
    """
    weight_type, activation_type = dtypes

    # Define all valid configurations to test
    pipeline_options = [None, "low", "high"]
    lmul_options = [False, True]
    fast_options = [False, True]

    # Create configs and blocks
    blocks = {}
    for pipeline, lmul, fast in product(pipeline_options, lmul_options, fast_options):
        if pipeline is None and fast is True:
            continue

        # Create the configuration
        config = AcceleratorAnalysisConfig(
            array_size=array_size,
            activation_type=activation_type,
            weight_type=weight_type,
            lmul=lmul,
            accum_addr_width=addr_bits,
            pipeline_level=pipeline,
            use_fast_internals=fast,
        )

        block = pyrtl.Block()
        with set_working_block(block):
            AcceleratorTopLevel(config)

        blocks[config.name] = block

    return blocks

In [16]:
accelerator_blocks = create_accelerator_blocks((Float8, Float8), 2, 12)

for name, block in accelerator_blocks.items():
    results = analyze(block, name=name)
    print(results, "\n")

RTLAnalysisResults for w8a8-2x2-ieee:
	max_delay=12953.76 ps
	max_freq=74.98 MHz
	logic_area=40216.33um²
	mem_area=561065.91um² 

RTLAnalysisResults for w8a8-2x2-lmul:
	max_delay=8835.42 ps
	max_freq=108.48 MHz
	logic_area=30633.13um²
	mem_area=561065.91um² 

RTLAnalysisResults for w8a8-2x2-ieee-pipePE:
	max_delay=8385.19 ps
	max_freq=114.05 MHz
	logic_area=42589.48um²
	mem_area=561065.91um² 

RTLAnalysisResults for w8a8-2x2-ieee-fast-pipePE:
	max_delay=5110.31 ps
	max_freq=182.04 MHz
	logic_area=44432.94um²
	mem_area=561065.91um² 

RTLAnalysisResults for w8a8-2x2-lmul-pipePE:
	max_delay=8385.19 ps
	max_freq=114.05 MHz
	logic_area=33006.28um²
	mem_area=561065.91um² 

RTLAnalysisResults for w8a8-2x2-lmul-fast-pipePE:
	max_delay=4846.99 ps
	max_freq=191.21 MHz
	logic_area=36487.60um²
	mem_area=561065.91um² 

RTLAnalysisResults for w8a8-2x2-ieee-pipeALL:
	max_delay=3226.71 ps
	max_freq=277.03 MHz
	logic_area=59400.16um²
	mem_area=561065.91um² 

RTLAnalysisResults for w8a8-2x2-ieee-fast-pi

# Extra stuff


In [None]:
unit = "MB"
addr_bits = 12
dtype = Float8
array_size = 256

desired_mem = 2.1

mem = calculate_accumulator_memory(addr_bits, array_size, dtype, unit, True)
min_addr_bits_mem = calculate_accum_addr_width_for_min_mem(
    desired_mem, array_size, dtype, unit, True
)
min_addr_bits_slots = calculate_accum_addr_width_for_min_slots(10000, True)