# Generating Hardware Blocks for Analysis


## Setup and helpers


In [None]:
import os
import pyrtl
from pyrtl import *
from typing import Callable, Type, Literal
from hardware_accelerators.dtypes import *
from hardware_accelerators.rtllib import *
from hardware_accelerators.rtllib.accelerator import (
    CompiledAccelerator,
    CompiledAcceleratorConfig,
)
from hardware_accelerators.rtllib.processing_element import ProcessingElement
from hardware_accelerators.rtllib.adders import *
from hardware_accelerators.rtllib.multipliers import *
from hardware_accelerators.rtllib.lmul import *
from hardware_accelerators.rtllib.utils.common import *

In [67]:
dtype_list = [Float8, BF16, Float32]
dtype_map = {8: Float8, 16: BF16, 32: Float32}
w_a_pairs = [(8, 8), (8, 16), (8, 32), (16, 16), (16, 32), (32, 32)]
w_a_dtypes = [(dtype_map[w], dtype_map[a]) for w, a in w_a_pairs]

In [68]:
def create_basic_hardware_block(
    fn: Callable[[WireVector, WireVector, Type[BaseFloat]], WireVector],
    dtype: Type[BaseFloat],
    **kwargs,
):
    bitwidth = dtype.bitwidth()
    a, b = pyrtl.Input(bitwidth, "a"), pyrtl.Input(bitwidth, "b")
    out = pyrtl.Output(bitwidth, "out")
    out <<= fn(a, b, dtype, **kwargs)


def create_inputs(*bitwidths, **named_bitwidths):
    """
    Create PyRTL Input wires with specified bitwidths.

    Args:
        *bitwidths: Variable number of bitwidths for unnamed inputs
        **named_bitwidths: Named bitwidths where the key is used as the wire name

    Returns:
        Generator of PyRTL Input wires

    Note:
        You must use either all positional arguments or all keyword arguments, not a mix.
    """
    if bitwidths and named_bitwidths:
        raise ValueError(
            "Please use either all positional arguments or all keyword arguments, not a mix."
        )

    # If using positional arguments
    for bitwidth in bitwidths:
        yield pyrtl.Input(bitwidth)

    # If using keyword arguments
    for name, bitwidth in named_bitwidths.items():
        yield pyrtl.Input(bitwidth, name=name)


def create_outputs(*args, **named_wires):
    """
    Create PyRTL Output wires connected to the input wires.

    Args:
        *args: Variable number of wires to connect to unnamed outputs
        **named_wires: Named wires where the key is used as the output wire name

    Note:
        You must use either all positional arguments or all keyword arguments, not a mix.
    """
    if args and named_wires:
        raise ValueError(
            "Please use either all positional arguments or all keyword arguments, not a mix."
        )

    # If using positional arguments
    for wire in args:
        out = pyrtl.Output(len(wire))
        out <<= wire

    # If using keyword arguments
    for name, wire in named_wires.items():
        out = pyrtl.Output(len(wire), name=name)
        out <<= wire


def analyze(block: Block | None = None):
    if block is not None:
        pyrtl.set_working_block(block)
    pyrtl.synthesize()
    pyrtl.optimize()
    timing = pyrtl.TimingAnalysis()
    delay = timing.max_length()
    print(f"\nest. max delay: {delay:.2f} ps")
    print(f"est. max freq: {timing.max_freq():.2f} MHz")
    print(f"est. area: {pyrtl.area_estimation()}\n\n")

# Adders


In [62]:
def create_adder_blocks(dtype: Type[BaseFloat], fast: bool) -> dict[str, Block]:
    bits = dtype.bitwidth()
    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()

    combinational_block = pyrtl.Block()
    adder_pipelined_block = pyrtl.Block()
    stage_2_block = pyrtl.Block()
    stage_3_block = pyrtl.Block()
    stage_4_block = pyrtl.Block()
    stage_5_block = pyrtl.Block()

    # Combinational design
    with set_working_block(combinational_block):
        create_outputs(*float_adder(*create_inputs(bits, bits), dtype=dtype, fast=fast))

    # Complete pipelined design
    with set_working_block(adder_pipelined_block):
        create_outputs(
            float_adder_pipelined(
                *create_inputs(bits, bits),
                dtype=dtype,
                fast=fast,
            )
        )

    # Stages 1 & 2
    with set_working_block(stage_2_block):
        float_components = extract_float_components(
            *create_inputs(bits, bits),
            e_bits=e_bits,
            m_bits=m_bits,
        )
        stage_2_outputs = adder_stage_2(
            *float_components,
            e_bits,
            m_bits,
            fast,
        )
        create_outputs(*stage_2_outputs)

    # Stage 3
    with set_working_block(stage_3_block):
        # Perform alignment and generate SGR bits
        stage_3_outputs = adder_stage_3(
            *create_inputs(m_bits + 1, e_bits),
            e_bits=e_bits,
            m_bits=m_bits,
        )
        create_outputs(*stage_3_outputs)

    # Stage 4
    with set_working_block(stage_4_block):
        # Perform mantissa addition and leading zero detection
        stage_4_outputs = adder_stage_4(
            *create_inputs(m_bits + 1, m_bits + 1, 1), m_bits=m_bits, fast=fast
        )
        create_outputs(*stage_4_outputs)

    # Stage 5
    with set_working_block(stage_5_block):
        # Perform normalization, rounding, and final assembly
        stage_5_outputs = adder_stage_5(
            *create_inputs(
                m_bits + 2,  # abs_mantissa: m_bits + 2 wide
                1,  # sticky_bit: 1 bit
                1,  # guard_bit: 1 bit
                1,  # round_bit: 1 bit
                4,  # lzc: 4 bits wide
                e_bits,  # exp_larger: e_bits wide
                1,  # sign_a: 1 bit
                1,  # sign_b: 1 bit
                e_bits + 1,  # exp_diff: e_bits + 1 wide
                1,  # is_neg: 1 bit
            ),
            e_bits=e_bits,
            m_bits=m_bits,
        )
        create_outputs(*stage_5_outputs)

    # Return all the generated blocks for analysis
    return {
        "adder_combinational": combinational_block,
        "adder_pipelined": adder_pipelined_block,
        "adder_stage_2": stage_2_block,
        "adder_stage_3": stage_3_block,
        "adder_stage_4": stage_4_block,
        "adder_stage_5": stage_5_block,
    }

In [63]:
adder_blocks = create_adder_blocks(Float8, fast=True)

for name, block in adder_blocks.items():
    print(f"Analyzing {name} block:")
    analyze(block)

Analyzing adder_combinational block:

est. max delay: 4106.58 ps
est. max freq: 222.74 MHz
est. area: (0.0035684352, 0)


Analyzing adder_pipelined block:

est. max delay: 1908.24 ps
est. max freq: 436.44 MHz
est. area: (0.0064538496, 0)


Analyzing adder_stage_2 block:

est. max delay: 1230.84 ps
est. max freq: 619.64 MHz
est. area: (0.000818928, 0)


Analyzing adder_stage_3 block:

est. max delay: 1565.30 ps
est. max freq: 513.27 MHz
est. area: (0.000496584, 0)


Analyzing adder_stage_4 block:

est. max delay: 1445.68 ps
est. max freq: 546.84 MHz
est. area: (0.0011900592, 0)


Analyzing adder_stage_5 block:

est. max delay: 1711.24 ps
est. max freq: 477.50 MHz
est. area: (0.001080288, 0)




# Multipliers


In [None]:
def create_multiplier_blocks(dtype: Type[BaseFloat], fast: bool) -> dict[str, Block]:
    bits = dtype.bitwidth()
    e_bits, m_bits = dtype.exponent_bits(), dtype.mantissa_bits()

    combinational_block = pyrtl.Block()
    multiplier_block = pyrtl.Block()
    stage_2_block = pyrtl.Block()
    stage_3_block = pyrtl.Block()
    stage_4_block = pyrtl.Block()

    # Combinational design
    with set_working_block(combinational_block):
        create_outputs(
            float_multiplier(*create_inputs(bits, bits), dtype=dtype, fast=fast)
        )

    # Complete pipelined design
    with set_working_block(multiplier_block):
        multiplier = FloatMultiplierPipelined(
            *create_inputs(bits, bits), dtype=dtype, fast=fast
        )
        create_outputs(multiplier._result)

    # Stage 1 & 2: Extract components and calculate sign, exponent sum, mantissa product
    with set_working_block(stage_2_block):
        float_components = extract_float_components(
            *create_inputs(bits, bits),
            e_bits=e_bits,
            m_bits=m_bits,
        )
        stage_2_outputs = multiplier_stage_2(
            *float_components,
            m_bits,
            fast,
        )
        create_outputs(*stage_2_outputs)

    # Stage 3: Leading zero detection and exponent adjustment
    with set_working_block(stage_3_block):
        stage_3_outputs = multiplier_stage_3(
            *create_inputs(e_bits + 1, 2 * m_bits + 2),  # exp_sum, mantissa_product
            e_bits=e_bits,
            m_bits=m_bits,
            fast=fast,
        )
        create_outputs(*stage_3_outputs)

    # Stage 4: Normalization, rounding, and final assembly
    with set_working_block(stage_4_block):
        stage_4_outputs = multiplier_stage_4(
            *create_inputs(
                e_bits,  # unbiased_exp
                e_bits,  # leading_zeros
                2 * m_bits + 2,  # mantissa_product
            ),
            m_bits=m_bits,
            e_bits=e_bits,
            fast=fast,
        )
        create_outputs(*stage_4_outputs)

    # Return all the generated blocks for analysis
    return {
        "combinational": combinational_block,
        "multiplier": multiplier_block,
        "stage_2": stage_2_block,
        "stage_3": stage_3_block,
        "stage_4": stage_4_block,
    }

In [65]:
multiplier_blocks = create_multiplier_blocks(Float8, fast=True)

for name, block in multiplier_blocks.items():
    print(f"Analyzing {name} block:")
    analyze(block)

Analyzing combinational block:

est. max delay: 4906.51 ps
est. max freq: 189.05 MHz
est. area: (0.003023064, 0)


Analyzing multiplier block:

est. max delay: 1828.47 ps
est. max freq: 452.19 MHz
est. area: (0.0038001744, 0)


Analyzing stage_2 block:

est. max delay: 1394.44 ps
est. max freq: 562.61 MHz
est. area: (0.001062864, 0)


Analyzing stage_3 block:

est. max delay: 1585.10 ps
est. max freq: 508.10 MHz
est. area: (0.000670824, 0)


Analyzing stage_4 block:

est. max delay: 1828.47 ps
est. max freq: 452.19 MHz
est. area: (0.001210968, 0)




# L-mul


In [52]:
def create_lmul_blocks(dtype: Type[BaseFloat]) -> dict[str, Block]:
    bits = dtype.bitwidth()

    combinational_block = pyrtl.Block()
    combinational_fast_block = pyrtl.Block()
    pipelined_block = pyrtl.Block()
    pipelined_fast_block = pyrtl.Block()

    # Combinational design (simple)
    with set_working_block(combinational_block):
        create_outputs(lmul_simple(*create_inputs(bits, bits), dtype=dtype))

    # Combinational design (fast)
    with set_working_block(combinational_fast_block):
        create_outputs(lmul_fast(*create_inputs(bits, bits), dtype=dtype))

    # Pipelined design (simple)
    with set_working_block(pipelined_block):
        mult = LmulPipelined(*create_inputs(bits, bits), dtype=dtype, fast=False)
        create_outputs(mult.output_reg)

    # Pipelined design (fast)
    with set_working_block(pipelined_fast_block):
        mult = LmulPipelined(*create_inputs(bits, bits), dtype=dtype, fast=True)
        create_outputs(mult.output_reg)

    # Return all the generated blocks for analysis
    return {
        "combinational_simple": combinational_block,
        "combinational_fast": combinational_fast_block,
        "pipelined_simple": pipelined_block,
        "pipelined_fast": pipelined_fast_block,
    }

In [54]:
lmul_blocks = create_lmul_blocks(Float8)

for name, block in lmul_blocks.items():
    print(f"Analyzing {name} block:")
    analyze(block)

Analyzing combinational_simple block:

est. max delay: 1962.64 ps
est. max freq: 426.32 MHz
est. area: (0.000635976, 0)


Analyzing combinational_fast block:

est. max delay: 1406.37 ps
est. max freq: 558.86 MHz
est. area: (0.001036728, 0)


Analyzing pipelined_simple block:

est. max delay: 2223.24 ps
est. max freq: 383.69 MHz
est. area: (0.0019985328, 0)


Analyzing pipelined_fast block:

est. max delay: 1085.34 ps
est. max freq: 681.04 MHz
est. area: (0.0021030768, 0)




# Processing Element


In [None]:
def connect_pe_io(pe: ProcessingElement):
    # Connect the inputs and outputs of the processing element
    w_bits, a_bits = pe.weight_type.bitwidth(), pe.data_type.bitwidth()
    w_in, d_in, acc_in = create_inputs(
        weight_in=w_bits, data_in=a_bits, accum_in=a_bits
    )
    # w_in, d_in, acc_in = create_inputs(w_bits, a_bits, a_bits)
    pe.connect_weight(w_in)
    pe.connect_data(d_in)
    pe.connect_accum(acc_in)
    # if pe.pipeline:
    #     controls = create_inputs(weight_en=1, data_en=1, mul_en=1, adder_en=1)
    pe.connect_control_signals(
        *create_inputs(weight_en=1, data_en=1, mul_en=1, adder_en=1)
    )
    create_outputs(*pe.outputs.__dict__.values())


def create_pe_blocks(
    dtypes: tuple[Type[BaseFloat], Type[BaseFloat]]
) -> dict[str, Block]:
    """Create a processing element for each pair of dtypes."""

    weight_dtype, act_dtype = dtypes

    # Defining blocks to encapsulate hardware

    combinational_block = Block()
    simple_pipeline_block = Block()
    simple_pipeline_fast_block = Block()
    full_pipeline_block = Block()
    full_pipeline_fast_block = Block()

    combinational_lmul_block = Block()
    simple_pipeline_lmul_block = Block()
    simple_pipeline_fast_lmul_block = Block()
    full_pipeline_lmul_block = Block()
    full_pipeline_fast_lmul_block = Block()

    # Standard IEEE multiplier versions

    with set_working_block(combinational_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier,
            adder=float_adder,
            pipeline_mult=False,
        )
        connect_pe_io(pe)

    with set_working_block(simple_pipeline_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier,
            adder=float_adder,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(simple_pipeline_fast_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier_fast_unstable,
            adder=float_adder_fast_unstable,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(full_pipeline_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier_pipelined,
            adder=float_adder_pipelined,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(full_pipeline_fast_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=float_multiplier_pipelined_fast_unstable,
            adder=float_adder_pipelined_fast_unstable,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    # L-mul versions

    with set_working_block(combinational_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_simple,
            adder=float_adder,
            pipeline_mult=False,
        )
        connect_pe_io(pe)

    with set_working_block(simple_pipeline_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_simple,
            adder=float_adder,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(simple_pipeline_fast_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_fast,
            adder=float_adder_fast_unstable,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(full_pipeline_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_pipelined,
            adder=float_adder_pipelined,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    with set_working_block(full_pipeline_fast_lmul_block):
        pe = ProcessingElement(
            data_type=act_dtype,
            weight_type=weight_dtype,
            accum_type=act_dtype,
            multiplier=lmul_pipelined_fast,
            adder=float_adder_pipelined_fast_unstable,
            pipeline_mult=True,
        )
        connect_pe_io(pe)

    return {
        "combinational": combinational_block,
        "standard": simple_pipeline_block,
        "fast": simple_pipeline_fast_block,
        "pipelined": full_pipeline_block,
        "fast_pipelined": full_pipeline_fast_block,
        "combinational_lmul": combinational_lmul_block,
        "standard_lmul": simple_pipeline_lmul_block,
        "fast_lmul": simple_pipeline_fast_lmul_block,
        "pipelined_lmul": full_pipeline_lmul_block,
        "fast_pipelined_lmul": full_pipeline_fast_lmul_block,
    }

In [51]:
reset_working_block()
pe_blocks = create_pe_blocks((Float8, Float8))

for name, block in pe_blocks.items():
    print(f"Analyzing {name} block:")
    analyze(block)

Pipelining disabled, no product register to enable. Deleting wire.
Pipelining disabled, no product register to enable. Deleting wire.
Analyzing combinational block:

est. max delay: 12953.76 ps
est. max freq: 74.98 MHz
est. area: (0.0079923888, 0)


Analyzing standard block:

est. max delay: 7848.58 ps
est. max freq: 121.48 MHz
est. area: (0.0084523824, 0)


Analyzing fast block:

est. max delay: 5110.31 ps
est. max freq: 182.04 MHz
est. area: (0.0084314736, 0)


Analyzing pipelined block:

est. max delay: 3226.71 ps
est. max freq: 277.03 MHz
est. area: (0.0121149072, 0)


Analyzing fast_pipelined block:

est. max delay: 2112.04 ps
est. max freq: 400.80 MHz
est. area: (0.0120939984, 0)


Analyzing combinational_lmul block:

est. max delay: 8835.42 ps
est. max freq: 108.48 MHz
est. area: (0.0055965888, 0)


Analyzing standard_lmul block:

est. max delay: 7848.58 ps
est. max freq: 121.48 MHz
est. area: (0.0060565824, 0)


Analyzing fast_lmul block:

est. max delay: 4310.38 ps
est. max fr

# Accelerator


In [None]:
import math


class CompiledAcceleratorSimulator:
    """Simulator for the accelerator that uses compiled simulation for speed."""

    def __init__(self, config: CompiledAcceleratorConfig):
        self.config = config
        self.construct_hardware()

    def construct_hardware(self):
        """Construct the hardware for the accelerator."""
        print(f"Constructing hardware for config {self.config.name}...")
        self.accelerator = CompiledAccelerator(self.config)
        # Create input and output wires
        inputs = {
            "data_enable": Input(1, "data_enable"),
            "data_inputs": [
                Input(self.config.activation_type.bitwidth(), f"data_in_{i}")
                for i in range(self.config.array_size)
            ],
            "weight_enable": Input(1, "weight_enable"),
            "weights_in": [
                Input(self.config.weight_type.bitwidth(), f"weight_in_{i}")
                for i in range(self.config.array_size)
            ],
            "accum_addr": Input(self.config.accum_addr_width, "accum_addr"),
            "accum_mode": Input(1, "accum_mode"),
            "act_start": Input(1, "act_start"),
            "act_func": Input(1, "act_func"),
        }
        self.accelerator.connect_inputs(**inputs)
        self._output_wires = [
            Output(self.config.activation_type.bitwidth(), f"out_{i}")
            for i in range(self.config.array_size)
        ]
        self.accelerator.connect_outputs(self._output_wires, Output(1, "output_valid"))


def calculate_accumulator_memory(
    accum_addr_width: int,
    array_size: int,
    dtype: Type[BaseFloat],
    unit: Literal["B", "KB", "MB"] = "KB",
) -> float:
    conversions = {
        "B": 8.0,
        "KB": 8.0 * 1024,
        "MB": 8.0 * 1024 * 1024,
    }
    bits = array_size * (2**accum_addr_width) * dtype.bitwidth()
    mem = bits / conversions[unit]
    slots = 2**accum_addr_width
    print(
        f"{mem} {unit} ({slots} slots) avaialable for {accum_addr_width} address bits",
        f"with {array_size}x{array_size} array in {dtype.__name__}",
    )
    return mem


def calculate_min_accum_addr_width(
    required_mem: float,
    array_size: int,
    dtype: Type[BaseFloat],
    unit: Literal["B", "KB", "MB"] = "KB",
) -> int:
    conversions = {
        "B": 8.0,
        "KB": 8.0 * 1024,
        "MB": 8.0 * 1024 * 1024,
    }
    bits = required_mem * conversions[unit]
    req_width = math.ceil(math.log2(bits / (array_size * dtype.bitwidth())))
    slots = 2**req_width
    print(
        f"{req_width} address bits ({slots} slots) required for {required_mem} {unit}",
        f"with {array_size}x{array_size} array in {dtype.__name__}",
    )
    return req_width


unit = "MB"
addr_bits = 12
dtype = Float16
array_size = 256

desired_mem = 2.1

mem = calculate_accumulator_memory(addr_bits, array_size, dtype, unit)
min_addr_bits = calculate_min_accum_addr_width(desired_mem, array_size, dtype, unit)

SyntaxError: '(' was never closed (825542768.py, line 75)