In [237]:
import os
import pyrtl
from pyrtl import WireVector, Input, Output, CompiledSimulation, reset_working_block
import torch
from typing import Literal
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from hardware_accelerators.dtypes import *
from hardware_accelerators.simulation.compile import (
    ReusableCompiledSimulation,
    CompiledAccelerator,
)
from hardware_accelerators.simulation.accelerator import CompiledAcceleratorSimulator
from hardware_accelerators.rtllib.accelerator import CompiledAcceleratorConfig
from hardware_accelerators.rtllib.multipliers import (
    float_multiplier,
    float_multiplier_simple,
)
from hardware_accelerators.nn import load_model, MLP

model = load_model("models/mlp_mnist.pth")

In [33]:
config_w8a32 = CompiledAcceleratorConfig(
    array_size=2,
    activation_type=Float32,
    weight_type=Float8,
    multiplier=float_multiplier,
)

# Testing adders


In [28]:
from pyrtl.rtllib.adders import carrysave_adder, cla_adder, kogge_stone, ripple_add
from pyrtl.rtllib.libutils import twos_comp_repr, rev_twos_comp_repr

In [None]:
def inspect_wire(sim, wire, two_comp=False):
    bits = len(wire)
    raw = sim.inspect(wire)
    val = rev_twos_comp_repr(raw, bits) if two_comp else raw
    print(f"{wire.name} ({bits} bits) = {val}, 0b{format(raw, f'0{bits}b')}")


def analyze():
    pyrtl.synthesize()
    pyrtl.optimize()
    timing = pyrtl.TimingAnalysis()
    delay = timing.max_length()
    print(f"\nest. max delay: {delay:.2f} ps")
    print(f"est. max freq: {timing.max_freq():.2f} MHz")
    print(f"est. area: {pyrtl.area_estimation()}\n\n")


pyrtl.set_debug_mode(False)

dtype = BF16
e_bits = dtype.exponent_bits()
bias = dtype.bias()

In [None]:
reset_working_block()
exp_a = pyrtl.Input(e_bits, "exp_a")  # type: ignore
exp_b = pyrtl.Input(e_bits, "exp_b")  # type: ignore
exp_diff = pyrtl.Output(e_bits + 1, "exp_diff")  # type: ignore
exp_diff <<= exp_a - exp_b
pyrtl.output_to_verilog(open("simple_sub.v", "w"))

In [None]:
reset_working_block()

slow_block = pyrtl.Block()

A = 10
B = 40
# A = 60
# B = 40

with pyrtl.set_working_block(slow_block):
    exp_a = pyrtl.Input(e_bits, "exp_a")  # type: ignore
    exp_b = pyrtl.Input(e_bits, "exp_b")  # type: ignore
    exp_diff = pyrtl.WireVector(e_bits + 1, "exp_diff")  # type: ignore
    signed_shift = WireVector(e_bits + 1, "signed_shift")  # type: ignore
    exp_larger = WireVector(e_bits, "exp_larger")  # type: ignore
    abs_shift = Output(e_bits, "abs_shift")  # type: ignore

    exp_diff <<= exp_a - exp_b  # This can be negative, indicating which is larger
    exp_larger <<= pyrtl.mux(exp_diff[e_bits], exp_a, exp_b)
    signed_shift <<= pyrtl.mux(
        exp_diff[e_bits],
        exp_diff[:e_bits],
        pyrtl.concat(exp_diff[e_bits], (~exp_diff[:e_bits] + 1)[:e_bits]),
    )
    abs_shift <<= signed_shift[:e_bits]

    slow_sim = pyrtl.Simulation(
        tracer=pyrtl.SimulationTrace("all")
    )  # , block=slow_block)
    slow_sim.step({exp_a: A, exp_b: B})  # type: ignore

    inspect_wire(slow_sim, exp_a)
    inspect_wire(slow_sim, exp_b)
    inspect_wire(slow_sim, exp_diff, True)
    inspect_wire(slow_sim, signed_shift)
    inspect_wire(slow_sim, abs_shift)
    inspect_wire(slow_sim, exp_larger)
    analyze()

fast_block = pyrtl.Block()

with pyrtl.set_working_block(fast_block):
    exp_a = pyrtl.Input(e_bits, "exp_a")  # type: ignore
    exp_b = pyrtl.Input(e_bits, "exp_b")  # type: ignore
    exp_diff = pyrtl.WireVector(e_bits + 1, "exp_diff")  # type: ignore
    abs_shift = Output(e_bits, "abs_shift")  # type: ignore

    neg_b = (~exp_b + 1)[:e_bits]
    exp_diff <<= carrysave_adder(exp_a, ~exp_b, pyrtl.Const(1), final_adder=kogge_stone)

    is_neg = ~exp_diff[e_bits]

    abs_shift <<= pyrtl.select(
        is_neg,
        pyrtl.concat(exp_diff[e_bits], (~exp_diff[:e_bits] + 1)[:e_bits]),
        exp_diff[:e_bits],
    )

    fast_sim = pyrtl.Simulation(tracer=pyrtl.SimulationTrace("all"), block=fast_block)
    fast_sim.step({exp_a: A, exp_b: B})  # type: ignore

    # neg_b.name = "neg_b"
    exp_diff.name = "exp_diff_twos"
    is_neg.name = "is_neg"

    # inspect_wire(fast_sim, neg_b, True)
    inspect_wire(fast_sim, exp_diff)
    inspect_wire(fast_sim, is_neg)
    inspect_wire(
        fast_sim,
        abs_shift,
    )

    analyze()

exp_a (8 bits) = 10, 0b00001010
exp_b (8 bits) = 40, 0b00101000
exp_diff (9 bits) = -30, 0b111100010
signed_shift (9 bits) = 286, 0b100011110
abs_shift (8 bits) = 30, 0b00011110
exp_larger (8 bits) = 40, 0b00101000

est. max delay: 2773.60 ps
est. max freq: 316.80 MHz
est. area: (0.000975744, 0)




PyrtlError: Input "tmp256151" has no input value specified

In [None]:
def analyze_adder(adder, bitwidth, **kwargs):
    print(f"Analyzing {adder.__name__} with bitwidth {bitwidth}")
    a, b = pyrtl.Input(bitwidth, "a"), pyrtl.Input(bitwidth, "b")
    out = pyrtl.Output(bitwidth + 1, "out")
    sum = adder(a, b, **kwargs)
    print("addition bitwidth: ", len(sum))
    out <<= sum
    pyrtl.synthesize()
    pyrtl.optimize()
    timing = pyrtl.TimingAnalysis()
    timing.print_max_length()
    print("est. max frequency: ", timing.max_freq())
    print(f"est. area: {pyrtl.area_estimation()}\n\n")


adder_funcs = [
    pyrtl.signed_add,
    cla_adder,
    kogge_stone,
    ripple_add,
    lambda a, b: float_multiplier(a, b, BF16),
]
for exp_a in adder_funcs:
    reset_working_block()
    analyze_adder(exp_a, 16)

reset_working_block()
analyze_adder(carrysave_adder, 16, c=pyrtl.Input(16, "c"), final_adder=cla_adder)
reset_working_block()
analyze_adder(carrysave_adder, 16, c=pyrtl.Input(16, "c"), final_adder=kogge_stone)

Analyzing signed_add with bitwidth 16
addition bitwidth:  17
The total block timing delay is  4870.070000000002
est. max frequency:  190.36487235083476
est. area: (0.00121968, 0)


Analyzing cla_adder with bitwidth 16
addition bitwidth:  17
The total block timing delay is  1900.5399999999997
est. max frequency:  437.91656813543887
est. area: (0.00117612, 0)


Analyzing kogge_stone with bitwidth 16
addition bitwidth:  17
The total block timing delay is  986.8399999999999
est. max frequency:  730.0122642060387
est. area: (0.00182952, 0)


Analyzing ripple_add with bitwidth 16
addition bitwidth:  17
The total block timing delay is  4735.000000000003
est. max frequency:  195.38882375928085
est. area: (0.001202256, 0)


Analyzing <lambda> with bitwidth 16
using faster multiplier
addition bitwidth:  16
The total block timing delay is  7632.510000000002
est. max frequency:  124.75812518479795
est. area: (0.010001376, 0)


Analyzing carrysave_adder with bitwidth 16
addition bitwidth:  18
The t

# Configs for power, area, delay analysis


### Baselines


In [None]:
SIZE = 4
# Baselines
config_w8a16 = CompiledAcceleratorConfig(
    array_size=SIZE,
    activation_type=BF16,
    weight_type=Float8,
    multiplier=float_multiplier,
    pipeline=True,
)
config_w8a16 = CompiledAcceleratorConfig(
    array_size=SIZE,
    activation_type=BF16,
    weight_type=Float8,
    multiplier=float_multiplier,
    pipeline=True,
)
config_w8a32 = CompiledAcceleratorConfig(
    array_size=8,
    activation_type=Float32,
    weight_type=Float8,
    multiplier=float_multiplier_simple,
)
config_w16a16 = CompiledAcceleratorConfig(
    array_size=SIZE,
    activation_type=BF16,
    weight_type=BF16,
    multiplier=float_multiplier,
    pipeline=True,
)
config_w16a32 = CompiledAcceleratorConfig(
    array_size=SIZE,
    activation_type=BF16,
    weight_type=BF16,
    multiplier=float_multiplier_simple,
    pipeline=True,
)
config_w32a32 = CompiledAcceleratorConfig(
    array_size=SIZE,
    activation_type=BF16,
    weight_type=BF16,
    multiplier=float_multiplier_simple,
    pipeline=True,
)

### Slow, power efficient


In [None]:
def _get_multipliers_for_config(
    w_a: tuple[int, int],
    mode: Literal["basic", "standard", "fast", "fastest"],
):
    """Get the standard and lmul multipliers based on mode, and the weight and activation types.

    Basic mode uses no pipelining, simplest internal components. Will be the slowest but also the most power efficient.
    Standard mode uses no pipelining, but with faster internal components. Will be slightly faster than basic mode.
    Fast mode uses pipelining with simpler internal components. Will be significantly faster than standard mode.
    Fastest mode uses pipelining with the fastest internal components. Will be the fastest but also the most power hungry.
    
    Args:
        w_a (tuple[int, int]): The weight and activation types.
        mode (Literal["basic", "standard", "fast", "fastest"]): The mode to use."
        
    Returns:
        tuple[float_multiplier, l_mul]: The appropriate pair of IEEE standard float multiplier and l-mul implementation.
    """
    mode_map = {
        "basic": (float_multiplier_simple, lmul_simple),
        "standard": (float_multiplier, lmul_fast),
        "fast": (float_multiplier_pipeline, lmul_fast),
        "fastest": (float_multiplier_pipeline, lmul_fastest),
    }
    return mode_map.get(mode, (float_multiplier_simple, lmul_simple)
    if mode == "basic":
        efficient_map = { 
            (8, 8): (float_multiplier_simple, lmul_simple),
            (8, 16): (float_multiplier_simple, lmul_8x16),

        }



def generate_configs():
    w_a_dtypes = [(8, 8), (8, 16), (8, 32), (16, 16), (16, 32), (32, 32)]

NameError: name 'Literal' is not defined

In [None]:
config_w8a8_e = CompiledAcceleratorConfig(
    array_size=SIZE,
    weight_type=Float8,
    activation_type=Float8,
    multiplier=float_multiplier_simple,
    pipeline=True,
)
config_w8a16_e = CompiledAcceleratorConfig(
    array_size=SIZE,
    weight_type=Float8,
    activation_type=BF16,
    multiplier=float_multiplier_simple,
    pipeline=True,
)
config_w8a16_e = CompiledAcceleratorConfig(
    array_size=SIZE,
    weight_type=Float8,
    activation_type=BF16,
    multiplier=float_multiplier_simple,
    pipeline=True,
)
config_w8a32_e = CompiledAcceleratorConfig(
    array_size=SIZE,
    weight_type=Float8,
    activation_type=Float32,
    multiplier=float_multiplier_simple,
)
config_w16a16_e = CompiledAcceleratorConfig(
    array_size=SIZE,
    weight_type=BF16,
    activation_type=BF16,
    multiplier=float_multiplier_simple,
    pipeline=True,
)
config_w16a32_e = CompiledAcceleratorConfig(
    array_size=SIZE,
    weight_type=BF16,
    activation_type=BF16,
    multiplier=float_multiplier_simple,
    pipeline=True,
)
config_w32a32_e = CompiledAcceleratorConfig(
    array_size=SIZE,
    weight_type=BF16,
    activation_type=BF16,
    multiplier=float_multiplier_simple,
    pipeline=True,
)

## Test loading from saved sim


In [7]:
# Data transformation: convert images to tensor and normalize them
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,)),
    ]
)
# Download MNIST test data
test_dataset = datasets.MNIST(
    root="./data", train=False, download=True, transform=transform
)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)


def get_batch(batch_size):
    loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    batch, labels = next(iter(loader))
    return batch.reshape(batch_size, -1).numpy(), labels.numpy()


def get_activation():
    image, _ = next(iter(test_loader))
    image = image.detach().numpy().reshape(-1)
    return image

# Testing compiled sims


In [51]:
reset_working_block()

In [None]:
sim_w8a16 = CompiledAcceleratorSimulator(config_w8a16, model=model)
sim_w16a16 = CompiledAcceleratorSimulator(config_w16a16, model=model)

Using precompiled library: /Users/kaibreese/UCSD/dsc180b/hardware-accelerators/hardware_accelerators/bin/w8ab16s8/pyrtlsim.so
Using precompiled library: /Users/kaibreese/UCSD/dsc180b/hardware-accelerators/hardware_accelerators/bin/wb16ab16s8/pyrtlsim.so


In [55]:
pyrtl.set_debug_mode(False)

In [8]:
reset_working_block()

config_w8a32 = CompiledAcceleratorConfig(
    array_size=2,
    activation_type=Float32,
    weight_type=Float8,
    multiplier=float_multiplier,
)

sim_w8a32 = CompiledAcceleratorSimulator(config_w8a32, model=model, recompile=True)

Constructing hardware for config w8a32-2x2...
Saved compiled binary for config w8a32-2x2 to /Users/kaibreese/.hardware_accelerators/sim_cache/77203ff37f1343bf


# Running batch inference


In [12]:
batch, labels = get_batch(10)

results = sim_w8a32.predict_batch(batch)
preds = np.argmax(results, axis=1)

print()
print(preds)
print(labels)
print(preds == labels)

Completed 11266/11438 tiles
[7 2 1 0 4 1 4 9 5 9]
[7 2 1 0 4 1 4 9 5 9]
[ True  True  True  True  True  True  True  True  True  True]


In [33]:
pred = sim_w8a16.predict(get_activation())
np.argmax(pred).item(), len(pred)

(7, 10)

In [32]:
pred = sim_w16a16.predict(get_activation())
np.argmax(pred).item(), len(pred)

(7, 10)

In [34]:
batch, labels = get_batch(10)

results = sim_w8a16.predict_batch(batch)
preds = np.argmax(results, axis=1)

print()
print(preds)
print(labels)
print(preds == labels)

Completed 1584/1618 tiles
[7 2 1 0 4 1 4 9 5 9]
[7 2 1 0 4 1 4 9 5 9]
[ True  True  True  True  True  True  True  True  True  True]


In [28]:
batch, labels = get_batch(10)

results = sim_fp8.predict_batch(batch)
preds = np.argmax(results, axis=1)

print()
print(preds)
print(labels)
print(preds == labels)

Completed 1584/1618 tiles
[7 0 7 0 3 0 2 0 9 0]
[7 2 1 0 4 1 4 9 5 9]
[ True False False  True False False False False False False]


In [None]:
print(sim_bf16.config)

CompiledAcceleratorConfig(
        array_size: 8
        activation_type: BF16
        weight_type: BF16
        multiplier: IEEE 754
        accum_addr_width: 12
        pipeline: False
        name: wb16ab16s8
    )


In [79]:
def get_batch(batch_size):
    loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    batch, labels = next(iter(loader))
    return batch.reshape(batch_size, -1).numpy(), labels


batch, labels = get_batch(10)
preds == labels

tensor([True, True, True, True, True, True, True, True, True, True])

In [106]:
isinstance(model, torch.nn.Module)

True

In [84]:
torch.tensor(results, dtype=torch.bfloat16)

tensor([[7.4215e-09, 7.4465e-12, 4.8801e-07, 6.7055e-06, 3.6637e-14, 3.8199e-10,
         9.2704e-15, 1.0000e+00, 1.1350e-09, 4.7730e-08],
        [1.3853e-08, 3.4332e-05, 1.0000e+00, 1.1176e-06, 4.6629e-15, 1.6917e-10,
         3.8417e-09, 4.4587e-13, 9.1735e-08, 2.2471e-13],
        [1.8394e-08, 1.0000e+00, 1.3351e-03, 6.3896e-05, 6.5804e-05, 4.1246e-05,
         2.9951e-06, 2.9206e-05, 7.4863e-05, 8.3074e-07],
        [1.0000e+00, 1.1539e-11, 3.4051e-09, 7.5033e-11, 2.0140e-08, 3.4226e-08,
         2.2352e-07, 2.0373e-10, 4.6020e-10, 3.5912e-06],
        [1.5061e-09, 7.0486e-11, 6.4261e-08, 5.4797e-11, 1.0000e+00, 6.1933e-08,
         1.7812e-08, 5.1036e-07, 4.2201e-09, 5.3406e-04],
        [1.6080e-09, 1.0000e+00, 1.0967e-05, 3.5018e-06, 1.8954e-05, 7.1886e-09,
         4.3656e-09, 3.2425e-05, 1.0058e-06, 4.1444e-08],
        [7.7716e-14, 2.7753e-07, 3.8445e-06, 1.5497e-06, 1.0000e+00, 1.3828e-05,
         2.0838e-08, 5.8711e-06, 8.8120e-04, 1.2207e-04],
        [9.5952e-11, 3.7812

In [92]:
np.sum(results, axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
from torch.nn import CrossEntropyLoss

CrossEntropyLoss()(torch.tensor(results), labels)

1.4622700768294972

In [108]:
type(test_dataset)

torchvision.datasets.mnist.MNIST

In [None]:
# Evaluation function
def evaluate(
    sim: CompiledAcceleratorSimulator,
    model: MLP,
    dataset,
    batch_size,
    criterion=CrossEntropyLoss(),
):
    correct = 0
    total = 0
    running_loss = 0.0
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    for batch, labels in data_loader:
        batch = batch.reshape(batch_size, -1).numpy()
        outputs = sim.predict_batch(model, batch)
        loss = criterion(torch.tensor(outputs), labels)
        running_loss += loss.item()

        # Get predictions from the maximum value
        predicted = np.argmax(outputs, axis=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        break
    avg_loss = running_loss / len(data_loader)
    accuracy = 100.0 * correct / total
    return avg_loss, accuracy


evaluate(sim_bf16, model, test_dataset, 10)

[array([[0.      , 0.      , 3.109375, 0.      ],
       [0.      , 9.9375  , 0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      ],
       [0.      , 8.0625  , 3.6875  , 3.078125],
       [0.      , 7.625   , 0.546875, 0.      ],
       [0.      , 0.      , 0.      , 0.      ],
       [0.      , 2.609375, 1.15625 , 0.      ],
       [0.      , 0.      , 0.      , 0.      ],
       [0.      , 9.5625  , 0.      , 0.      ],
       [0.      , 7.375   , 0.      , 3.78125 ]]), array([[ 0.      ,  6.8125  ,  6.625   ,  0.      ],
       [ 1.9375  ,  0.      ,  0.      ,  4.9375  ],
       [ 0.      ,  0.      ,  0.      ,  6.8125  ],
       [ 0.      ,  0.      ,  6.9375  ,  0.      ],
       [ 0.      ,  5.3125  , 11.8125  ,  0.      ],
       [ 0.      ,  1.6875  ,  0.      ,  6.0625  ],
       [ 0.      ,  1.078125,  8.5625  ,  0.      ],
       [ 0.      ,  3.96875 ,  3.96875 ,  4.0625  ],
       [ 0.      ,  0.      , 10.6875  ,  0.      ],
       [ 0.      ,  7.6875  , 

(0.001462270076829497, 100.0)

In [None]:
from hardware_accelerators.simulation.matrix_utils import bias_trick


weights_1 = model.fc1.weight.numpy(force=True)
bias_1 = model.fc1.bias.numpy(force=True)
weights_2 = model.fc2.weight.numpy(force=True)
bias_2 = model.fc2.bias.numpy(force=True)

# Apply the bias trick
W1_aug = bias_trick(weights_1, bias_1)
W2_aug = bias_trick(weights_2, bias_2)

W1_aug.shape[0]

128

In [None]:
10000 / 10

1000.0

# Compiling all configurations


In [None]:
from typing import Iterator, Type, List, Callable
from itertools import product

from hardware_accelerators.dtypes import *


def generate_accelerator_configs(
    array_size: int = 16,
    dtypes: List[Type[BaseFloat]] = None,
    multipliers: List[Callable] = None,
) -> Iterator[CompiledAcceleratorConfig]:
    """
    Generate all valid CompiledAcceleratorConfig combinations.

    Args:
        array_size: Size of the systolic array
        dtypes: List of data types to consider. Defaults to [Float8, BF16, FP16, FP32]
        multipliers: List of multiplier functions. Defaults to [float_multiplier, lmul]

    Yields:
        Valid CompiledAcceleratorConfig objects

    Restrictions:
        1. The activation_type must be greater than or equal to the weight_type in terms of bitwidth.
        2. 16-bit float types (BF16, FP16) should not be combined with each other.
           They should only pair with themselves or with FP32.
    """
    if dtypes is None:
        dtypes = [Float8, BF16, Float16, Float32]

    if multipliers is None:
        multipliers = [float_multiplier, lmul_fast]

    # Sort dtypes by bitwidth for easier comparison
    dtype_bitwidths = {dtype: dtype.bitwidth() for dtype in dtypes}
    sorted_dtypes = sorted(dtypes, key=lambda d: dtype_bitwidths[d])

    # Identify 16-bit float types
    bit16_float_types = [dtype for dtype in dtypes if dtype_bitwidths[dtype] == 16]

    # Generate all combinations
    for multiplier in multipliers:
        for weight_type in sorted_dtypes:
            # Find valid activation types based on bitwidth
            valid_activation_types = [
                dtype
                for dtype in sorted_dtypes
                if dtype_bitwidths[dtype] >= dtype_bitwidths[weight_type]
            ]

            for activation_type in valid_activation_types:
                # Skip invalid combinations of 16-bit float types
                if (
                    weight_type in bit16_float_types
                    and activation_type in bit16_float_types
                    and weight_type != activation_type
                ):
                    continue

                yield CompiledAcceleratorConfig(
                    array_size=array_size,
                    activation_type=activation_type,
                    weight_type=weight_type,
                    multiplier=multiplier,
                )


# Example usage:
def print_all_configs():
    for i, config in enumerate(generate_accelerator_configs()):
        print(f"Config {i+1}:")
        print(f"  Array Size: {config.array_size}")
        print(f"  Activation Type: {config.activation_type.__name__}")
        print(f"  Weight Type: {config.weight_type.__name__}")
        print(f"  Multiplier: {config.multiplier.__name__}")
        print()

In [32]:
print_all_configs()

Config 1:
  Array Size: 16
  Activation Type: Float8
  Weight Type: Float8
  Multiplier: float_multiplier

Config 2:
  Array Size: 16
  Activation Type: BF16
  Weight Type: Float8
  Multiplier: float_multiplier

Config 3:
  Array Size: 16
  Activation Type: Float16
  Weight Type: Float8
  Multiplier: float_multiplier

Config 4:
  Array Size: 16
  Activation Type: Float32
  Weight Type: Float8
  Multiplier: float_multiplier

Config 5:
  Array Size: 16
  Activation Type: BF16
  Weight Type: BF16
  Multiplier: float_multiplier

Config 6:
  Array Size: 16
  Activation Type: Float32
  Weight Type: BF16
  Multiplier: float_multiplier

Config 7:
  Array Size: 16
  Activation Type: Float16
  Weight Type: Float16
  Multiplier: float_multiplier

Config 8:
  Array Size: 16
  Activation Type: Float32
  Weight Type: Float16
  Multiplier: float_multiplier

Config 9:
  Array Size: 16
  Activation Type: Float32
  Weight Type: Float32
  Multiplier: float_multiplier

Config 10:
  Array Size: 16
  Activa