In [186]:
from typing import Type
import pyrtl
from pyrtl import *
import numpy as np
import onnx
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from enum import IntEnum
from hardware_accelerators.nn import load_model, MLP
from hardware_accelerators.dtypes import *
from hardware_accelerators.rtllib import *
from hardware_accelerators.simulation import AcceleratorSimulator
from hardware_accelerators.simulation.matrix_utils import *
from hardware_accelerators.simulation.buffer import WeightFIFOSimulator

# New Accelerator Sim Testing


## GEMM


In [None]:
simulator = AcceleratorSimulator.default_config(array_size=3, num_weight_tiles=2)

simulator.setup()

weights = np.ones((3, 3))
activations = np.array([[1, 2, 3], [-4, -5, -6], [7, 8, 9]])

simulator.load_weights(weights, 0)

simulator.execute_instruction(
    data_vec=activations[0],
    load_new_weights=True,
    flush_pipeline=False,
    activation_enable=True,
    activation_func="relu",
)
simulator.execute_instruction(
    data_vec=activations[1],
    accum_addr=1,
    flush_pipeline=False,
    activation_enable=True,
    activation_func="relu",
)
simulator.execute_instruction(
    data_vec=activations[2],
    accum_addr=2,
    activation_enable=True,
    activation_func="relu",
    flush_pipeline=True,
)

results = np.zeros((activations.shape[0], weights.shape[1]))

for i in range(3):
    results[i] = simulator._get_outputs()
    simulator.execute_instruction(nop=True)

gt = np.maximum(0, (activations @ weights))

assert np.isclose(results, gt).all()

simulator.history

[
 Simulation Step 0
 Input Signals:
 --------------------------------------------------------------------------------
   data_enable: 0
   data_inputs: None
   weight_start: 1
   weight_tile_addr: 0
   accum_addr: 0
   accum_mode: 0
   act_start: 0
   act_func: 0
 
 Systolic Array State:
 --------------------------------------------------------------------------------
 Inputs:
   w_en: 0
   enable: 0
   weights: [0. 0. 0.]
   data: [0. 0. 0.]
 
 Weights Matrix:
 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]
 
 Data Matrix:
 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]
 
 Accumulators:
 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]
 
 Control Registers:
 data_controls: [0, [0, 0]]
 accum_controls: [0, 0, 0]
 control_out: 0
 
 Outputs:
 [0. 0. 0.]
 ----------------------------------------
 
 
 Accumulator State:
 --------------------------------------------------------------------------------
 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]
 
 Outputs

## GEMV


In [None]:
simulator = AcceleratorSimulator.default_config(array_size=3, num_weight_tiles=2)
simulator.setup()

w = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]).T
activations = np.array([0, 0.5, 1])

simulator.load_weights(w, 0)

simulator.execute_instruction(
    data_vec=activations,
    load_new_weights=True,
    flush_pipeline=True,
    activation_enable=True,
    activation_func="relu",
)

print(f"Weights:\n{w}")
print(
    f"Systolic Array (weights loaded):\n{simulator.accelerator.inspect_systolic_array_state(simulator.sim)}"
)

simulator.execute_instruction(nop=True)
simulator.execute_instruction(nop=True)

results = simulator._get_outputs()
results

Weights:
[[1 2 3]
 [1 2 3]
 [1 2 3]]
Systolic Array (weights loaded):
Inputs:
  w_en: 0
  enable: 0
  weights: [0. 0. 0.]
  data: [0. 0. 0.]

Weights Matrix:
[[1. 2. 3.]
 [1. 2. 3.]
 [1. 2. 3.]]

Data Matrix:
[[0.  0.5 1. ]
 [0.5 1.  0. ]
 [1.  0.  0.5]]

Accumulators:
[[0.  1.  3. ]
 [0.5 3.  3. ]
 [0.  0.  0. ]]

Control Registers:
data_controls: [0, [0, 0]]
accum_controls: [0, 0, 1]
control_out: 0

Outputs:
[0. 0. 0.]
----------------------------------------



array([1.5, 3. , 4.5])

In [None]:
simulator.history

[
 Simulation Step 0
 Input Signals:
 --------------------------------------------------------------------------------
   data_enable: 0
   data_inputs: None
   weight_start: 1
   weight_tile_addr: 0
   accum_addr: 0
   accum_mode: 0
   act_start: 0
   act_func: 0
 
 Systolic Array State:
 --------------------------------------------------------------------------------
 Inputs:
   w_en: 0
   enable: 0
   weights: [0. 0. 0.]
   data: [0. 0. 0.]
 
 Weights Matrix:
 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]
 
 Data Matrix:
 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]
 
 Accumulators:
 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]
 
 Control Registers:
 data_controls: [0, [0, 0]]
 accum_controls: [0, 0, 0]
 control_out: 0
 
 Outputs:
 [0. 0. 0.]
 ----------------------------------------
 
 
 Accumulator State:
 --------------------------------------------------------------------------------
 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]
 
 Outputs

## Weight Loading


In [None]:
# simulator = AcceleratorSimulator.default_config(array_size=3, num_weight_tiles=2)
simulator.setup()

w = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])
w_neg = w * -1

simulator.load_weights(w, 0)
simulator.load_weights(w_neg, 1)

simulator.execute_instruction(load_new_weights=True, weight_tile_addr=0)
simulator.execute_instruction(load_new_weights=True, weight_tile_addr=1)
simulator.execute_instruction(nop=True)

for step in simulator.history:
    print(f"Step {step.step}\n{step.systolic_state.weights}\n")

Step 0
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

Step 1
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

Step 2
[[3. 1. 2.]
 [0. 0. 0.]
 [0. 0. 0.]]

Step 3
[[2. 3. 1.]
 [3. 1. 2.]
 [0. 0. 0.]]

Step 4
[[1. 2. 3.]
 [2. 3. 1.]
 [3. 1. 2.]]

Step 5
[[1. 2. 3.]
 [2. 3. 1.]
 [3. 1. 2.]]

Step 6
[[-3. -1. -2.]
 [ 1.  2.  3.]
 [ 2.  3.  1.]]

Step 7
[[-2. -3. -1.]
 [-3. -1. -2.]
 [ 1.  2.  3.]]



# PyTorch Utilities


## Load MNIST Test Data


In [2]:
# Data transformation: convert images to tensor and normalize them
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,)),
    ]
)
# Download MNIST test data
test_dataset = datasets.MNIST(
    root="./data", train=False, download=True, transform=transform
)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [3]:
image, label = next(iter(test_loader))
image = image.numpy().reshape(-1)
image.shape, label

((784,), tensor([7]))

## Initializing the trained PyTorch model


In [190]:
model = load_model("models/mlp_mnist.pth")
model.eval()

MLP(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=784, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

In [191]:
for name, param in model.named_parameters():
    print(name)

fc1.weight
fc1.bias
fc2.weight
fc2.bias


### Numpy only inference

In [192]:
inputs = image  # numpy vector representing the image


# numpy softmax
def softmax(x: np.ndarray):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


fc1_weight = model.fc1.weight.data.numpy()
fc1_bias = model.fc1.bias.data.numpy()
fc2_weight = model.fc2.weight.data.numpy()
fc2_bias = model.fc2.bias.data.numpy()

x = inputs @ fc1_weight.T + fc1_bias
x = np.maximum(0, x)
x = x @ fc2_weight.T + fc2_bias
x = softmax(x)

# get the index of the maximum value
predicted_class = np.argmax(x)
print(f"Predicted class: {predicted_class}")

Predicted class: 7


### Generalized parameter/layer extraction


In [51]:
def get_layer_info(model):
    layers = []
    for name, module in model.named_modules():
        # Skip the root module itself
        if name == "":
            continue

        if isinstance(module, nn.Linear):
            layers.append(
                {
                    "type": "linear",
                    "weight": module.weight.data.numpy(),
                    "bias": (
                        module.bias.data.numpy() if module.bias is not None else None
                    ),
                }
            )
        elif isinstance(module, nn.ReLU):
            layers.append({"type": "relu"})
        elif isinstance(module, nn.Flatten):
            layers.append({"type": "flatten"})
        # Add more layer types as needed
        # elif isinstance(module, nn.Conv2d):
        #     layers.append(...)

    return layers


def numpy_inference(image, layers):
    x = image

    for layer in layers:
        if layer["type"] == "linear":
            x = np.dot(layer["weight"], x)
            if layer["bias"] is not None:
                x = x + layer["bias"]
        elif layer["type"] == "relu":
            x = np.maximum(0, x)
        elif layer["type"] == "flatten":
            # If input is already 1D, this is a no-op
            x = x.flatten()

    return x


# Extract model structure
layers = get_layer_info(model)

# Print model structure
for i, layer in enumerate(layers):
    if layer["type"] == "linear":
        print(
            f"Layer {i}: Linear - Weight shape: {layer['weight'].shape}, Bias shape: {layer['bias'].shape}"
        )
    else:
        print(f"Layer {i}: {layer['type']}")

# Run inference
prediction = numpy_inference(image, layers)
predicted_class = np.argmax(prediction)
predicted_class

Layer 0: flatten
Layer 1: Linear - Weight shape: (128, 784), Bias shape: (128,)
Layer 2: relu
Layer 3: Linear - Weight shape: (10, 128), Bias shape: (10,)


np.int64(7)

# Running model on the accelerator sim


In [194]:
config = AcceleratorConfig(
    array_size=4,
    num_weight_tiles=8,
    data_type=BF16,
    weight_type=BF16,
    accum_type=BF16,
    pe_adder=float_adder,
    pe_multiplier=float_multiplier,
    pipeline=False,
    accum_adder=float_adder,
    accum_addr_width=8,
)

simulator = AcceleratorSimulator(config)

In [195]:
activations, label = next(iter(test_loader))

# Flatten activations
activations = activations.numpy().reshape(-1)

# Transpose weights since they are stationary in the systolic array
weights = model.fc1.weight.numpy(force=True)
bias = model.fc1.bias.numpy(force=True)

print(f"{activations.shape=}, {weights.shape=}, {bias.shape=}")

# Concat bias with weights to make it a single matrix and add a bias column to activations
weights_bias = np.concatenate([weights, bias.reshape(-1, 1)], axis=1)
activations_bias = np.concatenate([activations, np.ones(1)])
print(f"{activations_bias.shape=}, {weights_bias.shape=}")

# Check if the matrix multiplication is correct
result = weights @ activations + bias
fused_result = weights_bias @ activations_bias

np.isclose(result, fused_result).all()

activations.shape=(784,), weights.shape=(128, 784), bias.shape=(128,)
activations_bias.shape=(785,), weights_bias.shape=(128, 785)


np.True_

In [199]:
activations.shape

(784,)

In [197]:
sim_logprobs = simulator.simulate_pytorch_model(model, activations)
sim_logprobs, np.argmax(sim_logprobs)

fc1_weight: (128, 784)
fc1_bias: (128,)
fc2_weight: (10, 128)
fc2_bias: (10,)
inputs: (784,)
(785,) (128, 785)
fc1_out: (128,)
(129,) (10, 129)


(array([3.33605200e-03, 3.91608960e-02, 1.95792162e-01, 6.66149281e-02,
        6.45654022e-02, 2.24409809e-03, 2.24409809e-03, 2.24409809e-03,
        1.38795987e-03, 1.78539341e-04, 6.22226204e-01, 5.56256302e-06]),
 np.int64(10))

In [None]:
simulator.setup()
# chunk_size = 16
chunk_size = config.array_size
tile_generator = generate_gemv_tiles(activations, weights, chunk_size)

result = []

for tile in tile_generator:
    # partial = tile.matrix @ tile.vector
    partial = tile.vector @ tile.matrix
    if tile.index >= len(result):
        result.append(partial)
    else:
        result[tile.index] += partial

    simulator.load_weights(weights=tile.matrix, tile_addr=0)
    simulator.execute_instruction(
        load_new_weights=True,
        weight_tile_addr=0,
        data_vec=tile.vector,
        accum_addr=tile.index,
        accum_mode=not tile.first,
        activation_func="relu",
        activation_enable=tile.last,
        flush_pipeline=True,
    )

    # simulator.step()
    # sim_partial = simulator.accelerator.inspect_accumulator_state(
    #     simulator.sim
    # )[tile.index]

    # print(tile)
    # print(f"Calculated partial:\n{result[tile.index]}\n")
    # print(f"Simulation Partial Result:\n{sim_partial}\n\n")

    # if tile.last:
    #     break

In [None]:
result = np.array(result).flatten()

# simulator.execute_instruction(nop=True)
# simulator.execute_instruction(nop=True)
sim_result = simulator.output_trace

print(f"{sim_result=}")
print(f"{result=}")

sim_result=[array([ 2.546875, -3.078125,  0.140625,  4.03125 ]), array([-2.046875, -1.84375 , -4.28125 , -1.6875  ])]
result=array([ 2.55966587, -3.09237437,  0.13641229,  4.04874201, -2.07218613,
       -2.25486009, -4.17895806, -1.71206021])


In [155]:
sim_fc1 = simulator.accelerator.inspect_accumulator_state(simulator.sim).flatten()[:128]

sim_fc1

array([ -1.453125  ,  -2.265625  ,  -4.        ,  -2.75      ,
         1.140625  ,   0.12792969,   1.09375   ,  -0.84765625,
        -0.5390625 ,  -1.2109375 ,  -4.53125   ,  -3.59375   ,
        -1.78125   ,  -0.80859375,  -2.96875   ,  -3.4375    ,
        -3.890625  ,  -2.9375    ,  -3.640625  ,  -3.875     ,
        -3.984375  ,  -2.5       ,  -2.96875   ,  -3.859375  ,
        -1.6015625 ,   2.953125  ,   0.953125  ,   1.1171875 ,
        -7.96875   , -10.        ,  -7.75      ,  -8.3125    ,
         1.8984375 ,   0.8203125 ,  -1.484375  ,   0.66796875,
        -2.671875  ,  -4.71875   ,  -6.15625   ,  -8.        ,
        -7.03125   ,  -5.875     ,  -4.25      ,  -4.375     ,
        -3.703125  ,  -6.90625   ,  -2.59375   ,  -3.859375  ,
        -2.765625  ,  -1.953125  ,   1.046875  ,   3.125     ,
         0.171875  ,   0.80859375,   1.21875   ,  -2.875     ,
        -1.765625  ,   0.84375   ,   0.11474609,  -0.27539062,
        -5.40625   ,  -0.88671875,   0.83984375,   0.65

In [None]:
sim_fc1 = np.maximum(0, sim_result)

ValueError: operands could not be broadcast together with shapes (128,) (4,) 

In [None]:
def simulate_linear_layer(
    self,
    inputs: np.ndarray,
    weights: np.ndarray,
    bias: np.ndarray,
    activation_func=None,
) -> np.ndarray:

    weights_bias = np.concatenate([weights, bias.reshape(-1, 1)], axis=1)
    activations = np.concatenate([inputs.flatten(), np.ones(1)])

    tile_generator = generate_gemv_tiles(
        activations, weights_bias, self.config.array_size
    )

    for tile in tile_generator:
        simulator.load_weights(weights=tile.matrix, tile_addr=0)
        simulator.execute_instruction(
            load_new_weights=True,
            weight_tile_addr=0,
            data_vec=tile.vector,
            accum_addr=tile.index,
            accum_mode=not tile.first,
            activation_func=activation_func,
            activation_enable=tile.last,
            flush_pipeline=True,
        )

    simulator.execute_instruction(nop=True)
    result = np.array(self.output_trace).flatten()
    self.output_trace = []
    return result


def simulate_pytorch_model(self, model: MLP, inputs: np.ndarray) -> np.ndarray:
    """Simulates a PyTorch model using the accelerator simulator.

    Args:
        model: PyTorch model to simulate
        inputs: Input data as numpy array

    Returns:
        Model outputs as numpy array
    """
    fc1_weight = model.fc1.weight.data.numpy(force=True)
    fc1_bias = model.fc1.bias.data.numpy(force=True)
    fc2_weight = model.fc2.weight.data.numpy(force=True)
    fc2_bias = model.fc2.bias.data.numpy(force=True)

    activations = self.simulate_linear_layer(
        inputs=inputs, weights=fc1_weight, bias=fc1_bias, activation_func="relu"
    )
    logits = self.simulate_linear_layer(
        inputs=activations, weights=fc2_weight, bias=fc2_bias
    )

    return softmax(logits)

linear
relu


  x = inputs @ weights.T + bias
  x = np.maximum(0, x)


TypeError: max() received an invalid combination of arguments - got (out=NoneType, axis=NoneType, ), but expected one of:
 * ()
 * (Tensor other)
 * (int dim, bool keepdim = False)
      didn't match because some of the keywords were incorrect: out, axis
 * (name dim, bool keepdim = False)
      didn't match because some of the keywords were incorrect: out, axis


In [182]:
next(iter(test_loader))[0]

tensor([[[[-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.424