In [4]:
!ls /workspace/results/

df_model.onnx		    single_node_models	syn_v0	syn_v3	syn_v6
quartznet_hls.onnx	    syn_end2end		syn_v1	syn_v4	syn_v7
quartznet_hls_cleaned.onnx  syn_low_folding	syn_v2	syn_v5


In [5]:
from finn.util.visualization import showInNetron

showInNetron("/workspace/results/quartznet_hls_cleaned.onnx")

Stopping http://0.0.0.0:8081
Serving '/workspace/results/quartznet_hls_cleaned.onnx' at http://0.0.0.0:8081


In [6]:
from finn.core.modelwrapper import ModelWrapper
from finn.custom_op.registry import getCustomOp
from finn.transformation.fpgadataflow.set_folding import SetFolding
from finn.transformation.create_generic_partitions import PartitionFromDict
from finn.transformation.extend_partition import ExtendPartition
from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
from finn.util.fpgadataflow import is_fpgadataflow_node

model = ModelWrapper("/workspace/results/quartznet_hls_cleaned.onnx")

model = model.transform(PartitionFromDict({0: range(1,371)}))
for n in model.graph.node:
    if n.op_type=="GenericPartition":
        inst = getCustomOp(n)
        model_path = inst.get_nodeattr("model")
        model_partition = ModelWrapper(model_path)
        model_partition = model_partition.transform(SetFolding(target_cycles_per_frame=4300000)) 
        model_partition = model_partition.transform(AnnotateCycles())
        model_partition.save(model_path)
        
model = model.transform(ExtendPartition([1]))

for n in model.graph.node:
    if is_fpgadataflow_node(n):
        inst = getCustomOp(n)
        if n.op_type=="FMPadding_Batch":
            continue
        elif n.op_type=="DuplicateStreams_Batch":
            continue
        elif n.op_type=="AddStreams_Batch":
            continue
            
        elif n.op_type=="ConvolutionInputGenerator1D":
            #Why? Because percentage-wise the least amount of LUTs are used to implement the ConvInpGen (after BRAM).
            # For BRAM, you are using ~1%, for URAM ~2%, for LUTs ~1.10%
            # However, we want to keep the BRAMs for the thresholds of the VVAU/StreamingFCLayer
            inst.set_nodeattr("ram_style", "distributed")
            
        elif n.op_type=="Vector_Vector_Activate_Batch":
            inst.set_nodeattr("resType", "dsp")
            
        elif n.op_type=="StreamingFCLayer_Batch":
            inst.set_nodeattr("resType", "dsp")
            inst.set_nodeattr("ram_style", "ultra")
            inst.set_nodeattr("mem_mode", "decoupled")
            if inst.get_nodeattr("ram_style")=="ultra":
                inst.set_nodeattr("runtime_writeable_weights", 1)
            
        elif n.op_type=="Thresholding_Batch":
            inst.set_nodeattr("ram_style", "distributed")
            inst.set_nodeattr("mem_mode", "const")
            
        else:
            print("Missed: {}".format(n.op_type))
            break

model.save("/workspace/results/quartznet_hls_cleaned_partitioned.onnx")

In [7]:
from finn.util.visualization import showInNetron
showInNetron("/workspace/results/quartznet_hls_cleaned_partitioned.onnx")

Stopping http://0.0.0.0:8081
Serving '/workspace/results/quartznet_hls_cleaned_partitioned.onnx' at http://0.0.0.0:8081


# Compare model

In [49]:
import numpy as np
from finn.core.modelwrapper import ModelWrapper
from finn.util.basic import gen_finn_dt_tensor
import finn.core.onnx_exec as oxe
import time

t1 = time.perf_counter()

################################################################################################
####
#### MODEL 1
####
model_1 = ModelWrapper("/workspace/finn/end2end_quartznet_export_dev.onnx")

#### MODEL 1
# Create input data
input0_tensor_name = model_1.graph.input[0].name

## Change input...
input_val = np.load("brevitas_reference/end2end_quartznet_input.npy")
input_val = input_val[:,:,0:256]

# Quantize input data
input_val = quantize_tensor(input_val, num_of_bits=8)

input_dict = {}
input_dict[input0_tensor_name] = input_val
output0_tensor_name = model_1.graph.output[0].name

expected_m1_dict = oxe.execute_onnx(model_1, input_dict, return_full_exec_context = False)
expected_m1 = expected_m1_dict[output0_tensor_name]
################################################################################################

t2 = time.perf_counter() - t1
print("Elapsed time: {}".format(t2))

Elapsed time: 338.4672104109777


In [71]:
!rm brevitas_reference/end2end_quartznet_input_quantized.npy

In [74]:
import numpy as np

def quantize_tensor_v2(x, num_of_bits=8):
    # https://leimao.github.io/article/Neural-Networks-Quantization/
    alpha = x.min()
    beta = x.max()
    #alpha = -4.743273735046387
    #beta = 29.181787490844727

    b = num_of_bits
    alpha_q = -2 ** (b-1)
    beta_q = 2 ** (b-1) - 1

    s = (beta - alpha) / (beta_q - alpha_q)
    z = int((beta * alpha_q - alpha * beta_q) / (beta - alpha))

    x_q = np.round(1 / s * x + z, decimals=0)
    #x_q = torch.round(1 / s * x + z)
    x_q = np.clip(x_q, a_min=alpha_q, a_max=beta_q)
    #x_q = torch.clamp(x_q, min=alpha_q, max=beta_q)

    return x_q

input_val = np.load("brevitas_reference/end2end_quartznet_input.npy")

input_val_quantized = np.zeros(np.shape(input_val))
channels = np.shape(input_val)[1]
for c in range(channels):
    input_val_quantized[0, c, :] = quantize_tensor_v2(input_val[0, c, :], 8)

np.save("brevitas_reference/end2end_quartznet_input_quantized.npy")
print(np.shape(input_val))
print(np.shape(input_val_quantized))

TypeError: _save_dispatcher() missing 1 required positional argument: 'arr'

In [61]:
labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
         "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
    
def __ctc_decoder_predictions_tensor(tensor, labels):
    """
    Decodes a sequence of labels to words
    """
    blank_id = len(labels)
    hypotheses = []
    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
    decoded_prediction = []
    previous = len(labels)  # id of a blank symbol
    prediction = tensor
    for p in prediction:
        if (p != previous or previous == blank_id) and p != blank_id:
            decoded_prediction.append(p)
        previous = p
    hypothesis = ''.join([labels_map[c] for c in decoded_prediction])
    hypotheses.append(hypothesis)
    return hypotheses

In [63]:
previous = [28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
  22, 28, 28,  1, 28, 28, 14, 28, 28,  9, 28, 28, 20, 28, 28, 25, 28, 28, 28,  0,  0, 28,  1, 14,
   4,  4,  0,  0, 28, 22, 28, 28,  5, 28, 28, 28, 28, 24, 28, 28, 28, 28,  1, 28, 28, 28, 20,  9,
   9, 28, 15, 14, 28, 28,  0,  0,  0, 28, 28, 15,  6, 28, 28, 28,  0, 28, 28, 19, 19, 28, 16, 28,
  28, 28, 28,  1, 28, 28,  4, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
  28, 28, 28, 28, 28, 28, 28, 28]
print(expected_m1)

print(__ctc_decoder_predictions_tensor(previous, labels))
print(__ctc_decoder_predictions_tensor(expected_m1[0], labels))

[[28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28
  28 28 28 28 28  5 14 28 28 28 28 28 20 28 28 25 28 28 28  0 28 28 28  1
  28 28 28 28  2 28 28  5 28 28 28 28 24 28 28 28 28 28  9 28 28 19 28 28
  28 28  0 28 28 28 28  3 18 18 28 28  9 19 19 28 28 11 28 28 28 28 28  1
  28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28
  28 28 28 28 28 28 28 28]]
['vanity and vexation of spad']
['enty abexis criska']


In [12]:
from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode

t1 = time.perf_counter()

################################################################################################
####
#### MODEL 2
####
model_2 = ModelWrapper("/workspace/results/quartznet_hls_cleaned_partitioned.onnx")
exec_mode="cppsim"

if exec_mode=="cppsim":
    model_2 = model_2.transform(PrepareCppSim())
    model_2 = model_2.transform(CompileCppSim())
    model_2 = model_2.transform(SetExecMode("cppsim"))

#### MODEL 2
m1_input_val = input_val

input0_tensor_name = model_2.graph.input[0].name
input_dict = {}
m2_input_val = np.reshape(m1_input_val, np.shape(m1_input_val)+(1,)) #extend to 4D
input_dict[input0_tensor_name] = m2_input_val
output0_tensor_name = model_2.graph.output[0].name

expected_m2_dict = oxe.execute_onnx(model_2, input_dict, return_full_exec_context = False)
expected_m2 = expected_m2_dict[output0_tensor_name]

expected_m2 = np.reshape(expected_m2, np.shape(expected_m1))
m2_input_val = np.reshape(m2_input_val, np.shape(m1_input_val))

#assert(m1_input_val==m2_input_val).all()
#assert(expected_m1==expected_m2).all()
################################################################################################

t2 = time.perf_counter() - t1
print("Elapsed time: {}".format(t2))

  "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"

KeyboardInterrupt



In [None]:
assert(m1_input_val==m2_input_val).all()
assert(expected_m1==expected_m2).all()


In [None]:
for idx, el in expected_m1:
    print("{}\t{}".formated(el, expected_m2[idx]))