In [23]:
import numpy as np
from onnx import TensorProto, helper

import finn.core.onnx_exec as oxe
from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
from finn.core.datatype import DataType
from finn.core.modelwrapper import ModelWrapper
from finn.custom_op.general.multithreshold import multithreshold
from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
from finn.transformation.general import GiveUniqueNodeNames
from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
from finn.util.basic import gen_finn_dt_tensor
from finn.custom_op.registry import getCustomOp
from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
import os
from finn.util.pyverilator import axilite_read, axilite_write
from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
from finn.core.rtlsim_exec import rtlsim_exec
import finn.core.data_layout as DataLayout
import finn.transformation.fpgadataflow .convert_to_hls_layers as to_hls
from finn.transformation.infer_shapes import InferShapes
from finn.transformation.infer_data_layouts import InferDataLayouts

test_fpga_part = "xc7z020clg400-1"
target_clk_ns = 5


def make_single_thresholding_onnx(T, idt, odt, wdt):
    NumChannels = T.shape[0]
    out_bias = float(odt.min())
    
    inp = helper.make_tensor_value_info(
        "inp", TensorProto.FLOAT, [1, NumChannels, 128, 1]
    )
    outp = helper.make_tensor_value_info(
        "outp", TensorProto.FLOAT, [1, NumChannels, 128, 1]
    )

    mt_node = helper.make_node(
        "MultiThreshold",
        ["inp", "thresh"],
        ["outp"],
        domain="finn.custom_op.general",
        out_bias=out_bias,
        out_dtype="INT8",
        data_layout="NCHW"
    )
    graph = helper.make_graph(
        nodes=[mt_node], name="mt_graph", inputs=[inp], outputs=[outp]
    )

    model = helper.make_model(graph, producer_name="mtnode-model")
    model = ModelWrapper(model)

    model.set_tensor_datatype("inp", idt)
    model.set_tensor_datatype("outp", odt)
    model.set_tensor_datatype("thres", idt)
    
    model.set_initializer("thresh", T)
    model.set_tensor_shape("thresh", T.shape)
    model.set_tensor_layout("inp", DataLayout.NCHW)
    
    model = model.transform(InferShapes())

    return model

def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode):
    NumChannels = T.shape[0]

    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 128, 1, NumChannels])
    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 128, 1, NumChannels])
    
    node_inp_list = ["inp", "thresh"]

    Thresholding_node = helper.make_node(
        "Thresholding_Batch",
        node_inp_list,
        ["outp"],
        domain="finn.custom_op.fpgadataflow",
        backend="fpgadataflow",
        NumChannels=NumChannels,
        PE=pe,
        numSteps=T.shape[1],
        inputDataType=idt.name,
        weightDataType=idt.name,  # will be set by MinimizeAccumulatorWidth
        outputDataType=odt.name,
        ActVal=actval,
        mem_mode=mem_mode,
        numInputVectors=(1,128,1)
    )
    graph = helper.make_graph(
        nodes=[Thresholding_node],
        name="thresholding_graph",
        inputs=[inp],
        outputs=[outp],
    )

    model = helper.make_model(graph, producer_name="thresholding-model")
    model = ModelWrapper(model)

    model.set_tensor_datatype("inp", idt)
    model.set_tensor_datatype("outp", odt)

    model.set_tensor_datatype("thresh", idt)
    model.set_initializer("thresh", T)
        
    return model


def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode):
    if nf == -1:
        nf = ich
    pe = ich // nf
    assert ich % pe == 0

    # generate input data
    x = gen_finn_dt_tensor(idt, (1, 128, 1, ich))

    odt = act
    n_steps = act.get_num_possible_values() - 1
    T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
    # make the vivado_hls threshold bug appear (incorrect rtlsim result when first
    # threshold of first channel is zero, while using BIPOLAR output)
    if act == DataType.BIPOLAR:
        T[0][0] = 0
    # provide non-decreasing thresholds
    T = np.sort(T, axis=1)

    if odt == DataType.BIPOLAR:
        actval = 0
    else:
        actval = odt.min()

    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode)
    model.save("/tmp/fpgadataflow_threshold.onnx")
    
    if exec_mode == "cppsim":
        model = model.transform(PrepareCppSim())
        model = model.transform(CompileCppSim())
        model = model.transform(SetExecMode("cppsim"))
    elif exec_mode == "rtlsim":
        model = model.transform(SetExecMode("rtlsim"))
        model = model.transform(GiveUniqueNodeNames())
        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
        model = model.transform(HLSSynthIP())
        model = model.transform(PrepareRTLSim())
    else:
        raise Exception("Unknown exec_mode")

    model.save("/tmp/threshold_model_compiled.onnx")
        
    # package input data as dictionary
    input_dict = {"inp": x}

    # ONNX assumes NCHW format
    # HLS-lib assumes NHWC format
    x_onnx = np.transpose(x, (0, 3, 1, 2)) # NHWC -> NCHW
    y = multithreshold(x_onnx, T)
    if act == DataType.BIPOLAR:
        # binary to bipolar
        y = 2 * y - 1
    else:
        # signed offset
        y += act.min()
    y = np.transpose(y, (0, 2, 3, 1)) # NCHW -> NHWC

    oshape = model.get_tensor_shape("outp")
    y_expected = y.reshape(oshape)
    # execute model
    y_produced = oxe.execute_onnx(model, input_dict)["outp"]

    y_produced = y_produced.reshape(y_expected.shape)

    assert (y_produced == y_expected).all(), "cppsim failed"

    if exec_mode == "rtlsim":
        hls_synt_res_est = model.analysis(hls_synth_res_estimation)
        assert "Thresholding_Batch_0" in hls_synt_res_est

        node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
        inst = getCustomOp(node)
        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
        exp_cycles = exp_cycles_dict[node.name]
        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
        assert exp_cycles != 0


def test_runtime_thresholds_single_layer():
    mem_mode = "decoupled"
    act = DataType.INT4
    idt = DataType.INT16
    nf = 8
    ich = 16
    pe = ich // nf
    assert ich % pe == 0

    # generate input data
    in_tensor = gen_finn_dt_tensor(idt, (1, ich))

    odt = act
    n_steps = act.get_num_possible_values() - 1
    T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32)
    # provide non-decreasing thresholds
    T = np.sort(T, axis=1)

    if odt == DataType.BIPOLAR:
        actval = 0
    else:
        actval = odt.min()

    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode)
    op_inst = getCustomOp(model.graph.node[0])
    op_inst.set_nodeattr("runtime_writeable_weights", 1)
    op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat")
    with open("old_weights.dat", "r") as f:
        old_weight_stream = f.read().strip()
    os.remove("old_weights.dat")
    old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n"))
    old_weight_stream = list(old_weight_stream)
    # need to create stitched IP for runtime weight testing
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
    model = model.transform(PrepareRTLSim())
    model.set_metadata_prop("exec_mode", "rtlsim")
    # add two copies of the input tensor as the first one is just used to
    # "flush out" the pipeline (as mvau already starts receiving old weights while
    # we read/write new ones and reads seem to cause a disturbance too)
    in_tensor = np.tile(in_tensor, (2, 1))
    exec_ctx = {"inp": in_tensor}
    extracted_weight_stream = []

    def read_weights(sim):
        addr = 0
        for i in range(len(old_weight_stream)):
            extracted_weight_stream.append(
                axilite_read(sim, addr, basename="s_axilite_0_")
            )
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
    assert extracted_weight_stream == old_weight_stream
    # only use second batch element in output; first will be invalid due to
    # old weights (see above)
    y = exec_ctx["outp"][1]
    expected = multithreshold(in_tensor, T)[1]
    if act == DataType.BIPOLAR:
        # binary to bipolar
        expected = 2 * expected - 1
    else:
        # signed offset
        expected += act.min()
    assert (y == expected).all()

    new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(
        np.float32
    )
    # provide non-decreasing thresholds
    new_weights = np.sort(T, axis=1)
    op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat")
    with open("new_weights.dat", "r") as f:
        new_weight_stream = f.read().strip()
    os.remove("new_weights.dat")
    new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n"))
    new_weight_stream = list(new_weight_stream)

    def write_weights(sim):
        addr = 0
        for nw in new_weight_stream:
            axilite_write(sim, addr, nw, basename="s_axilite_0_")
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
    y = exec_ctx["outp"][1]
    expected = multithreshold(in_tensor, new_weights)[1]
    if act == DataType.BIPOLAR:
        # binary to bipolar
        expected = 2 * expected - 1
    else:
        # signed offset
        expected += act.min()
    assert (y == expected).all()

In [24]:
# activation: None or DataType
act = DataType.INT8
# input datatype
idt = DataType.INT16
# folding, -1 is maximum possible
nf = 64
#@pytest.mark.parametrize("nf", [-1, 2, 1])
# number of input features
ich = 64
# execution mode
exec_mode = "cppsim" 
#exec_mode = "rtlsim" 
# memory mode
mem_mode = "const"

test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode)




In [22]:
from finn.util.visualization import showInNetron
showInNetron("/tmp/fpgadataflow_threshold.onnx")

Stopping http://0.0.0.0:8081
Serving '/tmp/fpgadataflow_threshold.onnx' at http://0.0.0.0:8081


In [11]:
!ls /tmp

code_gen_cppsim__0803bwbs
code_gen_cppsim__0adxbp2s
code_gen_cppsim__0gpvwazg
code_gen_cppsim__0j37lujb
code_gen_cppsim__0kii_5_g
code_gen_cppsim__0li_bkch
code_gen_cppsim__0qhngowl
code_gen_cppsim__0u8nyiu0
code_gen_cppsim__0un191yw
code_gen_cppsim__0vkn17z5
code_gen_cppsim__0z_0sy6b
code_gen_cppsim__10fhabui
code_gen_cppsim__135lr9vh
code_gen_cppsim__1_j3g77f
code_gen_cppsim__1jppdo4q
code_gen_cppsim__1mjyzkcg
code_gen_cppsim__1ofzcpl1
code_gen_cppsim__1oyrqrfp
code_gen_cppsim__1qm_jopy
code_gen_cppsim__216jy8lp
code_gen_cppsim__23o44vf8
code_gen_cppsim__276z_o1y
code_gen_cppsim__2i0qzfy3
code_gen_cppsim__2kqw03fs
code_gen_cppsim__2rzk90td
code_gen_cppsim__324w97ad
code_gen_cppsim__33tcosio
code_gen_cppsim__35el0yrc
code_gen_cppsim__3hxpyi1p
code_gen_cppsim__3q36wf5c
code_gen_cppsim__3u1dmw3x
code_gen_cppsim__3u75vs2w
code_gen_cppsim__3zcch8ca
code_gen_cppsim__3zer_hxw
code_gen_cppsim__44be2xy1
code_gen_cppsim__4_rt0276
code_gen_cppsim__4k4pse8a
c

In [15]:
from finn.util.visualization import showInNetron

showInNetron("/tmp/threshold_model.onnx")

Stopping http://0.0.0.0:8081
Serving '/tmp/threshold_model.onnx' at http://0.0.0.0:8081
