In [1]:
import onnx
from finn.core.modelwrapper import ModelWrapper
from finn.util.visualization import showInNetron

from finn.custom_op.registry import getCustomOp
from finn.transformation.general import GiveUniqueNodeNames

## Loading the Brevitas network

In [2]:
from finn.util.basic import make_build_dir
from finn.util.visualization import showInNetron

import torch
import torch.nn as nn
import brevitas.nn as qnn
from brevitas.core.quant import QuantType
from brevitas.core.scaling import ScalingImplType

**(!)** You should paste the `siam_track/XOH` folder to `/workspace/finn/notebooks/` directory before proceeding.

In [3]:
build_dir = "/workspace/finn/notebooks/siam_track/XOH/"
network_identifier = "siamfc"

In [4]:
class _BatchNorm2d(nn.BatchNorm2d):

    def __init__(self, num_features, *args, **kwargs):
        super(_BatchNorm2d, self).__init__(
            num_features, *args, eps=1e-6, momentum=0.05, **kwargs)


class _AlexNet(nn.Module):
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        return x
    
# DPR: based on AlexNet, with all filters 3x3 (maybe more like VGG)); AND squeezed to fit FINN & ZCU104; AND quantized
class QAlexNetV5(_AlexNet):
    output_stride = 8

    def __init__(self, weights_bitwidth=4, activation_bitwidth=4):
        super(QAlexNetV5, self).__init__()
        print("Initializing modified and squeezed QUANTIZED AlexNet for SiamTrack backbone.")
        self.conv1 = nn.Sequential(
            qnn.QuantConv2d(3, 64, 3, weight_bit_width=8),#, bias=False),
            _BatchNorm2d(64),
            qnn.QuantReLU(bit_width=activation_bitwidth, return_quant_tensor=True),
            nn.MaxPool2d(2, 2),
            qnn.QuantConv2d(64, 64, 3, weight_bit_width=weights_bitwidth),
            _BatchNorm2d(64),
            qnn.QuantReLU(bit_width=activation_bitwidth, return_quant_tensor=True),
            nn.MaxPool2d(2, 2))
        self.conv2 = nn.Sequential(
            qnn.QuantConv2d(64, 128, 3, 1, weight_bit_width=weights_bitwidth),
            _BatchNorm2d(128),
            qnn.QuantReLU(bit_width=activation_bitwidth, return_quant_tensor=True),
            nn.MaxPool2d(2, 2))
        self.conv3 = nn.Sequential(
            qnn.QuantConv2d(128, 128, 3, 1, weight_bit_width=weights_bitwidth),
            _BatchNorm2d(128),
            qnn.QuantReLU(bit_width=activation_bitwidth, return_quant_tensor=True))
        self.conv4 = nn.Sequential(
            qnn.QuantConv2d(128, 128, 3, 1, weight_bit_width=weights_bitwidth),
            _BatchNorm2d(128),
            qnn.QuantReLU(bit_width=activation_bitwidth, return_quant_tensor=True))
        self.conv5 = nn.Sequential(
            qnn.QuantConv2d(128, 128, 3, 1, weight_bit_width=8))

In [5]:
net = QAlexNetV5(weights_bitwidth=4, activation_bitwidth=4)
net.load_state_dict(torch.load(build_dir + network_identifier + ".pth",  map_location=torch.device('cpu')))

Initializing modified and squeezed QUANTIZED AlexNet for SiamTrack backbone.


<All keys matched successfully>

In [6]:
import brevitas.onnx as bo

input_shape = (1, 3, 238, 238)
cnv = net.eval()

test_in = 

In [7]:
bo.export_finn_onnx(cnv, input_shape, build_dir + network_identifier + "-Brevitas-Raw.onnx")

ir_version: 6
producer_name: "pytorch"
producer_version: "1.7"
graph {
  node {
    input: "inp.1"
    input: "43"
    output: "44"
    name: "Conv_1"
    op_type: "Conv"
    attribute {
      name: "dilations"
      ints: 1
      ints: 1
      type: INTS
    }
    attribute {
      name: "group"
      i: 1
      type: INT
    }
    attribute {
      name: "kernel_shape"
      ints: 3
      ints: 3
      type: INTS
    }
    attribute {
      name: "pads"
      ints: 0
      ints: 0
      ints: 0
      ints: 0
      type: INTS
    }
    attribute {
      name: "strides"
      ints: 1
      ints: 1
      type: INTS
    }
    domain: ""
  }
  node {
    input: "44"
    input: "45"
    output: "46"
    name: "Mul_3"
    op_type: "Mul"
  }
  node {
    input: "46"
    input: "47"
    output: "48"
    name: "Add_5"
    op_type: "Add"
    domain: ""
  }
  node {
    input: "48"
    input: "conv1.1.weight"
    input: "conv1.1.bias"
    input: "conv1.1.running_mean"
    input: "conv1.1.running

## Generate streamlined model

In [8]:
import onnx
from finn.util.test import get_test_model_trained
from finn.core.modelwrapper import ModelWrapper
from finn.transformation.infer_shapes import InferShapes
from finn.transformation.fold_constants import FoldConstants
from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs

In [9]:
model = ModelWrapper(build_dir + network_identifier + "-Brevitas-Raw.onnx")
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(RemoveStaticGraphInputs())
model.save(build_dir + network_identifier + "-tidy.onnx")

In [10]:
showInNetron(build_dir + network_identifier + "-tidy.onnx")

Serving '/workspace/finn/notebooks/siam_track/XOH/siamfc-tidy.onnx' at http://0.0.0.0:8081


In [11]:
from finn.util.pytorch import ToTensor
from finn.transformation.merge_onnx_models import MergeONNXModels
from finn.core.datatype import DataType

In [12]:
model = ModelWrapper(build_dir + network_identifier + "-tidy.onnx")
global_inp_name = model.graph.input[0].name
# ishape = model.get_tensor_shape(global_inp_name)
# # preprocessing: torchvision's ToTensor divides uint8 inputs by 255
# totensor_pyt = ToTensor()
# chkpt_preproc_name = build_dir + network_identifier + "-preproc.onnx"
# bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)

# # join preprocessing and core model
# pre_model = ModelWrapper(chkpt_preproc_name)
# model = model.transform(MergeONNXModels(pre_model))
# # add input quantization annotation: UINT8 for all BNN-PYNQ models
# global_inp_name = model.graph.input[0].name
model.set_tensor_datatype(global_inp_name, DataType["UINT8"])

In [13]:
from finn.transformation.infer_datatypes import InferDataTypes

In [14]:
chkpt_name = build_dir + network_identifier + "-pre_post.onnx"
# tidy-up again
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())
model.save(chkpt_name)

In [15]:
showInNetron(build_dir + network_identifier + "-pre_post.onnx")

Stopping http://0.0.0.0:8081
Serving '/workspace/finn/notebooks/siam_track/XOH/siamfc-pre_post.onnx' at http://0.0.0.0:8081


In [16]:
from finn.transformation.streamline import Streamline
from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
import finn.transformation.streamline.absorb as absorb
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from finn.transformation.infer_data_layouts import InferDataLayouts
from finn.transformation.general import RemoveUnusedTensors

In [17]:
model = ModelWrapper(build_dir + network_identifier + "-pre_post.onnx")
model = model.transform(MoveScalarLinearPastInvariants())
model = model.transform(Streamline())
model = model.transform(LowerConvsToMatMul())
model = model.transform(MakeMaxPoolNHWC())
model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
model = model.transform(ConvertBipolarMatMulToXnorPopcount())
model = model.transform(Streamline())
# absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())
model.save(build_dir + network_identifier + "-streamlined.onnx")



In [18]:
showInNetron(build_dir + network_identifier + "-streamlined.onnx")

Stopping http://0.0.0.0:8081
Serving '/workspace/finn/notebooks/siam_track/XOH/siamfc-streamlined.onnx' at http://0.0.0.0:8081


In [19]:
import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from finn.custom_op.registry import getCustomOp
from finn.transformation.infer_data_layouts import InferDataLayouts

In [20]:
# choose the memory mode for the MVTU units, decoupled or const
mem_mode = "decoupled"

model = ModelWrapper(build_dir + network_identifier + "-streamlined.onnx")
model = model.transform(to_hls.InferBinaryStreamingFCLayer(mem_mode))
model = model.transform(to_hls.InferQuantizedStreamingFCLayer(mem_mode))
# TopK to LabelSelect
model = model.transform(to_hls.InferLabelSelectLayer())
# input quantization (if any) to standalone thresholding
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(to_hls.InferConvInpGen())
model = model.transform(to_hls.InferStreamingMaxPool())
# get rid of Reshape(-1, 1) operation between hlslib nodes
model = model.transform(RemoveCNVtoFCFlatten())
# get rid of Tranpose -> Tranpose identity seq
model = model.transform(absorb.AbsorbConsecutiveTransposes())
# infer tensor data layouts
model = model.transform(InferDataLayouts())
parent_model = model.transform(CreateDataflowPartition())
parent_model.save(build_dir + network_identifier + "-dataflow_parent.onnx")
sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
# save the dataflow partition with a different name for easier access
dataflow_model = ModelWrapper(dataflow_model_filename)
dataflow_model.save(build_dir + network_identifier + "-dataflow_model.onnx")

In [30]:
showInNetron(build_dir + network_identifier + "-dataflow_parent.onnx")

Stopping http://0.0.0.0:8081
Serving '/workspace/finn/notebooks/siam_track/XOH/siamfc-dataflow_parent.onnx' at http://0.0.0.0:8081


In [22]:
showInNetron(build_dir + network_identifier + "-dataflow_model.onnx")

Stopping http://0.0.0.0:8081
Serving '/workspace/finn/notebooks/siam_track/XOH/siamfc-dataflow_model.onnx' at http://0.0.0.0:8081


## Folding (based on FINN auto-generated onnx model)

In [23]:
# model = ModelWrapper(estimates_output_dir + "/intermediate_models/step_target_fps_parallelization.onnx")
print(build_dir + network_identifier + "-dataflow_model.onnx")
model = ModelWrapper(build_dir + network_identifier + "-dataflow_model.onnx")
fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
print(len(fc_layers))

/workspace/finn/notebooks/siam_track/XOH/siamfc-dataflow_model.onnx
6


In [24]:
# each tuple is (PE, SIMD, in_fifo_depth) for a layer

folding = [
    (32, 3, 128),
    (32, 16, 128),
    (16, 16, 128),
    (8, 16, 128),
    (8, 16, 128),
    (16, 8, 128)
] # V5 VERSION

# folding = [
#     (32, 3, 128),
#     (32, 16, 128),
#     (32, 16, 128),
#     (32, 16, 128),
#     (32, 16, 128),
#     (32, 16, 128)
# ] # V5 VERSION

# ORIGINAL FOLDING
# folding = [ 
#     (32, 3, 128),
#     (32, 16, 128),
#     (16, 16, 128),
#     (8, 16, 128),
#     (8, 16, 128),
#     (8, 8, 128)
# ]
# FOLDING V2
# folding = [
#     (32, 3, 128),
#     (32, 16, 128),
#     (16, 16, 128),
#     (8, 16, 128),
#     (8, 16, 128),
#     (16, 8, 128)
# ]
# FOLDING V3
# folding = [
#     (32, 3, 128),
#     (32, 16, 128),
#     (16, 16, 128),
#     (16, 16, 128),
#     (16, 16, 128),
#     (16, 8, 128)
# ]
# FOLDING V4
# folding = [
#     (32, 3, 128),
#     (32, 16, 128),
#     (16, 16, 128),
#     (16, 16, 128),
#     (16, 16, 128),
#     (16, 16, 128)
# ]
# FOLDING V5
# folding = [
#     (32, 3, 128),
#     (32, 16, 128),
#     (32, 16, 128),
#     (32, 16, 128),
#     (32, 16, 128),
#     (32, 16, 128)
# ]
# FOLDING V6
# folding = [
#     (32, 3, 128),
#     (32, 16, 128),
#     (32, 16, 128),
#     (32, 32, 128),
#     (32, 32, 128),
#     (32, 32, 128)
# ]

for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding):
    fcl_inst = getCustomOp(fcl)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("inFIFODepth", ififodepth)

# use same SIMD values for the sliding window operators
swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator")
for i in range(len(swg_layers)):
    swg_inst = getCustomOp(swg_layers[i])
    simd = folding[i][1]
    swg_inst.set_nodeattr("SIMD", simd)

model = model.transform(GiveUniqueNodeNames())
model.save(build_dir + network_identifier + "-folded.onnx")

Below we visualize in Netron to observe the `StreamingDataWidthConverter` and `StreamingFIFO` nodes that have been inserted into graph, as well as the folding factors in the `PE` and `SIMD` attributes of each `StreamingFCLayer_Batch`.

In [25]:
showInNetron(build_dir + network_identifier + "-folded.onnx")

Stopping http://0.0.0.0:8081
Serving '/workspace/finn/notebooks/siam_track/XOH/siamfc-folded.onnx' at http://0.0.0.0:8081


Our network is now ready and we can start with the hardware generation.

## Hardware Generation

In [26]:
test_pynq_board = "ZCU104"
target_clk_ns = 3

from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
model = ModelWrapper(build_dir + network_identifier + "-folded.onnx")

In [27]:
%%time
model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))
model.save(build_dir + network_identifier + "-synth.onnx")



CPU times: user 16.3 s, sys: 662 ms, total: 16.9 s
Wall time: 1h 43min 47s


In [28]:
model = ModelWrapper(build_dir + network_identifier + "-synth.onnx")
model.model.metadata_props

[key: "floorplan_json"
value: "/tmp/finn_dev_vision/vitis_floorplan_f9odg4r1/floorplan.json"
, key: "vivado_pynq_proj"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_xejc8f6i"
, key: "bitfile"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_xejc8f6i/resizer.bit"
, key: "hw_handoff"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_xejc8f6i/resizer.hwh"
, key: "vivado_synth_rpt"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_xejc8f6i/synth_report.xml"
, key: "platform"
value: "zynq-iodma"
]

In [29]:
from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver

model = ModelWrapper(build_dir + network_identifier + "-synth.onnx")
model = model.transform(MakePYNQDriver(platform="zynq-iodma"))
model.save(build_dir + network_identifier + "-driver.onnx")
model.model.metadata_props

[key: "floorplan_json"
value: "/tmp/finn_dev_vision/vitis_floorplan_f9odg4r1/floorplan.json"
, key: "vivado_pynq_proj"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_xejc8f6i"
, key: "bitfile"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_xejc8f6i/resizer.bit"
, key: "hw_handoff"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_xejc8f6i/resizer.hwh"
, key: "vivado_synth_rpt"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_xejc8f6i/synth_report.xml"
, key: "platform"
value: "zynq-iodma"
, key: "pynq_driver_dir"
value: "/tmp/finn_dev_vision/pynq_driver_ptftyap5"
]

**(!)** The `pynq_driver_dir` key value will be needed later, to find the proper directory on board.

## Deployment and Remote Execution
Change the ip, username, password, port value if needed.

In [5]:
import os
os.environ["PYNQ_IP"] = "192.168.0.99"
print(os.environ["PYNQ_IP"])

192.168.0.99


In [6]:
import os

# set up the following values according to your own environment
# FINN will use ssh to deploy and run the generated accelerator
ip = os.getenv("PYNQ_IP", "192.168.2.99")
username = os.getenv("PYNQ_USERNAME", "xilinx")
password = os.getenv("PYNQ_PASSWORD", "xilinx")
port = os.getenv("PYNQ_PORT", 22)
target_dir = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn_cnv_end2end_example")
# set up ssh options to only allow publickey authentication
options = "-o PreferredAuthentications=publickey -o PasswordAuthentication=no"

# test access to PYNQ board
! sshpass -p {password} ssh {username}@{ip} -p {port} cat /var/run/motd.dynamic

Welcome to PYNQ Linux, based on Ubuntu 18.04 (GNU/Linux 5.4.0-xilinx-v2020.1 aarch64)



In [7]:
print(target_dir)

/home/xilinx/finn_dev_vision


**(!)** The `target_dir` will be needed to find the directory with Pynq driver on board.

In [37]:
import os
import subprocess
from distutils.dir_util import copy_tree
from shutil import copy

import finn.transformation.fpgadataflow.templates as templates
from finn.transformation.base import Transformation
from finn.util.basic import make_build_dir


class DeployToPYNQ(Transformation):
    """Collects all necessary files for deployment and copies them to the PYNQ board.
    Expects information about PYNQ board to make scp possible:

    IP address of board, username and password for board and target directory where
    the files are stored on the board"""

    def __init__(self, ip, port, username, password, target_dir):
        super().__init__()
        self.ip = ip
        self.port = port
        self.username = username
        self.password = password
        self.target_dir = target_dir

    def apply(self, model):
        # set metadata properties accordingly to user input specifications
        model.set_metadata_prop("pynq_ip", self.ip)
        model.set_metadata_prop("pynq_port", str(self.port))
        model.set_metadata_prop("pynq_username", self.username)
        model.set_metadata_prop("pynq_password", self.password)
        model.set_metadata_prop("pynq_target_dir", self.target_dir)

        # create directory for deployment files
        deployment_dir = make_build_dir(prefix="pynq_deployment_")
        model.set_metadata_prop("pynq_deployment_dir", deployment_dir)

        # get and copy necessary files
        # .bit and .hwh file
        bitfile = model.get_metadata_prop("bitfile")
        hwh_file = model.get_metadata_prop("hw_handoff")
        deploy_files = [bitfile, hwh_file]

        for dfile in deploy_files:
            if dfile is not None:
                copy(dfile, deployment_dir)

        # helper script for Alveo
        platform = model.get_metadata_prop("platform")
        if platform == "alveo":
            alveo_run_sh = templates.alveo_run_sh_template
            fill_dict = {
                "$REMOTE_DEPLOY_DIR$": self.target_dir
                + "/"
                + os.path.basename(deployment_dir),
                "$CONDA_ENV_NAME$": "finn-pynq-alveo",
                "$REMOTE_XRT$": os.environ["XILINX_XRT"],
                "$REMOTE_PLATFORM_REPO_PATHS$": os.environ["PLATFORM_REPO_PATHS"],
                "$BITFILE$": os.path.basename(bitfile),
            }
            for key, value in fill_dict.items():
                alveo_run_sh = alveo_run_sh.replace(key, value)
            alveo_run_sh_path = deployment_dir + "/alveo_run.sh"
            with open(alveo_run_sh_path, "w") as f:
                f.write(alveo_run_sh)

        # driver.py and python libraries
        pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir")
        copy_tree(pynq_driver_dir, deployment_dir)
        model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
        model.set_metadata_prop("exec_mode", "remote_pynq")

        if self.password == "":
            prefix = ""
        else:
            prefix = "sshpass -p %s " % self.password

        # create target directory on PYNQ board
        cmd = prefix + 'ssh {}@{} -p {} "mkdir -p {}"'.format(
            self.username, self.ip, self.port, self.target_dir
        )
        bash_command = ["/bin/bash", "-c", cmd]
        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
        process_compile.communicate()
        # copy directory to PYNQ board using scp
        cmd = prefix + "scp -P{} -r {} {}@{}:{}".format(
            self.port, deployment_dir, self.username, self.ip, self.target_dir
        )
        bash_command = ["/bin/bash", "-c", cmd]
        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
        process_compile.communicate()

        return (model, False)

## Updating the PYNQ driver, part 1

Before proceeding, you need to update driver files with tracker code:

* replace `driver.py` and `driver_base.py` from `pynq_driver_dir` (in files generated by FINN on host) with files from `XOH/driver_backup` directory

We do this before transfering the files on board.

In [38]:
# from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ

model = ModelWrapper(build_dir + network_identifier + "-driver.onnx")
# model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
try:
    model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
except:
    print("Unexpected error:", sys.exc_info()[0])

model.save(build_dir + network_identifier + "-pynq_deploy.onnx")

In [39]:
model.model.metadata_props

[key: "floorplan_json"
value: "/tmp/finn_dev_vision/vitis_floorplan_79fn99h_/floorplan.json"
, key: "vivado_pynq_proj"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_daake8eb"
, key: "bitfile"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_daake8eb/resizer.bit"
, key: "hw_handoff"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_daake8eb/resizer.hwh"
, key: "vivado_synth_rpt"
value: "/tmp/finn_dev_vision/vivado_zynq_proj_daake8eb/synth_report.xml"
, key: "platform"
value: "zynq-iodma"
, key: "pynq_driver_dir"
value: "/tmp/finn_dev_vision/pynq_driver_icgts8ni"
, key: "pynq_ip"
value: "192.168.2.99"
, key: "pynq_port"
value: "22"
, key: "pynq_username"
value: "xilinx"
, key: "pynq_password"
value: "xilinx"
, key: "pynq_target_dir"
value: "/home/xilinx/finn_dev_vision"
, key: "pynq_deployment_dir"
value: "/tmp/finn_dev_vision/pynq_deployment_wqerse2x"
, key: "pynq_deploy_dir"
value: "/tmp/finn_dev_vision/pynq_deployment_wqerse2x"
, key: "exec_mode"
value: "remote_pynq"
]

In [40]:
target_dir_pynq = target_dir + "/" + model.get_metadata_prop("pynq_deployment_dir").split("/")[-1]
target_dir_pynq

'/home/xilinx/finn_dev_vision/pynq_deployment_wqerse2x'

**(!)** From this on board directory you will later run tracker.

In [41]:
! sshpass -p {password} ssh {username}@{ip} -p {port} 'ls -l {target_dir_pynq}'

total 19208
-rw-rw-r-- 1 xilinx xilinx    21144 Mar 24 14:07 driver_base.py
-rw-r--r-- 1 xilinx xilinx    15044 Mar 24 14:07 driver.py
drwxr-xr-x 4 xilinx xilinx     4096 Mar 24 14:07 finn
-rw-r--r-- 1 xilinx xilinx 19311208 Mar 24 14:07 resizer.bit
-rw-r--r-- 1 xilinx xilinx   295295 Mar 24 14:07 resizer.hwh
drwxr-xr-x 2 xilinx xilinx     4096 Mar 24 14:07 runtime_weights
-rw-rw-r-- 1 xilinx xilinx     4113 Mar 24 14:07 validate.py


## Updating the PYNQ driver, part 2

Now you need to update the code for fast input/output handling. Best way to do this is directly on board (either send and replace the proper file using ssh, or connect to board via UART and make adjustments)

* update the `target_dir_pynq/finn/util/data_packing.py` with `XOH/driver_backup/data_packing.py` code

# What next


## OPTION 1 (recommended for start): If you don't care to see time results now, you can easily check how it works running the cells within Tracking system tests section below. This will run tracker and save output in host XOH/output directory

## OPTION 2: If you wish to perform throughput tests of neural network, execute the code in Throughput Test on PYNQ Board section

## OPTION 3: Ditch the jupyter notebook and run tracker directly from board for best performance
* you should connect to ZCU104 board via UART (for example via `gtkterm`)
* you need to copy the `XOH/data` folder from host to `/home/xilinx/data/` on board
* find the proper directory with pynq driver (we printed out the path before)
* to run tracker: `python3 driver.py --exec_mode siamese_tracking` (you may need to use `sudo`)
* the tracker will load images from `/home/xilinx/data/Crossing/` and save the output frames with bounding box in `/home/xilinx/data/output/` (on board)
* if you want to see the inofrmation on timing of each step time, uncomment time measuerements in `driver.py`; nevertheless the script is measuring the total processing time of each frame and prining it out at the end.

The tracker uses some parameters specific to the sequence, used for initialization. This demo covers the Crossing sequence from OTB dataset. If you wish to use other sequences, you can generate the proper parameters using software part of this demo (Python project - `demo.py` script). Remember to update code (paths) and copy new parameters on board (for Crossing, the path is: `data/Crossing/parameters`.

## Tracking system tests

In [46]:
!pip3 install opencv-python

Defaulting to user installation because normal site-packages is not writeable
Collecting opencv-python
  Downloading opencv_python-4.6.0.66-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (60.9 MB)
[K     |████████████████████████████████| 60.9 MB 220 kB/s eta 0:00:011
Installing collected packages: opencv-python
Successfully installed opencv-python-4.6.0.66


In [47]:
import cv2

In [123]:
cfg = {
        # basic parameters
        'out_scale': 0.001,
        'exemplar_sz': 110,
        'instance_sz': 238,
        'context': 0.5,
        # inference parameters
        'scale_num': 1,
        'scale_step': 1.0375,
        'scale_lr': 0.59,
        'scale_penalty': 0.9745,
        'window_influence': 0.176,
        'response_sz': 17,
        'response_up': 16,
        'total_stride': 8,
        # train parameters
        'epoch_num': 50,
        'batch_size': 8,
        'num_workers': 32,
        'initial_lr': 1e-2,
        'ultimate_lr': 1e-5,
        'weight_decay': 5e-4,
        'momentum': 0.9,
        'r_pos': 16,
        'r_neg': 0}

In [124]:
def read_image(img_file, cvt_code=cv2.COLOR_BGR2RGB):
    img = cv2.imread(img_file, cv2.IMREAD_COLOR)
    if cvt_code is not None:
        img = cv2.cvtColor(img, cvt_code)
    return img

def save_image(img, boxes=None, box_fmt='ltwh', colors=None,
               thickness=3, fig_n=1, delay=1, visualize=True,
               cvt_code=cv2.COLOR_RGB2BGR):
    if cvt_code is not None:
        img = cv2.cvtColor(img, cvt_code)
    
    # resize img if necessary
    max_size = 960
    if max(img.shape[:2]) > max_size:
        scale = max_size / max(img.shape[:2])
        out_size = (
            int(img.shape[1] * scale),
            int(img.shape[0] * scale))
        img = cv2.resize(img, out_size)
        if boxes is not None:
            boxes = np.array(boxes, dtype=np.float32) * scale
    
    if boxes is not None:
        assert box_fmt in ['ltwh', 'ltrb']
        boxes = np.array(boxes, dtype=np.int32)
        if boxes.ndim == 1:
            boxes = np.expand_dims(boxes, axis=0)
        if box_fmt == 'ltrb':
            boxes[:, 2:] -= boxes[:, :2]
        
        # clip bounding boxes
        bound = np.array(img.shape[1::-1])[None, :]
        boxes[:, :2] = np.clip(boxes[:, :2], 0, bound)
        boxes[:, 2:] = np.clip(boxes[:, 2:], 0, bound - boxes[:, :2])
        
        if colors is None:
            colors = [
                (0, 0, 255),
                (0, 255, 0),
                (255, 0, 0),
                (0, 255, 255),
                (255, 0, 255),
                (255, 255, 0),
                (0, 0, 128),
                (0, 128, 0),
                (128, 0, 0),
                (0, 128, 128),
                (128, 0, 128),
                (128, 128, 0)]
        colors = np.array(colors, dtype=np.int32)
        if colors.ndim == 1:
            colors = np.expand_dims(colors, axis=0)
        
        for i, box in enumerate(boxes):
            color = colors[i % len(colors)]
            pt1 = (box[0], box[1])
            pt2 = (box[0] + box[2], box[1] + box[3])
            img = cv2.rectangle(img, pt1, pt2, color.tolist(), thickness)
    
    if visualize:
        winname = 'window_{}'.format(fig_n)
        cv2.imwrite(build_dir + '/output/Crossing/' + winname + '.jpg', img)
#         cv2.imshow(winname, img)
#         cv2.waitKey(delay)

    return img

def crop_and_resize(img, center, size, out_size,
                    border_type=cv2.BORDER_CONSTANT,
                    border_value=(0, 0, 0),
                    interp=cv2.INTER_LINEAR):
    # convert box to corners (0-indexed)
    size = round(size)
    corners = np.concatenate((
        np.round(center - (size - 1) / 2),
        np.round(center - (size - 1) / 2) + size))
    corners = np.round(corners).astype(int)

    # pad image if necessary
    pads = np.concatenate((
        -corners[:2], corners[2:] - img.shape[:2]))
    npad = max(0, int(pads.max()))
    if npad > 0:
        img = cv2.copyMakeBorder(
            img, npad, npad, npad, npad,
            border_type, value=border_value)

    # crop image patch
    corners = (corners + npad).astype(int)
    patch = img[corners[0]:corners[2], corners[1]:corners[3]]

    # resize to out_size
    patch = cv2.resize(patch, (out_size, out_size),
                       interpolation=interp)

    return patch

In [125]:
# prepare for FINN
import numpy as np
from finn.core.onnx_exec import execute_onnx

model = ModelWrapper(build_dir + network_identifier + "-pynq_deploy.onnx")
iname = model.graph.input[0].name
oname = model.graph.output[0].name
ishape = model.get_tensor_shape(iname)

# input_dict = {iname: x.astype(np.float32).reshape(ishape)}
# ret = execute_onnx(model, input_dict, True)

In [126]:
import time
import torch.nn.functional as F

class SiamFC(nn.Module):

    def __init__(self, out_scale=0.001):
        super(SiamFC, self).__init__()
        self.out_scale = out_scale
    
    def forward(self, z, x):
        return self._fast_xcorr(z, x) * self.out_scale
    
    def _fast_xcorr(self, z, x):
        # fast cross correlation
        nz = z.size(0)
        nx, c, h, w = x.size()
        x = x.view(-1, nz * c, h, w)
        out = F.conv2d(x, z, groups=nz)
        out = out.view(nx, -1, out.size(-2), out.size(-1))
        return out

class SiamTracker():
    
    def __init__(self, net=None):
        # These are values read from software part
        load_path = build_dir + '/data/Crossing/parameters/'
        self.center = np.load(load_path + 'center.npy')
        self.x_sz = np.load(load_path + 'x_sz.npy').item(0)
        self.avg_color = np.load(load_path + 'avg_color.npy')
        self.scale_factors = np.load(load_path + 'scale_factors.npy')
        self.device = 'cpu'
        self.head = SiamFC(cfg['out_scale'])
        self.kernel = torch.from_numpy(np.load(load_path + 'kernel.npy'))
        self.upscale_sz = np.load(load_path + 'upscale_sz.npy').item(0)
        self.hann_window = np.load(load_path + 'hann_window.npy')
        self.z_sz = np.load(load_path + 'z_sz.npy').item(0)
        self.target_sz = np.load(load_path + 'target_sz.npy')
        
        self.net = net

    def update(self, img):

        # search images
        #begin = time.time()
        x = [crop_and_resize(
            img, self.center, self.x_sz * f,
            out_size=cfg['instance_sz'],
            border_value=self.avg_color) for f in self.scale_factors]
        x = np.stack(x, axis=0)
        x = torch.from_numpy(x).to(
            self.device).permute(0, 3, 1, 2).float()

        # responses
        if self.net is not None:
            x = self.net(x)
        else:
            x = x.numpy()
            x = x[0, :, :, :][np.newaxis, :, :, :].transpose(0, 2, 3, 1)
            input_dict = {iname: x.astype(np.float32)}
            x = execute_onnx(model, input_dict, True)
            x = x[oname]
            x = x.transpose(0, 3, 1, 2)
#             x = x * 0.0002790141152217984 + np.load(build_dir + "/data/Add_0_param0.npy")
            x = torch.from_numpy(x)
            
        responses = self.head(self.kernel, x)
        responses = responses.squeeze(1).detach().numpy()

        # upsample responses and penalize scale changes
        responses = np.stack([cv2.resize(
            u, (self.upscale_sz, self.upscale_sz),
            interpolation=cv2.INTER_CUBIC)
            for u in responses])
        responses[:cfg['scale_num'] // 2] *= cfg['scale_penalty']
        responses[cfg['scale_num'] // 2 + 1:] *= cfg['scale_penalty']

        # peak scale
        scale_id = np.argmax(np.amax(responses, axis=(1, 2)))

        # peak location
        response = responses[scale_id]
        response -= response.min()
        response /= response.sum() + 1e-16
        response = (1 - cfg['window_influence']) * response + \
            cfg['window_influence'] * self.hann_window
        loc = np.unravel_index(response.argmax(), response.shape)

        # locate target center
        disp_in_response = np.array(loc) - (self.upscale_sz - 1) / 2
        disp_in_instance = disp_in_response * \
            cfg['total_stride'] / cfg['response_up']
        disp_in_image = disp_in_instance * self.x_sz * \
            self.scale_factors[scale_id] / cfg['instance_sz']
        self.center += disp_in_image

        # update target size
        scale =  (1 - cfg['scale_lr']) * 1.0 + \
            cfg['scale_lr'] * self.scale_factors[scale_id]
        self.target_sz *= scale
        self.z_sz *= scale
        self.x_sz *= scale

        # return 1-indexed and left-top based bounding box
        box = np.array([
            self.center[1] + 1 - (self.target_sz[1] - 1) / 2,
            self.center[0] + 1 - (self.target_sz[0] - 1) / 2,
            self.target_sz[1], self.target_sz[0]])

        return box
    
    def track(self, img_files, box, save_output=False):
        
        frame_num = len(img_files)
        boxes = np.zeros((frame_num, 4))
        boxes[0] = box
        times = np.zeros(frame_num)

        for f, img_file in enumerate(img_files):
            img = read_image(img_file)

            begin = time.time()
            if f == 0:
                continue
            else:
                boxes[f, :] = self.update(img)
            times[f] = time.time() - begin

            if boxes[f, 0] < 0 or boxes[f, 1] < 0 or boxes[f, 0] > img.shape[0] or boxes[f, 1] > img.shape[1]:
                print('Object lost. Aborting sequence tracking...')
                break

            if save_output:
                save_image(img, boxes[f, :], fig_n=f)
            
            if f > 3:
                break  # TODO for checking

        return boxes, times

In [131]:
import os
import glob

net = QAlexNetV5()
net.load_state_dict(torch.load(build_dir + network_identifier + ".pth", map_location=torch.device('cpu')))
cnv = net.eval()

seq_dir = os.path.expanduser(build_dir + '/data/Crossing/')
img_files = sorted(glob.glob(seq_dir + 'img/*.jpg'))
anno = np.loadtxt(seq_dir + 'groundtruth_rect.txt')

# call with net argument SiamTracker(net) for software output
tracker = SiamTracker()

Initializing modified and squeezed QUANTIZED AlexNet for SiamTrack backbone.


In [132]:
boxes, times = tracker.track(img_files, anno[0], save_output=True)

## Throughput Test on PYNQ Board

Is it real-time?

In [45]:
from finn.core.throughput_test import throughput_test_remote

model = ModelWrapper(build_dir + network_identifier + "-pynq_deploy.onnx")
res = throughput_test_remote(model, 1)
print("Network metrics:")
for key in res:
    print(str(key) + ": " + str(res[key]))

Network metrics:
runtime[ms]: 20.402908325195312
throughput[images/s]: 49.012620360848835
DRAM_in_bandwidth[Mb/s]: 8.328812603159765
DRAM_out_bandwidth[Mb/s]: 9.10928956978592
fclk[mhz]: 99.999
batch_size: 1
fold_input[ms]: 0.07390975952148438
pack_input[ms]: 0.055789947509765625
copy_input_data_to_device[ms]: 1.0845661163330078
copy_output_data_from_device[ms]: 0.3440380096435547
unpack_output[ms]: 7.530689239501953
unfold_output[ms]: 0.10848045349121094
