# Step-By-Step Lab1: FIR

Yuan-teng Chang, Catapult HLS AE yuan-teng.chang@siemens.com



#### **Objectives**

- Learn the Catapult HLS C++ flow
- Understand the loop unrolling and pipelining
- Understand the memory interface
- Learn how to design with multiple blocks



#### **Environment Variables**

To run Catapult HLS and QuestaSim, a few environment variables are required to be defined

- Define them in a shell script like setup-catapult.csh and source it
- Modify the path depending on the tool install path in your workstation

```
2 echo "Catapult setup"
3 setenv MGC_HOME /usr/local/bin/Siemens_EDA/Catapult_Synthesis_2022.2_1-1019737/Mgc_home
4
5 setenv CXX_HOME $MGC_HOME
6 setenv SYSTEMC_INCDIR $MGC_HOME/shared/include
7 setenv SYSTEMC_LIBDIR $MGC_HOME/shared/lib
8 set path = ($path ${MGC_HOME}/bin)
9
10 setenv LD_LIBRARY_PATH $SYSTEMC_LIBDIR
11 setenv LD_LIBRARY_PATH ${MGC_HOME}/lib:$LD_LIBRARY_PATH
12
13 echo "QuestaSim setup"
14 setenv MODEL_TECH /usr/local/bin/Siemens_EDA/Questa_Sim_2022_4/questasim/bin
15 setenv QUESTA_HOME /usr/local/bin/Siemens_EDA/Questa_Sim_2022_4/questasim
16 set path = ($path ${MODEL_TECH})
```





#### **Catapult Flow**





## Walkthrough of Catapult C++ Flow

An 8-tap FIR



#### fir\_ref.h (Reference model)

```
y = \sum_{i=0}^{7} c_i \cdot r[i]
   = c_0 r[0] + c_1 r[1] + \dots + c_7 r[7]
```

```
class fir_ref {
  private:
  int regs[8];
  public:
  fir_ref() {
    for (int i = 7; i > = 0; i - -)
    { regs[i] = 0; }
```

```
void run(int input, int coeffs[8], int &output)
    int temp = 0;
    for (int i = 7; i > = 0; i - -) {
      if ( i == 0 ) {
        regs[i] = input;
      } else {
        regs[i] = regs[i-1];
    for(int i = 0; i < 8; i++)
      temp += regs[i] * coeffs[i];
    output = temp >> 11;
};
```

Required

#### fir.h (HLS C)

```
#include <ac_int.h>
    #include <ac_channel.h>
    #include <mc scverify.h>
    class fir {
      private:
      int8 regs[8];
      public:
                         Required
      fir(
        for (int i = 7; i>=0; i--)
         { regs[i] = 0; }
                  Design block
ac channel
interface
                       fir
                                   output
                      run
```

```
#pragma hls_design interface
    void CCS_BLOCK(run)(ac_channel<int8> & input,
                         int8 coeffs[8],
                         ac channel<int8> &butput)
      int19 temp = 0;
label
      SHIFT: for (int i = 7; i>=0; i--) {
        if ( i == 0 ) {
          regs[i] = input.read();
        } else {
           regs[i] = regs[i-1];
      MAC: for (int i = 7; i > = 0; i - -) {
        temp += coeffs[i]*regs[i];
      output.write(temp>>11);
  };
```

## fir\_tb.cpp (TB)

```
#include "fir ref.h"
#include "fir.h"
CCS_MAIN(int argc, char *argv[])
  int8 coeffs[8];
  ac channel<int8> in chn;
  ac_channel<int8> out_chn;
  int8 din ,dout;
  int coeffs_ref[8];
  int din ref;
  int dout ref;
  fir
         dut;
  fir ref ref model;
  int pass cnt = 0;
  int fail cnt = 0;
  // Test Impulse
  for (int i=0; i < 8; i++)
    coeffs_ref[i]=coeffs[i]=-128+rand()%256;
```

```
for (int i = 0; i < 100; i++ ) {
    din ref = din = -128 + rand()\%256;
    in chn.write(din);
    dut.run(in chn, coeffs, out chn);
    ref_model.run(din_ref, coeffs_ref,
dout ref);
    dout = out chn.read();
    if (dout.to int()!=dout ref)
      printf("fail @ %3d: %4d != %4d \n", i,
dout.to int(), dout ref);
      fail cnt ++;
    } else {
      printf("pass @ %3d: %4d == %4d \n", i,
dout.to_int(), dout_ref);
      pass cnt ++;
  printf("\n");
  printf("total pass count %d\n", pass cnt);
  printf("total fail count %d\n", fail_cnt);
  CCS RETURN(0);
```

## Pre-HLS Verification by gcc (C Sim)

#### Build with gcc

• g++ -o3 -std=c++11 -I \$MGC\_HOME/shared/include fir\_tb.cpp -o SCVerify\_fir.exe

```
[mentor@RHEL74 01_walkthrough_loops]$ make
/usr/local/bin/Siemens_EDA/Catapult_Synthesis_2023.2_1-1065141/Mgc_home/bin/g++ -g -o3 -std=c++11 -Wall -Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-label -I./ -I/usr/local/bin/Siemens_EDA/Catapult_Synt
hesis_2023.2_1-1065141/Mgc_home/shared/include ./fir_tb.cpp -o bin/SCVerify_fir.exe
```

#### Run execution file

./SCVerify\_fir.exe

```
pass @ 89: 3 ==
pass @ 90: -18 == -18
pass @ 91:
          21 ==
pass @ 92: -12 == -12
pass @ 93:
      94: -6 == -6
pass @
pass @
      96: -21 == -21
pass @
          18 == 18
pass @ 97:
pass @
      98:
          -5 ==
pass @ 99:
```

```
total pass count 100 total fail count 0
```



## **Launch Catapult**

#### catapult &





#### Add input file



## **Enable SCVerify**





## Pre-HLS (C++) Verification by gcc



#### **Build VSCode Environment**





## **Build VSCode Environment (Cont.)**



#### **Build VSCode Environment (Cont.)**





Run CDesignChecker





## Run CDesignChecker (Cont.)





#### **HLS Synthesis – Set Top Design**





## **HLS Synthesis – Library Setup**





## **HLS Synthesis – Mapping**

#### Frequency





## **HLS Synthesis – Mapping**

Sync / async reset





#### **HLS Synthesis – Architecture**





## **HLS Synthesis – Scheduling**



## **HLS Synthesis – RTL**





## **HLS Synthesis – Report**



#### **HLS Synthesis – Report (Cont.)**





#### **Design Analyzer**

Analyze the design in various perspectives



#### **RTL Verification**

Automatically compare the outputs of HLC C and RTL



#### **RTL Verification**

#### Automatically compare the outputs of HLC C and RTL

```
Transcript =
     captured 100 values of output
 Info: scverify top/user tb: Simulation completed
 Checking results
  'output
     capture count
                          = 100
     comparison count
                          = 100
                          = 0
     ignore count
    error count
     stuck in dut fifo
     stuck in golden fifo = 0
 Info: scverify top/user tb: Simulation PASSED @ 10016 ns
 ** Note: (vsim-6574) SystemC simulation stopped by user.
VSIM 3>
```





# **Loop Unrolling and Pipelining**



#### **Branch a New Solution**





#### **HLS Synthesis – Architecture**



## **HLS Synthesis – Scheduling**





## **HLS Synthesis – RTL**



# **HLS Synthesis – Report**





# **Design Analyzer**

#### Analyze the design in various perspectives



#### **RTL Verification**

#### Automatically compare the outputs of HLC C and RTL

```
Project Files

gcc 4.9.2

Griginal Design + Testbench

CDesignChecker

Griginal Check Design

QuestaSIM

RTL VHDL output 'rtl.vhdl' vs Untimed C

RC Concat RTL VHDL output 'concat_sim_rt

RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+

RC Concat RTL Verilog output 'rtl.v' vs Untimed C+
```

```
captured 100 values of output
 Info: scverify top/user tb: Simulation completed
  Checking results
   'output'
     capture count
                          = 100
     comparison count
                          = 100
     ignore count
     error count
                          = 0
     stuck in dut fifo
     stuck in golden fifo = 0
 Info: scverify top/user tb: Simulation PASSED @ 10016 ns
# ** Note: (vsim-6574) SystemC simulation stopped by user.
VSIM 3>
```



### Reproduce the Solution

#### All command has been recorded in a tcl script

```
[mentor@RHEL74 walkthrough]$ ll ./Catapult/fir.v2/directives.tcl
-rw-rw-r-- 1 mentor mentor 3531 Aug 22 11:45 ./Catapult/fir.v2/directives.tcl

go new
go compile
solution library add nangate-45nm_beh -- -rtlsyntool DesignCompiler -vendor Nangate -technology 045nm
go libraries
directive set -CLOCKS {clk {-CLOCK_PERIOD 10.0 -CLOCK_EDGE rising -CLOCK_UNCERTAINTY 0.0 -CLOCK_HIGH_TI
go assembly
directive set /fir/run/main -PIPELINE_INIT_INTERVAL 1
directive set /fir/run/SHIFT -UNROLL yes
directive set /fir/run/MAC -UNROLL yes
go architect
```

The same result can be produced with this .tcl

catapult –f ./Catapult/fir.v2/directives.tcl &



# **Memory Interface**



### Source Code (fir.h)

```
void CCS_BLOCK(run)(ac_channel<ac_int<8>> &input,
                      ac_int<8> coeffs[32][8],
                      ac_channel<ac_int<5,false>> &coeff_addr,
                      ac channel<ac int<8>> &output) {
   ac int<19> temp = 0;
  ac int<5,false> addr = coeff addr.read();
   SHIFT: for (int i = 7; i > = 0; i - -) {
     if ( i == 0 ) {
       regs[i] = input.read();
     } else {
       regs[i] = regs[i-1];
   MAC: for (int i = 7; i > = 0; i - -) {
     temp += coeffs[addr][i]*regs[i];
   output.write(temp>>11);
```

### TestBench (fir\_tb.cpp)

```
#include "fir.h"
CCS MAIN(int argc, char *argv[])
  ac int<8> coeffs[32][8];
  ac channel<ac int<8>> input;
  ac channel<ac int<5,false>> coeff addr;
  ac channel<ac int<8>> output;
  ac int<8> data;
  ac int<5,false> addr;
  fir inst;
```

```
// Test Impulse
for (int j=0; j < 32; j++)
  for (int i=0; i < 8; i++)
    coeffs[i][i] = rand();
for (int i = 0; i < 10; i++ ) {
  data = rand();
  input.write(data);
  addr = rand();
  coeff_addr.write(addr);
  inst.run(input, coeffs, coeff addr, output);
while (output.available(1))
  printf("Output = %3d\n",output.read().to int());
CCS_RETURN(0);
```

# Start with running directives.tcl

catapult -f directives.tcl

Check coeffs SRAM: (1) size: 256x8b (2) resource type: 1R1W RAM





#### **Check SRAM Interface in RTL**

\*\_re\_\*: ram read enable; \*\_radr\_\*: ram read address; \*\_d": ram read data

```
Task Bar
                                  🍁 Start Page 🔛 Flow Manager | 🧲 Table | 🎌 Constraint Editor | 🙉 rtl.v |
                          *
  Synthesis Tasks
                                    🦥 🥎 🦍 🍇 🎉 🏂 🤟 Goto line...
   Input Files
                                    506
                                    507
   Mierarchy
                                    508
   Libraries
                                    509
                                           module fir run (
                                    510
                                             clk, rst, input rsc dat, input rsc vld, input rsc rdy, coeffs rsc triosy lz, coeff addr rsc dat,
   A Mapping
                                    511
                                                 coeff addr rsc vld, coeff addr rsc rdy, output rsc dat, output rsc vld, output rsc rdy,
                                                  coeffs rsci radr d, coeffs rsci q d, coeffs rsci re d pff
                                    512
   Architecture
                                    513

    Resources

                                    514
                                             input clk:
                                    515
                                              input rst;
   Schedule
                                             input [7:0] input rsc dat;
                                             input input rsc vld;
                                    517
   ∌ RTL
                                             output input_rsc_rdy;
output coeffs_rsc_triosy_lz;
                                    519
                                             input [4:0] coeff addr rsc dat;
                                    521
                                             input coeff addr rsc vld;
                                             output coeff_addr rsc rdy;
                                    522
                                             output [7:0] output rsc dat;
                                    523
Project Files
                                             output output_rsc_vld;
input output rsc_rdy;
                                    524
     ⊕ 🛅 fir.h
                                    525
      🖳 🚰 fir tb.cpp (Excluded)
                                    526
                                             output [7:0] coeffs rsci radr d;
                                             input [7:0] coeffs rsci q d;
                                    527
  Output Files
                                             output coeffs rsci re d pff;
                                    528
     ⊕ 阿 Reports
                                    529
                                    530
     531
                                             // Interconnect Declarations
     532
                                             wire run wen;
     🖹 🛅 Verilog
                                    533
                                             wire run wten;
                                    534
                                             wire input rsci wen comp;
       ⊕ 🖟 rtl.v
                                    535
                                             wire [7:0] input rsci idat mxwt;
          concat rtl.v (extr
                                             wire [7:0] coeffs rsci a d mxwt:
          🖟 concat sim rtl.v 🖟
```

#### **Check SRAM Interface in Waveform**







# How do we improve the throughput

#### Current throughput is 18

| Report: General     | <u>*</u> [ ** × [ | <b>L</b> 🗐 🗑 |                   |        |       |            |
|---------------------|-------------------|--------------|-------------------|--------|-------|------------|
| Solution /          | Latency           | Latency      | Throughput Cycles | Thro   | Slack | Total Area |
| 🗃 solution.v1 (new) |                   |              |                   |        |       |            |
| fir.v1 (extract)    | 17                | 170.00       | 18                | 180.00 | 7.68  | 1461.69    |





#### Make convolution in parallel

```
void CCS BLOCK(run)(ac channel<ac int<8>> &input,
                      ac int<8> coeffs[32][8],
                      ac channel<ac int<5,false>> &coeff addr,
                      ac channel<ac int<8>> &output) {
   ac int<19> temp = 0;
   ac int<5,false> addr = coeff addr.read();
   SHIFT: for (int i = 7; i > = 0; i - -) {
     if ( i == 0 ) {
       regs[i] = input.read();
     } else {
       regs[i] = regs[i-1];
                                          Make convolution parallel
   MAC: for (int i = 7; i > = 0; i - -) {
     temp += coeffs[addr][i]*regs[i];
   output.write(temp>>11);
```

### **Change the Architecture**

Unroll 'SHIFT' and 'MAC', and Pipeline 'main'



```
# Prescheduled SFOUFNTTAL '/fir/run' (total length 4 c-steps)
insufficient resources 'ccs_sample_mem.ccs_ram_sync_IRIW_rport(2,8,8,256,256,8,5)' to schedule '/fir/run'. 8 are needed, but only 1 instances are available please merge accesses to 'coeffs:rsc' to reduce the number of required resources

# NetList written to file 'schedule.ght'
Completed transformation 'architect' on solution 'fir.v2': elapsed time 0.25 seconds, memory usage 1364772kB, peak memory usage 1364772kB
Design complexity at end of 'architect': Total ops = 68, Real ops = 16, Vars = 23
Design 'fir' could not schedule partition '/fir/run' - resource competition
```

fir.h(23) SCHD-8 SCHD-4 fir.h(24) SCHD-39 NET-4 SOL-9 SOL-21



### **Increase Memory Data Width**

Word Width = 64 (why?)



- deliciartifd acheintly rearbellett litrea
- Makefile for RTL Verilog output 'rtl.v' vs Untimed C++ written to file './scverify/Verify\_rtl\_v\_msim.mk'
- Makefile for Concat RTL Verilog output 'concat sim rtl.v' vs Untimed C++ written to file './scverify/Verify concat sim rtl v msim.mk'
- Completed transformation 'extract' on solution 'fir.v3': elapsed time 3.66 seconds, memory usage 1364772kB, peak memory usage 1364816kB
- Design complexity at end of 'extract': Total ops = 269, Real ops = 84, Vars = 199



S0L-9

S0L-21

#### **Check Waveform**

#### Throughput = 1







# **Multiple Blocks**

A Design with Two 8-Tap FIRs and One Decimator



#### top.h

```
#include "fir.h"
#include "decimator.h"
class top {
  ac_channel<ac_int<8> > connect0;
  ac_channel<ac_int<8> > connect1;
  fir block0;
  fir block1;
  decimator block2;
  public :
  top ()
  #pragma hls design interface top
  void CCS_BLOCK(run) (ac_channel<ac_int<8> > &din,
                       ac_int<8> coeffs[8],
                       ac channel<ac int<8> > &dout)
    block0.run(din,coeffs,connect0);
    block1.run(connect0, coeffs, connect1);
    block2.run(connect1,dout);
```

#### decimator.h

```
class decimator {
  int count;
  public :
 decimator () {
    count = 0;
 #pragma hls design interface
 void CCS BLOCK(run) (ac_channel<ac_int<8> > &din,
                       ac channel<ac int<8> > &dout) {
    if (count==4)
      count = 0;
    ac_int<8> temp = din.read();
    if (count==0)
      dout.write(temp);
    count++;
```

#### fir.h

```
class fir {
  ac_int<8> regs[8];
  public:
  fir() {
    for (int i = 7; i>=0; i--)
      { regs[i] = 0; }
  }
  ...
```

```
#pragma hls_design interface
  void CCS_BLOCK(run)(ac_channel< ac_int<8> > &input,
                       ac_int<8> coeffs[8],
                       ac_channel< ac_int<8> > &output) {
    ac int<19> temp = 0;
    SHIFT: for (int i = 7; i > = 0; i - -) {
      if ( i == 0 ) {
        regs[i] = input.read();
      } else {
        regs[i] = regs[i-1];
    MAC: for (int i = 7; i > = 0; i - -) {
      temp += coeffs[i]*regs[i];
    output.write(temp>>11);
```

# Start from the 'Architecture' Stage

catapult -f directives.tcl &

There are three design blocks: block0, block1, block2

Set 'DirectInput' for the coeffs



### Set FIFO Depth as '0'

Set FIFO depth as '0' for connect0 and connect1





# **Look Report**

Latency = ?, Throughput = ?

Area: block0=?, block1=?, block2=?







#### **RTL Simulation**



why idle?



# **Improve Throughput**

Improve the throughput from 10 to 1 by yourself





# RTL Simulation (Throughput=1)

One dout every four din because of the decimation



