In [1]:
#import ipdb # alternative to pdb that works in jupyter notebook (pip3 install ipdb)
import os, subprocess, sys, re, time
from pathlib import Path
from pynq import Overlay
#from pynq import GPIO
from pynq import allocate
import math
import pandas as pd

from dma_receiver import DmaReceiver
from bram_loader import Bram_Loader
from continuous_monitoring_system_controller import ContinuousMonitoringSystemController

BASE_DIR = Path('/home/xilinx/design_files')
PATH = BASE_DIR 

base = Overlay(str(BASE_DIR / 'imported_design.bit'))

ITEM_BYTE_SIZE = 8
FIFO_SIZE = 32768
# +4 because DMA seems to have it's own buffer it fills before dma.recvchannel.transfer is even called
buffer_length = min( base.continuous_monitoring_system_blocks.axi_dma_0.buffer_max_size // ITEM_BYTE_SIZE, FIFO_SIZE)# + 4) 

print('buffer_length =', buffer_length)
input_buffer = allocate(shape=(buffer_length,), dtype='u8')
input_buffer_2 = allocate(shape=(buffer_length,), dtype='u8')

dma_rec = base.continuous_monitoring_system_blocks.axi_dma_0.recvchannel

# https://pynq.readthedocs.io/en/v2.7.0/_modules/pynq/lib/axigpio.html
gpio_rst_n_out = base.axi_gpio_0.channel1[0]
gpio_en_cpu_reset_server_request_put_out = base.axi_gpio_0.channel1[1]
gpio_pc_stream_m_axis_tlast_interval = base.axi_gpio_1.channel1

gpio_fifo_wr_count = base.axi_gpio_0.channel2[0:16]
gpio_fifo_rd_count = base.axi_gpio_0.channel2[16:32]

def print_dma_channel_status(channel):
    print('dma.running =', channel.running)
    print('dma.idle =', channel.idle)
    print('dma.error =', channel.error)
    print('status =', hex(channel._mmio.read(channel._offset + 4)))

def reset_cpu(delay=0.001):
    ''' AXI GPIO controlled reset, active-low. '''
    gpio_en_cpu_reset_server_request_put_out.write(0)
    gpio_rst_n_out.write(0)
    time.sleep(delay)
    gpio_rst_n_out.write(1)
    time.sleep(delay)
    gpio_en_cpu_reset_server_request_put_out.write(1)
    time.sleep(delay)
    gpio_en_cpu_reset_server_request_put_out.write(0)
    time.sleep(delay)

def print_fifo_data_counts():
    print('gpio_fifo_wr_count =', gpio_fifo_wr_count.read())
    print('gpio_fifo_rd_count =', gpio_fifo_rd_count.read())
    
# def set_pc_stream_tlast_interval(items_count):
#     ''' Sets 32-bit value specifying how many items can 
#     arrive by a single dma.recvchannel.tranfer(ib) call. '''
#     gpio_pc_stream_m_axis_tlast_interval[0:32].write(items_count)

def console_send(s):
    ''' Uses AXI GPIO and hardware FIFOs. '''
    console_input = base.console_io.axi_gpio_3.channel1[0:8]
    console_write_enable = base.console_io.axi_gpio_3.channel1[8] # using "edge_detector" to avoid continuous writing
    console_write_enable.off()
    for c in s:
        console_input.write(ord(c))
        console_write_enable.on()
        console_write_enable.off()
    
def console_data_available():
    console_output_empty = base.console_io.axi_gpio_3.channel2[8]
    return console_output_empty.read() == 0

def console_read():
    ''' Uses AXI GPIO and hardware FIFOs. '''
    s = ''
    console_output = base.console_io.axi_gpio_3.channel2[0:8]
    console_read_enable = base.console_io.axi_gpio_3.channel1[9] # using "edge_detector" to avoid continuous reading
    console_read_enable.off()
    while console_data_available():
        s += chr(console_output.read())
        console_read_enable.on()
        console_read_enable.off()
    return s

def instr_to_strings(instructions_integers):
    ''' Requires riscv-python-model installed.
    If network connection is available, "python3 -m pip install riscv-model.
    If not, then on separate machine with internet:
        python3 -m pip download riscv-model -d .  
    Then copy the downloaded .whl file to pynq and install with:
        python3 -m pip install <file.whl> -f ./ --no-index   
    Usage:
        instr_to_string([0xB60006F, 0xFE0791E3])
        '''
    instructions_string = ' 0x'.join(f'{ii:08X}' for ii in instructions_integers)
    return os.popen(f'riscv-machinsn-decode hexstring {instructions_string}').read().strip().split('\n')


####################################################################
# 

def read_performance_event_names(f_name='performance_event_names.txt'):
    ''' Reads events names from file, these were collected from CHERI-Flute source code by using this script:
    https://github.com/michalmonday/Flute/blob/continuous_monitoring/builds/RV64ACDFIMSUxCHERI_Flute_verilator/vcd/read_vcd.py
    '''
    return [line.strip() for line in f.readlines()]

def pop_n_bits_value(val, n):
    ''' pop_n_bits_value(0xFFFF, 4) returns: (0xFFF, 0xF) '''
    bits_value = val & ((1<<n)-1)
    return val >> n, bits_value

def parse_fifo_item(fifo_item):
    ''' Parses a single fifo item (e.g. 1024 bits) numerical value. 
        Single fifo item = {59bits padding, performance_counters805(7bits*115counters), instr32, clk_counter_delta64, pc64}
        Padding is used because only power of 2s can be used as size in fifo generator block (or axi in general?)'''
    perf_counters = []
    for i in range(PERFORMANCE_EVENTS_COUNT):
        fifo_item, perf_counter = pop_n_bits_value(fifo_item, 7)
        perf_counters.append(perf_counter)
    fifo_item, pc = pop_n_bits_value(fifo_item, 64)
    fifo_item, clk_counter = pop_n_bits_value(fifo_item, 64)
    fifo_item, instr = pop_n_bits_value(fifo_item, 32)
    return perf_counters, pc, clk_counter, instr

def get_dma_transfer(input_buffer, dma_rec):
    ''' Returns the number of transferred items, each having 1024 bits. '''
    dma_rec.transfer(input_buffer)
    dma_rec.wait() # depends on tlast
    items_transferred = math.floor(dma_rec.transferred * 64 / 1024 / 8)
    print(f'items_transferred = {items_transferred}')
    return items_transferred

def parse_last_dma_transfer(input_buffer, items_transferred):
    pcs = []
    instrs = []
    clk_counters = []
    events = []
    for i in range(items_transferred):
        chunks_per_item = math.ceil(1024/64)
        start = chunks_per_item * i
        end = start + chunks_per_item
        fifo_item = int.from_bytes(bytes(input_buffer[start:end]), byteorder='little')
        perf_counters, pc, clk_counter, instr = parse_fifo_item(fifo_item)
        events.append(perf_counters)
        pcs.append(pc)
        clk_counters.append(clk_counter)
        instrs.append(instr)
    instr_strings = instr_to_strings(instrs)
    return events, pcs, clk_counters, instrs, instr_strings

event_names = read_performance_event_names()
PERFORMANCE_EVENTS_COUNT = 115

print_dma_channel_status(dma_rec)
print_fifo_data_counts()

# set_pc_stream_tlast_interval(1000)


buffer_length = 32768
dma.running = True
dma.idle = False
dma.error = False
status = 0x0
gpio_fifo_wr_count = 0
gpio_fifo_rd_count = 0


In [2]:
def setup_cms(cms_ctrl):
    # Triggerring (exact address must match to start/stop trace)
    cms_ctrl.set_trigger_trace_start_address(0x1000)
    cms_ctrl.set_trigger_trace_end_address(0x80000106)  
    cms_ctrl.set_trigger_trace_start_address_enabled(False)
    cms_ctrl.set_trigger_trace_end_address_enabled(False)

    # Filtering (any address between lower bound and upper bound will be collected)
    cms_ctrl.set_monitored_address_range_lower_bound(0x0FFF)     #(0x80000000)
    cms_ctrl.set_monitored_address_range_upper_bound(0x800000FF)
    cms_ctrl.set_monitored_address_range_lower_bound_enabled(False)
    cms_ctrl.set_monitored_address_range_upper_bound_enabled(False)
    
    # Allow further trace collection if last traced program used "wfi"
    # (wait for interrupt) instruction which stops the trace.
    cms_ctrl.reset_wfi_wait()

# the long name is because of using hierarchy in Vivado block design
cms_ctrl_axi_gpio = base.continuous_monitoring_system_blocks.axi_gpio_to_cms_ctrl_interface.axi_gpio_cms_ctrl.channel1    
cms_ctrl = ContinuousMonitoringSystemController(cms_ctrl_axi_gpio)
setup_cms(cms_ctrl)

In [85]:
gpio_rst_n_out.write(0)

In [86]:
cms_ctrl.reset_wfi_wait()

In [5]:
bram_loader = Bram_Loader(base.bram_loader.axi_gpio_2)
bram_loader.load(PATH / 'riscv-example-baremetal-short.bin')

In [87]:
print_fifo_data_counts()
reset_cpu()

gpio_fifo_wr_count = 0
gpio_fifo_rd_count = 0


In [88]:
print_fifo_data_counts()

gpio_fifo_wr_count = 54
gpio_fifo_rd_count = 0


In [91]:
items_transferred = get_dma_transfer(input_buffer, dma_rec)

items_transferred = 57


In [92]:
events, pcs, clk_counters, instrs, instr_strings = parse_last_dma_transfer(input_buffer, items_transferred)

for pc, instr, instr_str, clk_counter in zip(pcs, instrs, instr_strings, clk_counters):
    print(f'CLK_DELTA={clk_counter:<14}PC={pc:>8X}    INSTR={instr:>08X}    {instr_str}')

CLK_DELTA=546           PC=    1010    INSTR=00028067    jalr x0, x5, 0
CLK_DELTA=58            PC=80000004    INSTR=00029C63    bne x5, x0, .+24
CLK_DELTA=3             PC=80000010    INSTR=0B60006F    jal x0, .+182
CLK_DELTA=47            PC=800000E0    INSTR=0180006F    jal x0, .+24
CLK_DELTA=104           PC=80000100    INSTR=FE0791E3    bne x15, x0, .-30
CLK_DELTA=45            PC=800000F4    INSTR=F35FF0EF    jal x1, .-204
CLK_DELTA=94            PC=8000004E    INSTR=00008067    jalr x0, x1, 0
CLK_DELTA=4             PC=80000100    INSTR=FE0791E3    bne x15, x0, .-30
CLK_DELTA=14            PC=800000F4    INSTR=F35FF0EF    jal x1, .-204
CLK_DELTA=22            PC=8000004E    INSTR=00008067    jalr x0, x1, 0
CLK_DELTA=4             PC=80000100    INSTR=FE0791E3    bne x15, x0, .-30
CLK_DELTA=10            PC=800000F4    INSTR=F35FF0EF    jal x1, .-204
CLK_DELTA=22            PC=8000004E    INSTR=00008067    jalr x0, x1, 0
CLK_DELTA=4             PC=80000100    INSTR=FE0791E3    bn

In [93]:
# print performance counters for the first 10 datapoints/instructions from data above
df = pd.DataFrame(events, columns=event_names)
df.iloc[:10]

Unnamed: 0,Core__NO_EV,Core__REDIRECT,Core__TRAP,Core__BRANCH,Core__JAL,Core__JALR,Core__AUIPC,Core__LOAD,Core__STORE,Core__LR,...,LL__TLB,LL__TLB_MISS,UNKNOWN_107,LL__TLB_FLUSH,LL__EVICT,UNKNOWN_110,UNKNOWN_111,TransExe__RENAMED_INST,TransExe__WILD_JUMP,TransExe__WILD_EXCEPTION
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,1,0,3,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,1,0,0,0,3,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,1,0,1,2,3,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,1,0,0,0,3,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,1,2,3,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
event_names

['Core__NO_EV',
 'Core__REDIRECT',
 'Core__TRAP',
 'Core__BRANCH',
 'Core__JAL',
 'Core__JALR',
 'Core__AUIPC',
 'Core__LOAD',
 'Core__STORE',
 'Core__LR',
 'Core__SC',
 'Core__AMO',
 'Core__SERIAL_SHIFT',
 'Core__INT_MUL_DIV_REM',
 'Core__FP',
 'Core__SC_SUCCESS',
 'Core__LOAD_WAIT',
 'Core__STORE_WAIT',
 'Core__FENCE',
 'Core__F_BUSY_NO_CONSUME',
 'Core__D_BUSY_NO_CONSUME',
 'Core__1_BUSY_NO_CONSUME',
 'Core__2_BUSY_NO_CONSUME',
 'Core__3_BUSY_NO_CONSUME',
 'Core__IMPRECISE_SETBOUND',
 'Core__UNREPRESENTABLE_CAP',
 'Core__MEM_CAP_LOAD',
 'Core__MEM_CAP_STORE',
 'Core__MEM_CAP_LOAD_TAG_SET',
 'Core__MEM_CAP_STORE_TAG_SET',
 'Core__INTERRUPT',
 'UNKNOWN_31',
 'L1I__LD',
 'L1I__LD_MISS',
 'L1I__LD_MISS_LAT',
 'UNKNOWN_35',
 'UNKNOWN_36',
 'UNKNOWN_37',
 'UNKNOWN_38',
 'UNKNOWN_39',
 'UNKNOWN_40',
 'L1I__TLB',
 'L1I__TLB_MISS',
 'L1I__TLB_MISS_LAT',
 'L1I__TLB_FLUSH',
 'UNKNOWN_45',
 'UNKNOWN_46',
 'UNKNOWN_47',
 'L1D__LD',
 'L1D__LD_MISS',
 'L1D__LD_MISS_LAT',
 'L1D__ST',
 'L1D__ST_MISS

In [98]:
# event name format is "CATEGORY__NAME"
df.mean(axis=0).sort_values(ascending=False)[:-1]

L1I__TLB                   8.052632
L1I__LD                    7.719298
Core__1_BUSY_NO_CONSUME    5.000000
Core__LOAD_WAIT            4.140351
L1D__LD_MISS_LAT           4.140351
                             ...   
UNKNOWN_45                 0.000000
L1I__TLB_MISS_LAT          0.000000
L1I__TLB_MISS              0.000000
UNKNOWN_40                 0.000000
UNKNOWN_39                 0.000000
Length: 114, dtype: float64

In [95]:
console_data_available()

True

In [96]:
console_read()

'New program!\nabc'

In [15]:
# console_send('abc')

In [20]:
# import matplotlib.pyplot as plt
# plt.plot(pcs)
# plt.show()