# Using Trace to Show Functions on Power Traces
This notebook is for the CW305 or the CW610 with the K82 target using the parallel trace interface.

It can be ported to other targets using the SWO interface (see `TraceWhisperer.ipynb` for the differences), however the lower bandwidth of SWO will reduce the granularity significantly.

In [None]:
TRACE_PLATFORM = 'CW610' # AKA PhyWhisperer
#TRACE_PLATFORM = 'CW305' # CW305 FPGA target board

#PLATFORM = 'CW305'
PLATFORM = 'CW308_K82F'

In [None]:
from chipwhisperer.capture.trace.TraceWhisperer import TraceWhisperer

In [None]:
##### TODO: point to standard bitfile and defines ########
defines = ['../hardware/CW305_DesignStart/hdl/defines_trace.v', '../hardware/phywhisperer/software/phywhisperer/firmware/defines_pw.v']

In [None]:
# platform setup:
if TRACE_PLATFORM == 'CW610':
    SCOPETYPE = 'OPENADC'
    %run "Helper_Scripts/Setup_Generic.ipynb"
    trace = TraceWhisperer(target, scope, force_bitfile=False, defines_files=defines)
    #trace = TraceWhisperer(target, scope, force_bitfile=True, bs='../hardware/tracewhisperer/vivado/tracewhisperer.runs/impl_no_ilas/tracewhisperer_top.bit', defines_files=defines)
    scope.clock.adc_src = "clkgen_x1"
    scope.adc.samples = 10000
    scope.gain.setGain(20)

else:
    %run "Helper_Scripts/Setup_CW305_DST.ipynb"
    scope.clock.adc_src = "extclk_x1"
    scope.adc.samples = 10000
    trace = TraceWhisperer(target, scope, defines_files=defines)

In [None]:
# required after programming some targets:
def target_reset():
    if TRACE_PLATFORM == 'CW610':
        scope.io.nrst = 'low'
        time.sleep(0.05)
        scope.io.nrst = 'high'
        time.sleep(0.05)

In [None]:
target_reset()

In [None]:
print(trace.phywhisperer_name())
print(trace.get_fw_buildtime())
print(trace.fpga_buildtime)

In [None]:
trace.capture.raw = True
trace.capture.trigger_source = 'firmware trigger'
trace.capture.mode = 'while_trig'
trace.trace_mode = 'parallel'

## Set the periodic PC sampling parameters:
This will set the PC sampling frequency to its maximum: every 64 clock cycles.

Use a small POSTINIT delay to ensure that PC samples do not start before the trigger is received.

Note that PC sampling will be enabled by the target only upon triggering; again this is to ensure that PC sampling does not begin before we start capturing traces.

In [None]:
trace.set_periodic_pc_sampling(enable=1, cyctap=0, postinit=1, postreset=0)

In [None]:
trace.arm_trace()

In [None]:
ktp = cw.ktp.Basic()
key, text = ktp.next()
powertrace = cw.capture_trace(scope, trace._ss, text, key)

In [None]:
raw = trace.read_capture_data()
len(raw)

In [None]:
frames = trace.get_raw_trace_packets(raw, removesyncs=True, verbose=False)
len(frames)

In [None]:
trace.write_raw_capture(frames, 'raw.bin')

In [None]:
# sanity check that first TPI auxiliary byte shows up in the right place, so that Orbuculum can parse:
for f in [frames]:
    if len(f[0][1]) == 6 and len(f[1][1]) == 6 and len(f[2][1]) == 8:
        print("Checks out!")
    else:
        print('oh-ho...')

In [None]:
for f in frames[:3]:
    for b in f[1]:
        #print(hex(b), end=' ')
        print('%02x ' % b, end='')
    print()

## Next we run Orbuculum to extract the list of sampled addresses:

(adjust path as needed for your installation)

In [None]:
%%bash
/home/jpnewae/git/orbuculum/ofiles/orbuculum -t -f raw.bin -P -e
wc hwevent

In [None]:
hwe = open('hwevent', 'r')

In [None]:
import re
event_regex = re.compile(r'2,\d+,(.+)$')

In [None]:
addresses = []
for l in hwe:
    match = event_regex.search(l)
    if match:
        addresses.append(int(match.group(1), 16))
    else:
        print('\t *** MISMATCH: %s' % l)
hwe.close()

## Then we grab the start and size of functions from the binary:

(adjust paths as needed for your installation)

In [None]:
if PLATFORM == 'CW305':
    objdump = !arm-none-eabi-objdump -t ../hardware/CW305_DesignStart/bram_a7.elf
else:
    objdump = !arm-none-eabi-objdump -t ../../cw_develop/hardware/victims/firmware/simpleserial-trace/simpleserial-trace-CW308_K82F.elf

In [None]:
objdump_regex = re.compile(r'(\w{8})\s+\w\s+F.+?\s(\w{8}) (\.hidden )?(\w+)$')
matches = 0
mismatches = 0
funcs = []
for l in objdump:
    match = objdump_regex.search(l)
    if match:
        matches += 1
        start = match.group(1)
        size = match.group(2)
        name = match.group(4)
        if int(size, 16) == 0:
            continue
        funcs.append([int(start, 16), int(size, 16), name])
    else:
        mismatches += 1

## Now we can add the function name that every sampled PC value belongs to, in our time-stamped frame list:

In [None]:
found_functions = []
for i, addy in enumerate(addresses):
    found = False
    for func in funcs:
        if addy >= func[0] and addy < func[0] + func[1]:
            #print('%08x: %s' % (addy, func[2]))
            found = True
            frames[i].append(func[2])
            if func[2] not in found_functions:
                found_functions.append(func[2])
            break
    if not found:
        print("\t\t\t*** couldn't find address %08x!" % addy)

In [None]:
# what our frames list looks like now:
frames[:5]

In [None]:
# which functions were sampled:
found_functions

## We now have everything we need to plot

In [None]:
from bokeh.palettes import Set1
palette = Set1[9]

In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.resources import INLINE
from bokeh.models import Span

output_notebook(INLINE)
p = figure(plot_width=1500, plot_height=700)

xrange = range(len(powertrace.wave))
p.line(xrange, powertrace.wave, line_color="black")

In [None]:
if scope.clock.adc_src == 'clkgen_x4' or scope.clock.adc_src == 'extclk_x4':
    multiplier = 4
else:
    multiplier = 1

In [None]:
loy = min(powertrace.wave)
hiy = max(powertrace.wave)

In [None]:
# add legend
for i,f in enumerate(found_functions):
    if i < len(palette):
        p.rect(x=0 ,y=0, width=0.01, height=0.01, color=palette[i], alpha=0.3, legend_label=f, visible=False)

In [None]:
for i,f in enumerate(frames[:-1]):
    if len(f) != 3:
        pass
    elif found_functions.index(f[2]) < len(palette):
        p.quad(left=f[0]*multiplier, bottom=loy, right=frames[i+1][0]*multiplier-1, top=hiy, color=palette[found_functions.index(f[2])], alpha=0.4)
    else:
        print("Running out of colors! Choose a different palette?")

In [None]:
p.legend.label_text_font_size = "18pt"

In [None]:
show(p)

# Can we do better?
If you zoom in, you can clearly see that the PC samples are every 64 cycles. Unfortunately that is the maximum sampling rate possible, even though our 4-bit trace port has much higher bandwidth.

In the example above, you can probably see what appears to be highly varying ratios of time spent in `xtime()` vs time spent in `MixColumns()` from round to round, while is reality the time in each is probably constant; this is a artifact of our low sampling rate.

But we can take advantage of the fact that the timing of the first PC sample (relative to when sampling is enabled) is somewhat random. If you repeat the capture several times, you'll notice the timestamp of the first captured frame varies. Try it!

In [None]:
# timestamp for the first PC sample packet; if you repeat the capture, you'll likely find a different timestamp each time:
frames[0][0]

If the target can be made to execute the exact same code path repeatedly, you can take advantage of this variability to collect and combine multiple traces and thus effectively get finer PC sampling granularity! Let's do just that. We'll repeat the capture 10 times and keep the trace sets whose timestamps are furthest apart.

In [None]:
runs = 10

In [None]:
more_frames = []
for i in range(runs):
    trace.arm_trace()
    throwaway = cw.capture_trace(scope, trace._ss, text, key)
    raw = trace.read_capture_data()
    frames = trace.get_raw_trace_packets(raw, removesyncs=True, verbose=False)
    more_frames.append(frames)
    print('run %d: %d raw entries, %d frames' % (i, len(raw), len(frames)))

In [None]:
for i,frame_set in enumerate(more_frames):
    print('run %2d: first frame offset = %d' % (i, frame_set[0][0]))

Instead of pulling out some fancy algorithm to do this automatically, just manually pick a few frame sets that are far apart:

In [None]:
# edit this list according to your own results!
chosen_sets = [more_frames[0], more_frames[1], more_frames[3], more_frames[4], more_frames[5]]

We then follow basically the same steps we did before with a single capture.

Care must be taken for the fact that if *x* raw frames were captured, Orbuculum may return *x-1* or *x-2* parsed frames -- the last one or two can be missing.

The next cells are a bit awkward because there's no easy way to call Orbuculum from a loop inside Jupyter, so you'll have to manually run the following few cells for each element of `chosen_sets`:

In [None]:
set_index = 0

In [None]:
# loop back up to here!
frame_set = chosen_sets[set_index]

In [None]:
trace.write_raw_capture(frame_set, 'raw.bin')

In [None]:
%%bash
/home/jpnewae/git/orbuculum/ofiles/orbuculum -t -f raw.bin -P -e
wc hwevent

In [None]:
import os
try:
    os.remove('hwevent%d' % set_index)
except:
    pass
os.rename('hwevent', 'hwevent%d' % set_index)

In [None]:
# increment the index then go back up and repeat until all frame sets are processed
set_index += 1
if set_index >= len(chosen_sets):
    print("Done!")
else:
    print("Go back up and repeat for set %d..." % set_index)

In [None]:
addresses = []
framez = []
for i in range(len(chosen_sets)):
    tfile = open('hwevent%d' % i, 'r')
    numframes = 0
    for l in tfile:
        match = event_regex.search(l)
        if match:
            #print("Got %s" % match.group(1))
            addresses.append(int(match.group(1), 16))
            numframes += 1
        else:
            print('\t *** MISMATCH: %s' % l)
    # This is where we have to account for Orbuculum possibly not returning all the frames we fed it:
    framez.extend(chosen_sets[i][:numframes])
    tfile.close()

In [None]:
# make sure we didn't screw up:
assert len(addresses) == len(framez)

From here on, we do almost exactly the same things we did when we had a single trace set:

In [None]:
found_functions = []
for i, addy in enumerate(addresses):
    found = False
    for func in funcs:
        if addy >= func[0] and addy < func[0] + func[1]:
            #print('%08x: %s' % (addy, func[2]))
            found = True
            framez[i].append(func[2])
            if func[2] not in found_functions:
                found_functions.append(func[2])
            break
    if not found:
        print("\t\t\t*** couldn't find address %08x!" % addy)

In [None]:
# which functions were sampled:
found_functions

Before we plot, it will be easier if we sort our frames by timestamp. Before we do that, let's add the base address to each frame, because that would be harder post-sorting, and we'll need this later:

In [None]:
# add the base address to each frame:
for i, addy in enumerate(addresses):
    framez[i].append(addy)

In [None]:
framez = sorted(framez, key=lambda x:x[0])

In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.resources import INLINE
from bokeh.models import Span

output_notebook(INLINE)
q = figure(plot_width=1500, plot_height=700)

xrange = range(len(powertrace.wave))
q.line(xrange, powertrace.wave, line_color="black")
palette = Set1[9]

In [None]:
if scope.clock.adc_src == 'clkgen_x4' or scope.clock.adc_src == 'extclk_x4':
    multiplier = 4
else:
    multiplier = 1

In [None]:
# add legend
for i,f in enumerate(found_functions):
    if i < len(palette):
        q.rect(x=0 ,y=0, width=0.01, height=0.01, color=palette[i], alpha=0.3, legend_label=f, visible=False)

In [None]:
for i,f in enumerate(framez[:-1]):
    if len(f) != 4:
        pass
    elif found_functions.index(f[2]) < len(palette):
        q.quad(left=f[0]*multiplier, bottom=loy, right=framez[i+1][0]*multiplier-1, top=hiy, color=palette[found_functions.index(f[2])], alpha=0.4)
    else:
        print("Running out of colors! Choose a different palette?")

In [None]:
q.legend.label_text_font_size = "18pt"

In [None]:
show(q)

Pretty isn't it? Make sure you zoom in to appreciate the finer granularity!

## One more visualization:
Instead of mapping functions to colours, let's map the sampled address space to a larger color palette. This gives a different visualization of when code is repeated.

In [None]:
import numpy as np
from bokeh.palettes import Plasma256
palette = Plasma256

In [None]:
def map_color(addy):
    if addy < min(addresses) or addy > max(addresses):
        raise ValueError("Address out of range")
    return int(np.interp(addy, [min(addresses), max(addresses)], [0, len(palette)-1]))

In [None]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.resources import INLINE

output_notebook(INLINE)
r = figure(plot_width=1500, plot_height=700)

xrange = range(len(powertrace.wave))
r.line(xrange, powertrace.wave, line_color="black")

In [None]:
for i,f in enumerate(framez[:-1]):
    if len(f) > 2 and found_functions.index(f[2]) < 9:
        r.quad(left=f[0]*multiplier, bottom=loy, right=framez[i+1][0]*multiplier-1, top=hiy, color=Plasma256[map_color(f[-1])], alpha=0.4)

In [None]:
show(r)