Vertex.AI Interactive Event Analysis
=========================================

This interactive notebook can be used for exploring events generated by Vertex.AI components.

To use it, you'll first need to produce an event log.  The way you do this varies with the various components (since they're configured in different ways).

* With the Tile server, you'll want to add something like the following stanza to your JSON _server.conf_:

      "eventlog": {
        "@type": "type.vertex.ai/vertexai.eventing.file.proto.EventLog",
        "filename": "eventlog.gz"
       },
       "event_tracking_mode": "EVENT_TRACKING_MODE_GLOBAL"
     
  When run with this configuration, the server will write a file, _eventlog.gz_, on shutdown (which you can trigger with a Ctrl-C).

     
* With the PlaidML Python backends, you can typically set an environment variable:

       PLAIDML_EVENTLOG_FILENAME=eventlog.gz
     
  This will cause events to be logged to _eventlog.gz_ on shutdown.

Next, build this notebook's Python dependencies:

    bazel build //base/context/analysis
    
By default, the notebook will look for _eventlog.gz_ in your home directory.  If that's wrong, you'll want to tweak the *scope.read_eventlog()* line below.

Once _eventlog.gz_ is in place, and the dependencies have been built, you can run the notebook, and start adding your own cells to explore the output data.

In [None]:
# Setup: Add some useful imports; make sure the Python path is configured correctly, &c.
from __future__ import absolute_import, print_function

# N.B. We use 'inline' since our test infrastructure doesn't support 'notebook'.
# Using 'notebook' provides some interactive controls for the diagrams.
%matplotlib inline

import datetime
import imp
import itertools
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys
import uuid


class ManifestLoader(object):
    def __init__(self, path, filename):
        self._path = path
        self._filename = filename
        
    def load_module(self, fullname):
        if fullname in sys.modules:
            mod = sys.modules[fullname]
        else:
            mod = imp.new_module(fullname)
            mod.__name__ = fullname
            mod.__package__ = fullname.split('.', 1)[0]
            mod.__builtins__ = __builtins__
            mod.__path__ = [mod.__package__]
            sys.modules[fullname] = mod
            names = fullname.rsplit('.', 1)
            if len(names) > 1:
                setattr(sys.modules[names[0]], names[1], mod)
        if self._filename:
            mod.__file__ = self._filename
            lmod = imp.load_source(fullname, self._filename)
            for attrname in dir(lmod):
                setattr(mod, attrname, getattr(lmod, attrname))
        return mod


class ManifestFinder(object):
    def __init__(self, prefixes, manifest_filename):
        self._known = {}
        self._prefixes = prefixes
        with open(manifest_filename, 'r') as manifest:
            for line in manifest:
                parts = line.split()
                parts.append(None)
                self._known[parts[0]] = parts[1]

    def find_module(self, fullname, path=None):
        for prefix in self._prefixes:
            if prefix:
                base = '/'.join([prefix] + fullname.split('.'))
            else:
                base = '/'.join(fullname.split('.'))
            for ext in ['/__init__.py', '.py']:
                fname = base + ext
                if fname in self._known:
                    return ManifestLoader(fname, self._known[fname])
        return None

try:
    initialized_notebook
except:
    runfiles = os.path.join('..', '..', '..', 'bazel-bin', 'base', 'context', 'analysis', 'analysis.runfiles')
    if os.path.exists(os.path.join(runfiles, 'vertexai')) and os.path.exists(os.path.join(runfiles, 'vertexai_plaidml')):
        sys.path.append(os.path.join(runfiles))
    elif os.path.exists(os.path.join(runfiles, 'MANIFEST')):
        sys.meta_path.append(ManifestFinder(['vertexai', 'vertexai_plaidml', 'protobuf/python', 'protobuf/src', 'six_archive'],
                                            os.path.join(runfiles, 'MANIFEST')))
    elif os.path.exists(os.path.join('..', 'MANIFEST')):
        sys.meta_path.append(ManifestFinder(['vertexai', 'vertexai_plaidml', 'protobuf/python', 'protobuf/src', 'six_archive'],
                                            os.path.join('..', 'MANIFEST')))
    else:
        raise Exception('Please build //base/context/analysis')
    initialized_notebook = True

In [None]:
# With the Python path configured, the Bazel-built code can be loaded.
import base.context.analysis as ca

In [None]:
# Load the raw events, and make them useful.
scope = ca.Scope()
if 'PLAIDML_EVENTLOG_FILENAME' in os.environ:
    scope.read_eventlog(os.environ['PLAIDML_EVENTLOG_FILENAME'])
else:
    scope.read_eventlog(os.path.expanduser('~/eventlog.gz'))


In [None]:
# Let's build up some stats on the kernels...
runs = filter(lambda evt: evt.verb == 'tile::hal::opencl::Executing', scope.events.itervalues())
times = {}
for r in runs:
    kuuid = r.ocl_runinfo.kernel_uuid
    try:
        kname = scope.events[kuuid].ocl_kernelinfo.kname
    except:
        print('{} -> {}'.format(r, r.ocl_runinfo))
        continue
    prev = times.get(kname, (0., 0, kuuid))
    times[kname] = (r.elapsed_time + prev[0], prev[1]+1, prev[2])


In [None]:
# Let's look at mean OpenCL kernel execution times, because they're often interesting.
means = {}
for kname in times:
    means[kname] = times[kname][0] / times[kname][1]
knames_by_mean_runtime = means.keys()
knames_by_mean_runtime.sort(key=lambda kname: means[kname], reverse=True)
for kname in knames_by_mean_runtime:
    print('{}: {}'.format(kname, datetime.timedelta(seconds=means[kname])))

In [None]:
# Let's turn the top 80 mean kernel execution times into a bar graph.
top_knames = knames_by_mean_runtime[:80]
fig = plt.figure(figsize=(16, 4))
ax = fig.add_subplot(111)
ax.bar(range(len(top_knames)), [means[kname] for kname in top_knames])
plt.xticks(range(len(top_knames)), top_knames, rotation=90)
fig.tight_layout(pad=2.)
plt.xlabel('Kernel')
plt.ylabel('Mean Runtime(seconds)')
plt.title('Mean Runtimes by Kernel')
plt.show()

In [None]:
# Here's the highest-mean-runtime kernel:
kuuid = times[knames_by_mean_runtime[0]][2]
kevt = scope.events[kuuid]
kinfo = kevt.ocl_kernelinfo
print(kinfo.src.decode('string_escape'))

In [None]:
# And its program:
print('Kernel {}\'s program was:\n{}'.format(kinfo.kname, kevt.hal_compilationinfo.program.code.decode('string_escape')))

In [None]:
# Total runtimes are also interesting.
totals = {}
for kname in times:
    totals[kname] = times[kname][0]
knames_by_total_runtime = totals.keys()
knames_by_total_runtime.sort(key=lambda kname: totals[kname], reverse=True)
for kname in knames_by_total_runtime:
    print('{}: {}'.format(kname, datetime.timedelta(seconds=totals[kname])))

In [None]:
# Let's turn the top 80 total kernel execution times into a bar graph.
top_knames = knames_by_total_runtime[:80]
fig = plt.figure(figsize=(16, 4))
ax = fig.add_subplot(111)
ax.bar(range(len(top_knames)), [totals[kname] for kname in top_knames])
plt.xticks(range(len(top_knames)), top_knames, rotation=90)
fig.tight_layout(pad=2.)
plt.xlabel('Kernel')
plt.ylabel('Total Runtime(seconds)')
plt.title('Total Runtimes by Kernel')
plt.show()

In [None]:
# Here's the highest-total-runtime kernel:
kuuid = times[knames_by_total_runtime[0]][2]
kevt = scope.events[kuuid]
kinfo = kevt.ocl_kernelinfo
print(kinfo.src.decode('string_escape'))

In [None]:
# And its program:
print('Kernel {}\'s program was:\n{}'.format(kinfo.kname, kevt.hal_compilationinfo.program.code.decode('string_escape')))

In [None]:
# How much memory did its program take up?
total = 0
for size, count in kevt.hal_compilationinfo.alloc_sizes.iteritems():
    total += size * count
print('Kernel {}\'s program used {} bytes of temporary memory:'.format(kinfo.kname, total))
for size, count in kevt.hal_compilationinfo.alloc_sizes.iteritems():
    print('  {} alloc(s) of size {}'.format(count, size))

In [None]:
# What was the actual program?
print('Kernel {}\'s program was:\n{}'.format(kinfo.kname, kevt.hal_compilationinfo.program.code.decode('string_escape')))

In [None]:
# It can also be interesting to look at what happens across an entire batch invocation.
invocations = filter(lambda evt: evt.verb == 'plaidml::invoker::ScheduleInvocation', scope.events.itervalues())
evt = invocations[10]

# Compute the buffer mapping events that occur after this invocation and before the next invocation.
mapping_window_end = 0
for e in invocations:
    if mapping_window_end < e.start_time:
        mapping_window_end = e.start_time
for e in invocations:
    if e.start_time < mapping_window_end and evt.end_time < e.start_time:
        mapping_window_end = e.start_time
mappings = filter(lambda e: e.verb == 'tile::MapCurrent' and evt.end_time <= e.start_time and e.end_time <= mapping_window_end,
                  scope.events.itervalues())

def depth_first_iter(event):
    pending = [event]
    while pending:
        event = pending.pop()
        yield event
        children = list(event.children)
        children.sort(key=lambda e: e.start_time, reverse=True)
        pending.extend(children)
        
def events_iter():
    iterators = [depth_first_iter(evt)]
    iterators.extend([depth_first_iter(m) for m in mappings])
    return itertools.chain(*iterators)

verb_indicies = {}
event_count = 0
max_elapsed_time = 0
max_executing_kname = ''
for e in events_iter():
    event_count += 1
    if e.verb not in verb_indicies:
        index = len(verb_indicies)
        verb_indicies[e.verb] = index
    if e.verb == 'tile::hal::opencl::Executing' and e.elapsed_time > max_elapsed_time:
        max_elapsed_time = e.elapsed_time
        max_executing_kname = scope.events[e.ocl_runinfo.kernel_uuid].ocl_kernelinfo.kname

fig = plt.figure(figsize=(10, event_count / 5.))
ax = fig.add_subplot(111)

idx = 1
yticks = []
colormap = mpl.cm.get_cmap('jet')
norm = mpl.colors.Normalize(vmin=0, vmax=len(verb_indicies))
for e in events_iter():
    try:
        label = scope.events[e.ocl_runinfo.kernel_uuid].ocl_kernelinfo.kname + ' '
    except KeyError:
        label = ''
    ax.barh(event_count-idx,
            width=e.elapsed_time,
            left=e.start_time,
            align='center',
            color=colormap(norm(verb_indicies[e.verb])))
    yticks.append(label + '.'.join(e.verb.split('::')[-2:]))
    idx += 1
yticks.reverse()
plt.yticks(range(len(yticks)), yticks)
ax.set_ylim(bottom=-1, top=len(yticks))
ax.grid(b=True)
fig.tight_layout(pad=2.)
fig.autofmt_xdate()
plt.xlabel('Timestamp (seconds)')
plt.title('Program Run')

plt.show()


In [None]:
# Just for fun, let's dump the longest-running kernel code.
# We're doing this by kernel name to make it easy to interactively swap in other kernels;
# just change max_executing_kname here.
build_event = scope.events[times[max_executing_kname][2]]
print(build_event.ocl_kernelinfo.src.decode('string_escape'))

In [None]:
# Also just for fun: let's dump the device info for that kernel.
device_event = scope.events[build_event.ocl_buildinfo.device_uuid]
print(device_event.ocl_deviceinfo)