In [None]:
%pip install -q matplotlib==3.8.3

In [None]:
import csv
import os

from itertools import pairwise
from pathlib import Path

import numpy as np

from matplotlib import pyplot as plt
from matplotlib import colors
from matplotlib import cm

from mpl_toolkits.axes_grid1 import inset_locator

In [None]:
RESULTS = Path(os.environ.get('RESULTS', 'results/corpus'))
CHARTS = Path(os.environ.get('CHARTS', 'charts.pdf'))

assert RESULTS.exists()

In [None]:
def load(name: str):
    with (RESULTS / name).with_suffix('.csv').open() as file:
        yield from csv.reader(file)


partials = {(language, queries[0]): list(reversed(queries))
            for language, *queries in load('partials')}

matches = {(language, query): list(map(int, vs))
           for language, query, *vs in load('matches') if any(vs)}

progress = {(language, query): list(map(int, vs))
            for language, query, *vs in load('progress') if int(vs[-1])}

In [None]:
# `$_ is tokenized by Pygments as three (instead of 2) tokens :/

for k in progress:
    if not all(progress[k]):
        drop = [i for i, t in enumerate(progress[k]) if not t]
        print('warning: dropping unrecognized tokens at offsets:',
              *(len(partials[k][i-1]) for i in drop), '\tfor', k[1])

        partials[k] = [p for i, p in enumerate(partials[k]) if i not in drop]
        progress[k] = [p for i, p in enumerate(progress[k]) if i not in drop]

In [None]:
width, aspect = 8.5, 9/16
def size(w): return (w, w * aspect)


full = size(width)
small = size(width * .49)

plt.rcParams['figure.autolayout'] = True
plt.rcParams['pdf.fonttype'] = 42

paper = []

In [None]:
diffs = np.array(list(matches.values()))

semgrep = diffs[:, 1] + diffs[:, 2]
excluded = 100 * diffs[:, 2] / semgrep

stsearch = diffs[:, 0] + diffs[:, 1]
included = 100 * diffs[:, 0] / stsearch

for total, rate, tool, name in [
    (semgrep, excluded, 'Semgrep', 'excluded in stsearch'),
    (stsearch, included, 'stsearch', 'not in Semgrep'),
]:
    fig = plt.figure(figsize=small)
    ax = fig.add_subplot()

    ax.set_ylabel(f'% {name}')
    ax.set_xlabel(f'# {tool} unique matches')
    ax.tick_params(axis='both', which='major')
    ax.set_xscale('log')

    ax.scatter(total, rate, label='query',
               color='blue', alpha=.5)

    ax.legend()

    paper.append(fig)

In [None]:
assert all(p >= n for ts in progress.values() for p, n in pairwise(ts))


def toks(prefixes: list[str], init=''):
    for syntax in prefixes:
        assert syntax.startswith(init)
        yield syntax[len(init):]
        init = syntax


queries = np.array(list(progress), dtype=object)
lengths = np.array(list(map(len, progress.values())))
final = np.array([vs[-1] for vs in progress.values()])

shape = len(progress), max(lengths)
tokens = np.full(shape, '', dtype=object)
results = np.full(shape, np.nan)
complete = np.full(shape, np.False_)

for i, (q, vs) in enumerate(progress.items()):
    tokens[i, :len(vs)] = list(toks(partials[q]))
    results[i, :len(vs)] = vs
    complete[i, len(vs)-1] = np.True_

# implies sorting by length!
order = np.lexsort(tokens.T)

queries = queries[order]
lengths, final = lengths[order], final[order]
tokens, results = tokens[order], results[order]
complete = complete[order]

In [None]:
firsts = np.full(len(results), np.nanmax(results))
previous = np.hstack((firsts[:, None], results[:, :-1]))

selective = 1 - (results / previous)
empty = np.full_like(selective, np.False_, dtype=bool)

partial = final[:, None] / results

heatmaps = []
for data, progression, mask, color in [
    (selective, ['whitesmoke', 'xkcd:violet'], empty, 'gray'),
    (partial, ['whitesmoke', 'green'], complete, 'blue'),
]:
    assert results.shape == partial.shape == mask.shape

    colormap = colors.LinearSegmentedColormap.from_list('Quality', progression)

    heatmap = colormap(data)
    heatmap[mask] = colors.to_rgba(color)
    heatmaps.append(heatmap)

    fig = plt.figure(figsize=full)
    ax = fig.add_subplot()

    ax.imshow(heatmap, interpolation='nearest', aspect='auto')

    ax.set_xticks(ticks := np.arange(heatmap.shape[1]))
    ax.set_xticklabels([f'{t+1}' for t in ticks])
    ax.tick_params(left=False, labelleft=False, bottom=False)

    ax.set_xticks(np.arange(-.5, heatmap.shape[1]+.5), minor=True)
    ax.set_yticks(np.arange(-.5, heatmap.shape[0]+.5), minor=True)
    ax.tick_params(which='minor', left=False, bottom=False)
    ax.grid(which='minor', color='w', linestyle='-')

    ax.set_ylabel('each query, sorted by tokens')
    ax.set_xlabel('searched prefix token length')

    cax = inset_locator.inset_axes(ax, width='40%', height='5%', borderpad=2)
    bar = fig.colorbar(cm.ScalarMappable(colors.Normalize(0, 100), colormap),
                       cax=cax, orientation='horizontal')

    bar.set_ticks(ticks := bar.get_ticks())  # type: ignore
    bar.set_ticklabels([f'{t:.0f}%' for t in ticks])

    for axis in (ax, cax):
        for spine in axis.spines.values():
            spine.set_visible(False)

    paper.append(fig)

In [None]:
from matplotlib.backends.backend_pdf import PdfPages

with PdfPages(CHARTS) as pdf:
    for fig in paper:
        pdf.savefig(fig)