In [3]:
import tarfile
import os

format_by_extension = {
    'JP2': 'JPEG2000',
    'jxl': 'JpegXL',
    'ppg': 'PNG',
    'pzp': 'DEFLATE',
    'pzs': 'ZSTD',
    'pq3': 'QB3',
    'qb3': 'QB3+',
    'lrc': 'LERC',
    'webp': 'WEBP',
    'ppm': 'raw',
    'pgm': 'raw'
}

# Sequence of formats for timing output, in the order they are applied
seq = ('JPEG2000', 'JpegXL', 'WEBP', 'QB3', 'QB3+', 'LERC', 'ZSTD', 'DEFLATE', 'PNG', 'Err')

def gettime(val: str) -> float:
    if 'ms' in val:
        return float(val.split('ms')[0]) / 1000
    l = val.split('m')
    return float(l[0]) * 60 + float(l[1][:-1])

def get_data(filename: str) -> dict:
    data = {}
    with tarfile.open(filename, 'r:gz') as tar:
        for member in tar.getmembers():
            if member.isfile() and "convert_" in member.name:
                img_type = member.name.split("/")[0]
                if img_type not in data:
                    data[img_type] = {}

                # Sizes comes first
                if "sizes" in member.name:
                    with tar.extractfile(member) as f:
                        for line in f:
                            try:
                                _, _, _, _, size, _, _, _, fn = line.decode('utf-8').strip().split()
                                imgname, ext = os.path.splitext(fn)
                                if ext[1:] in format_by_extension.keys():
                                    if imgname not in data[img_type]:
                                        data[img_type][imgname] = {}
                                    comp = format_by_extension[ext[1:]]
                                    if comp not in data[img_type][imgname]:
                                        data[img_type][imgname][comp] = {}
                                    data[img_type][imgname][comp]['size'] = int(size)
                                    if comp == "raw":
                                        data[img_type][imgname][comp]['time'] = 0.0
                            except:
                                pass

                if "timing" in member.name:
                    with tar.extractfile(member) as f:
                        for line in f:
                            ln = line.decode("utf-8").strip()
                            if ".ppm" in ln or '.pgm' in ln:
                                imgname = os.path.splitext(ln)[0]
                                if imgname not in data[img_type]:
                                    data[img_type][imgname][comp] = {}
                                comp = "JPEG2000"
                            if "user" in ln or 'MRF_Timing' in ln:
                                # WEBP could be missing
                                if comp not in data[img_type][imgname]:
                                    comp = seq[seq.index(comp) + 1]
                                data[img_type][imgname][comp]['time'] = gettime(ln.split()[-1])
                                comp = seq[seq.index(comp) + 1]
    return data                            

# Compression by format

In [None]:
# Relative compression by format
import matplotlib.pyplot as plt

for tp in data.keys():
    plt.figure(figsize=(20, 10))
    names = list(data[tp].keys())
    comps = list(data[tp][names[0]].keys())
    comps = [c for c in comps if c != 'raw']
    rawsize = [data[tp][name]['raw']['size'] for name in names]
    bar_width = 1 / (3 + len(comps))
    bwo = - bar_width * (len(comps) - 1) / 2

    for comp in comps:
        sizes = [data[tp][name][comp]['size']/rawsize[names.index(name)] for name in names]
        positions = [bwo + names.index(name) + comps.index(comp) * bar_width for name in names]
        plt.bar(positions, sizes, width = bar_width, label=comp)

    plt.xticks(list(range(len(names))), names, rotation=45)

    plt.title(f"{tp} compression")
    plt.legend()

# Compression speed in MB/sec

In [None]:
# Compression speed in MB/s
import matplotlib.pyplot as plt
import math

for tp in data.keys():
    fig, ax = plt.subplots(figsize=(20, 10))
    # Logarithmic scale makes it look balanced but gives the wrong impression of the speedup
    # ax.set_yscale('log', base = 2)
    ax.grid(axis='y', which='major', linestyle='-', linewidth = 1, color='gray')
    # ax.grid(axis='y', which='minor', linestyle='--')
    ax.set_axisbelow(True)
    names = list(data[tp].keys())
    comps = list(data[tp][names[0]].keys())
    comps = [c for c in comps if c != 'raw']
    rawsize = [data[tp][name]['raw']['size'] for name in names]
    # Normalized raw size, in mb
    factors = [r / 1000000 for r in rawsize]

    bar_width = 1 / (3 + len(comps))
    bwo = - bar_width * (len(comps) - 1) / 2

    for comp in comps:
        times = [factors[names.index(name)] / data[tp][name][comp]['time'] for name in names]
        positions = [bwo + names.index(name) + comps.index(comp) * bar_width for name in names]
        ax.bar(positions, times, width = bar_width, label=comp)

    ax.set_xticks(list(range(len(names))), names, rotation=45)
    ax.set_title(f"{tp} compression speed, MB/sec")
    ax.legend()

# Scatter plot

In [None]:
import matplotlib.pyplot as plt

# Scatter plot of compression speed vs compression ratio
def plot_comp(data, tp, label = 'x86', skip = ['artificial', 'fireworks']):
    fig, ax = plt.subplots(figsize=(10, 10))
    # Without this the figure is transparent outside of the plot itself
    fig.patch.set_alpha(1)
    ax.grid(axis='y', which='major', linestyle='-', linewidth = 1, color='gray')
    ax.set_axisbelow(True)
    ax.set_yscale('log', base = 2)
    ax.set_yticks([2**i for i in range(10)], [f"{2**i}" for i in range(10)])
    ax.set_ylabel("Compression speed MB/s")
    ax.set_xscale('log', base = 2)
    # ax.set_xticks([2**-i for i in range(10)], [f"{2**i}:1" for i in range(10)])
    ax.set_xticks([2**-(i/4) for i in range(10)], [f"{100/2**(i/4):.2f}%" for i in range(10)])
    ax.set_xlabel("Compression ratio (compressed/raw)")
    ax.invert_xaxis()

    names = list(data[tp].keys())
    comps = list(data[tp][names[0]].keys())
    comps = [c for c in comps if c != 'raw']
    rawsize = [data[tp][name]['raw']['size'] for name in names]
    # Normalized raw size, in mb
    factors = [r / 1000000 for r in rawsize]

    for comp in comps:
        # if comp == 'LERC' or comp == 'QB3':
        #     print(comp)
        #     for name in names:
        #         print(f"{name} {data[tp][name][comp]['time']} {data[tp][name][comp]['size']}")
        # Foveon compresses well on l16
        times = [factors[names.index(name)] / data[tp][name][comp]['time'] for name in names if name not in skip]
        sizes = [data[tp][name][comp]['size']/rawsize[names.index(name)] for name in names if name not in skip]
        ax.scatter(sizes, times, 300, label=comp)

    # ax.set_xticks(list(range(len(names))), names, rotation=45)
    ax.set_title(f"{tp} images, {label}")
    ax.legend()

x86_data = get_data('AMD7R32-results.tgz')
g2_data = get_data('G2-results.tgz')
g3_data = get_data('G3-results.tgz')
for tname in ('8bit', '16bit', 'l16bit'):
    plot_comp(x86_data, tname)
for tname in ('g8bit', 'g16bit', 'lg16bit'):
    plot_comp(x86_data, tname, skip = ['artificial', 'fireworks', 'flower_foveon'])
