# Lossy compression

In [None]:
from bokeh.io import output_notebook, show
from bokeh.plotting import output_file
from RootInteractive.InteractiveDrawing.bokeh.bokehDrawSA import bokehDrawSA
from RootInteractive.InteractiveDrawing.bokeh.bokehTools import bokehDrawArray
from RootInteractive.Tools.pandaTools import initMetadata
import pandas as pd
import numpy as np
import math
import logging
output_notebook()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Generate data
* A and B from normal distribution
* C from uniform \[0, 1\]
* D Bernoulli distribution

* Add derived variables - two from normal distribution, one approximately exponential depending on A,B,C,D

In [None]:
npoints = 100000
A = np.random.randn(npoints)
B = np.random.randn(npoints)
unifC = np.random.random_sample(npoints)
boolD = np.random.random_sample(npoints) > .47
derivedE = A+boolD*(A*.15-B*.4+.1)+.1*np.random.randn(npoints)
derivedF = np.random.exponential(1/((derivedE**2)+(np.sin(2*math.pi*unifC)+1.4)))
derivedG = 100+15*A+2*np.random.randn(npoints)
df = pd.DataFrame({"A":A,"B":B,"unifC":unifC,"boolD":boolD,"derivedE":derivedE,"derivedF":derivedF, "derivedG":derivedG})

## Make figures and selection widgets

In [None]:
parameterArray = [
    {"name": "size", "value":7, "range":[0, 30]},
    {"name": "legendFontSize", "value":"13px", "options":["9px", "11px", "13px", "15px"]},
    {"name": "legendVisible", "value":True},
    {"name": "nPointRender", "range":[0, 5000], "value": 1000},
]
figureArray = [
    [['derivedG'], ['derivedE'], {"colorZvar": "B"}],
    [['derivedE'], ['A','B']],
    [['unifC'], ['derivedF'], { "colorZvar": "derivedG"}],
    [['derivedF'], ['derivedG'], {"colorZvar": "derivedE", "errY": "10*A"} ],
    [['A'], ['B'], {"colorZvar": "derivedF"}],
    {"size":"size", "legend_options": {"label_text_font_size": "legendFontSize", "visible":"legendVisible"}}
]
layout = {
    "A": [
        [0, 1, 2, {'y_visible': 1, 'x_visible':1, 'plot_height': 300}],
        {'plot_height': 100, 'sizing_mode': 'scale_width', 'y_visible' : 2}
        ],
    "B": [
        [3, 4, {'y_visible': 3, 'x_visible':1, 'plot_height': 300}],
        {'plot_height': 100, 'sizing_mode': 'scale_width', 'y_visible' : 2}
        ]
}
widgetParams=[
    ['range', ['A']],
    ['range', ['B']],
    ['range', ['unifC']],
    ['multiSelect', ['boolD']],
    ['range', ['derivedE']],
    ['spinnerRange', ['derivedF']],
    ['range',["derivedG"]],
    ['toggle',['legendVisible'], {"name":"legendVisible"}],
    ['select',['legendFontSize'], {"name":"legendSize"}],
    ['slider',['size'], {"name":"markerSize"}],
    ['slider',['nPointRender'], {"name":"nPoint"}]
]
widgetLayoutDesc={
    "Selection": [[0, 1, 2], [3, 4], [5, 6], {'sizing_mode': 'scale_width'}],
    "Graphics": [["legendVisible", "nPoint"],["legendSize", "markerSize"]]
    }   

* Optimization
    * Compress the data
        * bokehDrawArray (and bokehDrawSA) take an arrayCompression parameter, which is a list of (regex, pipeline) pairs, where regex is the regular expression used to match column names
          and pipeline is a list of operations to be used on the column. Supported values are "relative", "delta", "zip" and "base64" 
        * Example: 
            ``arrayCompressionParam = [
            (".conv.Sigma.*",[("relative",7), "code", "zip"]), 
            (".delta.",[("relative",10), "code", "zip"]), 
            (".i2.",[("relative",7), "code", "zip""]), 
            (".*",[("relative",8), "code", "zip"])]``
            * Variables will be compressed in the given order. Once a variable was compressed, it will not be overwritten by another compression.
            * Tuple paramters: `(".conv.Sigma.*",[("relative",7), "code", "zip"])`
                * first parameter is a regex expression to match the column names to be compressed
                * second parameter is a list of operation to be used on the column
                    * most relevant for the user is the first parameter of the list which defines the quantization
                        * "absolute": precision to be used in absolute units of the given variable, e.g. 0.0001
                        * "relative": precision to be used in units of bits, e.g. 10
                    * "code", "zip"
                        * lossless compression
                        * code - factor the column into "codes" and "factors" - two columns
                        * at the time of writing this tutorial "code" - factoring the columns - resulted in suboptial compression because of a bug that will be fixed soon - factors aren't encoded properly
                        * zip - compress using gzip
                    * "base64"
                        * base64 encoding - as of the current version it's automatically used where appropriate, there should be no need to use this

In [None]:
arrayCompression = [
    ("unif.*", [("delta", .01), "zip"]),
    ("bool.*", ["zip"]),
    (".*", [("relative", 16), "zip"]),
]

In [None]:
output_file("test_compression.html")
bokehDrawSA.fromArray(df, None, figureArray, widgetParams, layout=layout,
                            widgetLayout=widgetLayoutDesc, nPointRender="nPointRender", parameterArray=parameterArray, arrayCompression=arrayCompression, useNotebook=False)

## Option "code" in compressArray
* Results in suboptimal compression in most cases as can be seen here

In [None]:
arrayCompression = [
    ("unif.*", [("delta", .01), "code", "zip"]),
    ("bool.*", ["zip"]),
    (".*", [("relative", 16), "zip"]),
]
output_file("test_compression_code.html")
bokehDrawSA.fromArray(df, None, figureArray, widgetParams, layout=layout,
                            widgetLayout=widgetLayoutDesc, nPointRender="nPointRender", parameterArray=parameterArray, arrayCompression=arrayCompression, useNotebook=False)