# Table segment annotation
The input to this notebook is the `ETK` output with `table extraction` enabled.

## Reading tables
This part uses `Spark` to read the etk output, and places all the tables in memory.

In [1]:
from pyspark import SparkConf, SparkContext

config = SparkConf().setAppName("table_node2vec").setMaster('local[*]')
sc = SparkContext(conf=config)

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
import json
from jsonpath_ng import parse

tablepath = parse('$.content_extraction.table.tables[*]')
tables = sc.textFile('/Users/majid/DIG/data/elicit_data/').map(lambda x: json.loads(x))
tables = tables.flatMap(lambda x: [match.value for match in tablepath.find(x)]).collect()


## Initialize annotations and annotation GUI

In [32]:
table_annotations = [dict(header=None, data=None, metadata=None, time=None, agg=None, annotated=False, fingerprint=x['fingerprint']) for x in tables]

In [37]:
import pandas as pd
import numpy as np
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
from ipywidgets import widgets, Layout

init_notebook_mode(connected=True)
table_ind = 0

next_btn = widgets.Button(description="next")
prev_btn = widgets.Button(description="prev")
table_ind_disp = widgets.IntText(value=table_ind, description="table index: ", layout=Layout(width='150px', height='30px'))

table_type_radio = widgets.RadioButtons(options=['regular', 'time series'])
header_area_ind = widgets.Text(description='header:')
data_area_ind = widgets.Text(description='data:')
time_area_ind = widgets.Text(description='time:')
agg_area_ind = widgets.Text(description='aggregate:')
metadata_area_ind = widgets.Text(description='metadata:')

apply_btn = widgets.Button(description="apply")

# colors = np.random.randint(0,255, list(df.shape)+[3])

# for i in range(colors.shape[0]):
#     temp = []
#     for j in range(colors.shape[1]):
#         c = colors[i,j,:]
#         temp.append('rgb({},{},{})'.format(c[0], c[1],c[2]))
#     colors_text.append(temp)
colors_text = None
df = None
indices_array = None
fig = None

ow = widgets.Output()
navigation = widgets.VBox([table_ind_disp, prev_btn, next_btn])
annotation_regular = widgets.HBox([widgets.VBox([header_area_ind, data_area_ind, metadata_area_ind, apply_btn]),
                                   widgets.VBox([time_area_ind, agg_area_ind])])
# annotation_timeseries = widgets.VBox([ts_header_area_ind,time_area_ind,ts_metadata_area_ind])
toolbox = widgets.HBox([navigation, annotation_regular])

layout = go.Layout()

def set_colors(update=False):
    global colors_text
    colors_text = [['white']*df.shape[1] for _ in range(df.shape[0])]
    if table_annotations[table_ind]['annotated'] == True:
        hh = table_annotations[table_ind]['header']
        dd = table_annotations[table_ind]['data']
        mdmd = table_annotations[table_ind]['metadata']
        times = table_annotations[table_ind]['time']
        aggs = table_annotations[table_ind]['agg']
        annotated = False
        if hh is not None:
            for h in hh.split(';'):
                indices = eval('indices_array['+h+']')
                if isinstance(indices, tuple):
                    indices = [indices]
                elif indices.ndim > 1:
                    indices = indices.flatten()
                for i in indices:
                    colors_text[i[0]][i[1]] = 'cyan'
        if dd is not None:
            for d in dd.split(';'):
                indices = eval('indices_array['+d+']')
                if isinstance(indices, tuple):
                    indices = [indices]
                elif indices.ndim > 1:
                    indices = indices.flatten()
                for i in indices:
                    colors_text[i[0]][i[1]] = 'LightGreen'
        if mdmd is not None:
            for md in mdmd.split(';'):
                indices = eval('indices_array['+md+']')
                if isinstance(indices, tuple):
                    indices = [indices]
                elif indices.ndim > 1:
                    indices = indices.flatten()
                for i in indices:
                    colors_text[i[0]][i[1]] = 'Khaki'
                    
        if times is not None:
            for t in times.split(';'):
                indices = eval('indices_array['+t+']')
                if isinstance(indices, tuple):
                    indices = [indices]
                elif indices.ndim > 1:
                    indices = indices.flatten()
                for i in indices:
                    colors_text[i[0]][i[1]] = 'blueviolet'
                    
        if aggs is not None:
            for agg in aggs.split(';'):
                indices = eval('indices_array['+agg+']')
                if isinstance(indices, tuple):
                    indices = [indices]
                elif indices.ndim > 1:
                    indices = indices.flatten()
                for i in indices:
                    colors_text[i[0]][i[1]] = 'olive'
        
def plot(update=False):
    global colors_text
    global df
    global indices_array
    global fig
    df = pd.read_html(tables[table_ind]['html'],match='.*')[0]
    if not update:
        indices_array = np.zeros(df.shape, dtype=object)
        for i in range(indices_array.shape[0]):
            for j in range(indices_array.shape[1]):
                indices_array[i,j] = (i,j)
        set_colors()
        trace = go.Table(cells=dict(values=[df[x] for x in df.columns],
                                    fill = dict(color=np.array(colors_text).T)),
                         header=dict(values=list(range(df.shape[1])),line = dict(width=0)))
        with ow:
            fig = dict(data=[trace], layout=layout)
            iplot(fig, filename='my plot')
    else:
        set_colors()
        trace = go.Table(cells=dict(values=[df[x] for x in df.columns],
                                    fill = dict(color=np.array(colors_text).T)),
                         header=dict(values=list(range(df.shape[1])),line = dict(width=0)))
        with ow:
            fig.update(data=[trace])
            iplot(fig)

def goto_next(b):
    global table_ind
    table_ind+=1
    ow.clear_output()
    plot()
    table_ind_disp.value = table_ind
    
def goto_prev(b):
    global table_ind
    table_ind-=1
    ow.clear_output()
    plot()
    table_ind_disp.value = table_ind
    
def apply_annotation(b):
    h = header_area_ind.value
    d = data_area_ind.value
    md = metadata_area_ind.value
    t = time_area_ind.value
    agg = agg_area_ind.value
    annotated = False
    if h != '':
        table_annotations[table_ind]['header'] = h
        annotated = True
    else:
        table_annotations[table_ind]['header'] = None
    if d != '':
        table_annotations[table_ind]['data'] = d
        annotated = True
    else:
        table_annotations[table_ind]['data'] = None
    if md != '':
        table_annotations[table_ind]['metadata'] = md
        annotated = True
    else:
        table_annotations[table_ind]['metadata'] = None
    if t != '':
        table_annotations[table_ind]['time'] = t
        annotated = True
    else:
        table_annotations[table_ind]['time'] = None
    if agg != '':
        table_annotations[table_ind]['agg'] = agg
        annotated = True
    else:
        table_annotations[table_ind]['agg'] = None
    table_annotations[table_ind]['annotated'] = annotated
    ow.clear_output()
    plot(True)
#     layout.update()
        
    
prev_btn.on_click(goto_prev)
next_btn.on_click(goto_next)
apply_btn.on_click(apply_annotation)


## Annotate Tables
On the left side there are table navigation buttons: 
- `table index`: current table index
- `next`, `prev`: goto next/prev table

On the right side, there are annotation fields:
- `header`: header block (column or row header cells, can be multiple rows/cols)
- `data`: data block
- `metadata`: metadata block (table information such as title, table notes, etc.)
- `time`: time block (in case the table is time series). it can overlap `header` block.
- `aggregate`: aggregate block (e.g. total). can overlap `data` block.

The annotations are in numpy slicing format, some examples shown below:
- first row: `0`
- first column: `:,0`
- cell at second col and third row: `2,1`

multiple blocks can be introduced, by using `;` to separate them:
- first row and third row: `0;2`
- `0:5,1:4;8:10,1:4`

By pressing `apply` the annotation will be saved in `table_annotations` array, this array must be written to disk once annotations are finished.

In [38]:
display(toolbox)
display(ow)
plot()

In [62]:
table_annotations[table_ind]

{'header': '1', 'data': '2:', 'metadata': '0,0', 'annotated': True}

### write the annotations to file

In [18]:
outfile = open('/Users/majid/Desktop/elicit_annotations.jl', 'w')
for x in table_annotations:
    outfile.write(json.dumps(x)+'\n')
outfile.close()