In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
import sys
sys.path.append('../')

### DataTable

In [None]:
import pandas as pd
import random

In [None]:
df = pd.DataFrame({"value": [random.randint(0,100) for i in range(100)]})

In [None]:
num_rows = df.shape[0]

df['rank'] = df['value'].rank()
df['alpha'] = df['value'].rank().apply(lambda x: 0.1 + 0.8 * (x / num_rows))

In [None]:
df.head()

In [None]:
from bokeh.models import Legend, LegendItem
from bokeh.plotting import figure, show, output_notebook
output_notebook()

In [None]:
from bokeh.io import output_notebook, show
output_notebook()

from random import randint
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn, HTMLTemplateFormatter
from bokeh.palettes import Category10

data = dict(
    cola=[randint(0, 100) for i in range(10)],
    colb=Category10[10],
    colc=['&#9608;' for i in range(10)]
)

source = ColumnDataSource(data)

template="""                
            <p style="color:<%= colb %>;"> 
                <%= "&#9608;" + value %>
            </p>
            """
formatter =  HTMLTemplateFormatter(template=template)

columns = [TableColumn(field="cola", title="CL1", width = 100),
           TableColumn(field='colb', title='CL2', formatter=formatter, width = 100),
           TableColumn(field='colc', title='CL3', formatter=formatter, width = 5)
          ]
data_table = DataTable(source=source,
                       columns=columns,
                       fit_columns=True,
                       selectable = True,
                       sortable = True,
                       width=400,height=400)

show(data_table)

### Experimental Recipe

EXPORT_PATH is for saving/loading a pre-processed dataset. 

This helps skip potentially time-consuming operations like vectorization/UMAP.

In [None]:
import pandas as pd
from hover.core.dataset import SupervisableTextDataset

EXPORT_PATH = 'example-dataset.csv'

In [None]:
# skip this block if EXPORT_PATH does not have a corresponding file
dataset = SupervisableTextDataset.from_pandas(pd.read_csv(EXPORT_PATH))

In [None]:
from faker import Faker
import random

# ---- fake data for illustation ----
fake_en = Faker("en")

def random_text():
    return fake_en.paragraph(3)

def random_raw_data():
    return {"content": random_text()}

def random_labeled_data():
    return {"content": random_text(), "mark": random.choice(["A", "B"])}

# -----------------------------------

dataset = SupervisableTextDataset(
    # raw data which do not have labels
    raw_dictl=[random_raw_data() for i in range(3000)],
    # train / dev / test sets are optional
    #train_dictl=[],
    train_dictl=[random_labeled_data() for i in range(500)],
    dev_dictl=[random_labeled_data() for i in range(50)],
    test_dictl=[random_labeled_data() for i in range(50)],
    # adjust feature_key and label_key to your data
    feature_key="content",
    label_key="mark",
)

# each subset is stored in its own DataFrame
dataset.dfs["raw"].head(5)

In [None]:
import spacy
import re

nlp = spacy.load("en_core_web_md")

def vectorizer(text):
    clean_text = re.sub(r"[\s]+", r" ", text)
    return nlp(clean_text, disable=nlp.pipe_names).vector

text = dataset.dfs["raw"].loc[0, "text"]
vec = vectorizer(text)
print(f"Text: {text}")
print(f"Vector shape: {vec.shape}")

In [None]:
# any kwargs will be passed onto the corresponding reduction
# for umap: https://umap-learn.readthedocs.io/en/latest/parameters.html
# for ivis: https://bering-ivis.readthedocs.io/en/latest/api.html
dataset.compute_2d_embedding(vectorizer, "umap")

# What we did adds 'x' and 'y' columns to the DataFrames in dataset.dfs
# One could alternatively pre-compute these columns using any approach
dataset.dfs["raw"].head(5)

In [None]:
#dataset.to_pandas().to_csv(EXPORT_PATH, index=False)

In [None]:
from hover.core.neural import VectorNet, MultiVectorNet
from hover.utils.common_nn import MLP, LogisticRegression

def vecnet_callback(dataset):
    """
    Create a model with vectorizer-NN architecture.
    """
    vecnets = [
        VectorNet(vectorizer, LogisticRegression, "LR1.pt", dataset.classes),
        VectorNet(vectorizer, LogisticRegression, "LR2.pt", dataset.classes),
    ]
    #return MultiVectorNet(vecnets)
    return vecnets[0]

vecnet = vecnet_callback(dataset)

# predict_proba accepts individual strings or list
# text -> vector -> class probabilities
print(vecnet.predict_proba(text))
print(vecnet.predict_proba([text]))

In [None]:
from hover.recipes.experimental import active_learning
from bokeh.io import show, output_notebook

handle = active_learning(dataset.copy(), vecnet_callback)

output_notebook()
show(handle)

In [None]:
import numpy as np

arr = np.array([1, 2])

In [None]:
from bokeh.models import Slider
from hover.utils.bokeh_helper import servable

@servable()
def burner():
    slider = Slider(start=0, end=10, value=0, step=1)
    
    def callback(attr, old, new):
        slider.end = new + 1
    
    slider.on_change('value', callback)
    return slider

burner_handle = burner()

In [None]:
show(burner_handle)