In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
import sys
sys.path.append('../')

### DataTable

In [None]:
import pandas as pd
import random

In [None]:
df = pd.DataFrame({"value": [random.randint(0,100) for i in range(100)]})

In [None]:
num_rows = df.shape[0]

df['rank'] = df['value'].rank()
df['alpha'] = df['value'].rank().apply(lambda x: 0.1 + 0.8 * (x / num_rows))

In [None]:
df.head()

In [None]:
from bokeh.models import Legend, LegendItem
from bokeh.plotting import figure, show, output_notebook
output_notebook()

In [None]:
from bokeh.io import output_notebook, show
output_notebook()

from random import randint
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn, HTMLTemplateFormatter
from bokeh.palettes import Category10

data = dict(
    cola=[randint(0, 100) for i in range(10)],
    colb=Category10[10],
    colc=["https://docs.chainer.org/en/stable/_images/5.png" for i in range(10)],
    cold=["https://www.soundjay.com/buttons/beep-02.mp3" for i in range(10)],
)

source = ColumnDataSource(data)

color_formatter=HTMLTemplateFormatter(template="""                
            <p style="color:<%= colb %>;"> 
                <%= "&#9608;&#9608;&#9608;" + value %> &#128512;
                
            </p> 
            """)
image_formatter = HTMLTemplateFormatter(template="""
<img src=<%= value %>>
""")
audio_formatter = HTMLTemplateFormatter(template="""
<audio controls preload="auto" src=<%= value %>></audio>
""")

formatter1 = HTMLTemplateFormatter(template=template1)
formatter2 = HTMLTemplateFormatter(template=template2)
columns = [TableColumn(field="cola", title="CL1", width=50),
           TableColumn(field='colb', title='CL2', formatter=color_formatter, width=100),
           TableColumn(field='colc', title='CL3', formatter=image_formatter, width=200),
           TableColumn(field='cold', title='CL4', formatter=audio_formatter, width=50),
          ]
data_table = DataTable(source=source,
                       columns=columns,
                       width=500, row_height=200)

show(data_table)

### Experimental Recipe

EXPORT_PATH is for saving/loading a pre-processed dataset. 

This helps skip potentially time-consuming operations like vectorization/UMAP.

In [None]:
import pandas as pd
from hover.core.dataset import SupervisableTextDataset

EXPORT_PATH = 'example-dataset.csv'

In [None]:
# skip this block if EXPORT_PATH does not have a corresponding file
dataset = SupervisableTextDataset.from_pandas(pd.read_csv(EXPORT_PATH))

In [None]:
from faker import Faker
import random

# ---- fake data for illustation ----
fake_en = Faker("en")

def random_text():
    return fake_en.paragraph(3)

def random_raw_data():
    return {"content": random_text()}

def random_labeled_data():
    return {"content": random_text(), "mark": random.choice(["A", "B"])}

# -----------------------------------

dataset = SupervisableTextDataset(
    # raw data which do not have labels
    raw_dictl=[random_raw_data() for i in range(2000)],
    # train / dev / test sets are optional
    #train_dictl=[],
    train_dictl=[random_labeled_data() for i in range(500)],
    dev_dictl=[random_labeled_data() for i in range(50)],
    test_dictl=[random_labeled_data() for i in range(50)],
    # adjust feature_key and label_key to your data
    feature_key="content",
    label_key="mark",
)

# each subset is stored in its own DataFrame
dataset.dfs["raw"].head(5)

In [None]:
import spacy
import re

nlp = spacy.load("en_core_web_md")

def vectorizer(text):
    clean_text = re.sub(r"[\s]+", r" ", text)
    return nlp(clean_text, disable=nlp.pipe_names).vector

text = dataset.dfs["raw"].loc[0, "text"]
vec = vectorizer(text)
print(f"Text: {text}")
print(f"Vector shape: {vec.shape}")

In [None]:
# any kwargs will be passed onto the corresponding reduction
# for umap: https://umap-learn.readthedocs.io/en/latest/parameters.html
# for ivis: https://bering-ivis.readthedocs.io/en/latest/api.html
dataset.compute_nd_embedding(vectorizer, "umap", dimension=4)
dataset.compute_nd_embedding(vectorizer, "umap", dimension=3)

# What we did adds 'x' and 'y' columns to the DataFrames in dataset.dfs
# One could alternatively pre-compute these columns using any approach
dataset.dfs["raw"].head(5)

In [None]:
#dataset.to_pandas().to_csv(EXPORT_PATH, index=False)

In [None]:
from hover.core.neural import VectorNet
from hover.utils.common_nn import MLP, LogisticRegression

vecnet = VectorNet(vectorizer, MLP, "LR1.pt", dataset.classes)

# predict_proba accepts individual strings or list
# text -> vector -> class probabilities
print(vecnet.predict_proba(text))
print(vecnet.predict_proba([text]))

In [None]:
from hover.recipes.experimental import active_learning
from bokeh.io import show, output_notebook
from hover.utils.bokeh_helper import bokeh_hover_tooltip

handle = active_learning(
    dataset.copy(), vecnet,
)

output_notebook()
show(handle)