In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../../../')

In [None]:
%%bash
cat ../../snippets/py/t0-0-dataset-text.txt

In [None]:
from hover.core.dataset import SupervisableTextDataset
from faker import Faker
import random

# ---- fake data for illustation ----
fake_en = Faker("en")

def random_text():
    return fake_en.paragraph(3)

def random_raw_data():
    return {"content": random_text()}

def random_labeled_data():
    return {"content": random_text(), "mark": random.choice(["A", "B"])}

# -----------------------------------

dataset = SupervisableTextDataset(
    # raw data which do not have labels
    raw_dictl=[random_raw_data() for i in range(500)],
    # train / dev / test sets are optional
    train_dictl=[],
    dev_dictl=[random_labeled_data() for i in range(50)],
    test_dictl=[random_labeled_data() for i in range(50)],
    # adjust feature_key and label_key to your data
    feature_key="content",
    label_key="mark",
)

# each subset is stored in its own DataFrame
dataset.dfs["raw"].head(5)

In [None]:
%%bash
cat ../../snippets/py/t0-1-vectorizer.txt

In [None]:
import spacy
import re

nlp = spacy.load("en_core_web_md")

def vectorizer(text):
    clean_text = re.sub(r"[\s]+", r" ", text)
    return nlp(clean_text, disable=nlp.pipe_names).vector

text = dataset.dfs["raw"].loc[0, "text"]
vec = vectorizer(text)
print(f"Text: {text}")
print(f"Vector shape: {vec.shape}")

In [None]:
%%bash
cat ../../snippets/py/t0-2-reduction.txt

In [None]:
# any kwargs will be passed onto the corresponding reduction
# for umap: https://umap-learn.readthedocs.io/en/latest/parameters.html
# for ivis: https://bering-ivis.readthedocs.io/en/latest/api.html
dataset.compute_2d_embedding(vectorizer, "umap")

# What we did adds 'x' and 'y' columns to the DataFrames in dataset.dfs
# One could alternatively pre-compute these columns using any approach
dataset.dfs["raw"].head(5)

In [None]:
%%bash
cat ../../snippets/py/t0-3-simple-annotator.txt

In [None]:
from hover.recipes import simple_annotator
from bokeh.io import show, output_notebook

# 'handle' is a function that renders elements in bokeh documents
handle = simple_annotator(dataset)

output_notebook()
show(handle, notebook_url='http://localhost:8888')

In [None]:
%%bash
cat ../../snippets/py/t1-0-vecnet-callback.txt

In [None]:
from hover.core.neural import VectorNet
from hover.utils.common_nn import LogisticRegression

def vecnet_callback(dataset, vectorizer):
    """
    Create a model with vectorizer-NN architecture.
    """
    # model.pt will point to a PyTorch state dict (to be created)
    # which gets cumulatively updated when we train the model
    vecnet = VectorNet(vectorizer, LogisticRegression, "model.pt", dataset.classes)
    return vecnet

vecnet = vecnet_callback(dataset, vectorizer)

# predict_proba accepts individual strings or list
# text -> vector -> class probabilities
print(vecnet.predict_proba(text))
print(vecnet.predict_proba([text]))

In [None]:
%%bash
cat ../../snippets/py/t1-1-active-learning.txt

In [None]:
from hover.recipes.experimental import active_learning
from bokeh.io import show, output_notebook

handle = active_learning(dataset, vectorizer, vecnet_callback)

output_notebook()
show(handle)