In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../../')

In [None]:
from hover.core.dataset import SupervisableTextDataset
import pandas as pd

example_csv_path = "https://raw.githubusercontent.com/phurwicz/hover-gallery/main/0.5.0/20_newsgroups_raw.csv"
raw_csv_path = "/Users/harry/modules/hover-gallery/0.5.0/20_newsgroups_raw.csv"
train_csv_path = "/Users/harry/modules/hover-gallery/0.5.0/20_newsgroups_train.csv"
dev_csv_path = "/Users/harry/modules/hover-gallery/0.5.0/20_newsgroups_test.csv"

# for fast demonstration purpose, sample the data
df_raw = pd.read_csv(raw_csv_path).sample(2000)
df_train = pd.read_csv(train_csv_path).sample(2000)
df_dev = pd.read_csv(dev_csv_path).sample(2000)

# data is divided into 4 subsets: "raw" / "train" / "dev" / "test"
# this example assumes no labeled data available., i.e. only "raw"
df_raw["SUBSET"] = "raw"
df_train["SUBSET"] = "train"
df_dev["SUBSET"] = "dev"
df = pd.concat([df_raw, df_train, df_dev], axis=0)
df["text"] = df["text"].apply(str)

# this class stores the dataset throught the labeling process
dataset = SupervisableTextDataset.from_pandas(df, feature_key="text", label_key="label")

In [None]:
import spacy
import re

# use your preferred embedding for the task
nlp = spacy.load("en_core_web_md")

# raw data (str in this case) -> np.array
def vectorizer(text):
    clean_text = re.sub(r"[\s]+", r" ", str(text))
    return nlp(clean_text, disable=nlp.pipe_names).vector

text = dataset.dfs["raw"].loc[0, "text"]
vec = vectorizer(text)
print(f"Text: {text}")
print(f"Vector shape: {vec.shape}")

In [None]:
# any kwargs will be passed onto the corresponding reduction
# for umap: https://umap-learn.readthedocs.io/en/latest/parameters.html
# for ivis: https://bering-ivis.readthedocs.io/en/latest/api.html
dataset.compute_2d_embedding(vectorizer, "umap")

# What we did adds 'x' and 'y' columns to the DataFrames in dataset.dfs
# One could alternatively pre-compute these columns using any approach
dataset.dfs["raw"].head(5)

In [None]:
from hover.utils.snorkel_helper import labeling_function
from hover import module_config
import re


@labeling_function(targets=["rec.autos"])
def auto_keywords(row):
    flag = re.search(
        r"(?i)(diesel|gasoline|automobile|vehicle|drive|driving)", row.text
    )
    return "rec.autos" if flag else module_config.ABSTAIN_DECODED


@labeling_function(targets=["rec.sport.baseball"])
def baseball_keywords(row):
    flag = re.search(r"(?i)(baseball|stadium|\ bat\ |\ base\ )", row.text)
    return "rec.sport.baseball" if flag else module_config.ABSTAIN_DECODED


@labeling_function(targets=["sci.crypt"])
def crypt_keywords(row):
    flag = re.search(r"(?i)(crypt|math|encode|decode|key)", row.text)
    return "sci.crypt" if flag else module_config.ABSTAIN_DECODED


@labeling_function(targets=["talk.politics.guns"])
def guns_keywords(row):
    flag = re.search(r"(?i)(gun|rifle|ammunition|violence|shoot)", row.text)
    return "talk.politics.guns" if flag else module_config.ABSTAIN_DECODED


@labeling_function(targets=["misc.forsale"])
def forsale_keywords(row):
    flag = re.search(r"(?i)(sale|deal|price|discount)", row.text)
    return "misc.forsale" if flag else module_config.ABSTAIN_DECODED


LABELING_FUNCTIONS = [
    auto_keywords,
    baseball_keywords,
    crypt_keywords,
    guns_keywords,
    forsale_keywords,
]

In [None]:
@labeling_function(targets=["misc.forsale"])
def forsale_keywords_alt(row):
    flag = re.search(r"(?i)(sale|deal|price|discount|sell)", row.text)
    return "misc.forsale" if flag else module_config.ABSTAIN_DECODED

LABELING_FUNCTIONS.append(forsale_keywords_alt)

In [None]:
from hover.recipes.experimental import snorkel_crosscheck
from bokeh.io import show, output_notebook

handle = snorkel_crosscheck(dataset.copy(), LABELING_FUNCTIONS, layout_style="vertical")

output_notebook()
show(handle)