In [None]:
tweet_eval
amazon_us_reviews
amazon_polarity
yelp_review_full
dbpedia_14
quora
jigsaw_toxicity_pred
carblacac/twitter-sentiment-analysis
OxAISH-AL-LLM/wiki_toxic
tals/vitaminc
https://github.com/Franck-Dernoncourt/pubmed-rct

In [None]:
categories = {
    "books": ["Books_v1_02", "Digital_Ebook_Purchase_v1_01"],
    "movie/tv": ["Video", "Video DVD", "Digital_Video_Download"]
}


In [None]:
a = load_dataset("amazon_polarity")

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset, load_from_disk
from energizer.datastores.pandas import PandasDataStoreForSequenceClassification

from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
import srsly
from pathlib import Path

from energizer.datastores.pandas import sample
from sklearn.utils import check_random_state
from sklearn.manifold import TSNE, MDS
import numpy as np
import seaborn as sns

In [None]:
ds = load_dataset("civil_comments")

In [None]:
df = ds["train"].to_pandas()

In [None]:
l = df["text"].str.len()

In [None]:
df.loc[(l < 10)]

In [None]:
df["labels"] = (df["toxicity"] > 0.5).astype("int")

In [None]:
df.loc[l > 10, "labels"].value_counts(normalize=True)

In [None]:
test_df = ds["test"].to_pandas()
test_df["labels"] = (test_df["toxicity"] > 0.5).astype("int")

In [None]:
test_df["labels"].value_counts()

In [None]:
test_df["text"].str.len().hist()

In [None]:
dataset_dict = load_from_disk("../data/processed/agnews/")
df = dataset_dict["train"].to_pandas()

In [None]:
rng = check_random_state(42)
ids = sample(indices=df.index.tolist(), size=10_000, random_state=rng, labels=df["labels"].tolist(), sampling="stratified")
samples = df.loc[df.index.isin(ids), ["labels", "embedding"]]

In [None]:
import umap
umap = umap.UMAP(
    n_neighbors=30,
    min_dist=0.3,
    metric='euclidean',
    random_state=rng,
)
mapper = umap.fit(np.stack(samples.embedding.values))

In [None]:
umap_proj = umap.transform(np.stack(df.embedding.values))

In [None]:
sns.scatterplot(x=umap_proj[:, 0], y=umap_proj[:, 1], hue=df.labels)

In [None]:
model_name = "all-mpnet-base-v2"
embedder = SentenceTransformer(model_name)

meta = {
    "embedding_model": model_name,
    "embedding_dimension": embedder.get_sentence_embedding_dimension(),
}

data_dir = Path("../data")

---
AGNEWS

In [None]:
dataset_dict = (
    load_dataset("ag_news")
    .rename_columns({"label": "labels"})
    .map(lambda ex: {"embedding": embedder.encode(ex["text"], device="cuda", batch_size=512)}, batched=True, batch_size=1024)
)

data_path = data_dir / "processed" / "agnews"
dataset_dict.save_to_disk(data_path)
srsly.write_json(data_path / "index_metadata.json", meta)

In [None]:
model_name = "google/bert_uncased_L-2_H-128_A-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset_dict = load_from_disk(data_path).map(lambda ex: tokenizer(ex["text"], return_token_type_ids=False), batched=True)

data_path = data_dir / "prepared" / "agnews_bert_tiny"
dataset_dict.save_to_disk(data_path)
srsly.write_json(data_path / "metadata.json", {"name_or_path": model_name})

In [None]:
load_from_disk

In [None]:
datastore = PandasDataStoreForSequenceClassification()
datastore.from_dataset_dict(dataset_dict, input_names=["input_ids", "attention_mask"], target_name="labels", tokenizer=tokenizer)
datastore.add_index("embedding", metric="l2")
datastore.save(data_dir / "prepared" / "agnews_bert-tiny")

In [None]:
ds = PandasDataStoreForSequenceClassification.load(data_dir / "prepared" / "agnews_bert-tiny")

In [None]:
query = ds.data.iloc[200]
ids, _ = ds.search(query.embedding, 100)

In [None]:
print(f"query:\n   [{ds.id2label[query.labels]}] {query.text}\nresults:")
print("   " + "\n   ".join(ds.get_by_ids(ids[0])[["labels", "text"]].apply(lambda ex: f"[{ds.id2label[ex['labels']]}] {ex['text']}", axis=1)))

In [None]:
ds = PandasDataStoreForSequenceClassification.load("./agnews_datastore")

In [None]:
ds.input_names, ds.target_name, ds.on_cpu

In [None]:
datastore.input_names, datastore.target_name, datastore.on_cpu, datastore._features

In [None]:
ds.input_names, ds.target_name, ds.on_cpu, ds._features

In [None]:
datastore.label_distribution(), ds.label_distribution()

In [None]:
query = datastore.data.iloc[0]
ids, dists = datastore.search(query["embedding"], 10, query_in_set=True)

print(f"query: {query.text}\nresults:")
print("  - " + "\n  - ".join(datastore.get_by_ids(ids[0]).text))

In [None]:
query = ds.data.iloc[0]
ids, dists = ds.search(query["embedding"], 10, query_in_set=True)

print(f"query: {query.text}\nresults:")
print("  - " + "\n  - ".join(ds.get_by_ids(ids[0]).text))

In [None]:
datastore.labels, ds.labels

In [None]:
datastore.label2id, ds.label2id

In [None]:
datastore.train_dataset(), ds.train_dataset()

In [None]:
datastore.label(indices=[0, 1], round=1, validation_perc=0.5)
ds.label(indices=[0, 1], round=1, validation_perc=0.5)

In [None]:
datastore.train_dataset(), datastore.train_dataset(0), ds.train_dataset(), ds.train_dataset(0)

In [None]:
datastore.pool_dataset(), datastore.pool_dataset(0), ds.pool_dataset(), ds.pool_dataset(0)

In [None]:
datastore.validation_dataset(), datastore.validation_dataset(0), ds.validation_dataset(), ds.validation_dataset(0)

In [None]:
datastore.test_dataset(), ds.test_dataset()

In [None]:
datastore.prepare_for_loading(), ds.prepare_for_loading()

In [None]:
datastore.show_batch(), ds.show_batch()