In [1]:
import json
from pathlib import Path

import hnswlib as hb
import numpy as np
import pandas as pd
import requests
import srsly
from datasets import (
    ClassLabel,
    Dataset,
    DatasetDict,
    Features,
    Sequence,
    Value,
    load_dataset,
    load_from_disk,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.auto import tqdm

In [None]:
# ds_dict = load_dataset("pietrolesci/eurlex_indexeda", cache_dir="ds_cache")
# ds_dict.save_to_disk("../data/processed/eurlex")

In [2]:
ds_dict = load_from_disk("/home/pl487/allset/data/processed/pubmed-200k-rct")

In [4]:
ds_dict

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'uid', 'embedding_all-mpnet-base-v2', 'embedding_multi-qa-mpnet-base-dot-v1', 'embedding_all-MiniLM-L12-v2'],
        num_rows: 2211861
    })
    validation: Dataset({
        features: ['labels', 'text', 'uid', 'embedding_all-mpnet-base-v2', 'embedding_multi-qa-mpnet-base-dot-v1', 'embedding_all-MiniLM-L12-v2'],
        num_rows: 28932
    })
    test: Dataset({
        features: ['labels', 'text', 'uid', 'embedding_all-mpnet-base-v2', 'embedding_multi-qa-mpnet-base-dot-v1', 'embedding_all-MiniLM-L12-v2'],
        num_rows: 29493
    })
})

In [None]:
# make `health control` the target label
ds_dict = ds_dict.map(
    lambda ex: {"labels": [int("192" in l) for l in ex["eurovoc_concepts"]]},
    batched=True,
)

In [None]:
# tokenize
MODELS = {
    "bert-tiny": "google/bert_uncased_L-2_H-128_A-2",
}
tokenizer = AutoTokenizer.from_pretrained(MODELS["bert-tiny"])
ds_dict = ds_dict.map(
    lambda ex: tokenizer(ex["text"], return_token_type_ids=False),
    batched=True,
)

In [None]:
ds_dict

In [None]:
# create index
embedding = "all-mpnet-base-v2"

embedding = f"embedding_{embedding}"
index_df = ds_dict["train"].to_pandas()[["uid", embedding]]

metric: str = "cosine"
ef_construction: int = 200
ef: int = 200
M: int = 64
num_threads: int = 5

emb = np.stack(index_df[embedding].values)
uid = index_df["uid"].tolist()

index = hb.Index(space=metric, dim=emb.shape[1])
index.set_ef(ef)
index.init_index(
    max_elements=emb.shape[0], M=M, ef_construction=ef_construction, random_seed=42
)
index.add_items(emb, uid, num_threads=num_threads)

In [None]:
# select columns
ds_dict = ds_dict.select_columns(
    ["uid", "labels", "text", "input_ids", "attention_mask"]
)

In [None]:
ds_dict.save_to_disk()
index.save_index(str(path / f"index_{metric}_{MODELS[idx]}.bin"))

srsly.write_yaml(
    path / f"metadata_{metric}_{MODELS[idx]}.yaml",
    {
        "metric": metric,
        "ef_construction": ef_construction,
        "ef": ef,
        "M": M,
        "num_threads": num_threads,
    },
)

In [None]:
with open(
    "/home/pl487/allset/amazon/AmazonCat-13K.raw/Yf.txt", encoding="latin-1"
) as fl:
    cats = fl.readlines()

In [None]:
cats = pd.DataFrame(cats)

In [None]:
a.loc[(a["f"] > 0.01) & (a["f"] < 0.03)].sort_values("f", ascending=False).join(cats)

In [None]:
# df.loc[df["target_ind"].map(lambda ex: 13199 in ex)]
df.loc[df["target_ind"].map(lambda ex: 12583 in ex)].iloc[2].to_dict()

In [None]:
30068 / len(df)

In [None]:
MODELS = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1", "all-MiniLM-L12-v2"]

In [None]:
models = [f"embedding_{i}" for i in MODELS]
df = load_dataset("pietrolesci/eurlex_indexed")["train"].to_pandas()
df = df[["uid"] + models]

In [None]:
metric: str = "cosine"
ef_construction: int = 200
ef: int = 200
M: int = 64
num_threads: int = 5

path = Path("../data/processed/eurlex/hnswlib_indices")
path.mkdir(exist_ok=True, parents=True)

for idx, model in enumerate(tqdm(models)):

    path.mkdir(exist_ok=True, parents=True)

    embeddings = np.stack(df[model].values)

    index = hb.Index(space=metric, dim=embeddings.shape[1])
    index.set_ef(ef)
    index.init_index(
        max_elements=embeddings.shape[0],
        M=M,
        ef_construction=ef_construction,
        random_seed=42,
    )

    index.add_items(embeddings, df["uid"].tolist(), num_threads=num_threads)
    index.save_index(str(path / f"index_{metric}_{MODELS[idx]}.bin"))

    srsly.write_yaml(
        path / f"metadata_{metric}_{MODELS[idx]}.yaml",
        {
            "metric": metric,
            "ef_construction": ef_construction,
            "ef": ef,
            "M": M,
            "num_threads": num_threads,
        },
    )

In [None]:
data = ds_dict.map(
    lambda ex: {"labels": [int("3191" in l) for l in ex["eurovoc_concepts"]]},
    batched=True,
)

In [None]:
train_df = data["train"].to_pandas()
test_df = data["test"].to_pandas()

In [None]:
train_freq = (
    train_df[["celex_id", "eurovoc_concepts"]]
    .explode("eurovoc_concepts")
    .groupby("eurovoc_concepts")["celex_id"]
    .nunique()
    .to_frame("count_train")
)
test_freq = (
    test_df[["celex_id", "eurovoc_concepts"]]
    .explode("eurovoc_concepts")
    .groupby("eurovoc_concepts")["celex_id"]
    .nunique()
    .to_frame("count_test")
)

In [None]:
desc = pd.DataFrame(
    srsly.read_jsonl("../data/raw/eurlex-57k/eurovoc_concepts.jsonl")
).set_index("id")

In [None]:
f = train_freq.join(test_freq).dropna()

In [None]:
f["sum"] = f.sum(1)

In [None]:
f = f.sort_values("sum", ascending=False)

In [None]:
f = f.join(desc)

In [None]:
f.loc[(f["sum"] < 3500) & (f["sum"] > 1000)].sort_values(
    ["count_test", "count_train"], ascending=[False, True]
)

In [None]:
# make `health control` the target label
data = ds_dict.map(
    lambda ex: {"labels": [int("192" in l) for l in ex["eurovoc_concepts"]]},
    batched=True,
)
train_df = data["train"].to_pandas()
test_df = data["test"].to_pandas()

In [None]:
train_df["labels"].value_counts(True), test_df["labels"].value_counts(True)

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 3))
log_reg = LogisticRegression()

In [None]:
train_df["document_type"].value_counts()

In [None]:
# X = tfidf.fit_transform(train_df["text"].values)
X = np.stack(train_df["embedding_all-mpnet-base-v2"].values)
y = train_df["labels"].values
# y = train_df["document_type"] == "Directive"

# X_test = tfidf.transform(test_df["text"].values)
X_test = np.stack(test_df["embedding_all-mpnet-base-v2"].values)
y_test = test_df["labels"].values
# y_test = test_df["document_type"] == "Directive"

In [None]:
log_reg.fit(X, y)

In [None]:
print(classification_report(y_test, log_reg.predict(X_test)))

In [None]:
print(classification_report(y, log_reg.predict(X)))

In [None]:
ds_dict

In [None]:
1609 / len(train_df)

In [None]:
train_df["document_type"].value_counts(True)

In [None]:
train_df.loc[train_df["eurovoc_concepts"].map(lambda ex: "2173" in ex)]

In [None]:
desc

In [None]:
# get eurlex from http://nlp.cs.aueb.gr/software_and_datasets/EURLEX57K/ and unzip the folder into eurlex
path = Path("../eurlex/")
data = {}
for split in path.iterdir():
    data[split.name] = []
    for file in tqdm(list(split.iterdir())):
        data[split.name].append(srsly.read_json(file))

In [None]:
cats = [
    "Wireless_v1_00",
    "Watches_v1_00",
    "Video_Games_v1_00",
    "Video_DVD_v1_00",
    "Video_v1_00",
    "Toys_v1_00",
    "Tools_v1_00",
    "Sports_v1_00",
    "Software_v1_00",
    "Shoes_v1_00",
    "Pet_Products_v1_00",
    "Personal_Care_Appliances_v1_00",
    "PC_v1_00",
    "Outdoors_v1_00",
    "Office_Products_v1_00",
    "Musical_Instruments_v1_00",
    "Music_v1_00",
    "Mobile_Electronics_v1_00",
    "Mobile_Apps_v1_00",
    "Major_Appliances_v1_00",
    "Luggage_v1_00",
    "Lawn_and_Garden_v1_00",
    "Kitchen_v1_00",
    "Jewelry_v1_00",
    "Home_Improvement_v1_00",
    "Home_Entertainment_v1_00",
    "Home_v1_00",
    "Health_Personal_Care_v1_00",
    "Grocery_v1_00",
    "Gift_Card_v1_00",
    "Furniture_v1_00",
    "Electronics_v1_00",
    "Digital_Video_Games_v1_00",
    "Digital_Video_Download_v1_00",
    "Digital_Software_v1_00",
    "Digital_Music_Purchase_v1_00",
    "Digital_Ebook_Purchase_v1_00",
    "Camera_v1_00",
    "Beauty_v1_00",
    "Baby_v1_00",
    "Automotive_v1_00",
    "Apparel_v1_00",
    "Digital_Ebook_Purchase_v1_01",
    "Books_v1_00",
    "Books_v1_01",
    "Books_v1_02",
]

In [None]:
ds_dict = load_dataset("pietrolesci/pubmed-rct-200k_indexed")
ds_dict = ds_dict.select_columns(
    ["uid", "labels", "text", "embedding_all-mpnet-base-v2"]
)

train_df = ds_dict["train"].to_pandas()
test_df = ds_dict["test"].to_pandas()

In [None]:
X = np.stack(train_df["embedding_all-mpnet-base-v2"].values)
y = train_df["labels"].values == 0

X_test = np.stack(test_df["embedding_all-mpnet-base-v2"].values)
y_test = test_df["labels"].values == 0

In [None]:
train_df["labels"].value_counts(True)

In [None]:
log_reg = LogisticRegression()

In [None]:
log_reg.fit(X, y)

In [None]:
print(classification_report(y_test, log_reg.predict(X_test)))

In [None]:
print(classification_report(y, log_reg.predict(X)))

In [None]:
len(train_df)

In [None]:
d = load_dataset("pietrolesci/pubmed-rct-200k_indexed")

In [None]:
a