In [1]:
import json
from pathlib import Path

import hnswlib as hb
import numpy as np
import pandas as pd
import requests
import srsly
from datasets import (
    ClassLabel,
    Dataset,
    DatasetDict,
    Features,
    Sequence,
    Value,
    load_dataset,
    load_from_disk,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.auto import tqdm

In [None]:
index_path = (
    "/home/pl487/allset/data/processed/eurlex-57k/all-MiniLM-L12-v2_cosine.bin"
)
meta_path = (
    "/home/pl487/allset/data/processed/eurlex-57k/all-MiniLM-L12-v2_cosine.json"
)

In [None]:
meta = srsly.read_json(meta_path)
index = hb.Index(space=meta["metric"], dim=meta["dim"])
index.load_index(str(index_path))

In [None]:
index.mark_deleted(0)

In [None]:
np.stack(index.get_items([0])).shape

In [None]:
sorted(index.get_ids_list())

In [None]:
ds_dict = load_from_disk("/home/pl487/allset/data/processed/wiki_toxic")

In [None]:
ds_dict["train"].to_pandas()["input_ids"].map(len).describe()

In [None]:
df = ds_dict["train"].to_pandas()

In [None]:
df["labels"].value_counts()

---
Pubmed

In [2]:
d = load_from_disk("/home/pl487/allset/data/processed/pubmed-200k-rct")

In [3]:
d

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'uid', 'embedding_all-mpnet-base-v2', 'embedding_multi-qa-mpnet-base-dot-v1', 'embedding_all-MiniLM-L12-v2'],
        num_rows: 2211861
    })
    validation: Dataset({
        features: ['labels', 'text', 'uid', 'embedding_all-mpnet-base-v2', 'embedding_multi-qa-mpnet-base-dot-v1', 'embedding_all-MiniLM-L12-v2'],
        num_rows: 28932
    })
    test: Dataset({
        features: ['labels', 'text', 'uid', 'embedding_all-mpnet-base-v2', 'embedding_multi-qa-mpnet-base-dot-v1', 'embedding_all-MiniLM-L12-v2'],
        num_rows: 29493
    })
})

In [None]:
df = d["train"].to_pandas()

In [None]:
df.loc[df["uid"] == 40384]

In [None]:
df["recitals"].str.len().describe()

In [None]:
df["text"].str.len().describe()

In [None]:
# ds_dict = load_dataset("pietrolesci/eurlex_indexeda", cache_dir="ds_cache")
# ds_dict.save_to_disk("../data/processed/eurlex")

---
Amazon

In [2]:
d = load_from_disk("/home/pl487/allset/data/prepared/pubmed-200k-rct_bert-tiny")

In [3]:
d

DatasetDict({
    train: Dataset({
        features: ['uid', 'labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2211861
    })
    test: Dataset({
        features: ['uid', 'labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 29493
    })
})

In [3]:
ds_dict = load_from_disk(
    "/home/pl487/allset/data/processed/amazoncat-13k/"
).select_columns(["uid", "target_ind", "text"])

In [4]:
train_df = ds_dict["train"].to_pandas()
test_df = ds_dict["test"].to_pandas()

In [5]:
with open("/home/pl487/allset/AmazonCat-13K.raw/Yf.txt", encoding="latin-1") as fl:
    cats = fl.read().split("\n")
cats = pd.DataFrame(cats)

In [6]:
a = train_df["target_ind"].explode().value_counts(True).to_frame("f")

In [38]:
a.loc[(a["f"] > 0.01) & (a["f"] < 0.02)].sort_values("f", ascending=False).join(cats)

Unnamed: 0_level_0,f,0
target_ind,Unnamed: 1_level_1,Unnamed: 2_level_1
7083,0.016357,literature & fiction
7891,0.014879,movies
4038,0.012757,education & reference
10063,0.012549,rock
8108,0.011986,new
12630,0.011986,used & rental textbooks


In [6]:
ids = cats.loc[cats[0].str.contains("religion")].index.tolist()
cats.loc[cats[0].str.contains("religion")]

Unnamed: 0,0
2790,comparative religion
3241,cultures & religions
3988,earth-based religions
5886,history of religion
8445,other eastern religions
8446,other eastern religions & sacred texts
8453,other religions
9904,religion
9905,religion & spirituality
9906,religions


In [7]:
ids

[2790, 3241, 3988, 5886, 8445, 8446, 8453, 9904, 9905, 9906, 10371]

In [71]:
mask = train_df["target_ind"].map(lambda ex: any(i in ex for i in ids))

In [72]:
train_df.loc[mask]

Unnamed: 0,uid,target_ind,text
13,13,"[7891, 7892]",Shrek - The Story So Far (Shrek 1 & 2 Full Scr...
26,26,"[7891, 7892]","Mission Impossible [VHS] (1996)\n\nA flashy, s..."
76,76,"[7891, 7892]",Student of Prague Collection (1913 & 1926 Vers...
95,95,"[7891, 7892]",Patch Adams - Collector's Edition (1998)\n\nPa...
105,105,"[7891, 7892]",Stonewall Jackson\n\nTells the remarkable life...
...,...,...,...
1186208,1186208,"[7891, 7892]",Jerry Springer Too Hot for Tv 2 Uncensored\n\n...
1186210,1186210,"[7891, 7892]",Riding High (1950)\n\nRIDING HIGH tells the st...
1186219,1186219,"[7891, 7892]",Roy Rogers - Grand Canyon Trail\n\nNo Descript...
1186225,1186225,"[7891, 7892]",Bardot - Et Dieu... créa la femme/Documentaire...


In [73]:
mask.sum() / len(train_df)

0.07499922022459218

In [None]:
fiction = 8%
movies = 7%
social science = 4%
religion = 2%

In [None]:
train_df.loc[train_df["target_ind"].map(lambda ex: 7893 in ex), "text"].to_dict()

In [None]:
# df.loc[df["target_ind"].map(lambda ex: 13199 in ex)]
df.loc[df["target_ind"].map(lambda ex: 12583 in ex)].iloc[2].to_dict()

In [None]:
30068 / len(df)

In [None]:
MODELS = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1", "all-MiniLM-L12-v2"]

In [None]:
models = [f"embedding_{i}" for i in MODELS]
df = load_dataset("pietrolesci/eurlex_indexed")["train"].to_pandas()
df = df[["uid"] + models]

In [None]:
metric: str = "cosine"
ef_construction: int = 200
ef: int = 200
M: int = 64
num_threads: int = 5

path = Path("../data/processed/eurlex/hnswlib_indices")
path.mkdir(exist_ok=True, parents=True)

for idx, model in enumerate(tqdm(models)):

    path.mkdir(exist_ok=True, parents=True)

    embeddings = np.stack(df[model].values)

    index = hb.Index(space=metric, dim=embeddings.shape[1])
    index.set_ef(ef)
    index.init_index(
        max_elements=embeddings.shape[0],
        M=M,
        ef_construction=ef_construction,
        random_seed=42,
    )

    index.add_items(embeddings, df["uid"].tolist(), num_threads=num_threads)
    index.save_index(str(path / f"index_{metric}_{MODELS[idx]}.bin"))

    srsly.write_yaml(
        path / f"metadata_{metric}_{MODELS[idx]}.yaml",
        {
            "metric": metric,
            "ef_construction": ef_construction,
            "ef": ef,
            "M": M,
            "num_threads": num_threads,
        },
    )

In [None]:
data = ds_dict.map(
    lambda ex: {"labels": [int("3191" in l) for l in ex["eurovoc_concepts"]]},
    batched=True,
)

In [None]:
train_df = data["train"].to_pandas()
test_df = data["test"].to_pandas()

In [None]:
train_freq = (
    train_df[["celex_id", "eurovoc_concepts"]]
    .explode("eurovoc_concepts")
    .groupby("eurovoc_concepts")["celex_id"]
    .nunique()
    .to_frame("count_train")
)
test_freq = (
    test_df[["celex_id", "eurovoc_concepts"]]
    .explode("eurovoc_concepts")
    .groupby("eurovoc_concepts")["celex_id"]
    .nunique()
    .to_frame("count_test")
)

In [None]:
desc = pd.DataFrame(
    srsly.read_jsonl("../data/raw/eurlex-57k/eurovoc_concepts.jsonl")
).set_index("id")

In [None]:
f = train_freq.join(test_freq).dropna()

In [None]:
f["sum"] = f.sum(1)

In [None]:
f = f.sort_values("sum", ascending=False)

In [None]:
f = f.join(desc)

In [None]:
f.loc[(f["sum"] < 3500) & (f["sum"] > 1000)].sort_values(
    ["count_test", "count_train"], ascending=[False, True]
)

In [None]:
# make `health control` the target label
data = ds_dict.map(
    lambda ex: {"labels": [int("192" in l) for l in ex["eurovoc_concepts"]]},
    batched=True,
)
train_df = data["train"].to_pandas()
test_df = data["test"].to_pandas()

In [None]:
train_df["labels"].value_counts(True), test_df["labels"].value_counts(True)

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 3))
log_reg = LogisticRegression()

In [None]:
train_df["document_type"].value_counts()

In [None]:
# X = tfidf.fit_transform(train_df["text"].values)
X = np.stack(train_df["embedding_all-mpnet-base-v2"].values)
y = train_df["labels"].values
# y = train_df["document_type"] == "Directive"

# X_test = tfidf.transform(test_df["text"].values)
X_test = np.stack(test_df["embedding_all-mpnet-base-v2"].values)
y_test = test_df["labels"].values
# y_test = test_df["document_type"] == "Directive"

In [None]:
log_reg.fit(X, y)

In [None]:
print(classification_report(y_test, log_reg.predict(X_test)))

In [None]:
print(classification_report(y, log_reg.predict(X)))

In [None]:
ds_dict

In [None]:
1609 / len(train_df)

In [None]:
train_df["document_type"].value_counts(True)

In [None]:
train_df.loc[train_df["eurovoc_concepts"].map(lambda ex: "2173" in ex)]

In [None]:
desc

In [None]:
# get eurlex from http://nlp.cs.aueb.gr/software_and_datasets/EURLEX57K/ and unzip the folder into eurlex
path = Path("../eurlex/")
data = {}
for split in path.iterdir():
    data[split.name] = []
    for file in tqdm(list(split.iterdir())):
        data[split.name].append(srsly.read_json(file))

In [None]:
cats = [
    "Wireless_v1_00",
    "Watches_v1_00",
    "Video_Games_v1_00",
    "Video_DVD_v1_00",
    "Video_v1_00",
    "Toys_v1_00",
    "Tools_v1_00",
    "Sports_v1_00",
    "Software_v1_00",
    "Shoes_v1_00",
    "Pet_Products_v1_00",
    "Personal_Care_Appliances_v1_00",
    "PC_v1_00",
    "Outdoors_v1_00",
    "Office_Products_v1_00",
    "Musical_Instruments_v1_00",
    "Music_v1_00",
    "Mobile_Electronics_v1_00",
    "Mobile_Apps_v1_00",
    "Major_Appliances_v1_00",
    "Luggage_v1_00",
    "Lawn_and_Garden_v1_00",
    "Kitchen_v1_00",
    "Jewelry_v1_00",
    "Home_Improvement_v1_00",
    "Home_Entertainment_v1_00",
    "Home_v1_00",
    "Health_Personal_Care_v1_00",
    "Grocery_v1_00",
    "Gift_Card_v1_00",
    "Furniture_v1_00",
    "Electronics_v1_00",
    "Digital_Video_Games_v1_00",
    "Digital_Video_Download_v1_00",
    "Digital_Software_v1_00",
    "Digital_Music_Purchase_v1_00",
    "Digital_Ebook_Purchase_v1_00",
    "Camera_v1_00",
    "Beauty_v1_00",
    "Baby_v1_00",
    "Automotive_v1_00",
    "Apparel_v1_00",
    "Digital_Ebook_Purchase_v1_01",
    "Books_v1_00",
    "Books_v1_01",
    "Books_v1_02",
]

In [None]:
ds_dict = load_dataset("pietrolesci/pubmed-rct-200k_indexed")
ds_dict = ds_dict.select_columns(
    ["uid", "labels", "text", "embedding_all-mpnet-base-v2"]
)

train_df = ds_dict["train"].to_pandas()
test_df = ds_dict["test"].to_pandas()

In [None]:
X = np.stack(train_df["embedding_all-mpnet-base-v2"].values)
y = train_df["labels"].values == 0

X_test = np.stack(test_df["embedding_all-mpnet-base-v2"].values)
y_test = test_df["labels"].values == 0

In [None]:
train_df["labels"].value_counts(True)

In [None]:
log_reg = LogisticRegression()

In [None]:
log_reg.fit(X, y)

In [None]:
print(classification_report(y_test, log_reg.predict(X_test)))

In [None]:
print(classification_report(y, log_reg.predict(X)))

In [None]:
len(train_df)

In [None]:
d = load_dataset("pietrolesci/pubmed-rct-200k_indexed")

In [None]:
a