In [1]:
import os
import pandas as pd
# torch
import torch
# local
from backend import AutoLabeler, get_device

In [2]:
PATH = 'data'
FILE_DATA = 'sample_data.csv'
DEVICE = get_device()
RANDOM_SEED = 42

## Data Loading

In [3]:
### load data
df_data = pd.read_csv(os.path.join(PATH, FILE_DATA)) # label_text | label_id won't show up in the real data (this is just our benchmark)
unique_labels = df_data[df_data.pre_defined].label_text.unique().tolist()
print(f'There are {len(unique_labels)} pre-defined labels.')
# remove these extra coumns in df_data to mimic the real world
df_data = df_data.loc[:100]
df_data = df_data[['text']].reset_index()
df_data.columns = ['id', 'text']

There are 30 pre-defined labels.


### Workflow

In [4]:
config = {
    "device" : "cpu", # or cuda
    "llm" : "sentence_transformer",
    # docs emb
    "load_emb_if_exists" : True,
    # predefined labels
    "pred_sim_measure" : "cosine", # similarity measure for initial assignment
    "pred_sim_threshold" : 0.65, # threshold above which sample will be assigned
    "pred_min_sample" : 5, # if 0, elminate those topics has nothing assigned, if > 0, the number of samples are enforced to be assigned
    # genai...
    # clustering config
    "n_cluster" : 3,
    "precomputed" : False,
    "clustering_method" : "lda",
    "dim_reduction" : "",
    "red_dim" : 5, 
}

In [5]:
labeler = AutoLabeler(df_data, unique_labels, config=config)

Auto-Labeler Workspace is created /'AutoLabler/'.


In [6]:
### step 1: document encoding with or w/o ai summary (TODO: need llama)
labeler.encode_all_doc_with_ai()

In [7]:
### step 2: split
labeler.preprocess_predefined_label()

  self.df_assigned = df_assigned.append(pd.DataFrame(container, columns=df_assigned.columns))


In [8]:
### step 3: grouping
labeler.group_docs()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_d

In [None]:
### step 4: kw extraction
labeler.kw_extraction()