In [1]:
import os
import pandas as pd
# torch
import torch
# local
from utilities import get_device
from backend import SentenceTransformer, SplitEngine, Groupper, KeywordExtractor

In [2]:
PATH = 'data'
FILE_DATA = 'sample_data.csv'
FILE_EMB = 'all_doc_emb.pt'
DEVICE = get_device()
LOAD_EMB_FROM_FILES = True
RANDOM_SEED = 42

## Data Loading

In [3]:
### load data
df_data = pd.read_csv(os.path.join(PATH, FILE_DATA)) # label_text | label_id won't show up in the real data (this is just our benchmark)
df_labels = pd.DataFrame(df_data[df_data.pre_defined].label_text.unique().tolist(), columns=['label_text']).rename_axis('label_id').reset_index()
df_labels['label_id'] += 1 # labeling starts from 1
print(f'There are {len(df_labels)} pre-defined labels.')

There are 30 pre-defined labels.


## Workflow

<p align="left">
  <img src="images\workflow.png"/>
</p>

### 1. Embedding

In [4]:
### initialize sentence transformer moel
model_st = SentenceTransformer(device=DEVICE)
# encode pre defined labels
label_embs = model_st.encode(df_labels.label_text.values.tolist())
# encode all documents
doc_embs = model_st.encode(df_data.text.values.tolist()) if not LOAD_EMB_FROM_FILES else torch.load(os.path.join(PATH, FILE_EMB))
# torch.save(doc_embs, os.path.join(PATH, FILE_EMB))

### 2. Splitting

In [5]:
split_agent = SplitEngine(df_data, df_labels, doc_embs, label_embs, threshold=0.65, label_red=False, min_samples=5)
df_assigned, df_not_assigned_pre = split_agent.run()

  self.df_assigned = df_assigned.append(pd.DataFrame(container, columns=df_assigned.columns))


### 3. Grouping

In [6]:
### groupping
groupper = Groupper(df_not_assigned_pre, doc_embs, {'n_clusters' : 19})
df_not_assigned = groupper.run()

In [None]:
### plotting
groupper.plot()

### 4. Merge \& Key Word Extraction

In [7]:
kwExtractor = KeywordExtractor(df_assigned, df_not_assigned, 10)
kwExtractor.run()
df_kw_summary = kwExtractor.get_topics_and_docs(df_data, 5)

### 5. Llama Representation Learning

In [65]:
### TODO

Unnamed: 0,label_id,index,text,count
0,0,"[34, 51, 101, 119, 120, 121, 142, 143, 195, 19...","[please list active alarms, set an alarm for t...",174.0
1,1,"[11366, 2082, 300, 10790, 1564]","[change the lights to a different hue, change ...",5.0
2,2,"[2823, 155, 9578, 1689, 1684]",[olly it's too bright in here can you turn a l...,5.0
