In [1]:
!pip install pandas tqdm
!pip install haystack-ai sentence-transformers
!pip install chroma-haystack

Collecting chroma-haystack
  Downloading chroma_haystack-0.21.1-py3-none-any.whl.metadata (2.2 kB)
Collecting chromadb>=0.5.0 (from chroma-haystack)
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb>=0.5.0->chroma-haystack)
  Downloading build-1.2.1-py3-none-any.whl.metadata (4.3 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb>=0.5.0->chroma-haystack)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb>=0.5.0->chroma-haystack)
  Downloading fastapi-0.112.2-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb>=0.5.0->chroma-haystack)
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting onnxruntime>=1.14.1 (from chromadb>=0.5.0->chroma-haystack)
  Downloading onnxruntime-1.19.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Col

In [2]:
import pandas as pd
from tqdm import tqdm
import string
# Read the CSV file
df = pd.read_csv("public_data/wi_dataset.csv")

# Extract the desired columns
columns = ["id", "title", "description"]
df_subset = df[columns]
# Drop duplicates
df_subset.drop_duplicates(subset="description", inplace=True)
# Drop rows with missing values
df_subset.dropna(inplace=True)
# Lowercase
df_subset["description"] = df_subset["description"].str.lower()
# Remove punctuation
df_subset["description"] = df_subset["description"].str.replace(f'[{string.punctuation}]', '', regex=True)

In [17]:
df_subset

Unnamed: 0,id,title,description
0,872828466,Panel & Paint Technician,panel paint technician required in colchester...
1,839465958,"Lärare i slöjd och teknik för årkurs 7-9, Ljun...",sista ansökningsdatum 1 juni 2021 referensnumm...
2,857077872,Consultants in Emergency Medicine - Doughiska,the galway clinic is a leading 146 bed state o...
3,801801567,Senior IT Support Engineers,my client who has been continually growing thr...
4,855162927,Commercial Sales Representatives,jobbtitel commercial sales representatives abo...
...,...,...,...
25660,862998979,Продавач-консултант в шоурум Gallerato,описание и изисквания gallerato е фирма с дълг...
25661,793143661,Pedestrian Marshall,pedestrial marshall yardman do you have a dbs...
25662,725881734,Unity Gameplay Developer (f/m/d),unity gameplay developer fmd job bei sunday g...
25663,880528881,Carpenter 4505717 | careers4a.com,what job title keywords or skills where co...


In [3]:
import pandas as pd
from tqdm import tqdm

# Read the CSV file
df = pd.read_csv("public_data/wi_labels.csv")

# Extract the desired columns
columns = ["code", "label", "description"]
df_labels = df[columns]


In [4]:
from haystack import Document
from haystack import Pipeline
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder

document_store = ChromaDocumentStore(distance_function="cosine")
# sentence-transformers/all-mpnet-base-v2 for default
model="dunzhang/stella_en_1.5B_v5"
# model = "BAAI/bge-multilingual-gemma2"
document_embedder = SentenceTransformersDocumentEmbedder(model=model)
document_embedder.warm_up()

documents = [Document(content=row["description"], meta={"code": row["code"], "label": row["label"]}) for _, row in df_labels.iterrows()]
documents_with_embeddings = document_embedder.run(documents)['documents']
document_store.write_documents(documents_with_embeddings)

query_pipeline = Pipeline()
# instruction = "Given a job advertisement, retrieve relevant job descriptions that matches the query."
# query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(model=model, prefix=instruction))
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(model=model))
query_pipeline.add_component("retriever", ChromaEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

  from .autonotebook import tqdm as notebook_tqdm
Batches:   0%|          | 0/14 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
Batches: 100%|██████████| 14/14 [00:10<00:00,  1.29it/s]


<haystack.core.pipeline.pipeline.Pipeline object at 0x7fe0f0a941f0>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: ChromaEmbeddingRetriever
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])

In [5]:
from IPython.utils import io

def get_query_results(query: str, top_k: int = 5):
    with io.capture_output() as captured:
        prediction = query_pipeline.run({"text_embedder":{"text": query}, "retriever": {"top_k": top_k}})
    codes = [str(doc.meta['code']) for doc in prediction['retriever']['documents']]
    labels = [str(doc.meta['label']) for doc in prediction['retriever']['documents']]
    return ', '.join(codes), ', '.join(labels)

In [6]:
from tqdm import tqdm
tqdm.pandas()
df_result = df_subset.progress_apply(lambda row: pd.Series(get_query_results(row["description"])), axis=1)
df_result.columns = ["pred_code", "pred_label"]

100%|██████████| 25080/25080 [16:19<00:00, 25.61it/s]


In [7]:
df_result = pd.concat([df_subset, df_result], axis=1)

In [8]:
df_result

Unnamed: 0,id,title,description,pred_code,pred_label
0,872828466,Panel & Paint Technician,panel paint technician required in colchester...,"7231, 7132, 5245, 9122, 9622","Motor vehicle mechanics and repairers, Spray p..."
1,839465958,"Lärare i slöjd och teknik för årkurs 7-9, Ljun...",sista ansökningsdatum 1 juni 2021 referensnumm...,"1345, 2359, 2330, 3333, 2353","Education managers, Teaching professionals not..."
2,857077872,Consultants in Emergency Medicine - Doughiska,the galway clinic is a leading 146 bed state o...,"1342, 2221, 3258, 1343, 2212","Health services managers, Nursing professional..."
3,801801567,Senior IT Support Engineers,my client who has been continually growing thr...,"3512, 2523, 3522, 3513, 4222",Information and communications technology user...
4,855162927,Commercial Sales Representatives,jobbtitel commercial sales representatives abo...,"2433, 3322, 5244, 2434, 2511",Technical and medical sales professionals (exc...
...,...,...,...,...,...
25660,862998979,Продавач-консултант в шоурум Gallerato,описание и изисквания gallerato е фирма с дълг...,"3322, 5242, 2433, 5243, 5223","Commercial sales representatives, Sales demons..."
25661,793143661,Pedestrian Marshall,pedestrial marshall yardman do you have a dbs...,"9613, 9329, 9510, 9111, 9211","Sweepers and related labourers, Manufacturing ..."
25662,725881734,Unity Gameplay Developer (f/m/d),unity gameplay developer fmd job bei sunday g...,"3333, 2423, 2511, 2166, 2433","Employment agents and contractors, Personnel a..."
25663,880528881,Carpenter 4505717 | careers4a.com,what job title keywords or skills where co...,"7121, 7111, 3333, 7119, 7115","Roofers, House builders, Employment agents and..."


In [8]:
df_result.to_csv("classification_top_5_.csv", index=False)

In [24]:
get_query_results("The quick brown fox jumps over the lazy dog", top_k=1)

('5164', 'Pet groomers and animal care workers')