In [1]:
!pip install pandas tqdm
!pip install haystack-ai sentence-transformers

Collecting tqdm
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.5
Collecting haystack-ai
  Downloading haystack_ai-2.4.0-py3-none-any.whl.metadata (13 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting haystack-experimental (from haystack-ai)
  Downloading haystack_experimental-0.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting lazy-imports (from haystack-ai)
  Downloading lazy_imports-0.3.1-py3-none-any.whl.metadata (10 kB)
Collecting more-itertools (from haystack-ai)
  Downloading more_itertools-10.4.0-py3-none-any.whl.metadata (36 kB)
Collecting networkx (from haystack-ai)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting openai>=1.1.0 (from haystack-ai)
  Downloading openai-1.42.0-py3-none-any.whl.metadata (22 kB)
Collecting posthog (from haystack-ai)
  Downl

In [1]:
import pandas as pd
from tqdm import tqdm
import string
# Read the CSV file
df = pd.read_csv("public_data/wi_dataset.csv")

# Extract the desired columns
columns = ["id", "title", "description"]
df_subset = df[columns]
# Drop duplicates
df_subset.drop_duplicates(subset="description", inplace=True)
# Drop rows with missing values
df_subset.dropna(inplace=True)
# Lowercase
df_subset["description"] = df_subset["description"].str.lower()
# Remove punctuation
df_subset["description"] = df_subset["description"].str.replace(f'[{string.punctuation}]', '', regex=True)

In [17]:
df_subset

Unnamed: 0,id,title,description
0,872828466,Panel & Paint Technician,panel paint technician required in colchester...
1,839465958,"Lärare i slöjd och teknik för årkurs 7-9, Ljun...",sista ansökningsdatum 1 juni 2021 referensnumm...
2,857077872,Consultants in Emergency Medicine - Doughiska,the galway clinic is a leading 146 bed state o...
3,801801567,Senior IT Support Engineers,my client who has been continually growing thr...
4,855162927,Commercial Sales Representatives,jobbtitel commercial sales representatives abo...
...,...,...,...
25660,862998979,Продавач-консултант в шоурум Gallerato,описание и изисквания gallerato е фирма с дълг...
25661,793143661,Pedestrian Marshall,pedestrial marshall yardman do you have a dbs...
25662,725881734,Unity Gameplay Developer (f/m/d),unity gameplay developer fmd job bei sunday g...
25663,880528881,Carpenter 4505717 | careers4a.com,what job title keywords or skills where co...


In [2]:
import pandas as pd
from tqdm import tqdm

# Read the CSV file
df = pd.read_csv("public_data/wi_labels.csv")

# Extract the desired columns
columns = ["code", "label", "description"]
df_labels = df[columns]


In [3]:
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
# sentence-transformers/all-mpnet-base-v2 for default
# model="dunzhang/stella_en_1.5B_v5"
model = "BAAI/bge-multilingual-gemma2"
document_embedder = SentenceTransformersDocumentEmbedder(model=model)
document_embedder.warm_up()

documents = [Document(content=row["description"], meta={"code": row["code"], "label": row["label"]}) for _, row in df_labels.iterrows()]
documents_with_embeddings = document_embedder.run(documents)['documents']
document_store.write_documents(documents_with_embeddings)

query_pipeline = Pipeline()
instruction = "Given a job advertisement, retrieve relevant job descriptions that matches the query."
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder(model=model, prefix=instruction))
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

  from .autonotebook import tqdm as notebook_tqdm
Batches:   0%|          | 0/14 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
Batches: 100%|██████████| 14/14 [00:10<00:00,  1.30it/s]


<haystack.core.pipeline.pipeline.Pipeline object at 0x7efbe1a8bfd0>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])

In [4]:
from IPython.utils import io

def get_query_results(query: str, top_k: int = 5):
    with io.capture_output() as captured:
        prediction = query_pipeline.run({"text_embedder":{"text": query}, "retriever": {"top_k": top_k}})
    codes = [str(doc.meta['code']) for doc in prediction['retriever']['documents']]
    labels = [str(doc.meta['label']) for doc in prediction['retriever']['documents']]
    return ', '.join(codes), ', '.join(labels)

In [5]:
from tqdm import tqdm
tqdm.pandas()
df_result = df_subset.progress_apply(lambda row: pd.Series(get_query_results(row["title"]+"\n"+row["description"])), axis=1)
df_result.columns = ["pred_code", "pred_label"]

100%|██████████| 25080/25080 [36:09<00:00, 11.56it/s]


In [6]:
df_result = pd.concat([df_subset, df_result], axis=1)

In [7]:
df_result

Unnamed: 0,id,title,description,pred_code,pred_label
0,872828466,Panel & Paint Technician,panel paint technician required in colchester...,"7231, 7132, 5245, 9122, 9622","Motor vehicle mechanics and repairers, Spray p..."
1,839465958,"Lärare i slöjd och teknik för årkurs 7-9, Ljun...",sista ansökningsdatum 1 juni 2021 referensnumm...,"1345, 2359, 4416, 2330, 3333","Education managers, Teaching professionals not..."
2,857077872,Consultants in Emergency Medicine - Doughiska,the galway clinic is a leading 146 bed state o...,"1342, 2221, 3258, 1343, 2212","Health services managers, Nursing professional..."
3,801801567,Senior IT Support Engineers,my client who has been continually growing thr...,"3512, 2523, 3522, 3513, 4222",Information and communications technology user...
4,855162927,Commercial Sales Representatives,jobbtitel commercial sales representatives abo...,"2433, 3322, 5244, 2434, 2511",Technical and medical sales professionals (exc...
...,...,...,...,...,...
25660,862998979,Продавач-консултант в шоурум Gallerato,описание и изисквания gallerato е фирма с дълг...,"3322, 5242, 2433, 5243, 5223","Commercial sales representatives, Sales demons..."
25661,793143661,Pedestrian Marshall,pedestrial marshall yardman do you have a dbs...,"9613, 9329, 9510, 9123, 9111","Sweepers and related labourers, Manufacturing ..."
25662,725881734,Unity Gameplay Developer (f/m/d),unity gameplay developer fmd job bei sunday g...,"3333, 2423, 2511, 2166, 2433","Employment agents and contractors, Personnel a..."
25663,880528881,Carpenter 4505717 | careers4a.com,what job title keywords or skills where co...,"7121, 7111, 3333, 7119, 7115","Roofers, House builders, Employment agents and..."


In [8]:
df_result.to_csv("classification_top_5_.csv", index=False)

In [24]:
get_query_results("The quick brown fox jumps over the lazy dog", top_k=1)

('5164', 'Pet groomers and animal care workers')