In [None]:
!pip install pandas tqdm
!pip install haystack-ai sentence-transformers

In [6]:
import pandas as pd
from tqdm import tqdm
import string
# Read the CSV file
df = pd.read_csv("public_data/wi_dataset.csv")

# Extract the desired columns
columns = ["id", "title", "description"]
df_subset = df[columns]
# Drop duplicates
df_subset.drop_duplicates(subset="description", inplace=True)
# Drop rows with missing values
df_subset.dropna(inplace=True)
# Lowercase
df_subset["description"] = df_subset["description"].str.lower()
# Remove punctuation
df_subset["description"] = df_subset["description"].str.replace(f'[{string.punctuation}]', '', regex=True)

In [17]:
df_subset

Unnamed: 0,id,title,description
0,872828466,Panel & Paint Technician,panel paint technician required in colchester...
1,839465958,"Lärare i slöjd och teknik för årkurs 7-9, Ljun...",sista ansökningsdatum 1 juni 2021 referensnumm...
2,857077872,Consultants in Emergency Medicine - Doughiska,the galway clinic is a leading 146 bed state o...
3,801801567,Senior IT Support Engineers,my client who has been continually growing thr...
4,855162927,Commercial Sales Representatives,jobbtitel commercial sales representatives abo...
...,...,...,...
25660,862998979,Продавач-консултант в шоурум Gallerato,описание и изисквания gallerato е фирма с дълг...
25661,793143661,Pedestrian Marshall,pedestrial marshall yardman do you have a dbs...
25662,725881734,Unity Gameplay Developer (f/m/d),unity gameplay developer fmd job bei sunday g...
25663,880528881,Carpenter 4505717 | careers4a.com,what job title keywords or skills where co...


In [7]:
import pandas as pd
from tqdm import tqdm

# Read the CSV file
df = pd.read_csv("public_data/wi_labels.csv")

# Extract the desired columns
columns = ["code", "label", "description"]
df_labels = df[columns]


In [8]:
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
# sentence-transformers/all-mpnet-base-v2 for default
document_embedder = SentenceTransformersDocumentEmbedder()
document_embedder.warm_up()

documents = [Document(content=row["description"], meta={"code": row["code"], "label": row["label"]}) for _, row in df_labels.iterrows()]
documents_with_embeddings = document_embedder.run(documents)['documents']
document_store.write_documents(documents_with_embeddings)

query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

Batches: 100%|██████████| 14/14 [00:01<00:00, 13.12it/s]


<haystack.core.pipeline.pipeline.Pipeline object at 0x7fdf91cf6610>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])

In [23]:
from IPython.utils import io

def get_query_results(query: str, top_k: int = 5):
    with io.capture_output() as captured:
        prediction = query_pipeline.run({"text_embedder":{"text": query}, "retriever": {"top_k": top_k}})
    codes = [str(doc.meta['code']) for doc in prediction['retriever']['documents']]
    labels = [str(doc.meta['label']) for doc in prediction['retriever']['documents']]
    return ', '.join(codes), ', '.join(labels)

In [16]:
from tqdm import tqdm
tqdm.pandas()
df_result = df_subset.progress_apply(lambda row: pd.Series(get_query_results(row["description"])), axis=1)
df_result.columns = ["pred_code", "pred_label"]

100%|██████████| 25080/25080 [10:43<00:00, 38.95it/s]


In [17]:
df_result = pd.concat([df_subset, df_result], axis=1)

In [19]:
df_result

Unnamed: 0,id,title,description,pred_code,pred_label
0,872828466,Panel & Paint Technician,panel paint technician required in colchester...,"7132, 7231, 8122, 9122, 9329","Spray painters and varnishers, Motor vehicle m..."
1,839465958,"Lärare i slöjd och teknik för årkurs 7-9, Ljun...",sista ansökningsdatum 1 juni 2021 referensnumm...,"2529, 1330, 4222, 2513, 3314",Database and network professionals not elsewhe...
2,857077872,Consultants in Emergency Medicine - Doughiska,the galway clinic is a leading 146 bed state o...,"2240, 2212, 3259, 2221, 2211","Paramedical practitioners, Specialist medical ..."
3,801801567,Senior IT Support Engineers,my client who has been continually growing thr...,"1330, 3512, 3333, 2523, 2433",Information and communications technology serv...
4,855162927,Commercial Sales Representatives,jobbtitel commercial sales representatives abo...,"5244, 4221, 2433, 3333, 3322","Contact centre salespersons, Travel consultant..."
...,...,...,...,...,...
25660,862998979,Продавач-консултант в шоурум Gallerato,описание и изисквания gallerato е фирма с дълг...,"7535, 8155, 6129, 2632, 6330","Pelt dressers, tanners and fellmongers, Fur an..."
25661,793143661,Pedestrian Marshall,pedestrial marshall yardman do you have a dbs...,"3333, 4222, 9313, 1439, 4419","Employment agents and contractors, Contact cen..."
25662,725881734,Unity Gameplay Developer (f/m/d),unity gameplay developer fmd job bei sunday g...,"1439, 3333, 2423, 1219, 8114","Services managers not elsewhere classified, Em..."
25663,880528881,Carpenter 4505717 | careers4a.com,what job title keywords or skills where co...,"9313, 1323, 7111, 7115, 9312","Building construction labourers, Construction ..."


In [20]:
df_result.to_csv("classification_top_5.csv", index=False)

In [24]:
get_query_results("The quick brown fox jumps over the lazy dog", top_k=1)

('5164', 'Pet groomers and animal care workers')