In [None]:
!pip install pandas tqdm
!pip install haystack-ai sentence-transformers

In [16]:
import pandas as pd
from tqdm import tqdm
import string
# Read the CSV file
df = pd.read_csv("public_data/wi_dataset.csv")

# Extract the desired columns
columns = ["id", "title", "description"]
df_subset = df[columns]
# Drop duplicates
df_subset.drop_duplicates(subset="description", inplace=True)
# Drop rows with missing values
df_subset.dropna(inplace=True)
# Lowercase
df_subset["description"] = df_subset["description"].str.lower()
# Remove punctuation
df_subset["description"] = df_subset["description"].str.replace(f'[{string.punctuation}]', '', regex=True)

In [17]:
df_subset

Unnamed: 0,id,title,description
0,872828466,Panel & Paint Technician,panel paint technician required in colchester...
1,839465958,"L√§rare i sl√∂jd och teknik f√∂r √•rkurs 7-9, Ljun...",sista ans√∂kningsdatum 1 juni 2021 referensnumm...
2,857077872,Consultants in Emergency Medicine - Doughiska,the galway clinic is a leading 146 bed state o...
3,801801567,Senior IT Support Engineers,my client who has been continually growing thr...
4,855162927,Commercial Sales Representatives,jobbtitel commercial sales representatives abo...
...,...,...,...
25660,862998979,–ü—Ä–æ–¥–∞–≤–∞—á-–∫–æ–Ω—Å—É–ª—Ç–∞–Ω—Ç –≤ —à–æ—É—Ä—É–º Gallerato,–æ–ø–∏—Å–∞–Ω–∏–µ –∏ –∏–∑–∏—Å–∫–≤–∞–Ω–∏—è gallerato –µ —Ñ–∏—Ä–º–∞ —Å –¥—ä–ª–≥...
25661,793143661,Pedestrian Marshall,pedestrial marshall yardman do you have a dbs...
25662,725881734,Unity Gameplay Developer (f/m/d),unity gameplay developer fmd job bei sunday g...
25663,880528881,Carpenter 4505717 | careers4a.com,what job title keywords or skills where co...


In [2]:
import pandas as pd
from tqdm import tqdm

# Read the CSV file
df = pd.read_csv("public_data/wi_labels.csv")

# Extract the desired columns
columns = ["code", "label", "description"]
df_labels = df[columns]


In [3]:
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
# sentence-transformers/all-mpnet-base-v2 for default
document_embedder = SentenceTransformersDocumentEmbedder()
document_embedder.warm_up()

documents = [Document(content=row["description"], meta={"code": row["code"], "label": row["label"]}) for _, row in df_labels.iterrows()]
documents_with_embeddings = document_embedder.run(documents)['documents']
document_store.write_documents(documents_with_embeddings)

query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:02<00:00,  5.88it/s]


<haystack.core.pipeline.pipeline.Pipeline object at 0x7f4970407190>
üöÖ Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
üõ§Ô∏è Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])

In [4]:
from IPython.utils import io

def get_query_results(query: str):
    with io.capture_output() as captured:
        prediction = query_pipeline.run({"text_embedder":{"text": query}})
    return prediction['retriever']['documents'][0].meta['code'], prediction['retriever']['documents'][0].meta['label']

In [7]:
from tqdm import tqdm
tqdm.pandas()
df_result = df_subset.progress_apply(lambda row: pd.Series(get_query_results(row["description"])), axis=1)
df_result.columns = ["pred_code", "pred_label"]

  0%|          | 0/25080 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25080/25080 [14:42<00:00, 28.41it/s]


In [8]:
df_result = pd.concat([df_subset, df_result], axis=1)

In [10]:
df_result.to_csv("classification.csv", index=False)