In [1]:
import pandas as pd
import numpy as np
import json
import os
import uuid

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
df  = pd.read_csv("data job posts.csv")
df.shape

(19001, 24)

In [3]:
df.head(1)

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False


In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

class Tokenizer(object):
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def get_token(self, documents):
        sentences = [documents]
        sentence_embeddings = self.model.encode(sentences)
        encod_np_array = np.array(sentence_embeddings)
        encod_list = encod_np_array.tolist()
        return encod_list[0]
    
    def semantic_search(self, query_vector, document_vectors, documents, top_k=10):
        # Compute cosine similarity between the query vector and document vectors
        similarities = cosine_similarity(np.array([query_vector]), document_vectors).flatten()
        # Sort the similarities in descending order and get the indices of top-k similar documents
        top_indices = np.argsort(similarities)[::-1][:top_k]
        # Retrieve the top-k similar documents
        search_results = [documents[idx] for idx in top_indices]
        return search_results


In [5]:
token_instance = Tokenizer()

In [6]:
df = df.head(5000)
df = df.dropna(how='all')

In [12]:
df['vector'] = df['jobpost'].progress_apply(token_instance.get_token)

Pandas Apply:   0%|          | 0/5000 [00:00<?, ?it/s]

In [13]:
elk_data = df.to_dict("records")

In [14]:
# Indexing data into a list of dictionaries
indexed_data = []
for x in elk_data:
    data = {
        "title": x.get("Title", ""),
        "company": x.get("Company", ""),
        "location": x.get("Location", ""),
        "salary": x.get("Salary", ""),
        "vector": x.get("vector", ""),
        "job_description": x.get("JobDescription", "")
    }
    indexed_data.append(data)

In [15]:
# Prompt for input query
input_query = input("Enter the Input Query ")
token_vector = token_instance.get_token(input_query)

# Prepare document vectors
document_vectors = [data["vector"] for data in indexed_data]

Enter the Input Query I am looking for jobs on python Developer with aws and elasticsearch skills


In [21]:
# Vectorize the query token vector (replace with your own tokenization logic)
query_vector = token_vector

# Perform semantic search
search_results = token_instance.semantic_search(query_vector, document_vectors, indexed_data)

# Extract titles from search results
titles = [entry["title"] for entry in search_results]

output = {
    "Query": input_query,
    "Results": titles
}

# Print the formatted output
print("Search Query: ", output["Query"])
print("Search Results:")
for i, title in enumerate(output["Results"]):
    print(f"{i + 1}. {title}")
    print("------------------------")
    

print("------------------------") 
# Prompt for the index selection
index_selection = int(input("Enter the index number to view details: ")) - 1
print("------------------------")   
# Check if the index selection is valid
if index_selection < 0 or index_selection >= len(search_results):
    print("Invalid index selection.")
else:
    # Get the selected search result
    selected_result = search_results[index_selection]

    # Get the index of the selected result in the indexed_data list
    selected_index = indexed_data.index(selected_result)

    # Get the selected entry from indexed_data
    selected_entry = indexed_data[selected_index]

    # Print the details of the selected entry
    print("Selected Entry Details:")
    print("------------------------")
    print("Title:", selected_entry["title"])
    print("Company:", selected_entry["company"])
    print("Location:", selected_entry["location"])
    print("Salary:", selected_entry["salary"])
    print("Job Description:", selected_entry["job_description"])

Search Query:  I am looking for jobs on python Developer with aws and elasticsearch skills
Search Results:
1. Software Engineer
------------------------
2. PHP/MySQL Software Engineer
------------------------
3. Python Developers
------------------------
4. Search Engine Optimization Specialists
------------------------
5. Search Engine Optimization Specialist
------------------------
6. Search Engine Optimization Specialists
------------------------
7. Software Engineer / Senior Software Engineer
------------------------
8. Head of Sales Department
------------------------
9. Web Developer
------------------------
10. Field Application Engineer
------------------------
------------------------
Enter the index number to view details: 4
------------------------
Selected Entry Details:
------------------------
Title: Search Engine Optimization Specialists
Company: LinkGard Systems, LLC
Location: Yerevan, Armenia
Salary: Competitive. Based on experience.
Job Description: LinkGard Systems 