In [3]:
import numpy as np
import pandas as pd

# NLP and embedding tools
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#Document processing
import PyPDF2
from pdfminer.high_level import extract_text
from docx import document

#visulization
import matplotlib.pyplot as plt
import seaborn as sns

#utilities
import os
import  re 
import json
from tqdm import tqdm
import faiss

#machine learning tools
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfTransformer



In [21]:
from dotenv import load_dotenv
from huggingface_hub import login
load_dotenv()
hf_token = os.getenv("HUGGINGFACE_API_TOKEN")

if hf_token:
    login(token=hf_token)
    print("✅ Successfully logged into Hugging Face!")
else:
    print("❌ Hugging Face API token not found. Check your .env file.")

✅ Successfully logged into Hugging Face!


In [23]:
#embedding model
model = SentenceTransformer("all-MiniLM-L6-v2", token=hf_token)

#nltk resources
nltk.download("punkt")
nltk.download("stopwords")

#spacy model
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shind\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shind\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#loading the Job Description dataset
from datasets import load_dataset # type: ignore
dataset = load_dataset("jacob-hugging-face/job-descriptions")


Generating train split: 100%|██████████| 853/853 [00:00<00:00, 3257.42 examples/s]


In [57]:
dataset

DatasetDict({
    train: Dataset({
        features: ['company_name', 'job_description', 'position_title', 'description_length', 'model_response'],
        num_rows: 853
    })
})

In [30]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", token = hf_token)

def tokenize_func(examples):
    return tokenizer(examples["job_description"], padding= "max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(
    tokenize_func,
    batched = True,
    desc = "Tokenizing dataset"
)


Tokenizing dataset: 100%|██████████| 853/853 [00:01<00:00, 635.47 examples/s]


In [33]:
#create embeddings
def create_embeddings(examples):
    embeddings = model.encode(examples["job_description"], show_progress_bar=True)
    return {"embeddings": embeddings.tolist()}

embedded_dataset = dataset.map(
    create_embeddings,
    batched = True,
    batch_size = 32,
    desc = "Creating Embeddings"
)

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.67s/it]? examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.50s/it]:13, 11.18 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.51s/it]:05, 12.00 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.52s/it]:01, 12.28 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.45s/it]0:58, 12.38 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.59s/it]0:55, 12.56 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]0:53, 12.44 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]0:51, 12.11 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.38s/it]0:50, 11.85 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.62s/it]0:46, 12.26 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.34s/it]0:43, 12.21 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.40s/it]0:39, 12.56 examples/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.36s/it]0:36, 12.73 examples/s]

In [37]:
print(embedded_dataset.column_names)

{'train': ['company_name', 'job_description', 'position_title', 'description_length', 'model_response', 'embeddings']}


In [38]:
#storing the embeddings in a faiss vector database
embeddings_array = np.array(embedded_dataset["train"]["embeddings"])

#storing in faiss
index = faiss.IndexFlatL2(384)
index.add(embeddings_array)

faiss.write_index(index, "faiss_index.bin")

In [40]:
#Encoding the query
query_text = "Looking for a machine learning role"
query_embedding = model.encode(query_text).reshape(1, -1)

In [41]:
index = faiss.read_index("faiss_index.bin")

D,I = index.search(query_embedding, k =5)

print(f"Top matching job indices: {I}")

Top matching job indices: [[665 179 430 828  74]]


In [48]:
def search_faiss(query_text, model, index, top_k=5):
    query_embedding = model.encode([query_text])  # Get embedding for query
    distances, indices = index.search(query_embedding, top_k)  # Search top-k results
    return distances, indices

query = "Software Engineer with experience in Python and AI"
distances, indices = search_faiss(query, model, index)

print("Nearest Neighbors:", indices)
print("Distances:", distances)

Nearest Neighbors: [[179  13 676  62 672]]
Distances: [[1.0211227 1.0506432 1.0827878 1.1002285 1.1348178]]


In [None]:
for idx in indices[0]:  # indices is a 2D array
    print(dataset["train"]["job_description"][idx])

In [55]:
faiss.write_index(index, "faiss_index.bin")


In [56]:
index = faiss.read_index("faiss_index.bin")

In [92]:
#Function to extract text from resume PDF

def extract_text_from_pdf(file_path):
    text = ""
    file_type = file_path.split(".")[-1].lower()

    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            extracted_text = page.extract_text()
            if extracted_text:  # Ensure text is not None
                text += extracted_text + "\n"
    return text


#Function to find the matching jobs


def find_matching_jobs(resume_text, faiss_index, dataset, top_k=5):
    # Create embedding for the resume
    resume_embedding = model.encode([resume_text])
    
    # Search in FAISS index
    distances, indices = faiss_index.search(resume_embedding, top_k)
    
    # Get matching job details
    matches = []
    for i, idx in enumerate(indices[0]):
        # Convert numpy.int64 to standard Python int
        idx = int(idx)
        
        if idx < len(dataset):  # Safety check
            try:
                job_title = dataset[idx]['position_title']
                job_description = dataset[idx]['job_description']
                
                matches.append({
                    'position_title': job_title,
                    'job_description': job_description,
                    'match_score': (1 - distances[0][i]) * 100  # Convert distance to similarity percentage
                })
            except Exception as e:
                print(f"Error processing match {idx}: {e}")
    
    return matches


In [97]:
#Main function for gradio interface
import tempfile
def resume_process(file, dataset, faiss_index):
    try:
        # Directly use the file path provided by Gradio
        resume_text = extract_text_from_pdf(file.name)

        matches = find_matching_jobs(resume_text, faiss_index, dataset)

        results = ""
        for i, match in enumerate(matches):
            results += f"**{i+1}.{match['position_title']}**\n"
            results += f"Match score : {match['match_score']:.2f}%\n"
            results += f"Job_Description: {match['job_description'][:100000000000000000000]}...\n\n"
            results += "---\n\n"

        return resume_text, results
    
    except Exception as e:
        return f"Error processing file {str(e)}","No matches found due to error."



In [None]:
#main gradio function
import gradio as gr
def main():
    def process_resume_wrapper(file):
        return resume_process(file, dataset, index)
    with gr.Blocks(title = "Resume Job Matcher") as demo:
        gr.Markdown("#Resume Job Matcher")
        gr.Markdown("Upload your resume to find matching job description")
    
        with gr.Row():
            with gr.Column():
                file_input = gr.File(label="Upload your pdf file")
                submit_btn = gr.Button("Find Matching Jobs")
            with gr.Column():
                resume_text = gr.Textbox(label="Extracted resume text", lines=20)
                results_output = gr.Markdown(label="Matching Jobs")
        submit_btn.click(
            fn= process_resume_wrapper,
            inputs=[file_input],
            outputs=[resume_text, results_output]
        )
    demo.launch()

if __name__ == "__main__":
    dataset_obj = load_dataset("jacob-hugging-face/job-descriptions")
    dataset = dataset_obj["train"]  
    
    # Assuming you have precomputed embeddings
    embeddings_array = np.array(embedded_dataset["train"]["embeddings"])
    faiss_index = faiss.IndexFlatL2(384)
    faiss_index.add(embeddings_array)

    main()



    

TypeError: BlockContext.__init__() got an unexpected keyword argument 'align'

In [None]:
#storing the embeddings in a faiss vector database
embeddings_array = np.array(embedded_dataset["train"]["embeddings"])

#storing in faiss
index = faiss.IndexFlatL2(384)
index.add(embeddings_array)

faiss.write_index(index, "faiss_index.bin")