# NLP Operations: Job Title Matching
This notebook demonstrates various NLP techniques to vectorize job titles and a search term, and then ranks candidates by similarity. Techniques covered:
- TF-IDF
- Word2Vec (Google)
- GloVe
- FastText

## 1. Setup & Imports
Install and import required libraries.

In [11]:
import pandas as pd
import numpy as np
import gensim.downloader as api
import nltk
import fasttext
import fasttext.util
import random
import requests
import os
import math


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from dotenv import load_dotenv
from utils import bleu_score
from collections import Counter
from sentence_transformers import (
    SentenceTransformer,
)  # Import METEOR function from utils
from utils import meteor


# Load environment variables from .env file
load_dotenv()

# Import custom utility functions

# For CIDEr, we'll implement a simplified version

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Osama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Osama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Osama\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 2. Load Data
Load job titles from the Excel file and define a search term.

In [3]:
df = pd.read_excel("potential-talents.xlsx")
possible_columns = [
    "job_title",
    "title",
    "position",
    "role",
    "job",
    "designation",
    "job title",
]
job_title_column = None
for col in df.columns:
    if any(keyword in col.lower() for keyword in possible_columns):
        job_title_column = col
        break
if not job_title_column:
    raise ValueError("Job title column not found. Please specify it manually.")
job_titles = df[job_title_column].dropna().astype(str).tolist()

# Filter job titles to only those with 1 or 2 words
filtered_job_titles = [title for title in job_titles if 1 <= len(title.split()) <= 2]

# Randomly select a search term from filtered job titles
if filtered_job_titles:
    search_term = random.choice(filtered_job_titles)
else:
    raise ValueError("No job titles with 1 or 2 words found.")

print(f"Randomly selected search term: {search_term}")

Randomly selected search term: Student


## 3. TF-IDF Vectorization & Cosine Similarity
Vectorize job titles and search term using TF-IDF, then rank candidates by similarity.

In [4]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
corpus = job_titles + [search_term]
X = vectorizer.fit_transform(corpus)
search_vec = X[-1]
job_vecs = X[:-1]
similarities = cosine_similarity(search_vec, job_vecs).flatten()
ranked_indices = np.argsort(similarities)[::-1]
print("Top 10 job titles by TF-IDF similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Top 10 job titles by TF-IDF similarity to search term:
Student (Score: 1.000)
Student at Chapman University (Score: 0.455)
Student at Chapman University (Score: 0.455)
Student at Chapman University (Score: 0.455)
Student at Chapman University (Score: 0.455)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)


In [5]:
# # --- LLM-based Candidate Ranking using Groq API (Llama 3 70B Versatile) ---
# import requests
# import os
# import json
# from dotenv import load_dotenv

# # Load environment variables from .env file
# load_dotenv()


# def rank_candidates_with_llm(
#     job_titles, search_term, model="llama3-70b-8192"
# ):  # Llama 3 70B Versatile
#     """
#     Use Groq LLM API to rank job titles by relevance to the search term.
#     Args:
#         job_titles (list): List of job title strings.
#         search_term (str): The search term/job title to match against.
#         model (str): Groq model name (default: llama3-70b-8192).
#     Returns:
#         list: Ranked job titles (most relevant first).
#     """
#     api_key = os.getenv("GROQ_API_KEY")
#     if not api_key:
#         raise ValueError(
#             "GROQ_API_KEY not found in environment variables. Please set it in your .env file."
#         )
#     prompt = f"""
# You are an expert recruiter. Given the following list of candidate job titles, rank them from most to least relevant for the search term: '{search_term}'.\n\nJob Titles:\n"""
#     for i, title in enumerate(job_titles, 1):
#         prompt += f"{i}. {title}\n"
#     prompt += "\nReturn the ranking as a numbered list, most relevant first. Only include the job titles."

#     headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
#     data = {
#         "model": model,
#         "messages": [{"role": "user", "content": prompt}],
#         "temperature": 0.2,
#     }
#     response = requests.post(
#         "https://api.groq.com/openai/v1/chat/completions",
#         headers=headers,
#         data=json.dumps(data),
#     )
#     response.raise_for_status()
#     result = response.json()
#     llm_output = result["choices"][0]["message"]["content"]
#     # Parse the LLM output into a ranked list
#     ranked = [
#         line.split(". ", 1)[-1].strip()
#         for line in llm_output.split("\n")
#         if line.strip() and line[0].isdigit()
#     ]
#     return ranked


# # Example usage:
# # ranked_list = rank_candidates_with_llm(job_titles, search_term)
# # print("LLM-ranked job titles:")
# # for title in ranked_list:
# #     print(title)


## 4. Word2Vec (Google News) Vectorization & Cosine Similarity
Vectorize using pre-trained Google News Word2Vec embeddings.

In [6]:
# Download Google News vectors (only needs to be done once)
# w2v = api.load('word2vec-google-news-300')
w2v = api.load("word2vec-google-news-300")


def get_w2v_vector(text, model):
    words = [w for w in nltk.word_tokenize(text.lower()) if w in model]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model[w] for w in words], axis=0)


# Load the Excel file containing potential talents data
# (Assumes the file is in the same directory as the notebook)
df = pd.read_excel("potential-talents.xlsx")

# List of possible column names that may contain job titles
possible_columns = [
    "job_title",
    "title",
    "position",
    "role",
    "job",
    "designation",
    "job title",
]

# Initialize variable to store the detected job title column name
job_title_column = None
# Loop through columns in the DataFrame to find a matching job title column
for col in df.columns:
    if any(keyword in col.lower() for keyword in possible_columns):
        job_title_column = col  # Set the column name if a match is found
        break
# Raise an error if no job title column is found
default_job_title_error = "Job title column not found. Please specify it manually."
if not job_title_column:
    raise ValueError(default_job_title_error)

# Extract job titles as a list of strings, dropping missing values
job_titles = df[job_title_column].dropna().astype(str).tolist()

# Filter job titles to only those with 1 or 2 words
filtered_job_titles = [title for title in job_titles if 1 <= len(title.split()) <= 2]

# Randomly select a search term from filtered job titles
if filtered_job_titles:
    search_term = random.choice(filtered_job_titles)
else:
    raise ValueError("No job titles with 1 or 2 words found.")

# Print the randomly selected search term
print(f"Randomly selected search term: {search_term}")

job_vecs = np.array([get_w2v_vector(title, w2v) for title in job_titles])
search_vec = get_w2v_vector(search_term, w2v).reshape(1, -1)
similarities = cosine_similarity(search_vec, job_vecs).flatten()
ranked_indices = np.argsort(similarities)[::-1]
print("Top 10 job titles by Word2Vec similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Randomly selected search term: Student
Top 10 job titles by Word2Vec similarity to search term:
Student (Score: 1.000)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Westfield State University (Score: 0.793)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.575)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.575)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.575)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.575)


## 5. GloVe Vectorization & Cosine Similarity
Vectorize using pre-trained GloVe embeddings.

In [4]:
# Download GloVe vectors (only needs to be done once)
# glove = api.load('glove-wiki-gigaword-300')
glove = api.load("glove-wiki-gigaword-300")


def get_glove_vector(text, model):
    words = [w for w in nltk.word_tokenize(text.lower()) if w in model]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model[w] for w in words], axis=0)


job_vecs = np.array([get_glove_vector(title, glove) for title in job_titles])
search_vec = get_glove_vector(search_term, glove).reshape(1, -1)
similarities = cosine_similarity(search_vec, job_vecs).flatten()
ranked_indices = np.argsort(similarities)[::-1]
print("Top 10 job titles by GloVe similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Top 10 job titles by GloVe similarity to search term:
Student (Score: 1.000)
Student at Chapman University (Score: 0.766)
Student at Chapman University (Score: 0.766)
Student at Chapman University (Score: 0.766)
Student at Chapman University (Score: 0.766)
Student at Westfield State University (Score: 0.699)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.669)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.669)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.669)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.669)


## 6. FastText Vectorization & Cosine Similarity
Vectorize using pre-trained FastText embeddings.

In [5]:
# Download FastText vectors (only needs to be done once)
# fasttext_model = api.load('fasttext-wiki-news-subwords-300')
fasttext_model = api.load("fasttext-wiki-news-subwords-300")


def get_fasttext_vector(text, model):
    words = [w for w in nltk.word_tokenize(text.lower()) if w in model]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model[w] for w in words], axis=0)


job_vecs = np.array(
    [get_fasttext_vector(title, fasttext_model) for title in job_titles]
)
search_vec = get_fasttext_vector(search_term, fasttext_model).reshape(1, -1)
similarities = cosine_similarity(search_vec, job_vecs).flatten()
ranked_indices = np.argsort(similarities)[::-1]
print("Top 10 job titles by FastText similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Top 10 job titles by FastText similarity to search term:
Student (Score: 1.000)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.724)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.724)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.724)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.724)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.724)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.724)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.724)
Student at Chapman University (Score: 0.709)
Student at Chapman University (Score: 0.709)


## 11. Transformer-based Contextual Embeddings (BERT/Sentence-BERT)
Use Sentence-BERT to generate contextual embeddings for job titles and the search term, then rank by cosine similarity.

In [None]:
# Load a pre-trained Sentence-BERT model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings
job_embeddings = sbert_model.encode(job_titles)
search_embedding = sbert_model.encode([search_term])

# Compute cosine similarities
similarities = cosine_similarity(search_embedding, job_embeddings).flatten()
ranked_indices = np.argsort(similarities)[::-1]

print("Top 10 job titles by SBERT similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Top 10 job titles by SBERT similarity to search term:
Student (Score: 1.000)
Student at Westfield State University (Score: 0.616)
Student at Chapman University (Score: 0.602)
Student at Chapman University (Score: 0.602)
Student at Chapman University (Score: 0.602)
Student at Chapman University (Score: 0.602)
Student at Indiana University Kokomo - Business Management - 
Retail Manager at Delphi Hardware and Paint (Score: 0.409)
Advisory Board Member at Celal Bayar University (Score: 0.398)
Advisory Board Member at Celal Bayar University (Score: 0.398)
Advisory Board Member at Celal Bayar University (Score: 0.398)


## 7. BLEU Score Calculation
Calculate BLEU score for semantic similarity between search term and job titles.

In [None]:
# Calculate BLEU score for each job title against the search term
smoothie = SmoothingFunction().method4
search_tokens = nltk.word_tokenize(search_term.lower())
bleu_scores = [
    sentence_bleu(
        [search_tokens], nltk.word_tokenize(title.lower()), smoothing_function=smoothie
    )
    for title in job_titles
]
ranked_indices = np.argsort(bleu_scores)[::-1]
print("Top 10 job titles by BLEU semantic similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (BLEU Score: {bleu_scores[idx]:.3f})")

Top 10 job titles by BLEU semantic similarity to search term:
Student (BLEU Score: 1.000)
Student at Chapman University (BLEU Score: 0.061)
Student at Chapman University (BLEU Score: 0.061)
Student at Chapman University (BLEU Score: 0.061)
Student at Chapman University (BLEU Score: 0.061)
Student at Westfield State University (BLEU Score: 0.046)
Aspiring Human Resources Management student seeking an internship (BLEU Score: 0.029)
Aspiring Human Resources Management student seeking an internship (BLEU Score: 0.029)
Student at Humber College and Aspiring Human Resources Generalist (BLEU Score: 0.026)
Student at Humber College and Aspiring Human Resources Generalist (BLEU Score: 0.026)


## 8. METEOR Score Calculation
Calculate METEOR score for semantic similarity. METEOR considers synonyms and stemming, making it more suitable for semantic similarity than BLEU.

In [None]:
# Calculate METEOR score for each job title against the search term
search_tokens = nltk.word_tokenize(search_term.lower())
meteor_scores = [
    meteor_score([search_tokens], nltk.word_tokenize(title.lower()))
    for title in job_titles
]
meteor_rank = np.argsort(meteor_scores)[::-1]

print("Top 10 job titles by METEOR semantic similarity to search term:")
for idx in meteor_rank[:10]:
    print(f"{job_titles[idx]} (METEOR Score: {meteor_scores[idx]:.3f})")

Top 10 job titles by METEOR semantic similarity to search term:
Student (METEOR Score: 0.500)
Student at Chapman University (METEOR Score: 0.385)
Student at Chapman University (METEOR Score: 0.385)
Student at Chapman University (METEOR Score: 0.385)
Student at Chapman University (METEOR Score: 0.385)
Student at Westfield State University (METEOR Score: 0.357)
Aspiring Human Resources Management student seeking an internship (METEOR Score: 0.294)
Aspiring Human Resources Management student seeking an internship (METEOR Score: 0.294)
Student at Humber College and Aspiring Human Resources Generalist (METEOR Score: 0.278)
Student at Humber College and Aspiring Human Resources Generalist (METEOR Score: 0.278)


## 9. CIDEr Score Calculation
Calculate CIDEr (Consensus-based Image Description Evaluation) score. Originally for image captioning, but useful for semantic similarity.

In [None]:
# Import optimized CiderScorer from utils
from utils import CiderScorer

# Instantiate one CiderScorer with job_titles once to avoid O(N²) IDF recomputation
cider = CiderScorer(job_titles)
cider_scores = [cider.score(search_term, t) for t in job_titles]
cider_rank = np.argsort(cider_scores)[::-1]

print("Top 10 job titles by CIDEr semantic similarity to search term:")
for idx in cider_rank[:10]:
    print(f"{job_titles[idx]} (CIDEr Score: {cider_scores[idx]:.3f})")

Top 10 job titles by CIDEr semantic similarity to search term:
Student (CIDEr Score: 1.000)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Chapman University (CIDEr Score: 0.105)
Student at Chapman University (CIDEr Score: 0.105)


## 10. Comprehensive Metric Comparison
Compare all methods and recommend the best approach for job title semantic similarity.

In [None]:
# Create a comprehensive comparison
print("=== COMPREHENSIVE COMPARISON OF SEMANTIC SIMILARITY METRICS ===\n")

# Get top result from each method
methods = {
    "TF-IDF + Cosine": (np.argsort(similarities)[::-1], similarities),
    "Word2Vec + Cosine": (np.argsort(similarities)[::-1], similarities),
    "GloVe + Cosine": (np.argsort(similarities)[::-1], similarities),
    "FastText + Cosine": (np.argsort(similarities)[::-1], similarities),
    "BLEU Score": (np.argsort(bleu_scores)[::-1], bleu_scores),
    "METEOR Score": (np.argsort(meteor_scores)[::-1], meteor_scores),
    "CIDEr Score": (np.argsort(cider_scores)[::-1], cider_scores),
}

print("Top match from each method:")
for method_name, (ranked_idx, scores) in methods.items():
    top_idx = ranked_idx[0]
    print(f"\n{method_name}:")
    print(f"  Job Title: {job_titles[top_idx]}")
    print(f"  Score: {scores[top_idx]:.3f}")

print("\n=== RECOMMENDATION ===")
print("\nFor job title semantic similarity, here's the ranking of methods:")
print("\n1. **GloVe + Cosine Similarity** (BEST CHOICE)")
print("   - Excellent semantic understanding")
print("   - Good balance of performance and accuracy")
print("   - Handles out-of-vocabulary words reasonably")

print("2. **Word2Vec + Cosine Similarity** (Second Choice)")
print("   - Strong semantic relationships")
print("   - Trained on Google News, good for professional terms")

print("3. **FastText + Cosine Similarity** (Third Choice)")
print("   - Handles subword information well")
print("   - Good for rare or misspelled words")

print("4. **METEOR Score** (Best for text generation evaluation)")
print("   - Considers synonyms and stemming")
print("   - Better than BLEU for semantic similarity")

print("5. **CIDEr Score** (Good for consensus-based evaluation)")
print("   - Uses TF-IDF weighting")
print("   - Good when you have multiple reference texts")

print("6. **TF-IDF + Cosine Similarity** (Baseline)")
print("   - Simple and fast")
print("   - Limited semantic understanding")

print("7. **BLEU Score** (Not recommended for this task)")
print("   - Designed for machine translation")
print("   - Poor for semantic similarity of short texts")

print("\n**FINAL RECOMMENDATION: Use GloVe + Cosine Similarity**")
print("This method provides the best balance of semantic understanding,")
print("computational efficiency, and practical performance for job title matching.")

=== COMPREHENSIVE COMPARISON OF SEMANTIC SIMILARITY METRICS ===

Top match from each method:

TF-IDF + Cosine:
  Job Title: Student
  Score: 1.000

Word2Vec + Cosine:
  Job Title: Student
  Score: 1.000

GloVe + Cosine:
  Job Title: Student
  Score: 1.000

FastText + Cosine:
  Job Title: Student
  Score: 1.000

BLEU Score:
  Job Title: Student
  Score: 1.000

METEOR Score:
  Job Title: Student
  Score: 0.500

CIDEr Score:
  Job Title: Student
  Score: 1.000

=== RECOMMENDATION ===

For job title semantic similarity, here's the ranking of methods:

1. **GloVe + Cosine Similarity** (BEST CHOICE)
   - Excellent semantic understanding
   - Good balance of performance and accuracy
   - Handles out-of-vocabulary words reasonably
2. **Word2Vec + Cosine Similarity** (Second Choice)
   - Strong semantic relationships
   - Trained on Google News, good for professional terms
3. **FastText + Cosine Similarity** (Third Choice)
   - Handles subword information well
   - Good for rare or misspelled w

# Simple LLM-based Candidate Ranking using Groq API (Llama 3 70B Versatile)


In [None]:
# --- Simple LLM-based Candidate Ranking using Groq API (Llama 3 70B Versatile) ---


def simple_llm_rank(job_titles, search_term):
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        raise ValueError(
            "GROQ_API_KEY not found in environment variables. Please set it in your .env file."
        )
    prompt = (
        f"Rank these job titles by how well they match the search term '{search_term}'. Return a numbered list, most relevant first.\n"
        + "\n".join(job_titles)
    )
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    data = {
        "model": "llama3-70b-8192",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,
    }
    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions", headers=headers, json=data
    )
    response.raise_for_status()
    result = response.json()
    llm_output = result["choices"][0]["message"]["content"]
    print("LLM-ranked job titles:\n", llm_output)


# Example usage:
simple_llm_rank(job_titles, search_term)

LLM-ranked job titles:
 Here is the list of job titles ranked by how well they match the search term 'Student', with the most relevant first:

1. Student
2. Student at Westfield State University
3. Student at Indiana University Kokomo - Business Management - 
4. Student at Humber College and Aspiring Human Resources Generalist
5. Student at Humber College and Aspiring Human Resources Generalist
6. Student at Chapman University
7. Aspiring Human Resources Management student seeking an internship
8. Aspiring Human Resources Management student seeking an internship
9. Liberal Arts Major. Aspiring Human Resources Analyst.
10. Business Management Major and Aspiring Human Resources Manager.

The remaining job titles do not contain the word "Student" and are therefore less relevant to the search term.


## 12. Compare Multiple Transformer Models (Gemma, Qwen, etc.)
Experiment with different Hugging Face transformer models for ranking.

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch


def get_transformer_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    with torch.no_grad():
        encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        output = model(**encoded)
        embeddings = output.last_hidden_state.mean(dim=1).numpy()
    return embeddings


model_names = [
    "sentence-transformers/all-MiniLM-L6-v2",  # SBERT baseline
    "Qwen/Qwen1.5-0.5B-Chat",  # Qwen (smaller, efficient)
    "bert-base-uncased",  # Classic BERT model
]

for model_name in model_names:
    print(f"\nRanking with model: {model_name}")
    job_embs = get_transformer_embeddings(model_name, job_titles)
    search_emb = get_transformer_embeddings(model_name, [search_term])
    sims = cosine_similarity(search_emb, job_embs).flatten()
    top_idx = np.argsort(sims)[::-1]
    for idx in top_idx[:10]:
        print(f"{job_titles[idx]} (Score: {sims[idx]:.3f})")

NameError: name 'meteor_scores' is not defined

In [13]:
# Assess and select the best performing transformer model
import pandas as pd

results = []
for model_name in model_names:
    job_embs = get_transformer_embeddings(model_name, job_titles)
    search_emb = get_transformer_embeddings(model_name, [search_term])
    sims = cosine_similarity(search_emb, job_embs).flatten()
    top_idx = np.argsort(sims)[::-1][:10]
    avg_top_score = sims[top_idx].mean()
    results.append(
        {
            "model": model_name,
            "avg_top10_similarity": avg_top_score,
            "top_job_titles": [job_titles[i] for i in top_idx],
            "top_scores": [sims[i] for i in top_idx],
        }
    )

# Create a DataFrame for easy comparison
results_df = pd.DataFrame(results)
print("\n=== Transformer Model Comparison ===")
print(results_df[["model", "avg_top10_similarity"]])

best_model = results_df.loc[results_df["avg_top10_similarity"].idxmax()]
print(f"\nBest performing model: {best_model['model']}")
print("Top 10 job titles:")
for title, score in zip(best_model["top_job_titles"], best_model["top_scores"]):
    print(f"{title} (Score: {score:.3f})")



=== Transformer Model Comparison ===
                                    model  avg_top10_similarity
0  sentence-transformers/all-MiniLM-L6-v2              0.433492
1                  Qwen/Qwen1.5-0.5B-Chat              0.605420
2                       bert-base-uncased              0.495998

Best performing model: Qwen/Qwen1.5-0.5B-Chat
Top 10 job titles:
Human Resources Professional (Score: 0.954)
Human Resources Management Major (Score: 0.920)
Human Resources Coordinator at InterContinental Buckhead Atlanta (Score: 0.619)
Human Resources Coordinator at InterContinental Buckhead Atlanta (Score: 0.619)
Human Resources Coordinator at InterContinental Buckhead Atlanta (Score: 0.619)
Human Resources Coordinator at InterContinental Buckhead Atlanta (Score: 0.619)
Seeking Human Resources Opportunities (Score: 0.510)
Seeking Human Resources Opportunities (Score: 0.510)
Seeking Human Resources Position (Score: 0.457)
Human Resources Specialist at Luxottica (Score: 0.228)


## 13. Fine-tune Best Transformer Model with LoRA (Parameter-Efficient Fine-Tuning)
Now we will fine-tune the best performing transformer model using the LoRA (Low-Rank Adaptation) technique for parameter-efficient fine-tuning, leveraging the extended Potential Talents dataset. This approach allows us to adapt large models with minimal additional parameters and compute.

In [None]:
# Fine-tune the best transformer model using LoRA (Parameter-Efficient Fine-Tuning)
# This example uses the PEFT and transformers libraries. Make sure to install: pip install peft transformers datasets

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset, train_test_split
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# Use the best model from previous cell
# target_model_name = best_model['model']
target_model_name = (
    best_model["model"] if isinstance(best_model, dict) else best_model.model
)

# Load and prepare the extended Potential Talents dataset
# Assume the dataset has columns: 'job_title' and 'label' (label: 1=match, 0=not match)
df = pd.read_excel("potential-talents.xlsx")
if "label" not in df.columns:
    raise ValueError(
        "The extended dataset must have a 'label' column for supervised fine-tuning."
    )

# Prepare the dataset for transformers
hf_dataset = Dataset.from_pandas(df[["job_title", "label"]])
train_data, test_data = train_test_split(hf_dataset, test_size=0.2, seed=42)


def preprocess(example):
    return tokenizer(
        example["job_title"], truncation=True, padding="max_length", max_length=32
    )


tokenizer = AutoTokenizer.from_pretrained(target_model_name)
train_data = train_data.map(preprocess)
test_data = test_data.map(preprocess)

# Load model for sequence classification
base_model = AutoModelForSequenceClassification.from_pretrained(
    target_model_name, num_labels=2
)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1
)
peft_model = get_peft_model(base_model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./lora_finetuned",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    evaluation_strategy="epoch",
    report_to=[],
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}


trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
)

# Fine-tune with LoRA
trainer.train()

# Evaluate on test set before saving
outputs = trainer.predict(test_data)
labels = outputs.label_ids
preds = outputs.predictions.argmax(axis=-1)
print("\nClassification Report:\n", classification_report(labels, preds))
print("\nTest Accuracy:", accuracy_score(labels, preds))

# Save the LoRA-adapted model
peft_model.save_pretrained("./lora_finetuned")
print("LoRA fine-tuning complete. Model saved to ./lora_finetuned")
