# NLP Operations: Job Title Matching
This notebook demonstrates various NLP techniques to vectorize job titles and a search term, and then ranks candidates by similarity. Techniques covered:
- TF-IDF
- Word2Vec (Google)
- GloVe
- FastText

## 1. Setup & Imports
Install and import required libraries.

In [3]:
# Install required packages (uncomment if running for the first time)
# !pip install pandas scikit-learn gensim nltk
# !pip install glove-python-binary
# !pip install fasttext
# !pip install nltk pycocoevalcap

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from gensim.models import KeyedVectors
import fasttext
import fasttext.util
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import random
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Import custom utility functions
from utils import bleu_score

# For CIDEr, we'll implement a simplified version
from collections import Counter
import math

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Osama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Osama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Osama\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 2. Load Data
Load job titles from the Excel file and define a search term.

In [4]:
df = pd.read_excel("potential-talents.xlsx")
possible_columns = [
    "job_title",
    "title",
    "position",
    "role",
    "job",
    "designation",
    "job title",
]
job_title_column = None
for col in df.columns:
    if any(keyword in col.lower() for keyword in possible_columns):
        job_title_column = col
        break
if not job_title_column:
    raise ValueError("Job title column not found. Please specify it manually.")
job_titles = df[job_title_column].dropna().astype(str).tolist()

# Filter job titles to only those with 1 or 2 words
filtered_job_titles = [title for title in job_titles if 1 <= len(title.split()) <= 2]

# Randomly select a search term from filtered job titles
if filtered_job_titles:
    search_term = random.choice(filtered_job_titles)
else:
    raise ValueError("No job titles with 1 or 2 words found.")

print(f"Randomly selected search term: {search_term}")

Randomly selected search term: Student


## 3. TF-IDF Vectorization & Cosine Similarity
Vectorize job titles and search term using TF-IDF, then rank candidates by similarity.

In [5]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
corpus = job_titles + [search_term]
X = vectorizer.fit_transform(corpus)
search_vec = X[-1]
job_vecs = X[:-1]
similarities = cosine_similarity(search_vec, job_vecs).flatten()
ranked_indices = np.argsort(similarities)[::-1]
print("Top 10 job titles by TF-IDF similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Top 10 job titles by TF-IDF similarity to search term:
Student (Score: 1.000)
Student at Chapman University (Score: 0.455)
Student at Chapman University (Score: 0.455)
Student at Chapman University (Score: 0.455)
Student at Chapman University (Score: 0.455)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.371)


In [6]:
# # --- LLM-based Candidate Ranking using Groq API (Llama 3 70B Versatile) ---
# import requests
# import os
# import json
# from dotenv import load_dotenv

# # Load environment variables from .env file
# load_dotenv()


# def rank_candidates_with_llm(
#     job_titles, search_term, model="llama3-70b-8192"
# ):  # Llama 3 70B Versatile
#     """
#     Use Groq LLM API to rank job titles by relevance to the search term.
#     Args:
#         job_titles (list): List of job title strings.
#         search_term (str): The search term/job title to match against.
#         model (str): Groq model name (default: llama3-70b-8192).
#     Returns:
#         list: Ranked job titles (most relevant first).
#     """
#     api_key = os.getenv("GROQ_API_KEY")
#     if not api_key:
#         raise ValueError(
#             "GROQ_API_KEY not found in environment variables. Please set it in your .env file."
#         )
#     prompt = f"""
# You are an expert recruiter. Given the following list of candidate job titles, rank them from most to least relevant for the search term: '{search_term}'.\n\nJob Titles:\n"""
#     for i, title in enumerate(job_titles, 1):
#         prompt += f"{i}. {title}\n"
#     prompt += "\nReturn the ranking as a numbered list, most relevant first. Only include the job titles."

#     headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
#     data = {
#         "model": model,
#         "messages": [{"role": "user", "content": prompt}],
#         "temperature": 0.2,
#     }
#     response = requests.post(
#         "https://api.groq.com/openai/v1/chat/completions",
#         headers=headers,
#         data=json.dumps(data),
#     )
#     response.raise_for_status()
#     result = response.json()
#     llm_output = result["choices"][0]["message"]["content"]
#     # Parse the LLM output into a ranked list
#     ranked = [
#         line.split(". ", 1)[-1].strip()
#         for line in llm_output.split("\n")
#         if line.strip() and line[0].isdigit()
#     ]
#     return ranked


# # Example usage:
# # ranked_list = rank_candidates_with_llm(job_titles, search_term)
# # print("LLM-ranked job titles:")
# # for title in ranked_list:
# #     print(title)


## 4. Word2Vec (Google News) Vectorization & Cosine Similarity
Vectorize using pre-trained Google News Word2Vec embeddings.

In [7]:
# Download Google News vectors (only needs to be done once)
# w2v = api.load('word2vec-google-news-300')
w2v = api.load("word2vec-google-news-300")


def get_w2v_vector(text, model):
    words = [w for w in nltk.word_tokenize(text.lower()) if w in model]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model[w] for w in words], axis=0)


# Load the Excel file containing potential talents data
# (Assumes the file is in the same directory as the notebook)
df = pd.read_excel("potential-talents.xlsx")

# List of possible column names that may contain job titles
possible_columns = [
    "job_title",
    "title",
    "position",
    "role",
    "job",
    "designation",
    "job title",
]

# Initialize variable to store the detected job title column name
job_title_column = None
# Loop through columns in the DataFrame to find a matching job title column
for col in df.columns:
    if any(keyword in col.lower() for keyword in possible_columns):
        job_title_column = col  # Set the column name if a match is found
        break
# Raise an error if no job title column is found
default_job_title_error = "Job title column not found. Please specify it manually."
if not job_title_column:
    raise ValueError(default_job_title_error)

# Extract job titles as a list of strings, dropping missing values
job_titles = df[job_title_column].dropna().astype(str).tolist()

# Filter job titles to only those with 1 or 2 words
filtered_job_titles = [title for title in job_titles if 1 <= len(title.split()) <= 2]

# Randomly select a search term from filtered job titles
if filtered_job_titles:
    search_term = random.choice(filtered_job_titles)
else:
    raise ValueError("No job titles with 1 or 2 words found.")

# Print the randomly selected search term
print(f"Randomly selected search term: {search_term}")

job_vecs = np.array([get_w2v_vector(title, w2v) for title in job_titles])
search_vec = get_w2v_vector(search_term, w2v).reshape(1, -1)
similarities = cosine_similarity(search_vec, job_vecs).flatten()
ranked_indices = np.argsort(similarities)[::-1]
print("Top 10 job titles by Word2Vec similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Randomly selected search term: Student
Top 10 job titles by Word2Vec similarity to search term:
Student (Score: 1.000)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Westfield State University (Score: 0.793)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.575)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.575)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.575)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.575)
Top 10 job titles by Word2Vec similarity to search term:
Student (Score: 1.000)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Chapman University (Score: 0.807)
Student at Westfield State University (Score: 0.793)
Student at 

## 5. GloVe Vectorization & Cosine Similarity
Vectorize using pre-trained GloVe embeddings.

In [8]:
# Download GloVe vectors (only needs to be done once)
# glove = api.load('glove-wiki-gigaword-300')
glove = api.load("glove-wiki-gigaword-300")


def get_glove_vector(text, model):
    words = [w for w in nltk.word_tokenize(text.lower()) if w in model]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model[w] for w in words], axis=0)


job_vecs = np.array([get_glove_vector(title, glove) for title in job_titles])
search_vec = get_glove_vector(search_term, glove).reshape(1, -1)
similarities = cosine_similarity(search_vec, job_vecs).flatten()
ranked_indices = np.argsort(similarities)[::-1]
print("Top 10 job titles by GloVe similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Top 10 job titles by GloVe similarity to search term:
Student (Score: 1.000)
Student at Chapman University (Score: 0.766)
Student at Chapman University (Score: 0.766)
Student at Chapman University (Score: 0.766)
Student at Chapman University (Score: 0.766)
Student at Westfield State University (Score: 0.699)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.669)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.669)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.669)
Student at Humber College and Aspiring Human Resources Generalist (Score: 0.669)


## 6. FastText Vectorization & Cosine Similarity
Vectorize using pre-trained FastText embeddings.

In [9]:
# Download FastText vectors (only needs to be done once)
# fasttext_model = api.load('fasttext-wiki-news-subwords-300')
fasttext_model = api.load("fasttext-wiki-news-subwords-300")


def get_fasttext_vector(text, model):
    words = [w for w in nltk.word_tokenize(text.lower()) if w in model]
    if not words:
        return np.zeros(model.vector_size)
    return np.mean([model[w] for w in words], axis=0)


job_vecs = np.array(
    [get_fasttext_vector(title, fasttext_model) for title in job_titles]
)
search_vec = get_fasttext_vector(search_term, fasttext_model).reshape(1, -1)
similarities = cosine_similarity(search_vec, job_vecs).flatten()
ranked_indices = np.argsort(similarities)[::-1]
print("Top 10 job titles by FastText similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

KeyboardInterrupt: 

## 7. BLEU Score Calculation
Calculate BLEU score for semantic similarity between search term and job titles.

In [None]:
# Calculate BLEU score for each job title against the search term
smoothie = SmoothingFunction().method4
search_tokens = nltk.word_tokenize(search_term.lower())
bleu_scores = [
    sentence_bleu(
        [search_tokens], nltk.word_tokenize(title.lower()), smoothing_function=smoothie
    )
    for title in job_titles
]
ranked_indices = np.argsort(bleu_scores)[::-1]
print("Top 10 job titles by BLEU semantic similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (BLEU Score: {bleu_scores[idx]:.3f})")

Top 10 job titles by BLEU semantic similarity to search term:
Student (BLEU Score: 1.000)
Student at Chapman University (BLEU Score: 0.061)
Student at Chapman University (BLEU Score: 0.061)
Student at Chapman University (BLEU Score: 0.061)
Student at Chapman University (BLEU Score: 0.061)
Student at Westfield State University (BLEU Score: 0.046)
Aspiring Human Resources Management student seeking an internship (BLEU Score: 0.029)
Aspiring Human Resources Management student seeking an internship (BLEU Score: 0.029)
Student at Humber College and Aspiring Human Resources Generalist (BLEU Score: 0.026)
Student at Humber College and Aspiring Human Resources Generalist (BLEU Score: 0.026)


## 8. METEOR Score Calculation
Calculate METEOR score for semantic similarity. METEOR considers synonyms and stemming, making it more suitable for semantic similarity than BLEU.

In [None]:
# Import METEOR function from utils
from utils import meteor

# Calculate METEOR score for each job title against the search term
search_tokens = nltk.word_tokenize(search_term.lower())
meteor_scores = [
    meteor_score([search_tokens], nltk.word_tokenize(title.lower()))
    for title in job_titles
]
meteor_rank = np.argsort(meteor_scores)[::-1]

print("Top 10 job titles by METEOR semantic similarity to search term:")
for idx in meteor_rank[:10]:
    print(f"{job_titles[idx]} (METEOR Score: {meteor_scores[idx]:.3f})")

Top 10 job titles by METEOR semantic similarity to search term:
Student (METEOR Score: 0.500)
Student at Chapman University (METEOR Score: 0.385)
Student at Chapman University (METEOR Score: 0.385)
Student at Chapman University (METEOR Score: 0.385)
Student at Chapman University (METEOR Score: 0.385)
Student at Westfield State University (METEOR Score: 0.357)
Aspiring Human Resources Management student seeking an internship (METEOR Score: 0.294)
Aspiring Human Resources Management student seeking an internship (METEOR Score: 0.294)
Student at Humber College and Aspiring Human Resources Generalist (METEOR Score: 0.278)
Student at Humber College and Aspiring Human Resources Generalist (METEOR Score: 0.278)


## 9. CIDEr Score Calculation
Calculate CIDEr (Consensus-based Image Description Evaluation) score. Originally for image captioning, but useful for semantic similarity.

In [None]:
# Import optimized CiderScorer from utils
from utils import CiderScorer

# Instantiate one CiderScorer with job_titles once to avoid O(N²) IDF recomputation
cider = CiderScorer(job_titles)
cider_scores = [cider.score(search_term, t) for t in job_titles]
cider_rank = np.argsort(cider_scores)[::-1]

print("Top 10 job titles by CIDEr semantic similarity to search term:")
for idx in cider_rank[:10]:
    print(f"{job_titles[idx]} (CIDEr Score: {cider_scores[idx]:.3f})")

Top 10 job titles by CIDEr semantic similarity to search term:
Student (CIDEr Score: 1.000)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Humber College and Aspiring Human Resources Generalist (CIDEr Score: 0.125)
Student at Chapman University (CIDEr Score: 0.105)
Student at Chapman University (CIDEr Score: 0.105)


## 10. Comprehensive Metric Comparison
Compare all methods and recommend the best approach for job title semantic similarity.

In [None]:
# Create a comprehensive comparison
print("=== COMPREHENSIVE COMPARISON OF SEMANTIC SIMILARITY METRICS ===\n")

# Get top result from each method
methods = {
    "TF-IDF + Cosine": (np.argsort(similarities)[::-1], similarities),
    "Word2Vec + Cosine": (np.argsort(similarities)[::-1], similarities),
    "GloVe + Cosine": (np.argsort(similarities)[::-1], similarities),
    "FastText + Cosine": (np.argsort(similarities)[::-1], similarities),
    "BLEU Score": (np.argsort(bleu_scores)[::-1], bleu_scores),
    "METEOR Score": (np.argsort(meteor_scores)[::-1], meteor_scores),
    "CIDEr Score": (np.argsort(cider_scores)[::-1], cider_scores),
}

print("Top match from each method:")
for method_name, (ranked_idx, scores) in methods.items():
    top_idx = ranked_idx[0]
    print(f"\n{method_name}:")
    print(f"  Job Title: {job_titles[top_idx]}")
    print(f"  Score: {scores[top_idx]:.3f}")

print("\n=== RECOMMENDATION ===")
print("\nFor job title semantic similarity, here's the ranking of methods:")
print("\n1. **GloVe + Cosine Similarity** (BEST CHOICE)")
print("   - Excellent semantic understanding")
print("   - Good balance of performance and accuracy")
print("   - Handles out-of-vocabulary words reasonably")

print("2. **Word2Vec + Cosine Similarity** (Second Choice)")
print("   - Strong semantic relationships")
print("   - Trained on Google News, good for professional terms")

print("3. **FastText + Cosine Similarity** (Third Choice)")
print("   - Handles subword information well")
print("   - Good for rare or misspelled words")

print("4. **METEOR Score** (Best for text generation evaluation)")
print("   - Considers synonyms and stemming")
print("   - Better than BLEU for semantic similarity")

print("5. **CIDEr Score** (Good for consensus-based evaluation)")
print("   - Uses TF-IDF weighting")
print("   - Good when you have multiple reference texts")

print("6. **TF-IDF + Cosine Similarity** (Baseline)")
print("   - Simple and fast")
print("   - Limited semantic understanding")

print("7. **BLEU Score** (Not recommended for this task)")
print("   - Designed for machine translation")
print("   - Poor for semantic similarity of short texts")

print("\n**FINAL RECOMMENDATION: Use GloVe + Cosine Similarity**")
print("This method provides the best balance of semantic understanding,")
print("computational efficiency, and practical performance for job title matching.")

=== COMPREHENSIVE COMPARISON OF SEMANTIC SIMILARITY METRICS ===

Top match from each method:

TF-IDF + Cosine:
  Job Title: Student
  Score: 1.000

Word2Vec + Cosine:
  Job Title: Student
  Score: 1.000

GloVe + Cosine:
  Job Title: Student
  Score: 1.000

FastText + Cosine:
  Job Title: Student
  Score: 1.000

BLEU Score:
  Job Title: Student
  Score: 1.000

METEOR Score:
  Job Title: Student
  Score: 0.500

CIDEr Score:
  Job Title: Student
  Score: 1.000

=== RECOMMENDATION ===

For job title semantic similarity, here's the ranking of methods:

1. **GloVe + Cosine Similarity** (BEST CHOICE)
   - Excellent semantic understanding
   - Good balance of performance and accuracy
   - Handles out-of-vocabulary words reasonably
2. **Word2Vec + Cosine Similarity** (Second Choice)
   - Strong semantic relationships
   - Trained on Google News, good for professional terms
3. **FastText + Cosine Similarity** (Third Choice)
   - Handles subword information well
   - Good for rare or misspelled w

In [None]:
# --- Simple LLM-based Candidate Ranking using Groq API (Llama 3 70B Versatile) ---
import requests
import os
from dotenv import load_dotenv

load_dotenv()


def simple_llm_rank(job_titles, search_term):
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        raise ValueError(
            "GROQ_API_KEY not found in environment variables. Please set it in your .env file."
        )
    prompt = (
        f"Rank these job titles by how well they match the search term '{search_term}'. Return a numbered list, most relevant first.\n"
        + "\n".join(job_titles)
    )
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    data = {
        "model": "llama3-70b-8192",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,
    }
    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions", headers=headers, json=data
    )
    response.raise_for_status()
    result = response.json()
    llm_output = result["choices"][0]["message"]["content"]
    print("LLM-ranked job titles:\n", llm_output)


# Example usage:
simple_llm_rank(job_titles, search_term)

LLM-ranked job titles:
 Here is the list of job titles ranked by how well they match the search term 'Student', with the most relevant first:

1. Student
2. Student at Westfield State University
3. Student at Indiana University Kokomo - Business Management - 
4. Student at Humber College and Aspiring Human Resources Generalist
5. Student at Humber College and Aspiring Human Resources Generalist
6. Student at Chapman University
7. Aspiring Human Resources Management student seeking an internship
8. Aspiring Human Resources Management student seeking an internship
9. Liberal Arts Major. Aspiring Human Resources Analyst.
10. Business Management Major and Aspiring Human Resources Manager.

The remaining job titles do not contain the word "Student" and are therefore less relevant to the search term.


## 11. Transformer-based Contextual Embeddings (BERT/Sentence-BERT)
Use Sentence-BERT to generate contextual embeddings for job titles and the search term, then rank by cosine similarity.

In [10]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load a pre-trained Sentence-BERT model
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute embeddings
job_embeddings = sbert_model.encode(job_titles)
search_embedding = sbert_model.encode([search_term])

# Compute cosine similarities
similarities = cosine_similarity(search_embedding, job_embeddings).flatten()
ranked_indices = np.argsort(similarities)[::-1]

print("Top 10 job titles by SBERT similarity to search term:")
for idx in ranked_indices[:10]:
    print(f"{job_titles[idx]} (Score: {similarities[idx]:.3f})")

Top 10 job titles by SBERT similarity to search term:
Student (Score: 1.000)
Student at Westfield State University (Score: 0.616)
Student at Chapman University (Score: 0.602)
Student at Chapman University (Score: 0.602)
Student at Chapman University (Score: 0.602)
Student at Chapman University (Score: 0.602)
Student at Indiana University Kokomo - Business Management - 
Retail Manager at Delphi Hardware and Paint (Score: 0.409)
Advisory Board Member at Celal Bayar University (Score: 0.398)
Advisory Board Member at Celal Bayar University (Score: 0.398)
Advisory Board Member at Celal Bayar University (Score: 0.398)


## 12. Compare Multiple Transformer Models (Gemma, Qwen, etc.)
Experiment with different Hugging Face transformer models for ranking.

In [11]:
from transformers import AutoTokenizer, AutoModel
import torch


def get_transformer_embeddings(model_name, texts):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    with torch.no_grad():
        encoded = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        output = model(**encoded)
        embeddings = output.last_hidden_state.mean(dim=1).numpy()
    return embeddings


model_names = [
    "sentence-transformers/all-MiniLM-L6-v2",  # SBERT baseline
    "Qwen/Qwen1.5-0.5B-Chat",  # Qwen (if available)
    "google/gemma-2b-it",  # Gemma 2B (if available)
]

for model_name in model_names:
    print(f"\nRanking with model: {model_name}")
    job_embs = get_transformer_embeddings(model_name, job_titles)
    search_emb = get_transformer_embeddings(model_name, [search_term])
    sims = cosine_similarity(search_emb, job_embs).flatten()
    top_idx = np.argsort(sims)[::-1]
    for idx in top_idx[:10]:
        print(f"{job_titles[idx]} (Score: {sims[idx]:.3f})")


Ranking with model: sentence-transformers/all-MiniLM-L6-v2
Student (Score: 0.609)
Student at Westfield State University (Score: 0.472)
Student at Indiana University Kokomo - Business Management - 
Retail Manager at Delphi Hardware and Paint (Score: 0.447)
Student at Chapman University (Score: 0.441)
Student at Chapman University (Score: 0.441)
Student at Chapman University (Score: 0.441)
Student at Chapman University (Score: 0.441)
Advisory Board Member at Celal Bayar University (Score: 0.348)
Advisory Board Member at Celal Bayar University (Score: 0.348)
Advisory Board Member at Celal Bayar University (Score: 0.348)

Ranking with model: Qwen/Qwen1.5-0.5B-Chat


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Human Resources Professional (Score: 0.954)
Human Resources Management Major (Score: 0.920)
Human Resources Coordinator at InterContinental Buckhead Atlanta (Score: 0.619)
Human Resources Coordinator at InterContinental Buckhead Atlanta (Score: 0.619)
Human Resources Coordinator at InterContinental Buckhead Atlanta (Score: 0.619)
Human Resources Coordinator at InterContinental Buckhead Atlanta (Score: 0.619)
Seeking Human Resources Opportunities (Score: 0.510)
Seeking Human Resources Opportunities (Score: 0.510)
Seeking Human Resources Position (Score: 0.457)
Human Resources Specialist at Luxottica (Score: 0.228)

Ranking with model: google/gemma-2b-it


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2b-it.
401 Client Error. (Request ID: Root=1-68532383-109c14b203ce714f4793ca33;75adf809-962c-458e-b688-bde86812f18e)

Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted. You must have access to it and be authenticated to access it. Please log in.

## 13. LLM-based Ranking with Multiple Models (Llama 3, Gemma, Qwen)
Use Groq API to rank job titles with different LLMs.

In [None]:
import os
import requests
import json
from dotenv import load_dotenv

load_dotenv()


def llm_rank(job_titles, search_term, model_name):
    api_key = os.getenv("GROQ_API_KEY")
    prompt = (
        f"Rank these job titles by how well they match the search term '{search_term}'. "
        "Return a numbered list, most relevant first.\n\n" + "\n".join(job_titles)
    )
    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
    data = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.2,
    }
    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers=headers,
        data=json.dumps(data),
    )
    response.raise_for_status()
    result = response.json()
    llm_output = result["choices"][0]["message"]["content"]
    ranked = [
        line.split(". ", 1)[-1].strip()
        for line in llm_output.split("\n")
        if line.strip() and line[0].isdigit()
    ]
    return ranked


for model in ["llama3-70b-8192", "google/gemma-2b-it", "Qwen/Qwen1.5-0.5B-Chat"]:
    print(f"\nLLM Ranking with {model}:")
    ranked = llm_rank(job_titles, search_term, model)
    for i, title in enumerate(ranked[:10], 1):
        print(f"{i}. {title}")