In [104]:
import pandas as pd
from ftfy import fix_text #removes â€¦ â€™ â€“
from bs4 import BeautifulSoup # strips any leftover HTML safely
from cleantext import clean #gives you ready-made toggles: remove any URLS, emojis, phone
import faiss
from sentence_transformers import SentenceTransformer
from pathlib import Path
from sentence_transformers import CrossEncoder
import torch

DATA CLEANING

In [42]:
def strip_html(s): 
    return BeautifulSoup(s or "", "html.parser").get_text(" ")

In [43]:
def ez_clean(s: str) -> str:
    s = fix_text(s or "")                     # fixes Ã¢â‚¬… mojibake
    s = strip_html(s)                         # removes HTML tags
    s = clean(                                # turnkey switches
        s, 
        lower=True,
        no_urls=True, no_emails=True, no_phone_numbers=True,
        no_currency_symbols=True, no_emoji=True, no_line_breaks=True,
        fix_unicode=True
    )
    return s.strip()

In [44]:
# Use it on your files (edit paths if needed)
res = pd.read_csv(r"D:\HR Agent\data\resume.csv")
jobs = pd.read_csv(r"D:\HR Agent\data\job_des.csv")

In [45]:
res["Category"]   = res["Category"].map(ez_clean)
res["Resume"]     = res["Resume"].map(ez_clean)
jobs["Job Title"] = jobs["Job Title"].map(ez_clean)
jobs["Description"]= jobs["Description"].map(ez_clean)

In [46]:
# optional: drop dups & very short rows
res = res.drop_duplicates(subset=["Resume"])
jobs = jobs.drop_duplicates(subset=["Job Title","Description"])
res = res[res["Resume"].str.len() >= 80]
jobs = jobs[jobs["Description"].str.len() >= 80]



In [47]:
res.to_csv(r"D:\HR Agent\data\resume_clean_auto.csv", index=False)
jobs.to_csv(r"D:\HR Agent\data\job_des_clean_auto.csv", index=False)

print("Saved: resume_clean_auto.csv, job_des_clean_auto.csv")

Saved: resume_clean_auto.csv, job_des_clean_auto.csv


EMBEDDING

In [87]:
# --- paths 
DATA_DIR = Path(r"D:\HR Agent\data")
ART_DIR  = Path(r"D:\HR Agent\artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)

In [88]:
# Use your cleaned resumes
res_path = DATA_DIR / "resume_clean_v2.csv"   # or resume_clean_auto.csv / resume_clean.csv
res = pd.read_csv(res_path)                   # expects columns: Category, Resume

In [89]:
# Text to embed = the resume text
texts = res["Resume"].fillna("").astype(str).tolist()

# Same model as JDs
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [90]:
# --- embed with a small, strong model ---
emb = model.encode(
    texts,
    convert_to_numpy=True,
    normalize_embeddings=True,   # cosine via inner product
    batch_size=64,
    show_progress_bar=True
)

Batches: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]


BUILT FAISS INDEX

In [91]:
# FAISS index (cosine with normalized vectors)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

In [92]:
# Persist index + metadata
faiss.write_index(index, str(ART_DIR / "resume_index.faiss"))
np.save(ART_DIR / "resume_embeddings.npy", emb)
res.to_csv(ART_DIR / "resume_meta.csv", index=False, encoding="utf-8")

print("✅ Saved:")
print(ART_DIR / "resume_index.faiss")
print(ART_DIR / "resume_embeddings.npy")
print(ART_DIR / "resume_meta.csv")
print("Total resumes indexed:", len(res))


✅ Saved:
D:\HR Agent\artifacts\resume_index.faiss
D:\HR Agent\artifacts\resume_embeddings.npy
D:\HR Agent\artifacts\resume_meta.csv
Total resumes indexed: 166


QUICK SANITY CHECK

In [98]:
ART = r"D:\HR Agent\artifacts"

# Load index + metadata
index = faiss.read_index(fr"{ART}\jd_index.faiss")
meta  = pd.read_csv(fr"{ART}\jd_meta.csv")  # columns: Job Title, Description, text

print("Index vectors:", index.ntotal)
print("Embedding dim:", index.d)
print("Meta rows:", len(meta))
assert len(meta) == index.ntotal, "Meta rows must equal index size!"

# ----- sanity query by KEYWORDS -----
KEYWORDS = "data analyst"  # <--- change this

# Prefer title match; fallback to description; final fallback: use keywords as-is
title_mask = meta["Job Title"].fillna("").str.contains(KEYWORDS, case=False, regex=False)
desc_mask  = meta["Description"].fillna("").str.contains(KEYWORDS, case=False, regex=False)

if title_mask.any():
    seed = meta[title_mask].iloc[0]
elif desc_mask.any():
    seed = meta[desc_mask].iloc[0]
else:
    seed = {"Job Title": f"(no exact JD match) :: {KEYWORDS}", "text": KEYWORDS}

q_text = seed.get("text", KEYWORDS)

# Encode & search
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
q_emb  = model.encode([q_text], convert_to_numpy=True, normalize_embeddings=True)
D, I   = index.search(q_emb, 5)

# Show top matches
out = meta.iloc[I[0]][["Job Title"]].copy()
out["score"] = D[0]
print("Query JD/keywords:", seed["Job Title"])
print(out.reset_index(drop=True))



Index vectors: 519
Embedding dim: 384
Meta rows: 519
Query JD/keywords: data analyst
                                  Job Title     score
0                              data analyst  1.000000
1                              data analyst  0.755119
2  data reporting analyst- (***york, pa***)  0.747314
3                       junior data analyst  0.742687
4                              data analyst  0.741027


QUERY INDEX WITH REAL RESUME AND LIST THE TOP-K MATCHING JDS

In [None]:
# --- paths
ART_DIR = r"D:\HR Agent\artifacts"
DATA_DIR = r"D:\HR Agent\data"

In [93]:
# Load resume index + meta
r_index = faiss.read_index(fr"{ART_DIR}\resume_index.faiss")
r_meta  = pd.read_csv(fr"{ART_DIR}\resume_meta.csv")   # has Category, Resume

In [95]:
# Load JDs to choose from
jobs = pd.read_csv(fr"{DATA_DIR}\job_des_clean_auto.csv")  # has Job Title, Description
jobs["text"] = (jobs["Job Title"].fillna("") + " - " + jobs["Description"].fillna("")).str.strip()

In [96]:
# Same embedder as used to build the index
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def search_candidates_for_jd(jd_text: str, k: int = 10) -> pd.DataFrame:
    q = model.encode([jd_text], convert_to_numpy=True, normalize_embeddings=True)
    D, I = r_index.search(q, k)
    out = r_meta.iloc[I[0]].copy()
    out["score"] = D[0]
    # show a short preview of the resume text
    out["resume_preview"] = out["Resume"].str.slice(0, 220) + "..."
    return out[["Category", "resume_preview", "score"]].reset_index(drop=True)


In [97]:
# Pick a JD by title keywords (fallback: search in description)
KEYWORDS = "data analyst"  # <-- change this

hits = jobs[jobs["Job Title"].str.contains(KEYWORDS, case=False, na=False, regex=False)]
if hits.empty:
    hits = jobs[jobs["Description"].str.contains(KEYWORDS, case=False, na=False, regex=False)]
    assert not hits.empty, f"No JD title/description contains: {KEYWORDS}"

hit = hits.iloc[0]                 # take the first match; or hits.sample(1).iloc[0]
jd_text = hit["text"]

print("JD:", hit["Job Title"])
results = search_candidates_for_jd(jd_text, k=10)
print(results)


JD: data analyst
           Category                                     resume_preview  \
0      Data Science  expertise - data and quantitative analysis - d...   
1      Data Science  skills * python * tableau * data visualization...   
2     Etl Developer  computer skills: - yes. sql knowledge-yes unix...   
3     Etl Developer  skill set talend big data informatica power ce...   
4  Python Developer  technical skills / responsibilities: * hands o...   
5  Business Analyst  key skills - requirement gathering - requireme...   
6  Python Developer  * operating systems: windows * others: ms exce...   
7             Sales  skills 1. ms-office 2. good communication skil...   
8               Pmo  area of expertise (profile) around 10 plus yea...   
9     Sap Developer  education details january 2016 bachelor of eng...   

      score  
0  0.596651  
1  0.577712  
2  0.556203  
3  0.537110  
4  0.536817  
5  0.527484  
6  0.523135  
7  0.521917  
8  0.519749  
9  0.514907  


RE-RANK FAISS RESULTS WITH A CROSSENCODER

In [101]:
# --- ensure your search returns full Resume text for re-ranking ---
def search_candidates_for_jd(jd_text: str, k: int = 20) -> pd.DataFrame:
    q = model.encode([jd_text], convert_to_numpy=True, normalize_embeddings=True)
    D, I = r_index.search(q, k)
    out = r_meta.iloc[I[0]][["Category","Resume"]].copy()
    out["bi_score"] = D[0]  # bi-encoder score (from FAISS)
    out["resume_preview"] = out["Resume"].str.slice(0, 220) + "..."
    return out

# --- cross-encoder (reads JD + Resume together) ---
ce = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")  # downloads on first use


In [106]:
def rerank_with_crossencoder(jd_text: str, cand_df, top_m: int = 10):
    pairs = [(jd_text, t) for t in cand_df["Resume"].tolist()]
    logits = ce.predict(pairs)  # raw scores (can be negative)
    probs = torch.sigmoid(torch.tensor(logits)).numpy()  # 0..1

    df = cand_df.copy()
    df["ce_logit"] = logits
    df["ce_prob"]  = probs
    df = df.sort_values("ce_prob", ascending=False).head(top_m)
    return df[["Category", "resume_preview", "bi_score", "ce_prob"]].reset_index(drop=True)


In [107]:
KEYWORDS = "data analyst"   # <-- your JD keywords
hits = jobs[jobs["Job Title"].str.contains(KEYWORDS, case=False, na=False, regex=False)]
if hits.empty:
    hits = jobs[jobs["Description"].str.contains(KEYWORDS, case=False, na=False, regex=False)]
hit = hits.iloc[0]
jd_text = hit["text"]
print("JD:", hit["Job Title"])

cands = search_candidates_for_jd(jd_text, k=25)   # recall
final = rerank_with_crossencoder(jd_text, cands, top_m=10)  # precision
print(final)


JD: data analyst
           Category                                     resume_preview  \
0               Pmo  core competencies * maintain processes to ensu...   
1      Data Science  expertise - data and quantitative analysis - d...   
2      Data Science  skills * python * tableau * data visualization...   
3      Data Science  skills * r * python * sap hana * tableau * sap...   
4      Data Science  skills * programming languages: python (pandas...   
5               Pmo  area of expertise (profile) around 10 plus yea...   
6  Business Analyst  key skills - requirement gathering - requireme...   
7     Sap Developer  skills: * etl * data warehousing * sql/pl sql ...   
8     Etl Developer  technical summary * knowledge of informatica p...   
9  Business Analyst  education details february 2006 to february 20...   

   bi_score   ce_prob  
0  0.508532  0.003384  
1  0.596651  0.002780  
2  0.577712  0.002076  
3  0.494516  0.001896  
4  0.509494  0.001056  
5  0.519749  0.000671  
