In [None]:
import pandas as pd 
import numpy as np
import re
from pydantic import BaseModel, field_validator
import optional
from typing import Optional
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os
import re, unicodedata

In [None]:

# ---------- paths ----------
JOB = r"D:\HR Agent\data\job.csv"
RES = r"D:\HR Agent\data\resume.csv"


In [None]:
# ---------- helpers ----------
def clean_text(s: object) -> str:
    if pd.isna(s):
        return ""
    s = str(s)

    # fix common mojibake quickly (works even without extra libs)
    s = (s.replace("Â", " ")
           .replace("â€™", "'")
           .replace("â€œ", '"')
           .replace("â€\x9d", '"')
           .replace("â€“", "-")
           .replace("â€”", "-")
           .replace("â€¢", " ")
           .replace("\xa0", " "))

    s = unicodedata.normalize("NFC", s)

    # strip HTML + URLs + emails + phones
    s = re.sub(r"<[^>]+>", " ", s)
    s = re.sub(r"(https?://\S+|www\.\S+|#URL_\w+)", " ", s)
    s = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", " ", s)
    s = re.sub(r"\+?\d[\d\-\s\(\)]{7,}\d", " ", s)

    # collapse whitespace + lowercase
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

In [None]:

# ---------- load (handle encodings) ----------
def safe_read(path):
    for enc in ("utf-8", "utf-8-sig", "latin1"):
        try:
            return pd.read_csv(path, encoding=enc)
        except UnicodeDecodeError:
            continue
    # last resort
    return pd.read_csv(path, encoding_errors="ignore")

job_df = safe_read(JOB)
resume_df = safe_read(RES)


In [None]:
# ---------- JOB CLEAN ----------
for col in ["title", "description", "requirements"]:
    if col not in job_df.columns:
        job_df[col] = ""
    job_df[col] = job_df[col].apply(clean_text)

job_df["job_text"] = (
    job_df["title"] + " " + job_df["description"] + " " + job_df["requirements"]
).str.replace(r"\s+", " ", regex=True).str.strip()

job_df = job_df[job_df["job_text"].str.len() > 20]
job_df = job_df.drop_duplicates(subset=["job_text"])

# ---------- RESUME CLEAN ----------
# ensure expected names
resume_df = resume_df.rename(columns={"Category": "category", "Resume": "resume"})
if "resume" not in resume_df.columns:
    raise KeyError(f"Could not find resume text column. Columns: {list(resume_df.columns)}")

resume_df["category"] = resume_df.get("category", "unknown").apply(clean_text)
resume_df["resume_text"] = resume_df["resume"].apply(clean_text)

resume_df = resume_df[resume_df["resume_text"].str.len() > 20]
resume_df = resume_df.drop_duplicates(subset=["resume_text"])

In [None]:
# ---------- SAVE ----------
job_out = r"D:\HR Agent\data\job_cleaned.csv"
res_out = r"D:\HR Agent\data\resume_cleaned.csv"
job_df.to_csv(job_out, index=False)
resume_df.to_csv(res_out, index=False)

In [None]:
print(f"jobs cleaned: {len(job_df)} -> {job_out}")
print(f"resumes cleaned: {len(resume_df)} -> {res_out}")
print("\nSample job_text:\n", job_df["job_text"].head(2).to_string(index=False))
print("\nSample resume_text:\n", resume_df["resume_text"].head(2).to_string(index=False))

jobs cleaned: 15736 -> D:\HR Agent\data\job_cleaned.csv
resumes cleaned: 166 -> D:\HR Agent\data\resume_cleaned.csv

Sample job_text:
 marketing intern food52, a fast-growing, james ...
customer service - cloud video production organ...

Sample resume_text:
 skills * programming languages: python (pandas,...
education details may 2013 to may 2017 b.e uit-...
