##Extract Keywords from resume

##Cell 1 – Install libraries & restart kernel

In [0]:
# ===== 1. Install & Restart =====
%pip install spacy sentence-transformers PyPDF2 requests
!python -m spacy download en_core_web_sm
dbutils.library.restartPython()

##Cell 2 – Imports & global configs

In [0]:
import io, re, requests
from PyPDF2 import PdfReader
import spacy
from sentence_transformers import SentenceTransformer, util

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Curated technical keywords
TECH_TERMS = {
    "python","pyspark","sql","aws","azure","databricks","spark","hive",
    "redshift","airflow","docker","jenkins","kafka","eventhub","kinesis",
    "machine learning","data engineering","etl","pipeline","s3","emr",
    "mwaa","adf","data lake","synapse","hdinsight","cloud","governance"
}


##Cell 3 – Utility functions

In [0]:
def extract_technical_terms(text):
    if not text:
        return ""
    doc = nlp(text.lower())
    terms = set()
    for chunk in doc.noun_chunks:
        if chunk.text.strip() in TECH_TERMS:
            terms.add(chunk.text.strip())
    for token in doc:
        if token.is_alpha and not token.is_stop and token.lemma_ in TECH_TERMS:
            terms.add(token.lemma_)
    return " ".join(sorted(terms))


##Cell 4 – Load resume text & preprocess

In [0]:
resume_url = "https://raw.githubusercontent.com/parth30034/Docs/main/Resume_Parth_Shrivastava.pdf"
pdf_bytes = requests.get(resume_url).content
reader = PdfReader(io.BytesIO(pdf_bytes))
resume_text = " ".join([page.extract_text() for page in reader.pages])
resume_filtered = extract_technical_terms(resume_text)


##Cell 5 – Read all job descriptions from Delta table

In [0]:
jobs_df = spark.read.format("delta").table("main.gold.jobs_final")
job_rows = jobs_df.select("title", "company_name", "description").collect()
job_filtered_texts = [extract_technical_terms(r.description) if r.description else "" for r in job_rows]


##Cell 6 – Semantic matching in bulk

In [0]:
model = SentenceTransformer("all-MiniLM-L6-v2")
resume_emb = model.encode(resume_filtered, convert_to_tensor=True)
job_embs = model.encode(job_filtered_texts, convert_to_tensor=True)

scores = util.cos_sim(resume_emb, job_embs)[0].cpu().numpy() * 100

results = []
for row, score in zip(job_rows, scores):
    results.append({
        "title": row.title,
        "company_name": row.company_name,
        "match_percent": round(float(score), 2),
        "description": row.description
    })

results_df = spark.createDataFrame(results)
display(results_df.orderBy(results_df.match_percent.desc()))


## Cell 7 – Save results to Delta Gold table

In [0]:

results_df \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("main.gold.jobs_with_match")

print("✅ Results saved to main.gold.jobs_with_match")


✅ Results saved to main.gold.jobs_with_match


In [0]:
company_stats = (
    results_df.groupBy("company_name")
              .avg("match_percent")
              .withColumnRenamed("avg(match_percent)", "avg_match_percent")
              .orderBy("avg_match_percent", ascending=False)
)

display(company_stats)

company_name,avg_match_percent
MSCI,83.36
ASTELLAS PHARMA,82.27
ASSA ABLOY,82.05000000000001
AT&T,81.07
ZENSAR TECHNOLOGIES,78.93
TIETOEVRY,78.09000000000002
ALL EUROPEAN CAREERS,76.95
TANISHA SYSTEMS INC,75.1
EXXONMOBIL,74.44999999999999
BARCLAYS,74.19
