# Libraries

In [1]:
!pip install python-docx PyPDF2 pandas rapidfuzz

# Optional LLMs:
!pip install openai
# !pip install transformers accelerate sentencepiece
# !pip install huggingface_hub




In [12]:
import os, re, json, textwrap
from collections import defaultdict
import PyPDF2, docx
import pandas as pd

# Providers
OPENAI_AVAILABLE = False
# HF_TRANSFORMERS_AVAILABLE = False
# HF_HUB_AVAILABLE = False

try:
    import openai
    OPENAI_AVAILABLE = True
except:
    try:
        from openai import OpenAI
        OPENAI_AVAILABLE = True
    except:
        OPENAI_AVAILABLE = False

try:
    import transformers
    HF_TRANSFORMERS_AVAILABLE = True
except:
    pass

try:
    import huggingface_hub
    HF_HUB_AVAILABLE = True
except:
    pass

def wrap(s, width=100):
    return "\n".join(textwrap.wrap(str(s), width=width))

# CV Input

In [None]:
# Option A 

cv_path = "C:/Users/Benyamin/OneDrive - University of Portsmouth/Documents/Apply/Job Hunting/2- Full time/3-AI-ML-Engineer/BenyaminEbrahimpour CV_AI-ML_v4_07-13-25.pdf"

# Option B
cv_text_manual = """
PASTE YOUR CV TEXT HERE IF NEEDED
"""
# === Paste your target Job Description (JD) text here ===

jd_text = """
YOUR JOB DESCRIPTION
"""

# Choose LLM provider: 'openai', 'hf_local', 'hf_inference', '' (auto)
LLM_PROVIDER = ''
OPENAI_MODEL = 'gpt-4o-mini'
# HF_LOCAL_MODEL = 'google/flan-t5-small'
# HF_INFERENCE_MODEL = 'mistralai/Mistral-7B-Instruct-v0.3'

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
# HF_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN', '')


# Skills Taxonomy & Free Resources

In [4]:
SKILL_TAXONOMY = {
    "python": {"aliases": ["python3"], "category": "Programming"},
    "git": {"aliases": ["github", "version control"], "category": "MLOps"},
    "sql": {"aliases": ["mysql", "postgres"], "category": "Data"},
    "pandas": {"aliases": ["dataframe"], "category": "Data"},
    "numpy": {"aliases": ["np"], "category": "Data"},
    "scikit-learn": {"aliases": ["sklearn"], "category": "ML"},
    "tensorflow": {"aliases": ["keras"], "category": "DL"},
    "pytorch": {"aliases": ["torch"], "category": "DL"},
    "nlp": {"aliases": ["spacy", "nltk"], "category": "ML"},
    "time series": {"aliases": ["forecasting"], "category": "ML"},
    "statistics": {"aliases": [], "category": "Math"},
    "docker": {"aliases": [], "category": "MLOps"},
    "apis": {"aliases": ["fastapi", "flask"], "category": "Backend"},
    "excel": {"aliases": [], "category": "Data"},
    "tableau": {"aliases": [], "category": "Viz"},
}

FREE_RESOURCES = {
    "python": [("CS50 Python", "https://cs50.harvard.edu/python/")],
    "sql": [("Mode SQL Tutorial", "https://mode.com/sql-tutorial/")],
    "pandas": [("Kaggle Pandas", "https://www.kaggle.com/learn/pandas")],
    "numpy": [("NumPy Guide", "https://numpy.org/doc/stable/user/index.html")],
    "scikit-learn": [("Sklearn Tutorials", "https://scikit-learn.org/stable/tutorial/")],
    "tensorflow": [("Keras Docs", "https://keras.io/")],
    "pytorch": [("PyTorch Tutorials", "https://pytorch.org/tutorials/")],
    "nlp": [("spaCy Course", "https://course.spacy.io/")],
    "statistics": [("Khan Academy", "https://www.khanacademy.org/math/statistics-probability")],
}


# CV & JD Parsing

In [5]:
def extract_text_from_pdf(path):
    if PyPDF2 is None: return ""
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        return "\n".join([p.extract_text() or "" for p in reader.pages])

def extract_text_from_docx(path):
    if docx is None: return ""
    doc = docx.Document(path)
    return "\n".join([p.text for p in doc.paragraphs])

def load_cv_text(path, manual):
    if path and os.path.exists(path):
        if path.endswith(".pdf"): return extract_text_from_pdf(path)
        if path.endswith(".docx"): return extract_text_from_docx(path)
        if path.endswith(".txt"): return open(path).read()
    return manual

cv_text = load_cv_text(cv_path, cv_text_manual)
jd_text_norm = re.sub(r"\s+", " ", jd_text.lower())
cv_text_norm = re.sub(r"\s+", " ", cv_text.lower())

# Skill Gap Analysis - Regex Baseline Detector

In [6]:
def skill_patterns(skill):
    base = [skill.lower()] + SKILL_TAXONOMY[skill]["aliases"]
    return [re.compile(rf"\b{re.escape(x)}\b", re.I) for x in base]

def detect_skills_regex(text):
    found = {}
    for skill in SKILL_TAXONOMY:
        pats = skill_patterns(skill)
        cnt = sum(len(p.findall(text)) for p in pats)
        if cnt > 0:
            found[skill] = cnt
    return found


# LLM Skill Extraction

In [7]:
def prompt_for_skills(cv, jd):
    taxonomy_list = "\n".join(f"- {k}" for k in SKILL_TAXONOMY)
    return f"""
Extract which skills from this taxonomy appear in the CV and JD.
Return JSON with keys: "cv_skills", "jd_skills".

TAXONOMY:
{taxonomy_list}

CV:
{cv[:3000]}

JD:
{jd[:3000]}
"""

def extract_skills_llm(cv, jd):
    try:
        prompt = prompt_for_skills(cv, jd)
        if OPENAI_AVAILABLE and OPENAI_API_KEY:
            from openai import OpenAI
            client = OpenAI(api_key=OPENAI_API_KEY)
            resp = client.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[{"role":"user","content": prompt}],
                temperature=0
            )
            data = json.loads(resp.choices[0].message.content)
            return data
    except Exception as e:
        print("LLM failed:", e)
    return None


In [8]:
# Run Detection
llm_result = extract_skills_llm(cv_text_norm, jd_text_norm)
if llm_result:
    cv_skills = {s:1 for s in llm_result["cv_skills"]}
    jd_skills = {s:1 for s in llm_result["jd_skills"]}
else:
    cv_skills = detect_skills_regex(cv_text_norm)
    jd_skills = detect_skills_regex(jd_text_norm)

missing = [s for s in jd_skills if s not in cv_skills]
print("CV skills:", list(cv_skills))
print("JD skills:", list(jd_skills))
print("Missing skills:", missing)


CV skills: ['python', 'git', 'sql', 'pandas', 'numpy', 'pytorch', 'statistics', 'excel', 'tableau']
JD skills: ['python', 'git']
Missing skills: []


# Personalized Learning Path

In [9]:
ORDER = ["python","git","sql","pandas","numpy","scikit-learn","nlp","tensorflow","pytorch"]

def build_path(missing):
    ordered = sorted(missing, key=lambda s: ORDER.index(s) if s in ORDER else 999)
    plan = []
    for s in ordered:
        cat = SKILL_TAXONOMY[s]["category"]
        res = FREE_RESOURCES.get(s, [])
        plan.append({
            "Skill": s,
            "Category": cat,
            "Weeks": 1 if s in ["git","excel"] else 2,
            "Resources": res
        })
    return plan

learning_plan = build_path(missing)
pd.DataFrame(learning_plan)


# Project Ideas (Portfolio Builders)

In [10]:
ROLE_IDEAS = {
    "data scientist": ["End-to-end ML project", "NLP classifier", "Time series forecasting"],
    "data analyst": ["KPI dashboard", "Cohort analysis", "Sales forecasting"]
}

role = "data scientist" if "scientist" in jd_text_norm else "data analyst"
print("Suggested role:", role)
for idea in ROLE_IDEAS[role]:
    print("-", idea)


Suggested role: data analyst
- KPI dashboard
- Cohort analysis
- Sales forecasting


# Export Report

In [11]:
report = []
report.append("# SkillBridge Plan\n")
report.append("## Skills\n")
report.append(f"CV skills: {list(cv_skills)}\n")
report.append(f"JD skills: {list(jd_skills)}\n")
report.append(f"Missing: {missing}\n")

report.append("\n## Learning Path\n")
for row in learning_plan:
    report.append(f"- {row['Skill']} ({row['Category']}), {row['Weeks']} weeks")

out_path = "skillbridge_report.md"
with open(out_path,"w") as f:
    f.write("\n".join(report))

print("Report saved to", out_path)


Report saved to skillbridge_report.md
