<a href="https://colab.research.google.com/github/malluandcompany/Projects/blob/main/jobbank_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 pandas




In [None]:
!pip install -q requests beautifulsoup4 pandas

import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

BASE = "https://www.jobbank.gc.ca"
SEARCH_URL = "https://www.jobbank.gc.ca/jobsearch/jobsearch?sort=M&searchstring=data+analyst"
PAGES = 2
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

EMPTY = ""

# ---------------------------------------------------------------------------
# Helper functions
# ---------------------------------------------------------------------------

def unique(seq):
    seen = set()
    out = []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out


def get_posting_links(search_html):
    soup = BeautifulSoup(search_html, "html.parser")
    anchors = soup.select('a[href*="/jobsearch/jobposting/"]')
    links = []
    for a in anchors:
        href = a.get("href", "")
        if "/jobsearch/jobposting/" in href:
            links.append(urljoin(BASE, href))
    return unique(links)


def clean_employer(raw):
    if not raw:
        return EMPTY
    txt = re.sub(r'\bEmployer\b[: ]*', '', raw, flags=re.I)
    txt = re.sub(r'\bEmployer Details\b', '', txt, flags=re.I)
    txt = re.sub(r'\bAuto List\b', '', txt, flags=re.I)
    txt = re.sub(r'\s{2,}', ' ', txt).strip()
    return txt


def split_location(raw):
    """Return City, Province, PostalCode; empty if not found."""
    if not raw or not isinstance(raw, str):
        return EMPTY, EMPTY, EMPTY

    text = raw.strip()
    text = re.sub(r'^(Location|Lieu de travail)\s*[:\-]?\s*', '', text, flags=re.I).strip()

    postal_match = re.search(r'\b([A-Z]\d[A-Z])\s?(\d[A-Z]\d)\b', text)
    postal = f"{postal_match.group(1)} {postal_match.group(2)}" if postal_match else EMPTY

    prov_match = re.search(r'\b(AB|BC|MB|NB|NL|NS|NT|NU|ON|PE|QC|SK|YT)\b', text)
    province = prov_match.group(1) if prov_match else EMPTY

    city = EMPTY
    if province:
        before_prov = text.split(province)[0]
        parts = [p.strip() for p in before_prov.split(',') if p.strip()]
        city = parts[-1] if parts else before_prov.strip()
    else:
        parts = [p.strip() for p in text.split(',') if p.strip()]
        city = parts[0] if parts else EMPTY

    return city, province, postal


DATE_RE = re.compile(
    r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\b'
)

def extract_date(soup):
    t = soup.select_one('time[datetime]')
    if t:
        txt = t.get_text(" ", strip=True)
        m = DATE_RE.search(txt)
        if m:
            return m.group(0)
        return t.get("datetime", EMPTY)
    txt = soup.get_text(" ", strip=True)
    m = DATE_RE.search(txt)
    return m.group(0) if m else EMPTY


def extract_noc(soup):
    a = soup.select_one('a[href*="noc.esdc.gc.ca"]')
    if a:
        m = re.search(r'\b(\d{4,5})\b', a.get_text(" ", strip=True))
        if m:
            return m.group(1)
    txt = soup.get_text(" ", strip=True)
    m = re.search(r'\b(\d{4,5})\b', txt)
    return m.group(1) if m else EMPTY


def extract_employment_type(soup):
    text = soup.get_text(" ", strip=True).lower()
    keywords = ["full time", "part time", "permanent", "temporary", "contract", "casual", "seasonal"]
    found = [k.title() for k in keywords if k in text]
    return ", ".join(found) if found else EMPTY


def extract_work_arrangement(soup):
    text = soup.get_text(" ", strip=True).lower()
    if any(k in text for k in ["remote", "work from home", "telework"]):
        return "Remote/Hybrid"
    if any(k in text for k in ["on site", "on-site"]):
        return "On-site"
    return EMPTY


def extract_source_site_and_url(soup):
    for a in soup.select('a[href]'):
        href = a["href"]
        if "jobbank.gc.ca" not in href:
            site = urlparse(href).netloc.replace("www.", "")
            return site, href
    return EMPTY, EMPTY


def parse_posting(url):
    r = requests.get(url, headers=headers, timeout=20)
    if r.status_code != 200:
        return {"URL": url, "Status": r.status_code}

    soup = BeautifulSoup(r.text, "html.parser")

    title = soup.select_one("h1").get_text(strip=True) if soup.select_one("h1") else EMPTY

    employer_el = soup.find(string=re.compile("Employer", re.I))
    employer = clean_employer(employer_el.strip()) if employer_el else EMPTY

    location_el = soup.find(string=re.compile("Location", re.I))
    location_raw = location_el.parent.get_text(strip=True) if location_el else EMPTY
    city, province, postal = split_location(location_raw)

    wage_el = soup.find(string=re.compile("Wage", re.I)) or soup.find(string=re.compile("Salary", re.I))
    wage = wage_el.parent.get_text(strip=True) if wage_el else EMPTY

    date_posted = extract_date(soup)
    noc = extract_noc(soup)
    emp_type = extract_employment_type(soup)
    work_arr = extract_work_arrangement(soup)
    site, link = extract_source_site_and_url(soup)

    return {
        "Title": title,
        "Employer": employer,
        "City": city,
        "Province": province,
        "PostalCode": postal,
        "Wage": wage,
        "DatePosted": date_posted,
        "NOC": noc,
        "EmploymentType": emp_type,
        "WorkArrangement": work_arr,
        "SourceSite": site,
        "SourceURL": link,
        "URL": url
    }

# ---------------------------------------------------------------------------
# Crawl JobBank
# ---------------------------------------------------------------------------

all_rows = []
for p in range(1, PAGES + 1):
    page_url = f"{SEARCH_URL}&page={p}"
    print("Fetching:", page_url)
    r = requests.get(page_url, headers=headers, timeout=20)
    if r.status_code != 200:
        print("Failed:", page_url)
        continue
    links = get_posting_links(r.text)
    print(f"Page {p} → {len(links)} job links")

    for u in links:
        all_rows.append(parse_posting(u))
        time.sleep(1.25)

df = pd.DataFrame(all_rows).drop_duplicates(subset=["URL"]).reset_index(drop=True)
df.to_csv("jobbank_jobs.csv", index=False)
print("Saved", len(df), "jobs → jobbank_jobs.csv")
df.head(10)


Fetching: https://www.jobbank.gc.ca/jobsearch/jobsearch?sort=M&searchstring=data+analyst&page=1
Page 1 → 25 job links
Fetching: https://www.jobbank.gc.ca/jobsearch/jobsearch?sort=M&searchstring=data+analyst&page=2
Page 2 → 25 job links
Saved 50 jobs → jobbank_jobs.csv


Unnamed: 0,Title,Employer,City,Province,PostalCode,Wage,DatePosted,NOC,EmploymentType,WorkArrangement,SourceSite,SourceURL,URL
0,big data analyst,Employers,,,,Median wage,"October 03, 2025",21211,Full Time,On-site,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
1,big data analyst,Employers,,,,Median wage,"October 18, 2025",21211,"Full Time, Casual",On-site,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
2,data quality analystLMIA requested,Employers,,,,Median wage,"September 04, 2025",21223,"Full Time, Permanent, Temporary",Remote/Hybrid,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
3,data mining analyst,Employers,,,,Median wage,"September 30, 2025",21211,,On-site,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
4,data mining analystStudent job,Employers,,,,Median wage,"September 24, 2025",21211,Full Time,On-site,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
5,"analyst, database",Employers,,,,Median wage,"October 16, 2025",21223,Full Time,On-site,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
6,data mining analyst,Employers,,,,Median wage,"October 03, 2025",21211,,On-site,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
7,data analyst - informatics and systems,Employers,,,,Median wage,"October 20, 2025",21223,"Full Time, Permanent, Temporary",Remote/Hybrid,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
8,data analyst - informatics and systems,Employers,,,,Median wage,"October 11, 2025",21223,Part Time,On-site,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...
9,data analyst - informatics and systems,Employers,,,,Median wage,"October 10, 2025",21223,,On-site,,#searchString,https://www.jobbank.gc.ca/jobsearch/jobposting...


In [None]:
# Convert the CSV to Excel format
excel_filename = "jobbank_jobs.xlsx"
df.to_excel(excel_filename, index=False)
print(f"Excel file saved as {excel_filename}")


Excel file saved as jobbank_jobs.xlsx


In [None]:
from google.colab import files
files.download("jobbank_jobs.xlsx")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def parse_posting(url):
    print("Parsing:", url)
    r = requests.get(url, headers=headers, timeout=20)
    if r.status_code != 200:
        return {"URL": url, "Status": r.status_code}

    soup = BeautifulSoup(r.text, "html.parser")

    # Job title
    title_tag = soup.select_one("span.noc-title, h1")
    title = title_tag.get_text(strip=True) if title_tag else ""

    # Employer + City fallback from <meta description>
    meta_desc = soup.find("meta", {"name": "description"})
    employer, city, province = "", "", ""
    if meta_desc and meta_desc.get("content"):
        desc = meta_desc["content"]
        # Try extracting Employer
        m_emp = re.search(r"for a .* at ([A-Za-z0-9&'’\-\s]+?) At", desc)
        if m_emp:
            employer = m_emp.group(1).strip()
        # Try extracting City/Province
        m_loc = re.search(r"At ([A-Za-z\s]+),?\s?([A-Z]{2})?", desc)
        if m_loc:
            city = m_loc.group(1).strip()
            province = m_loc.group(2) if m_loc.group(2) else ""

    # Date Posted
    date_tag = soup.select_one('span[property="datePosted"]')
    date_posted = date_tag.get_text(strip=True).replace("Posted on", "").strip() if date_tag else ""

    # NOC Code
    noc_tag = soup.select_one("span.noc-no")
    noc_match = re.search(r"\d{4,5}", noc_tag.get_text()) if noc_tag else None
    noc = noc_match.group(0) if noc_match else ""

    # Wage
    wage_el = soup.find(string=re.compile("Wage|Salary", re.I))
    wage = wage_el.parent.get_text(strip=True) if wage_el else ""

    # Employment Type
    emp_type = extract_employment_type(soup)

    # Work Arrangement
    work_arr = extract_work_arrangement(soup)

    # Source Site & URL
    site, link = extract_source_site_and_url(soup)

    return {
        "Title": title,
        "Employer": employer,
        "City": city,
        "Province": province,
        "Wage": wage,
        "DatePosted": date_posted,
        "NOC": noc,
        "EmploymentType": emp_type,
        "WorkArrangement": work_arr,
        "SourceSite": site,
        "SourceURL": link,
        "URL": url
    }


In [None]:
from google.colab import files
uploaded = files.upload()


Saving jobbank_jobs.xlsx to jobbank_jobs.xlsx


In [None]:
import pandas as pd
list(uploaded.keys())


['jobbank_jobs.xlsx']

In [None]:
# --- PHASE 2.1: DATA CLEANING & PREPROCESSING ---

import pandas as pd
import numpy as np
import re

# Load dataset (Excel version)
df = pd.read_excel("jobbank_jobs.xlsx", engine="openpyxl")

# --- General cleaning ---
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
df.replace(["", "None", None, np.nan], "N/A", inplace=True)
df.drop_duplicates(subset=["url"], inplace=True)

# --- Column-specific cleaning ---

# Title
if "title" in df.columns:
    df["title"] = df["title"].astype(str).str.title().str.replace(r"\(.*?\)", "", regex=True).str.strip()

# Employer
if "employer" in df.columns:
    df["employer"] = df["employer"].astype(str).str.replace(r"Employer\s*Details.*", "", regex=True)
    df["employer"] = df["employer"].str.replace(r"Employer\s*", "", regex=True).str.strip()

# City / Province / Postal code
def normalize_province(p):
    prov_map = {
        "Alberta": "AB", "British Columbia": "BC", "Manitoba": "MB", "New Brunswick": "NB",
        "Newfoundland and Labrador": "NL", "Nova Scotia": "NS", "Northwest Territories": "NT",
        "Nunavut": "NU", "Ontario": "ON", "Prince Edward Island": "PE", "Quebec": "QC",
        "Saskatchewan": "SK", "Yukon": "YT"
    }
    if isinstance(p, str):
        p = p.strip().title()
        return prov_map.get(p, p if p in prov_map.values() else "N/A")
    return "N/A"

if "province" in df.columns:
    df["province"] = df["province"].apply(normalize_province)

postal_pattern = re.compile(r"[A-Z]\d[A-Z]\s?\d[A-Z]\d")
if "postal_code" in df.columns:
    df["postal_code"] = df["postal_code"].apply(
        lambda x: x if isinstance(x, str) and postal_pattern.match(x) else "N/A"
    )

# Wage
if "wage" in df.columns:
    df["wage_amount"] = df["wage"].astype(str).str.extract(r"(\$?\d+(?:\.\d{1,2})?)")[0]
    df["wage_unit"] = df["wage"].astype(str).str.extract(r"(hour|week|month|year)", flags=re.I)[0]
    df["wage_amount"] = (
        df["wage_amount"].replace(r"[^0-9.]", "", regex=True).replace("", "N/A")
    )

# Date
if "date_posted" in df.columns:
    df["date_posted"] = pd.to_datetime(df["date_posted"], errors="coerce")
    df["date_posted"] = df["date_posted"].dt.strftime("%Y-%m-%d").fillna("N/A")

# NOC
if "noc" in df.columns:
    df["noc"] = df["noc"].apply(
        lambda x: x if re.match(r"^\d{4,5}$", str(x)) else "N/A"
    )

# Employment type
valid_types = ["Full-Time", "Part-Time", "Contract", "Temporary", "Casual", "Seasonal"]
if "employment_type" not in df.columns:
    df["employment_type"] = "N/A"
else:
    df["employment_type"] = df["employment_type"].astype(str).str.title()
df.loc[~df["employment_type"].isin(valid_types), "employment_type"] = "N/A"

# Work arrangement
valid_work = ["On-Site", "Remote/Hybrid"]
if "work_arrangement" not in df.columns:
    df["work_arrangement"] = "N/A"
else:
    df["work_arrangement"] = df["work_arrangement"].astype(str).str.title()
df.loc[~df["work_arrangement"].isin(valid_work), "work_arrangement"] = "N/A"

# Source site & URL
if "source_site" not in df.columns:
    df["source_site"] = "N/A"
else:
    df["source_site"] = df["source_site"].astype(str).str.extract(r"([A-Za-z0-9.-]+)")

if "source_url" not in df.columns:
    df["source_url"] = "N/A"
else:
    df["source_url"] = df["source_url"].apply(
        lambda x: x if isinstance(x, str) and x.startswith("http") else "N/A"
    )

# --- Validation filters ---
df = df[
    (df["title"] != "N/A")
    & (df["employer"] != "N/A")
    & (df["url"] != "N/A")
]

# --- Export cleaned dataset ---
df.to_csv("jobbank_cleaned.csv", index=False, encoding="utf-8")

# Log rows containing N/A values
cleaning_log = df[df.isin(["N/A"]).any(axis=1)]
cleaning_log.to_csv("jobbank_cleaning_log.csv", index=False, encoding="utf-8")

print("✅ Cleaned dataset saved as 'jobbank_cleaned.csv'")
print("🧾 Cleaning log saved as 'jobbank_cleaning_log.csv'")
print(f"📊 Final dataset shape: {df.shape}")


✅ Cleaned dataset saved as 'jobbank_cleaned.csv'
🧾 Cleaning log saved as 'jobbank_cleaning_log.csv'
📊 Final dataset shape: (16, 19)


  df["wage_amount"].replace(r"[^0-9.]", "", regex=True).replace("", "N/A")


In [None]:
# --- PHASE 2.2: FEATURE ENGINEERING & LABEL ENCODING (Updated Safe Version) ---
import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load cleaned dataset
df = pd.read_csv("jobbank_cleaned.csv")

# ==================== 1️⃣  DERIVED FEATURES ====================

# --- Job Seniority ---
def get_seniority(title):
    title = str(title).lower()
    if any(k in title for k in ["senior", "lead", "principal", "manager", "head"]):
        return "Senior"
    elif any(k in title for k in ["junior", "entry", "assistant", "associate"]):
        return "Junior"
    else:
        return "Mid"

df["job_seniority"] = df["title"].apply(get_seniority)

# --- Wage Normalization (to hourly) ---
def wage_to_hourly(amount, unit):
    try:
        amount = float(amount)
    except:
        return np.nan
    if isinstance(unit, str):
        unit = unit.lower()
        if "hour" in unit:
            return amount
        elif "week" in unit:
            return amount / 40          # assume 40 hours per week
        elif "month" in unit:
            return amount / (4 * 40)    # 4 weeks per month
        elif "year" in unit:
            return amount / (52 * 40)   # 52 weeks per year
    return np.nan

df["wage_hourly"] = df.apply(lambda x: wage_to_hourly(x.get("wage_amount"), x.get("wage_unit")), axis=1)

# --- Days Since Posted (auto-detect column) ---
date_col = None
for c in df.columns:
    if "date" in c.lower() and "post" in c.lower():
        date_col = c
        break

if date_col:
    df["days_since_posted"] = (
        datetime.now() - pd.to_datetime(df[date_col], errors="coerce")
    ).dt.days
    df["days_since_posted"] = df["days_since_posted"].fillna(df["days_since_posted"].median())

else:
    df["days_since_posted"] = np.nan

# --- Province Code ---
province_map = {
    "AB": 1, "BC": 2, "MB": 3, "NB": 4, "NL": 5, "NS": 6,
    "NT": 7, "NU": 8, "ON": 9, "PE": 10, "QC": 11, "SK": 12, "YT": 13
}
df["province_code"] = df["province"].map(province_map).fillna(0).astype(int)

# ==================== 2️⃣  LABEL ENCODING ====================
enc_cols = ["employment_type", "work_arrangement", "province", "source_site"]
label_maps = {}

for col in enc_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col + "_encoded"] = le.fit_transform(df[col].astype(str))
        label_maps[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# ==================== 3️⃣  NORMALIZATION ====================
scaler = MinMaxScaler()
num_cols = ["wage_hourly", "days_since_posted"]
df[num_cols] = scaler.fit_transform(df[num_cols])

# ==================== 4️⃣  OUTPUT FILES ====================
df.to_csv("jobbank_features.csv", index=False)

with open("feature_summary.txt", "w", encoding="utf-8") as f:
    f.write("=== Feature Engineering Summary ===\n\n")
    f.write(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}\n\n")
    f.write("Encoded Label Maps:\n")
    for col, mapping in label_maps.items():
        f.write(f"\n{col}:\n{mapping}\n")

print("✅ Feature-engineered dataset saved as 'jobbank_features.csv'")
print("🧾 Summary file saved as 'feature_summary.txt'")
print(f"📊 Final dataset shape: {df.shape}")


✅ Feature-engineered dataset saved as 'jobbank_features.csv'
🧾 Summary file saved as 'feature_summary.txt'
📊 Final dataset shape: (16, 27)


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


In [2]:
# =======================================
# 🔧 Setup for OCR (ONE-TIME Cell)
# =======================================

!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract pdf2image pdfminer.six
!apt-get install -y poppler-utils


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://cli.github.com/packages stable InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,086 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,

In [3]:
# ========================================
# ✅ Phase 3.1 - Unified Resume Parser (Text + OCR Fallback)
# Notes: Automatically detects extractable text; falls back to OCR only if needed.
# Outputs: parsed_resume.csv
# ========================================

import io
import re
import pandas as pd
import pytesseract
from pdf2image import convert_from_path
from pdfminer.high_level import extract_text
from google.colab import files

uploaded = files.upload()
resume_name = list(uploaded.keys())[0]

# --------------------------
# Step A: Try text extraction
# --------------------------
raw_text = extract_text(resume_name).strip()

def text_quality(text):
    words = text.split()
    return len(words)

if text_quality(raw_text) < 40:  # Threshold for weak digital text
    print("⚠ Low text quality → Running OCR fallback...")

    images = convert_from_path(resume_name)
    ocr_text = ""
    for img in images:
        ocr_text += pytesseract.image_to_string(img)
    final_text = ocr_text
else:
    print("✅ Text extracted successfully, skipping OCR")
    final_text = raw_text

# --------------------------
# Step B: Field extraction
# --------------------------
def extract_name(text):
    match = re.search(r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b", text)
    return match.group(0) if match else "Unknown"

def extract_email(text):
    match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    return match.group(0) if match else "N/A"

def extract_phone(text):
    match = re.search(r"\+?\d[\d\- \(\)]{7,}\d", text)
    return match.group(0) if match else "N/A"

def extract_skills(text):
    keywords = ["Python", "SQL", "Tableau", "Power BI", "Excel", "Machine Learning", "Analytics"]
    found = [k for k in keywords if k.lower() in text.lower()]
    return ", ".join(found) if found else "N/A"

data = {
    "Name": extract_name(final_text),
    "Email": extract_email(final_text),
    "Phone": extract_phone(final_text),
    "Skills": extract_skills(final_text),
}

df = pd.DataFrame([data])
df.to_csv("parsed_resume.csv", index=False)

print("\n✅ Unified Resume parsing complete → 'parsed_resume.csv'")
df.head()


Saving Data Analyst - ats compatible-1.pdf to Data Analyst - ats compatible-1.pdf
✅ Text extracted successfully, skipping OCR

✅ Unified Resume parsing complete → 'parsed_resume.csv'


Unnamed: 0,Name,Email,Phone,Skills
0,Bay Area,professionalemail@resumeworded.com,+1-234-456-789,"Python, Tableau, Analytics"


In [6]:
# [2.3] Enhanced Resume Parsing (Experience, Education, URLs)
import re

def extract_experience(txt):
    pattern = r"(?:^|\n)(WORK EXPERIENCE|PROFESSIONAL EXPERIENCE|EXPERIENCE)(.*?)(\n[A-Z ]{6,}|$)"
    m = re.search(pattern, txt, flags=re.I | re.S)
    if m:
        content = m.group(2).strip()
        if len(content) > 10:
            return content
    return "N/A"


def extract_education(text):
    pattern = r"(?:Education)(.*?)(?:Skills|Projects|Experience|$)"
    match = re.search(pattern, text, re.I | re.S)
    return match.group(1).strip() if match else "N/A"

def extract_linkedin(txt):
    m = re.search(r"(https?://[^\s]*linkedin\.com[^\s]*)", txt, flags=re.I)
    return m.group(1).strip() if m else "N/A"


def extract_name(text):
    first_line = text.split("\n")[0]
    if re.match(r"[A-Za-z]+ [A-Za-z]+", first_line):
        return first_line.strip()
    return "N/A"

# Apply functions
df["Name"] = extract_name(final_text)
df["Experience"] = extract_experience(final_text)
df["Education"] = extract_education(final_text)
df["LinkedIn"] = extract_linkedin(final_text)

df.to_csv("parsed_resume.csv", index=False)
print("✅ Enriched resume fields extracted!")
df.head(1)

✅ Enriched resume fields extracted!


Unnamed: 0,Name,Email,Phone,Skills,Experience,Education,LinkedIn
0,JOHN DOE,professionalemail@resumeworded.com,+1-234-456-789,"Python, Tableau, Analytics",,"Resume Worded University, San Francisco, CA 20...",


In [7]:
df["Experience"] = extract_experience(text)
df["LinkedIn"] = extract_linkedin(text)
df.to_csv("parsed_resume.csv", index=False)
df.head()


NameError: name 'text' is not defined

In [2]:
# ===============================================================
# Code [3.2A] Unified Resume Parser v3 (Text-first, OCR fallback)
# Purpose: Extract structured resume fields from hybrid layouts
# Outputs:
#   - parsed_resume_structured.json (nested, for audit)
#   - parsed_resume_flat.csv        (flat, for ML/matching)
# Notes: Layout-agnostic (sidebar or full-width), robust sectioning
# ===============================================================

# --- Dependencies (install once per runtime) ---
# If already installed earlier in your notebook, you can comment these out.
!apt-get -qq update
!apt-get -qq install -y tesseract-ocr poppler-utils
!pip -q install pdfminer.six pdf2image pytesseract pandas

import re
import json
import io
import math
import pytesseract
import pandas as pd
from datetime import datetime
from pdfminer.high_level import extract_text
from pdf2image import convert_from_path
from google.colab import files

# ---------------------------
# [3.2A.1] Upload PDF
# ---------------------------
print("[3.2A] Upload a resume PDF…")
uploaded = files.upload()
resume_path = list(uploaded.keys())[0]

# ---------------------------
# [3.2A.2] Text extraction (text-first, OCR fallback)
# ---------------------------
def extract_pdf_text(path, ocr_word_threshold=40):
    raw = extract_text(path) or ""
    words = len(raw.split())
    if words >= ocr_word_threshold:
        print("[3.2A] Digital text OK (pdfminer).")
        return raw
    print("[3.2A] Low text quality -> using OCR fallback (pdf2image + Tesseract).")
    images = convert_from_path(path)  # requires poppler-utils
    ocr_text = []
    for img in images:
        ocr_text.append(pytesseract.image_to_string(img))
    return "\n".join(ocr_text)

raw_text = extract_pdf_text(resume_path)

# ---------------------------
# [3.2A.3] Normalization helpers
# ---------------------------
def normalize_text(txt: str) -> str:
    t = txt.replace("\r", "\n")
    t = re.sub(r"[ \t]+", " ", t)         # collapse spaces
    t = re.sub(r"\n{2,}", "\n\n", t)      # collapse blank lines
    t = re.sub(r"•|\u2022|\u2023|\u25E6|\u2043|\u2219", "-", t)  # bullets -> hyphen
    t = re.sub(r"\xa0", " ", t)           # non-breaking spaces
    return t.strip()

text = normalize_text(raw_text)

# ---------------------------
# [3.2A.4] Section detection (layout-agnostic)
# Works for both sidebar (skills/contact left) and full-width bodies.
# ---------------------------
SECTION_HEADERS = {
    "EXPERIENCE": [r"work experience", r"professional experience", r"experience", r"employment history"],
    "EDUCATION":  [r"education", r"academic background", r"qualifications"],
    "SKILLS":     [r"skills", r"technical skills", r"core competencies", r"toolbox"],
}

def find_section(txt: str, target_headers, stop_headers):
    """
    Locate a section by target header keywords and stop when a new ALL-CAPS header begins
    or another known section header appears. Case-insensitive across hybrid layouts.
    """
    # Build a combined regex to find target header
    target = r"|".join([f"(?:^|\\n)\\s*{h}\\s*$" for h in target_headers])
    # Build stop pattern: either next ALL CAPS line or another known header
    all_headers = sum(SECTION_HEADERS.values(), [])
    stops = r"|".join([f"(?:^|\\n)\\s*{h}\\s*$" for h in all_headers if h not in target_headers])
    # Also stop at generic ALL CAPS header lines (>=6 chars)
    stop_caps = r"(?:^|\n)\s*[A-Z][A-Z \-]{5,}\s*$"

    m = re.search(target, txt, flags=re.I|re.M)
    if not m:
        return None

    start = m.end()
    # Look for next stop: either known header or ALL CAPS header
    stop_m = re.search(f"(?:{stops})|{stop_caps}", txt[start:], flags=re.I|re.M)
    end = start + stop_m.start() if stop_m else len(txt)
    return txt[start:end].strip()

def extract_section(txt, key):
    targets = SECTION_HEADERS[key]
    # Stop when encountering any other section header
    stop_keys = [h for k,v in SECTION_HEADERS.items() if k != key for h in v]
    return find_section(txt, targets, stop_keys)

experience_block = extract_section(text, "EXPERIENCE")
education_block  = extract_section(text, "EDUCATION")
skills_block     = extract_section(text, "SKILLS")

# ---------------------------
# [3.2A.5] Scalar field extraction
# ---------------------------
def extract_name(txt):
    # Try first non-empty line with two capitalized tokens
    for line in txt.splitlines():
        line = line.strip()
        if not line:
            continue
        if re.match(r"^[A-Z][a-zA-Z'\-]+ [A-Z][a-zA-Z'\-]+$", line):
            return line
        # Uppercase header style names
        if re.match(r"^[A-Z][A-Z '\-]+ [A-Z][A-Z '\-]+$", line) and len(line.split()) <= 4:
            return line.title()
        break
    return "N/A"

def extract_email(txt):
    m = re.search(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}", txt)
    return m.group(0) if m else "N/A"

def extract_phone(txt):
    m = re.search(r"\+?\d[\d \-\(\)]{7,}\d", txt)
    return m.group(0) if m else "N/A"

def extract_linkedin(txt):
    m = re.search(r"(https?://[^\s]*linkedin\.com[^\s]*)", txt, flags=re.I)
    return m.group(1).strip() if m else "N/A"

name     = extract_name(text)
email    = extract_email(text)
phone    = extract_phone(text)
linkedin = extract_linkedin(text)

# ---------------------------
# [3.2A.6] Experience parsing
# Handles patterns like:
#   Company – Role (Jan 2021 – Jun 2023)  City, ST
#   Role, Company | 2019–2022
# And bullet lines following each entry.
# ---------------------------
MONTHS = r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[a-z]*"
DATE   = rf"(?:{MONTHS}\s+\d{{4}}|\d{{4}})"
RANGE  = rf"({DATE})\s*(?:–|-|to)\s*(Present|{DATE})"

exp_entry_re = re.compile(
    rf"""
    ^\s*
    (?P<title>[^,\-|\n]{{3,}}?)
    (?:\s*[,|\-]\s*|\s+at\s+)
    (?P<company>[^|\-\n]{{2,}}?)
    (?:\s*\|\s*|\s*[-–]\s*|,\s*)
    (?P<dates>{RANGE})
    """,
    re.IGNORECASE|re.MULTILINE|re.VERBOSE
)

def parse_dates(datestr):
    # Return (start_dt, end_dt) and years float
    # Accept Present as end
    now = datetime.now()
    # Extract all year/month tokens
    toks = re.findall(rf"{MONTHS}|\d{{4}}|Present", datestr, flags=re.I)
    # Heuristics: try to build start/end strings
    # Normalize to first and last found year/month combos
    # Fallback: just years like 2019 - 2022
    years = re.findall(r"\b(19|20)\d{2}\b", datestr)
    if "Present" in datestr:
        end = now
    else:
        # last year in string, else None
        m_end = re.search(rf"(?:{MONTHS}\s+)?((?:19|20)\d{{2}})\b(?!.+((?:19|20)\d{{2}}))", datestr, flags=re.I)
        if m_end:
            y = int(m_end.group(1))
            end = datetime(y, 12, 1)
        else:
            end = None

    m_start = re.search(rf"^({MONTHS}\s+)?((?:19|20)\d{{2}})", datestr.strip(), flags=re.I)
    if m_start:
        y = int(m_start.group(2))
        start = datetime(y, 1, 1)
    else:
        start = None

    years_float = None
    if start and end:
        years_float = max(0.0, (end - start).days / 365.25)
    return start, end, years_float

def parse_experience_block(block):
    if not block or not block.strip():
        return [], 0.0
    entries = []
    total_years = 0.0

    # Split into logical chunks by blank lines
    chunks = [c.strip() for c in re.split(r"\n{2,}", block) if c.strip()]
    for chunk in chunks:
        # The header line often contains role/company/dates; bullets follow.
        header_line = chunk.split("\n")[0]
        m = exp_entry_re.search(header_line)
        if not m:
            # Try alternate "Company - Role (dates)"
            alt = re.search(rf"^(?P<company>[^,\n]{{2,}})\s*[-–]\s*(?P<title>[^(\n]{{2,}})\s*\((?P<dates>{RANGE})\)", header_line, flags=re.I)
            if not alt:
                continue
            gd = alt.groupdict()
        else:
            gd = m.groupdict()

        company = re.sub(r"\s+", " ", gd.get("company","")).strip(" -|")
        title   = re.sub(r"\s+", " ", gd.get("title","")).strip(" -|")
        datestr = gd.get("dates","")
        start, end, years = parse_dates(datestr)
        # Bullets (subsequent lines starting with "-" or "*")
        bullets = []
        for line in chunk.split("\n")[1:]:
            if re.match(r"^\s*[-*•]\s+", line) or len(line.strip())>0 and line.strip()[0] in {"-", "•", "*"}:
                bullets.append(re.sub(r"^\s*[-*•]\s*", "", line).strip())
        entries.append({
            "company": company or "N/A",
            "title": title or "N/A",
            "dates": datestr,
            "start": start.isoformat() if start else "N/A",
            "end": end.isoformat() if end else "N/A",
            "years": round(years, 2) if years is not None else None,
            "bullets": bullets[:8]  # cap for cleanliness
        })
        if years:
            total_years += years

    return entries, round(total_years, 2)

exp_entries, total_years_exp = parse_experience_block(experience_block)

# ---------------------------
# [3.2A.7] Education parsing
# ---------------------------
DEGREE_RX = r"(Bachelor|Master|B\.?Sc|M\.?Sc|B\.?Eng|M\.?Eng|MBA|Ph\.?D|Diploma|Certificate)"
def parse_education_block(block):
    if not block or not block.strip():
        return []
    edu = []
    # Split lines and group into small chunks around degree keywords
    lines = [l.strip() for l in block.splitlines() if l.strip()]
    buf = []
    for ln in lines:
        buf.append(ln)
        if re.search(DEGREE_RX, ln, flags=re.I) or len(buf) >= 3:
            chunk = " | ".join(buf)
            degree = re.search(DEGREE_RX, chunk, flags=re.I)
            year = re.search(r"\b(19|20)\d{2}\b", chunk)
            edu.append({
                "institution": ln if not degree else chunk.split("|")[0],
                "degree": degree.group(0) if degree else "N/A",
                "field": "N/A",  # can be improved with NLP later
                "year": int(year.group(0)) if year else "N/A",
                "raw": chunk
            })
            buf = []
    if buf:
        chunk = " | ".join(buf)
        degree = re.search(DEGREE_RX, chunk, flags=re.I)
        year = re.search(r"\b(19|20)\d{2}\b", chunk)
        edu.append({
            "institution": chunk.split("|")[0],
            "degree": degree.group(0) if degree else "N/A",
            "field": "N/A",
            "year": int(year.group(0)) if year else "N/A",
            "raw": chunk
        })
    return edu

edu_entries = parse_education_block(education_block)

# ---------------------------
# [3.2A.8] Skills extraction (union of section + global scan)
# ---------------------------
SKILL_VOCAB = [
    "python", "sql", "excel", "tableau", "power bi", "pandas", "numpy",
    "scikit-learn", "tensorflow", "pytorch", "spark", "aws", "gcp", "azure",
    "airflow", "dbt", "matplotlib", "seaborn", "nlp", "git", "jira", "mlops"
]

def extract_skills(text_all, skills_section):
    found = set()
    hay = (skills_section or "") + "\n" + text_all
    for sk in SKILL_VOCAB:
        if re.search(rf"\b{re.escape(sk)}\b", hay, flags=re.I):
            found.add(sk)
    return sorted(found)

skills_list = extract_skills(text, skills_block)
skills_score = len(skills_list)

# ---------------------------
# [3.2A.9] Seniority inference
# ---------------------------
def infer_seniority(exp_entries):
    # Simple heuristic: look across titles
    senior_kw = ["senior", "lead", "principal", "staff", "manager", "head"]
    junior_kw = ["junior", "intern", "assistant", "associate"]
    titles = " ".join([e["title"].lower() for e in exp_entries if e.get("title")])
    if any(k in titles for k in senior_kw): return "Senior"
    if any(k in titles for k in junior_kw): return "Junior"
    return "Mid"

seniority = infer_seniority(exp_entries)

# ---------------------------
# [3.2A.10] Assemble outputs
# ---------------------------
structured = {
    "name": name,
    "email": email,
    "phone": phone,
    "linkedin": linkedin,
    "skills": skills_list,
    "skills_score": skills_score,
    "experience_total_years": total_years_exp,
    "seniority": seniority,
    "experience": exp_entries,
    "education": edu_entries,
    "raw_sections": {
        "experience_block": (experience_block or "")[:2000],
        "education_block":  (education_block or "")[:2000],
        "skills_block":     (skills_block or "")[:2000],
    }
}

# Save JSON (nested)
with open("parsed_resume_structured.json", "w", encoding="utf-8") as f:
    json.dump(structured, f, ensure_ascii=False, indent=2, default=str)

# Save CSV (flat)
flat = {
    "Name": name,
    "Email": email,
    "Phone": phone,
    "LinkedIn": linkedin,
    "Skills": ", ".join(skills_list),
    "SkillsScore": skills_score,
    "ExperienceYears": total_years_exp,
    "Seniority": seniority,
    "ExperienceEntries": len(exp_entries),
    "EducationEntries": len(edu_entries),
}
pd.DataFrame([flat]).to_csv("parsed_resume_flat.csv", index=False)

print("[3.2A] Parsing complete.")
print("- JSON: parsed_resume_structured.json")
print("- CSV : parsed_resume_flat.csv")
print("\nPreview flat summary:")
pd.DataFrame([flat])


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[3.2A] Upload a resume PDF…


Saving Accounting Resume Sample 1.pdf to Accounting Resume Sample 1.pdf
[3.2A] Digital text OK (pdfminer).
[3.2A] Parsing complete.
- JSON: parsed_resume_structured.json
- CSV : parsed_resume_flat.csv

Preview flat summary:


Unnamed: 0,Name,Email,Phone,LinkedIn,Skills,SkillsScore,ExperienceYears,Seniority,ExperienceEntries,EducationEntries
0,,johndoe@abc.com,555-555-5555,,excel,1,0.0,Mid,0,0
