# M2 Resume Extractor Training (Colab + T4 GPU)

Fine-tunes **yashpwr/resume-ner-bert-v2** for BIO NER on resumes with 14 entity types (29 labels).
Two-phase training: frozen layers 0-8 for 2 epochs, then all layers unfrozen for 6 epochs.

## Setup
1. Upload `m2_training_data.zip` (~276MB) to your Google Drive
2. Run all cells
3. Download the trained model from the output

In [None]:
# Cell 1: Install dependencies
!pip install -q transformers datasets seqeval accelerate pandas pyarrow pyyaml

In [None]:
# Cell 2: Check GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Cell 3: Mount Google Drive and extract training data
from google.colab import drive
drive.mount('/content/drive')

import os

# Look for the zip in common Drive locations
zip_candidates = [
    '/content/drive/MyDrive/m2_training_data.zip',
    '/content/drive/MyDrive/Colab/m2_training_data.zip',
    '/content/drive/MyDrive/training/m2_training_data.zip',
]
zip_path = None
for p in zip_candidates:
    if os.path.exists(p):
        zip_path = p
        break

if zip_path is None:
    # Fallback: search Drive root for the file
    for root, dirs, files in os.walk('/content/drive/MyDrive/'):
        if 'm2_training_data.zip' in files:
            zip_path = os.path.join(root, 'm2_training_data.zip')
            break
        # Don't recurse too deep
        if root.count(os.sep) > 5:
            break

if zip_path:
    print(f"Found zip at: {zip_path}")
    !unzip -o "{zip_path}" -d /content/m2_data
else:
    print("ERROR: m2_training_data.zip not found in Google Drive!")
    print("Please upload m2_training_data.zip to your Google Drive root folder.")
    print("Then re-run this cell.")

In [None]:
# Cell 4: Data preparation - load and unify all 5 resume NER datasets
import json
import logging
import random
import re
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, Features, Sequence, Value

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

DATA_DIR = Path("/content/m2_data")

# --- Entity types and BIO labels ---
ENTITY_TYPES = [
    "NAME", "EMAIL", "PHONE", "LOCATION", "DESIGNATION", "COMPANY",
    "DEGREE", "GRADUATION_YEAR", "COLLEGE_NAME", "YEARS_OF_EXPERIENCE",
    "SKILLS", "CERTIFICATION", "PROJECT_NAME", "PROJECT_TECHNOLOGY",
]

LABELS = ["O"]
for etype in ENTITY_TYPES:
    LABELS.append(f"B-{etype}")
    LABELS.append(f"I-{etype}")

LABEL2ID = {label: idx for idx, label in enumerate(LABELS)}
ID2LABEL = {idx: label for idx, label in enumerate(LABELS)}

print(f"Entity types: {len(ENTITY_TYPES)}")
print(f"Total labels (BIO): {len(LABELS)}")

# --- Label normalization map ---
_LABEL_NORMALIZE = {
    "Name": "NAME", "name": "NAME",
    "EMAIL": "EMAIL", "Email Address": "EMAIL", "email": "EMAIL",
    "Phone": "PHONE", "phone": "PHONE", "PHONE": "PHONE",
    "Location": "LOCATION", "location": "LOCATION", "LOCATION": "LOCATION",
    "Designation": "DESIGNATION", "designation": "DESIGNATION", "DESIGNATION": "DESIGNATION",
    "Companies worked at": "COMPANY", "Company": "COMPANY", "company": "COMPANY", "COMPANY": "COMPANY",
    "Degree": "DEGREE", "degree": "DEGREE", "DEGREE": "DEGREE",
    "Graduation Year": "GRADUATION_YEAR", "graduation_year": "GRADUATION_YEAR",
    "College Name": "COLLEGE_NAME", "college_name": "COLLEGE_NAME", "COLLEGE": "COLLEGE_NAME",
    "Years of Experience": "YEARS_OF_EXPERIENCE", "years_of_experience": "YEARS_OF_EXPERIENCE", "Experience": "YEARS_OF_EXPERIENCE",
    "Skills": "SKILLS", "skills": "SKILLS", "SKILLS": "SKILLS",
    "Certification": "CERTIFICATION", "certification": "CERTIFICATION", "CERTIFICATION": "CERTIFICATION",
    "Project": "PROJECT_NAME", "project": "PROJECT_NAME", "PROJECT": "PROJECT_NAME",
    "Technology": "PROJECT_TECHNOLOGY", "technology": "PROJECT_TECHNOLOGY",
}


def _normalize_label(raw_label):
    return _LABEL_NORMALIZE.get(raw_label)


def _tokenize_with_offsets(text):
    tokens, offsets = [], []
    for m in re.finditer(r"\S+", text):
        tokens.append(m.group())
        offsets.append((m.start(), m.end()))
    return tokens, offsets


def _bio_tags_from_char_spans(tokens, char_offsets, spans):
    tags = ["O"] * len(tokens)
    for span in spans:
        s_start, s_end, label = span["start"], span["end"], span["label"]
        first = True
        for idx, (t_start, t_end) in enumerate(char_offsets):
            if t_end <= s_start or t_start >= s_end:
                continue
            prefix = "B" if first else "I"
            tag = f"{prefix}-{label}"
            if tag in LABEL2ID:
                tags[idx] = tag
                first = False
    return tags


def _label_to_id(tags):
    return [LABEL2ID.get(t, 0) for t in tags]


# --- Dataset Loaders ---

def load_yashpwr():
    parquet_path = DATA_DIR / "yashpwr_resume_ner" / "train.parquet"
    if not parquet_path.exists():
        logger.warning("yashpwr not found -- skipping.")
        return []
    df = pd.read_parquet(parquet_path)
    if "tokens" not in df.columns or "ner_tags" not in df.columns:
        return []
    records = []
    for _, row in df.iterrows():
        tokens, raw_tags = row["tokens"], row["ner_tags"]
        if not isinstance(tokens, list) or not isinstance(raw_tags, list):
            continue
        if len(tokens) != len(raw_tags):
            continue
        records.append({"tokens": tokens, "ner_tags": [int(t) for t in raw_tags], "source": "yashpwr"})
    logger.info("yashpwr: %d sequences", len(records))
    return records


def load_dataturks():
    json_path = DATA_DIR / "dataturks_resume_ner" / "Entity Recognition in Resumes.json"
    if not json_path.exists():
        logger.warning("DataTurks not found -- skipping.")
        return []
    records = []
    with open(json_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                entry = json.loads(line)
            except json.JSONDecodeError:
                continue
            content = entry.get("content", "")
            annotations = entry.get("annotation", [])
            if not content or not annotations:
                continue
            tokens, offsets = _tokenize_with_offsets(content)
            if len(tokens) < 3:
                continue
            spans = []
            for ann in annotations:
                raw_labels = ann.get("label", [])
                if isinstance(raw_labels, str):
                    raw_labels = [raw_labels]
                points = ann.get("points", [])
                if not points:
                    continue
                for raw_label in raw_labels:
                    norm = _normalize_label(raw_label)
                    if norm is None:
                        continue
                    for pt in points:
                        start, end = pt.get("start"), pt.get("end")
                        if start is None or end is None:
                            continue
                        spans.append({"start": start, "end": end + 1, "label": norm})
            tags = _bio_tags_from_char_spans(tokens, offsets, spans)
            for i in range(0, len(tokens), 128):
                ct, ctags = tokens[i:i+128], tags[i:i+128]
                if len(ct) >= 3:
                    records.append({"tokens": ct, "ner_tags": _label_to_id(ctags), "source": "dataturks"})
    logger.info("DataTurks: %d sequences", len(records))
    return records


def load_mehyaar():
    base_dir = DATA_DIR / "mehyaar_ner_cvs" / "ResumesJsonAnnotated" / "ResumesJsonAnnotated"
    if not base_dir.exists():
        logger.warning("Mehyaar not found -- skipping.")
        return []
    records = []
    for jf in sorted(base_dir.glob("*.json")):
        try:
            with open(jf, "r", encoding="utf-8") as f:
                data = json.load(f)
        except (json.JSONDecodeError, UnicodeDecodeError):
            continue
        text = data.get("text", data.get("content", ""))
        annotations = data.get("annotations", data.get("annotation", []))
        if not text or not annotations:
            continue
        tokens, offsets = _tokenize_with_offsets(text)
        if len(tokens) < 3:
            continue
        spans = []
        for ann in annotations:
            if isinstance(ann, dict):
                label = ann.get("label", ann.get("type", ""))
                start = ann.get("start", ann.get("startOffset"))
                end = ann.get("end", ann.get("endOffset"))
            elif isinstance(ann, (list, tuple)) and len(ann) >= 3:
                start, end, label = ann[0], ann[1], ann[2]
            else:
                continue
            if start is None or end is None or not label:
                continue
            norm = _normalize_label(label)
            if norm is None:
                if label.upper() in [e.upper() for e in ENTITY_TYPES]:
                    norm = label.upper()
                else:
                    continue
            spans.append({"start": int(start), "end": int(end), "label": norm})
        tags = _bio_tags_from_char_spans(tokens, offsets, spans)
        for i in range(0, len(tokens), 128):
            ct, ctags = tokens[i:i+128], tags[i:i+128]
            if len(ct) >= 3:
                records.append({"tokens": ct, "ner_tags": _label_to_id(ctags), "source": "mehyaar"})
    logger.info("Mehyaar: %d sequences", len(records))
    return records


def load_datasetmaster():
    parquet_path = DATA_DIR / "datasetmaster_resumes" / "train.parquet"
    if not parquet_path.exists():
        logger.warning("DatasetMaster not found -- skipping.")
        return []
    df = pd.read_parquet(parquet_path)
    records = []
    text_col = None
    for candidate in ["resume_text", "text", "content", "resume", "Resume"]:
        if candidate in df.columns:
            text_col = candidate
            break
    if text_col is None:
        # Synthesize from structured fields
        for _, row in df.iterrows():
            tokens_all, tags_all = [], []
            for col, label in [("skills", "SKILLS"), ("education", "DEGREE"), ("projects", "PROJECT_NAME")]:
                val = row.get(col)
                if isinstance(val, str) and val.strip():
                    toks, _ = _tokenize_with_offsets(val)
                    tokens_all.extend(toks)
                    tags_all.extend([f"B-{label}"] + [f"I-{label}"] * (len(toks) - 1))
                elif isinstance(val, list):
                    for item in val:
                        if isinstance(item, str) and item.strip():
                            toks, _ = _tokenize_with_offsets(item)
                            tokens_all.extend(toks)
                            tags_all.extend([f"B-{label}"] + [f"I-{label}"] * (len(toks) - 1))
            if len(tokens_all) >= 5:
                records.append({"tokens": tokens_all, "ner_tags": _label_to_id(tags_all), "source": "datasetmaster"})
        logger.info("DatasetMaster (structured): %d sequences", len(records))
        return records
    # Weak supervision from structured columns onto text
    field_label_map = {
        "skills": "SKILLS", "education": "DEGREE", "company": "COMPANY",
        "designation": "DESIGNATION", "college": "COLLEGE_NAME", "degree": "DEGREE",
        "projects": "PROJECT_NAME", "certification": "CERTIFICATION", "certifications": "CERTIFICATION",
    }
    for _, row in df.iterrows():
        text = row[text_col]
        if not isinstance(text, str) or len(text) < 30:
            continue
        tokens, offsets = _tokenize_with_offsets(text[:2000])
        if len(tokens) < 5:
            continue
        spans = []
        for col, label in field_label_map.items():
            val = row.get(col)
            if val is None:
                continue
            search_terms = []
            if isinstance(val, str) and val.strip():
                search_terms = [val.strip()]
            elif isinstance(val, list):
                search_terms = [str(v).strip() for v in val if isinstance(v, str) and v.strip()]
            for term in search_terms[:10]:
                try:
                    for m in re.finditer(re.escape(term[:100]), text[:2000], re.IGNORECASE):
                        spans.append({"start": m.start(), "end": m.end(), "label": label})
                        break
                except re.error:
                    continue
        tags = _bio_tags_from_char_spans(tokens, offsets, spans)
        for i in range(0, len(tokens), 128):
            ct, ctags = tokens[i:i+128], tags[i:i+128]
            if len(ct) >= 3:
                records.append({"tokens": ct, "ner_tags": _label_to_id(ctags), "source": "datasetmaster"})
    logger.info("DatasetMaster: %d sequences", len(records))
    return records


def load_djinni():
    parquet_path = DATA_DIR / "djinni_candidates" / "train.parquet"
    if not parquet_path.exists():
        logger.warning("Djinni not found -- skipping.")
        return []
    df = pd.read_parquet(parquet_path)
    if len(df) > 15000:
        df = df.sample(n=15000, random_state=42)
    records = []
    text_col = None
    for candidate in ["description", "text", "bio", "summary", "content", "experience"]:
        if candidate in df.columns:
            text_col = candidate
            break
    skill_col = None
    for candidate in ["skills", "keywords", "technologies"]:
        if candidate in df.columns:
            skill_col = candidate
            break
    position_col = None
    for candidate in ["position", "title", "designation", "job_title"]:
        if candidate in df.columns:
            position_col = candidate
            break
    if text_col is None:
        logger.warning("Djinni: no text column found. Columns: %s", df.columns.tolist())
        return []
    exp_pattern = re.compile(r"\b(\d+)\+?\s*years?\b", re.IGNORECASE)
    for _, row in df.iterrows():
        text = row.get(text_col, "")
        if not isinstance(text, str) or len(text) < 20:
            continue
        text = text[:1500]
        tokens, offsets = _tokenize_with_offsets(text)
        if len(tokens) < 5:
            continue
        spans = []
        for m in exp_pattern.finditer(text):
            spans.append({"start": m.start(), "end": m.end(), "label": "YEARS_OF_EXPERIENCE"})
        if skill_col:
            skills_val = row.get(skill_col, "")
            if isinstance(skills_val, str):
                skill_list = [s.strip() for s in skills_val.split(",") if s.strip()]
            elif isinstance(skills_val, list):
                skill_list = [str(s).strip() for s in skills_val if s]
            else:
                skill_list = []
            for skill in skill_list[:15]:
                try:
                    for m in re.finditer(re.escape(skill), text, re.IGNORECASE):
                        spans.append({"start": m.start(), "end": m.end(), "label": "SKILLS"})
                        break
                except re.error:
                    continue
        if position_col:
            pos_val = row.get(position_col, "")
            if isinstance(pos_val, str) and pos_val.strip():
                try:
                    for m in re.finditer(re.escape(pos_val.strip()), text, re.IGNORECASE):
                        spans.append({"start": m.start(), "end": m.end(), "label": "DESIGNATION"})
                        break
                except re.error:
                    pass
        tags = _bio_tags_from_char_spans(tokens, offsets, spans)
        if any(t != "O" for t in tags):
            for i in range(0, len(tokens), 128):
                ct, ctags = tokens[i:i+128], tags[i:i+128]
                if len(ct) >= 3:
                    records.append({"tokens": ct, "ner_tags": _label_to_id(ctags), "source": "djinni"})
    logger.info("Djinni: %d sequences", len(records))
    return records


# --- Load all datasets ---
print("Loading all 5 datasets...")
all_records = []
for name, loader in [("yashpwr", load_yashpwr), ("DataTurks", load_dataturks),
                      ("Mehyaar", load_mehyaar), ("DatasetMaster", load_datasetmaster),
                      ("Djinni", load_djinni)]:
    try:
        recs = loader()
        all_records.extend(recs)
        print(f"  {name}: {len(recs)} sequences (total: {len(all_records)})")
    except Exception as e:
        print(f"  {name}: FAILED - {e}")

# Remove internal flags
for rec in all_records:
    rec.pop("_raw_tags", None)

print(f"\nTotal unified: {len(all_records)} sequences")

## Cell 4.5: yashpwr Weak Supervision NER

**Problem:** The `yashpwr/resume-ner-training-data` dataset was expected to have BIO-tagged NER
columns (`tokens`, `ner_tags`), but actually contains **chat/instruction data** with a `messages`
column in system/user/assistant format:
- **system**: "You are an expert resume assistant..."
- **user**: "Please summarize the following resume:\n\n`[FULL RESUME TEXT]`"
- **assistant**: "`[SUMMARY]`"

**Solution:** We extract the full resume text from the user message and apply **weak supervision**
(regex + section parsing) to automatically generate BIO tags. This is noisier than gold-standard
annotations but adds ~20K+ sequences covering entity types that other datasets may lack.

**How it works:**
1. Extract resume text from the `user` message (everything after the instruction prompt)
2. Detect section headers: `Skills`, `Education`, `Professional Experience`, etc.
3. Use regex patterns to tag entities within each section:
   - **SKILLS**: Comma/newline-separated items in Skills sections
   - **DESIGNATION**: Job titles at the start of experience blocks (e.g. "General Manager")
   - **COMPANY**: Text following "Company Name" patterns
   - **LOCATION**: "City , State" patterns
   - **DEGREE**: Education qualifiers like "BS", "MBA", "Master"
   - **COLLEGE_NAME**: Institution names in Education sections
   - **GRADUATION_YEAR**: 4-digit years in Education sections
   - **YEARS_OF_EXPERIENCE**: "X+ years" patterns anywhere
   - **EMAIL**: Standard email regex
   - **PHONE**: Phone number patterns

**Label quality:** Weak supervision = ~70-80% accuracy. Mixing it with gold-standard data
(Mehyaar, DataTurks) still improves model coverage for underrepresented entity types.

In [None]:
# Cell 4.5: yashpwr Weak Supervision - extract NER from chat/instruction format
#
# The yashpwr dataset has 22,855 resumes in chat format (messages column).
# We extract resume text from the "user" message and apply regex-based
# weak supervision to generate BIO tags automatically.

def load_yashpwr_weak_supervision():
    """Parse yashpwr chat messages into weakly-labeled NER sequences."""
    parquet_path = DATA_DIR / "yashpwr_resume_ner" / "train.parquet"
    if not parquet_path.exists():
        print("yashpwr parquet not found -- skipping weak supervision.")
        return []

    df = pd.read_parquet(parquet_path)
    if "messages" not in df.columns:
        print(f"yashpwr has no 'messages' column (cols: {df.columns.tolist()}) -- skipping.")
        return []

    print(f"yashpwr: {len(df)} rows with chat messages. Applying weak supervision...")

    # --- Regex patterns for entity detection ---

    # Section headers that help us know what context we're in
    SECTION_PATTERNS = {
        "skills": re.compile(
            r"^(Skills|Technical Skills|Core Competencies|Areas of Expertise|"
            r"Skill Highlights|Additional Skills|Computer Skills|Software Skills)\b",
            re.IGNORECASE | re.MULTILINE
        ),
        "education": re.compile(
            r"^(Education|Academic Background|Educational Background|Qualifications)\b",
            re.IGNORECASE | re.MULTILINE
        ),
        "experience": re.compile(
            r"^(Professional Experience|Work Experience|Experience|Employment|"
            r"Employment History|Work History|Professional Background|Career History)\b",
            re.IGNORECASE | re.MULTILINE
        ),
        "certifications": re.compile(
            r"^(Certifications?|Licenses?|Professional Certifications?|"
            r"Licenses? and Certifications?)\b",
            re.IGNORECASE | re.MULTILINE
        ),
    }

    # Entity-level patterns
    EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")
    PHONE_RE = re.compile(r"(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}")
    YEARS_EXP_RE = re.compile(r"\b(\d{1,2})\+?\s*(?:years?|yrs?)\s*(?:of\s+)?(?:experience)?\b", re.IGNORECASE)
    YEAR_RE = re.compile(r"\b(19[89]\d|20[0-2]\d)\b")
    LOCATION_RE = re.compile(r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)\s*,\s*([A-Z]{2})\b")
    COMPANY_RE = re.compile(r"Company\s+Name\s+(.+?)(?:\s+City\s*,|\s*$)", re.IGNORECASE | re.MULTILINE)

    # Degree patterns
    DEGREE_RE = re.compile(
        r"\b(Ph\.?D\.?|M\.?D\.?|J\.?D\.?|M\.?B\.?A\.?|M\.?S\.?|B\.?S\.?|B\.?A\.?|M\.?A\.?|"
        r"Bachelor(?:'?s)?(?:\s+of\s+\w+)?|Master(?:'?s)?(?:\s+of\s+\w+)?|"
        r"Associate(?:'?s)?(?:\s+of\s+\w+)?|Doctorate|Doctor of)\b",
        re.IGNORECASE
    )

    # Common job title patterns (for DESIGNATION)
    TITLE_RE = re.compile(
        r"\b((?:Senior|Junior|Lead|Chief|Head|Principal|Staff|Associate|Assistant|Executive|"
        r"Vice President|VP|Director|Manager|Coordinator|Specialist|Analyst|Engineer|"
        r"Developer|Designer|Consultant|Administrator|Supervisor|Officer|Architect|"
        r"Technician|Representative|Advisor|Strategist|Planner)"
        r"(?:\s+(?:of|for))?"
        r"(?:\s+\w+){0,3})"
        r"(?=\s+(?:January|February|March|April|May|June|July|August|September|October|November|December|\d{4}|Company|$))",
        re.IGNORECASE | re.MULTILINE
    )

    records = []
    skipped = 0

    for idx, row in df.iterrows():
        messages = row["messages"]
        if not isinstance(messages, (list, np.ndarray)):
            skipped += 1
            continue

        # Extract resume text from user message
        resume_text = ""
        for msg in messages:
            if isinstance(msg, dict) and msg.get("role") == "user":
                content = msg.get("content", "")
                # Remove the instruction prompt, keep just the resume
                for separator in [
                    "following resume:\n\n",
                    "following resume:\n",
                    "this resume:\n\n",
                    "this resume:\n",
                    "resume:\n\n",
                    "resume:\n",
                ]:
                    if separator in content:
                        resume_text = content.split(separator, 1)[1]
                        break
                if not resume_text:
                    # If no separator found, use the whole content if it's long enough
                    if len(content) > 200:
                        resume_text = content
                break

        if len(resume_text) < 50:
            skipped += 1
            continue

        # Truncate very long resumes
        resume_text = resume_text[:3000]
        tokens, offsets = _tokenize_with_offsets(resume_text)
        if len(tokens) < 10:
            skipped += 1
            continue

        spans = []

        # --- 1. Email ---
        for m in EMAIL_RE.finditer(resume_text):
            spans.append({"start": m.start(), "end": m.end(), "label": "EMAIL"})

        # --- 2. Phone ---
        for m in PHONE_RE.finditer(resume_text):
            spans.append({"start": m.start(), "end": m.end(), "label": "PHONE"})

        # --- 3. Years of experience ---
        for m in YEARS_EXP_RE.finditer(resume_text):
            spans.append({"start": m.start(), "end": m.end(), "label": "YEARS_OF_EXPERIENCE"})

        # --- 4. Location (City, ST) ---
        for m in LOCATION_RE.finditer(resume_text):
            spans.append({"start": m.start(), "end": m.end(), "label": "LOCATION"})

        # --- 5. Company Name patterns ---
        for m in COMPANY_RE.finditer(resume_text):
            company = m.group(1).strip()
            if len(company) > 2:
                spans.append({"start": m.start(1), "end": m.start(1) + len(company), "label": "COMPANY"})

        # --- 6. Degrees ---
        for m in DEGREE_RE.finditer(resume_text):
            spans.append({"start": m.start(), "end": m.end(), "label": "DEGREE"})

        # --- 7. Section-aware tagging ---
        # Find section boundaries
        sections = []
        for sec_name, sec_re in SECTION_PATTERNS.items():
            for m in sec_re.finditer(resume_text):
                sections.append((m.start(), sec_name))
        sections.sort(key=lambda x: x[0])

        # Skills section: tag comma/newline separated items
        for i, (sec_start, sec_name) in enumerate(sections):
            sec_end = sections[i + 1][0] if i + 1 < len(sections) else len(resume_text)
            sec_text = resume_text[sec_start:sec_end]

            if sec_name == "skills":
                # Skip the header line, get content
                lines = sec_text.split("\n")[1:]
                content = " ".join(lines).strip()
                # Split by commas, semicolons, or bullet-like separators
                skill_items = re.split(r"[,;|]|\band\b", content)
                for item in skill_items:
                    item = item.strip().strip(".")
                    if 2 < len(item) < 50 and not item[0].isdigit():
                        try:
                            for m in re.finditer(re.escape(item), resume_text[sec_start:sec_end]):
                                abs_start = sec_start + m.start()
                                abs_end = sec_start + m.end()
                                spans.append({"start": abs_start, "end": abs_end, "label": "SKILLS"})
                                break
                        except re.error:
                            continue

            elif sec_name == "education":
                # Tag graduation years in education section
                for m in YEAR_RE.finditer(sec_text):
                    abs_start = sec_start + m.start()
                    abs_end = sec_start + m.end()
                    spans.append({"start": abs_start, "end": abs_end, "label": "GRADUATION_YEAR"})

                # Tag college names: lines with known institution keywords
                for m in re.finditer(
                    r"((?:University|College|Institute|School|Academy|"
                    r"Polytechnic|Conservatory)(?:\s+of)?\s+[\w\s]+?)(?:\s*[-,\n]|$)",
                    sec_text, re.IGNORECASE
                ):
                    name = m.group(1).strip()
                    if len(name) > 5:
                        abs_start = sec_start + m.start(1)
                        abs_end = sec_start + m.start(1) + len(name)
                        spans.append({"start": abs_start, "end": abs_end, "label": "COLLEGE_NAME"})

            elif sec_name == "certifications":
                # Tag entire non-empty lines as CERTIFICATION
                for line in sec_text.split("\n")[1:]:
                    line = line.strip()
                    if len(line) > 5 and not re.match(r"^\d{4}", line):
                        try:
                            for m in re.finditer(re.escape(line[:80]), resume_text):
                                spans.append({"start": m.start(), "end": m.end(), "label": "CERTIFICATION"})
                                break
                        except re.error:
                            continue

        # --- 8. Job titles (DESIGNATION) near date patterns ---
        for m in TITLE_RE.finditer(resume_text):
            title = m.group(1).strip()
            if len(title) > 3:
                spans.append({"start": m.start(1), "end": m.start(1) + len(title), "label": "DESIGNATION"})

        # Only keep records that have at least some entity tags
        if not spans:
            skipped += 1
            continue

        # Remove overlapping spans (keep longest)
        spans.sort(key=lambda s: (s["start"], -(s["end"] - s["start"])))
        filtered_spans = []
        last_end = -1
        for s in spans:
            if s["start"] >= last_end:
                filtered_spans.append(s)
                last_end = s["end"]

        tags = _bio_tags_from_char_spans(tokens, offsets, filtered_spans)

        # Chunk into 128-token sequences
        for i in range(0, len(tokens), 128):
            ct = tokens[i:i + 128]
            ctags = tags[i:i + 128]
            if len(ct) >= 5 and any(t != "O" for t in ctags):
                records.append({
                    "tokens": ct,
                    "ner_tags": _label_to_id(ctags),
                    "source": "yashpwr_weak",
                })

    # Summary stats
    entity_counts = {}
    for rec in records:
        for tag_id in rec["ner_tags"]:
            label = ID2LABEL.get(tag_id, "O")
            if label != "O" and label.startswith("B-"):
                etype = label[2:]
                entity_counts[etype] = entity_counts.get(etype, 0) + 1

    print(f"\nyashpwr weak supervision results:")
    print(f"  Processed: {len(df) - skipped} resumes (skipped {skipped})")
    print(f"  Generated: {len(records)} sequences")
    print(f"  Entity counts (B- tags):")
    for etype, count in sorted(entity_counts.items(), key=lambda x: -x[1]):
        print(f"    {etype}: {count}")

    return records


# --- Run it and add to all_records ---
yashpwr_weak = load_yashpwr_weak_supervision()
all_records.extend(yashpwr_weak)
print(f"\nTotal after yashpwr weak supervision: {len(all_records)} sequences")

In [None]:
# Cell 5: Clean and split data

# Clean records
cleaned = []
for rec in all_records:
    tokens = [str(t) if not isinstance(t, str) else t for t in rec.get("tokens", [])]
    tags = [int(t) if not isinstance(t, int) else t for t in rec.get("ner_tags", [])]
    if not tokens or len(tokens) != len(tags):
        continue
    if any(t in ("nan", "None", "") for t in tokens):
        continue
    # Strip surrogate characters that break Arrow/UTF-8
    tokens = [t.encode("utf-8", errors="replace").decode("utf-8") for t in tokens]
    source = str(rec.get("source", "unknown")).encode("utf-8", errors="replace").decode("utf-8")
    cleaned.append({"tokens": tokens, "ner_tags": tags, "source": source})

print(f"Cleaned: {len(all_records)} -> {len(cleaned)} records (dropped {len(all_records) - len(cleaned)})")

# Split into train/val/test
TRAIN_RATIO = 0.8
VAL_RATIO = 0.1

rng = random.Random(42)
rng.shuffle(cleaned)

n = len(cleaned)
n_train = int(n * TRAIN_RATIO)
n_val = int(n * VAL_RATIO)

splits = {
    "train": cleaned[:n_train],
    "validation": cleaned[n_train:n_train + n_val],
    "test": cleaned[n_train + n_val:],
}

features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(Value("int32")),
    "source": Value("string"),
})

dd = DatasetDict()
for split_name, split_records in splits.items():
    dd[split_name] = Dataset.from_dict(
        {
            "tokens": [r["tokens"] for r in split_records],
            "ner_tags": [r["ner_tags"] for r in split_records],
            "source": [r["source"] for r in split_records],
        },
        features=features,
    )
    print(f"  {split_name}: {len(split_records)} examples")

train_ds, val_ds, test_ds = dd["train"], dd["validation"], dd["test"]
print(f"\nTrain: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}")

In [None]:
# Cell 6: Tokenize and align labels
from transformers import AutoTokenizer

BASE_MODEL = "yashpwr/resume-ner-bert-v2"
MAX_LENGTH = 512

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize_and_align(examples):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )
    all_labels = []
    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned.append(-100)
            elif word_id != previous_word_id:
                aligned.append(labels[word_id])
            else:
                aligned.append(-100)
            previous_word_id = word_id
        all_labels.append(aligned)
    tokenized["labels"] = all_labels
    return tokenized

print("Tokenizing train...")
train_tok = train_ds.map(tokenize_and_align, batched=True, remove_columns=train_ds.column_names)
print("Tokenizing val...")
val_tok = val_ds.map(tokenize_and_align, batched=True, remove_columns=val_ds.column_names)
print("Tokenizing test...")
test_tok = test_ds.map(tokenize_and_align, batched=True, remove_columns=test_ds.column_names)
print(f"Tokenized: train={len(train_tok)}, val={len(val_tok)}, test={len(test_tok)}")

In [None]:
# Cell 7: Model setup + two-phase training
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import f1_score, precision_score, recall_score

# Label list for metrics (must match LABELS from cell 4)
label_list = LABELS
id2label = ID2LABEL
label2id = LABEL2ID

# Load model with resized classifier head
model = AutoModelForTokenClassification.from_pretrained(
    BASE_MODEL,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,  # classifier head changes from 11 to 14 entity types
)
print(f"Model loaded: {BASE_MODEL} with {len(label_list)} labels")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    true_labels, true_preds = [], []
    for pred_seq, label_seq in zip(predictions, labels):
        t_labels, t_preds = [], []
        for p, l in zip(pred_seq, label_seq):
            if l == -100:
                continue
            t_labels.append(label_list[l])
            t_preds.append(label_list[p])
        true_labels.append(t_labels)
        true_preds.append(t_preds)
    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }


def freeze_bert_layers(model, layer_indices):
    for idx in layer_indices:
        for param in model.bert.encoder.layer[idx].parameters():
            param.requires_grad = False
    print(f"Froze BERT layers: {layer_indices}")


def unfreeze_all(model):
    for param in model.parameters():
        param.requires_grad = True
    print("Unfroze all layers")


# --- Config ---
OUTPUT_DIR = "/content/m2_resume_extractor"
LEARNING_RATE = 2e-5
BATCH_SIZE = 16
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
FREEZE_LAYERS = [0, 1, 2, 3, 4, 5, 6, 7, 8]
FREEZE_EPOCHS = 2
TOTAL_EPOCHS = 8

# ============================
# Phase 1: Frozen layers 0-8
# ============================
print(f"\n{'='*60}")
print(f"Phase 1: Frozen layers {FREEZE_LAYERS} for {FREEZE_EPOCHS} epochs")
print(f"{'='*60}")

freeze_bert_layers(model, FREEZE_LAYERS)

phase1_args = TrainingArguments(
    output_dir=OUTPUT_DIR + "/phase1",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=FREEZE_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    fp16=True,
    logging_steps=50,
    save_total_limit=1,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=phase1_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
)

trainer.train()
phase1_results = trainer.evaluate()
print(f"Phase 1 results: {phase1_results}")

# ============================
# Phase 2: All layers unfrozen
# ============================
remaining_epochs = TOTAL_EPOCHS - FREEZE_EPOCHS

print(f"\n{'='*60}")
print(f"Phase 2: All layers unfrozen for {remaining_epochs} epochs")
print(f"{'='*60}")

unfreeze_all(model)

phase2_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE * 0.5,  # lower LR for full fine-tuning
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=remaining_epochs,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=phase2_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
)

trainer.train()
print("Training complete!")

In [None]:
# Cell 8: Evaluate on test set
print("Evaluating on test set...")
results = trainer.evaluate(test_tok)
print(f"\nTest Results:")
print(f"  Precision: {results.get('eval_precision', 0):.4f}")
print(f"  Recall:    {results.get('eval_recall', 0):.4f}")
print(f"  F1:        {results.get('eval_f1', 0):.4f}")

TARGET_F1 = 0.90
test_f1 = results.get('eval_f1', 0)
if test_f1 >= TARGET_F1:
    print(f"\nTarget F1 {TARGET_F1:.2f} ACHIEVED (got {test_f1:.4f})")
else:
    print(f"\nTarget F1 {TARGET_F1:.2f} NOT MET (got {test_f1:.4f})")

In [None]:
# Cell 9: Save model and download
import shutil

# Save final model + tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")

# List saved files
for f in sorted(Path(OUTPUT_DIR).iterdir()):
    if f.is_file():
        size_mb = f.stat().st_size / 1e6
        print(f"  {f.name}: {size_mb:.1f} MB")

# Zip for download
shutil.make_archive("/content/m2_resume_extractor_trained", "zip", OUTPUT_DIR)
print(f"\nZipped to /content/m2_resume_extractor_trained.zip")

# Download
from google.colab import files
files.download("/content/m2_resume_extractor_trained.zip")

---

## Appendix: Using yashpwr Data for Other Models

The yashpwr dataset (22,855 resumes in chat format) contains rich structured resume text that
can be reformatted for other models in the pipeline beyond M2. Below are concrete strategies
for each model that could benefit.

---

### M3 (Skills Comparator) - Skill Co-occurrence Pairs

**What M3 needs:** Triplets of (anchor_skill, positive_skill, negative_skill) for contrastive learning.

**How yashpwr helps:** Each resume's Skills section lists skills that co-occur in real professionals.
Skills listed together on the same resume are implicitly related (positive pairs), while skills
from very different resumes/domains are negatives.

**Data format to generate:**
```python
# Extract skills sections from yashpwr resumes
# For each resume, parse comma-separated skills
# Build co-occurrence: skills on same resume = positive pair
# Skills from different industry resumes = negative
triplets = []
for resume in yashpwr_resumes:
    skills = extract_skills_section(resume)  # ["Python", "SQL", "Machine Learning"]
    for i, anchor in enumerate(skills):
        for j, positive in enumerate(skills):
            if i != j:
                negative = random_skill_from_different_domain()
                triplets.append({"anchor": anchor, "positive": positive, "negative": negative})
```

**Expected yield:** ~50K-100K skill triplets from real resume co-occurrence patterns.
This supplements ESCO/Tabiya synonym data with real-world usage patterns.

---

### M4 (Exp/Edu Comparator) - Resume Feature Extraction

**What M4 needs:** (resume_features, jd_features) pairs with experience/education scores.

**How yashpwr helps:** The assistant message contains a **professional summary** that describes
the candidate's experience level, education, and domain. Combined with the structured resume text,
we can extract:
- Years of experience (from the resume text)
- Education level and field (from Education section)
- Job titles and seniority progression (from Experience section)
- Domain/industry (from the summary + job titles)

**Data format to generate:**
```python
# For each resume:
# 1. Extract: years_exp, edu_level, job_titles, skills, domain
# 2. Generate synthetic JD requirements based on the resume's domain
# 3. Create pairs with varying match quality (exact match = 1.0, partial = 0.5, mismatch = 0.1)
pairs = []
for resume in yashpwr_resumes:
    features = extract_structured_features(resume)
    # Positive pair: JD that matches this resume well
    jd_match = synthesize_matching_jd(features)
    pairs.append({"resume": features, "jd": jd_match, "label": 0.85 + noise})
    # Negative pair: JD from different domain
    jd_mismatch = synthesize_mismatching_jd(features)
    pairs.append({"resume": features, "jd": jd_mismatch, "label": 0.2 + noise})
```

**Expected yield:** ~45K pairs (2 per resume) with realistic feature distributions.

---

### M5 (Judge) - Score Calibration Data

**What M5 needs:** Combined scores from M3+M4 mapped to overall match quality.

**How yashpwr helps:** The assistant summaries provide implicit quality signals.
Resumes with clear skill-job alignment in the summary suggest high match scores.
This could supplement M5's calibration data after M3/M4 are trained.

**Note:** M5 benefits most AFTER M3 and M4 are trained, as it needs their output scores
as input features. Save this for a second training pass.

---

### Implementation Priority

| Model | Effort | Impact | Priority |
|-------|--------|--------|----------|
| **M2** (done above) | Low | High - adds ~20K sequences | Already implemented |
| **M3** | Medium | Medium - adds skill co-occurrence | Second priority |
| **M4** | Medium | Low - already has good data | Nice to have |
| **M5** | High | Low - needs M3/M4 first | Future work |