# M3 Skills Comparator Training (Kaggle + P100 GPU)

Fine-tunes **TechWolf/JobBERT-v2** using contrastive learning
(MultipleNegativesRankingLoss) on skill triplets.

**Model:** Sentence-transformers with SentenceTransformerTrainer API
**Input:** (anchor_skill, positive_skill, negative_skill) triplets
**Output:** Skill embeddings where similar skills are close together

## Setup
1. Upload `m3_training_data.zip` as a Kaggle Dataset (it will auto-extract)
2. Add the dataset to this notebook via the sidebar **Add Data** button
3. Select **GPU P100** in notebook settings (Settings > Accelerator)
4. Enable **Internet** (Settings > Internet > On)
5. Run all cells
6. Download the trained model from the output or via Kaggle Dataset API

## Data Sources (7 datasets)
| Dataset | Type | Expected Data |
|---------|------|---------------|
| TechWolf ESCO Sentences | Skill-sentence contrastive pairs | ~138K pairs |
| Tabiya ESCO | Skill synonyms / alt labels | ~14K skills |
| Nesta Skills Taxonomy | Skill clusters (143 clusters) | ~10K skills |
| StackLite | SO tag co-occurrence (may be LFS placeholder) | optional |
| Related Job Skills | Skill relationship mapping | ~4.7M |
| Job Skill Set | Job posts with extracted skills | ~4.7M |
| Mind Tech Ontology | Tech skill ontology with synonyms | ~5.4M |

In [None]:
# Cell 1: Suppress TF/CUDA warnings + install dependencies
#
# Kaggle pre-loads both TensorFlow and PyTorch, causing duplicate CUDA factory
# registration messages (cuFFT, cuDNN, cuBLAS). These are cosmetic only.
# Setting TF_CPP_MIN_LOG_LEVEL=3 before any imports suppresses them.

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['GRPC_VERBOSITY'] = 'ERROR'
os.environ['ABSL_MIN_LOG_LEVEL'] = '2'

import warnings
warnings.filterwarnings('ignore', message='.*computation placer already registered.*')
warnings.filterwarnings('ignore', message='.*Unable to register.*factory.*')

!pip install -q sentence-transformers datasets pandas pyarrow

In [None]:
# Cell 2: Check GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("WARNING: No GPU detected! Enable GPU in Settings > Accelerator.")

In [None]:
# Cell 3: Locate training data
#
# Kaggle auto-extracts uploaded zip files. When you upload m3_training_data.zip
# as a dataset, contents are extracted into /kaggle/input/<dataset-name>/.
# Kaggle converts underscores to hyphens, so "m3_training_data" becomes
# "/kaggle/input/m3-training-data/".

import os
from pathlib import Path

# Expected data subdirectories
TARGET_FOLDERS = {
    "techwolf_esco_sentences", "tabiya_esco", "nesta_skills_taxonomy",
    "stacklite", "related_job_skills", "job_skill_set", "mind_tech_ontology"
}

print("Available datasets in /kaggle/input/:")
for d in os.listdir("/kaggle/input/"):
    print(f"  /kaggle/input/{d}/")

# Search for the directory containing our data folders
DATA_DIR = None
for root, dirs, files in os.walk("/kaggle/input/"):
    if any(d in TARGET_FOLDERS for d in dirs):
        DATA_DIR = Path(root)
        break
    if root.count(os.sep) - "/kaggle/input/".count(os.sep) > 4:
        break

if DATA_DIR:
    print(f"\nData root found: {DATA_DIR}")
    print("Contents:")
    for item in sorted(os.listdir(DATA_DIR)):
        full = DATA_DIR / item
        if full.is_dir():
            count = sum(1 for _ in full.rglob("*") if _.is_file())
            print(f"  {item}/ ({count} files)")
        else:
            print(f"  {item} ({full.stat().st_size / 1e6:.1f} MB)")
else:
    print("\nERROR: Could not find expected data folders!")
    print("Full /kaggle/input/ tree:")
    for root, dirs, files in os.walk("/kaggle/input/"):
        depth = root.replace("/kaggle/input/", "").count(os.sep)
        indent = "  " * depth
        print(f"{indent}{os.path.basename(root)}/")
        if depth < 4:
            for f in files[:10]:
                print(f"{indent}  {f}")
            if len(files) > 10:
                print(f"{indent}  ... and {len(files) - 10} more files")
    print("\nPlease add m3_training_data.zip as a Dataset in the notebook sidebar.")

In [None]:
# Cell 4: Load all skill data sources
#
# Primary sources (from original training script):
#   1. TechWolf ESCO Sentences - skill-sentence contrastive pairs
#   2. Tabiya ESCO - skill synonyms / alt labels
#   3. Nesta Skills Taxonomy - skill cluster assignments
#   4. StackLite - SO tag co-occurrence (may be Git LFS placeholder)
#
# Bonus sources (additional data not in original script):
#   5. Related Job Skills - skill relationship mapping
#   6. Job Skill Set - job posts with extracted skill lists
#   7. Mind Tech Ontology - tech skill ontology with synonyms

import csv
import gzip
import json
import logging
import random
import ast
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

assert DATA_DIR is not None, "DATA_DIR not set! Re-run Cell 3."
print(f"Using DATA_DIR: {DATA_DIR}")


# ============================================================
# Source 1: TechWolf ESCO Skill Sentences
# ============================================================

def load_esco_sentences():
    """Load TechWolf Synthetic-ESCO-Skill-Sentences.
    Returns list of dicts: {skill, sentence}."""
    parquet_path = DATA_DIR / "techwolf_esco_sentences" / "train.parquet"
    if not parquet_path.exists():
        logger.warning("TechWolf ESCO sentences not found -- skipping.")
        return []

    df = pd.read_parquet(parquet_path)

    # Identify columns
    skill_col, sentence_col = None, None
    for col in df.columns:
        cl = col.lower()
        if "skill" in cl and "sent" not in cl:
            skill_col = col
        elif "sent" in cl or "text" in cl or "description" in cl:
            sentence_col = col

    if skill_col is None or sentence_col is None:
        cols = df.columns.tolist()
        if len(cols) >= 2:
            skill_col, sentence_col = cols[0], cols[1]
        else:
            logger.warning("TechWolf: cannot identify columns. Got: %s", df.columns.tolist())
            return []

    records = []
    for _, row in df.iterrows():
        skill = str(row[skill_col]).strip()
        sentence = str(row[sentence_col]).strip()
        if skill and sentence and len(skill) > 1:
            records.append({"skill": skill, "sentence": sentence})

    logger.info("TechWolf ESCO: loaded %d skill-sentence pairs.", len(records))
    return records


# ============================================================
# Source 2: Tabiya ESCO Synonyms
# ============================================================

def load_tabiya_synonyms():
    """Load Tabiya ESCO skill synonyms / alternative labels.
    Returns dict: canonical_skill -> [alt_labels]."""
    csv_dir = DATA_DIR / "tabiya_esco" / "tabiya-esco-v1.1.1" / "csv"
    skills_csv = csv_dir / "skills.csv"

    if not skills_csv.exists():
        # Try alternate paths
        for candidate in DATA_DIR.glob("tabiya_esco/**/skills.csv"):
            skills_csv = candidate
            break
        if not skills_csv.exists():
            logger.warning("Tabiya skills.csv not found -- skipping.")
            return {}

    df = pd.read_csv(skills_csv)

    preferred_col, alt_col = None, None
    for col in df.columns:
        cl = col.lower()
        if "preferred" in cl and "label" in cl:
            preferred_col = col
        elif "alt" in cl and "label" in cl:
            alt_col = col

    if preferred_col is None:
        for col in df.columns:
            if "label" in col.lower() or "name" in col.lower():
                preferred_col = col
                break

    if preferred_col is None:
        logger.warning("Tabiya: cannot identify preferred label column. Got: %s", df.columns.tolist())
        return {}

    synonyms = defaultdict(list)
    for _, row in df.iterrows():
        pref = str(row[preferred_col]).strip()
        if not pref or pref == "nan":
            continue
        alts = []
        if alt_col and pd.notna(row.get(alt_col)):
            alt_str = str(row[alt_col])
            for sep in ["|", "\n", ";"]:
                if sep in alt_str:
                    alts = [a.strip() for a in alt_str.split(sep) if a.strip()]
                    break
            if not alts and alt_str.strip():
                alts = [alt_str.strip()]
        synonyms[pref] = alts

    logger.info("Tabiya ESCO: loaded %d skills with synonyms.", len(synonyms))
    return synonyms


# ============================================================
# Source 3: Nesta Skills Taxonomy Clusters
# ============================================================

def load_nesta_clusters():
    """Load Nesta UK Skills Taxonomy cluster assignments.
    Returns dict: cluster_id -> [skill_names]."""
    nesta_base = DATA_DIR / "nesta_skills_taxonomy"
    clusters = defaultdict(list)

    json_files = list(nesta_base.rglob("*.json"))
    csv_files = list(nesta_base.rglob("*.csv"))

    if not json_files and not csv_files:
        logger.warning("Nesta: no data files found under %s -- skipping.", nesta_base)
        return {}

    logger.info("Loading Nesta clusters from %s", nesta_base)

    # Try JSON files first (taxonomy/cluster data)
    for jf in json_files:
        if "cluster" in jf.name.lower() or "taxonomy" in jf.name.lower() or "hierarchy" in jf.name.lower():
            try:
                with open(jf, "r", encoding="utf-8") as f:
                    data = json.load(f)
                if isinstance(data, dict):
                    for key, val in data.items():
                        if isinstance(val, list):
                            cluster_id = hash(key) % 1000
                            clusters[cluster_id] = [str(v) for v in val if v]
                        elif isinstance(val, dict) and "skills" in val:
                            cluster_id = hash(key) % 1000
                            clusters[cluster_id] = [str(s) for s in val["skills"] if s]
                        elif isinstance(val, dict):
                            # Try nested structure
                            for sub_key, sub_val in val.items():
                                if isinstance(sub_val, list):
                                    cid = hash(f"{key}_{sub_key}") % 10000
                                    clusters[cid] = [str(v) for v in sub_val if v]
                logger.info("Nesta: loaded clusters from %s", jf.name)
                if clusters:
                    break
            except (json.JSONDecodeError, UnicodeDecodeError):
                continue

    # Fallback: CSV files
    if not clusters:
        for cf in csv_files:
            if "skill" in cf.name.lower() or "cluster" in cf.name.lower():
                try:
                    df = pd.read_csv(cf)
                    skill_col, cluster_col = None, None
                    for col in df.columns:
                        cl = col.lower()
                        if "skill" in cl or "name" in cl or "label" in cl:
                            skill_col = col
                        elif "cluster" in cl or "group" in cl or "category" in cl:
                            cluster_col = col
                    if skill_col and cluster_col:
                        for _, row in df.iterrows():
                            skill = str(row[skill_col]).strip()
                            cid = row[cluster_col]
                            if skill and skill != "nan":
                                clusters[int(hash(str(cid)) % 1000)].append(skill)
                        logger.info("Nesta: loaded clusters from %s", cf.name)
                        break
                except Exception:
                    continue

    logger.info("Nesta: %d clusters, %d total skills.", len(clusters), sum(len(v) for v in clusters.values()))
    return clusters


# ============================================================
# Source 4: StackLite Tag Co-occurrence
# ============================================================

def load_stacklite_cooccurrence():
    """Load StackLite tag co-occurrence. Returns dict: tag -> set(co-occurring tags).
    Note: .gz files may be Git LFS pointers (134 bytes). Handled gracefully."""
    tags_path = DATA_DIR / "stacklite" / "question_tags.csv.gz"
    if not tags_path.exists():
        logger.warning("StackLite tags not found -- skipping.")
        return {}

    # Check if file is a Git LFS pointer (< 200 bytes)
    if tags_path.stat().st_size < 200:
        logger.warning("StackLite tags file is too small (%d bytes) -- likely a Git LFS pointer. Skipping.",
                       tags_path.stat().st_size)
        return {}

    logger.info("Loading StackLite co-occurrence from %s", tags_path)
    question_tags = defaultdict(list)
    try:
        with gzip.open(str(tags_path), "rt", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            for i, row in enumerate(reader):
                qid = int(row.get("Id", row.get("id", 0)))
                tag = row.get("Tag", row.get("tag", "")).strip()
                if qid and tag:
                    question_tags[qid].append(tag)
                if i >= 2_000_000:
                    break
    except Exception as e:
        logger.warning("Failed to read StackLite tags: %s", e)
        return {}

    cooccur = defaultdict(set)
    for qid, tags in question_tags.items():
        for tag in tags:
            for other in tags:
                if other != tag:
                    cooccur[tag].add(other)

    logger.info("StackLite: built co-occurrence for %d tags.", len(cooccur))
    return cooccur


# ============================================================
# Source 5 (Bonus): Related Job Skills
# ============================================================

def load_related_skills():
    """Load related_skills.csv: skill -> 10 related skills.
    Returns dict: skill_name -> [related_skill_names]."""
    csv_path = DATA_DIR / "related_job_skills" / "related_skills.csv"
    if not csv_path.exists():
        logger.warning("Related skills CSV not found -- skipping.")
        return {}

    df = pd.read_csv(csv_path)
    related = {}
    name_col = df.columns[0]  # 'name'
    rel_cols = [c for c in df.columns if c.startswith("related_")]

    for _, row in df.iterrows():
        name = str(row[name_col]).strip()
        if not name or name == "nan":
            continue
        rels = []
        for rc in rel_cols:
            val = row.get(rc)
            if pd.notna(val) and str(val).strip() and str(val).strip() != "nan":
                rels.append(str(val).strip())
        if rels:
            related[name] = rels

    logger.info("Related Skills: loaded %d skills with %d total relations.",
                len(related), sum(len(v) for v in related.values()))
    return related


# ============================================================
# Source 6 (Bonus): Job Skill Set (skills from job posts)
# ============================================================

def load_job_skill_sets():
    """Load all_job_post.csv: extract skill sets from job postings.
    Returns list of skill lists (each list = skills from one job)."""
    csv_path = DATA_DIR / "job_skill_set" / "all_job_post.csv"
    if not csv_path.exists():
        logger.warning("Job skill set CSV not found -- skipping.")
        return []

    df = pd.read_csv(csv_path)
    skill_col = None
    for col in df.columns:
        if "skill" in col.lower():
            skill_col = col
            break

    if skill_col is None:
        logger.warning("Job skill set: no skill column found. Cols: %s", df.columns.tolist())
        return []

    skill_lists = []
    for _, row in df.iterrows():
        raw = row[skill_col]
        if pd.isna(raw):
            continue
        raw = str(raw).strip()
        # Parse Python list string like "['skill1', 'skill2', ...]"
        try:
            skills = ast.literal_eval(raw)
            if isinstance(skills, list) and len(skills) >= 2:
                skills = [s.strip() for s in skills if isinstance(s, str) and s.strip()]
                if len(skills) >= 2:
                    skill_lists.append(skills)
        except (ValueError, SyntaxError):
            # Try comma-separated
            skills = [s.strip() for s in raw.split(",") if s.strip()]
            if len(skills) >= 2:
                skill_lists.append(skills)

    logger.info("Job Skill Sets: loaded %d job postings with skill lists.", len(skill_lists))
    return skill_lists


# ============================================================
# Source 7 (Bonus): Mind Tech Ontology
# ============================================================

def load_tech_ontology():
    """Load mind_tech_ontology aggregated skills.
    Returns dict: skill_name -> {synonyms: [], type: [], domains: []}."""
    json_path = DATA_DIR / "mind_tech_ontology" / "__aggregated_skills.json"
    if not json_path.exists():
        logger.warning("Tech ontology not found -- skipping.")
        return {}

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    ontology = {}
    if isinstance(data, list):
        for entry in data:
            name = entry.get("name", "").strip()
            if not name:
                continue
            synonyms = entry.get("synonyms", [])
            skill_type = entry.get("type", [])
            domains = entry.get("associatedToApplicationDomains", [])
            frameworks = entry.get("specificToFrameworks", [])
            langs = entry.get("supportedProgrammingLanguages", [])
            ontology[name] = {
                "synonyms": [s for s in synonyms if s and s != name],
                "type": skill_type,
                "domains": domains,
                "frameworks": frameworks,
                "langs": langs,
            }

    logger.info("Tech Ontology: loaded %d skills.", len(ontology))
    return ontology


# --- Load all sources ---
print("\n" + "="*60)
print("Loading all skill data sources...")
print("="*60 + "\n")

esco_pairs = load_esco_sentences()
synonyms = load_tabiya_synonyms()
clusters = load_nesta_clusters()
cooccur = load_stacklite_cooccurrence()
related_skills = load_related_skills()
job_skill_sets = load_job_skill_sets()
tech_ontology = load_tech_ontology()

print(f"\n{'='*60}")
print("Data loading summary:")
print(f"  ESCO sentence pairs:  {len(esco_pairs):>8,}")
print(f"  Tabiya synonyms:      {len(synonyms):>8,}")
print(f"  Nesta clusters:       {len(clusters):>8,}")
print(f"  StackLite co-occur:   {len(cooccur):>8,}")
print(f"  Related skills:       {len(related_skills):>8,}")
print(f"  Job skill sets:       {len(job_skill_sets):>8,}")
print(f"  Tech ontology:        {len(tech_ontology):>8,}")
print(f"{'='*60}")

In [None]:
# Cell 5: Build triplets + hard negatives
#
# Strategy:
#   1. Synonym-based: anchor = preferred label, positive = alt label
#   2. Cluster-based: skills in same Nesta cluster are positives
#   3. ESCO sentence-based: different sentences about same skill
#   4. Related skills: skill -> related skill from related_skills.csv
#   5. Job co-occurrence: skills from same job posting are positives
#   6. Tech ontology: skill -> synonym from ontology
#   7. Hard negatives: replace 30% of random negatives with co-occurring but different skills

MAX_TRIPLETS = 500_000
SEED = 42
rng = random.Random(SEED)

# Build global skill vocabulary for negative sampling
all_skills = list(set(
    list(synonyms.keys())
    + [alt for alts in synonyms.values() for alt in alts]
    + [skill for skills in clusters.values() for skill in skills]
    + list(cooccur.keys())
    + list(related_skills.keys())
    + [s for rels in related_skills.values() for s in rels]
    + [s for slist in job_skill_sets for s in slist]
    + list(tech_ontology.keys())
    + [syn for info in tech_ontology.values() for syn in info["synonyms"]]
))

print(f"Global skill vocabulary: {len(all_skills):,} unique skills")

if len(all_skills) < 10:
    raise ValueError(f"Insufficient skill vocabulary ({len(all_skills)}) for triplet construction.")

triplets = []

def _random_neg(exclude):
    """Pick a random skill not in the exclude set."""
    for _ in range(10):
        neg = rng.choice(all_skills)
        if neg not in exclude:
            return neg
    return rng.choice(all_skills)


# --- 1. Synonym-based triplets ---
count_before = len(triplets)
for pref, alts in synonyms.items():
    for alt in alts:
        if alt == pref:
            continue
        neg = _random_neg({pref, alt})
        triplets.append({"anchor": pref, "positive": alt, "negative": neg})
        if len(triplets) >= MAX_TRIPLETS:
            break
    if len(triplets) >= MAX_TRIPLETS:
        break
print(f"  Synonym-based: +{len(triplets) - count_before:,} triplets")


# --- 2. Cluster-based triplets ---
count_before = len(triplets)
cluster_list = list(clusters.items())
all_cluster_skills = [s for skills in clusters.values() for s in skills]
if cluster_list and all_cluster_skills and len(triplets) < MAX_TRIPLETS:
    for cid, skills in cluster_list:
        if len(skills) < 2:
            continue
        other_cluster_skills = [s for oid, oss in cluster_list if oid != cid for s in oss]
        if not other_cluster_skills:
            other_cluster_skills = all_skills
        for i in range(len(skills)):
            for j in range(i + 1, min(i + 3, len(skills))):
                neg = rng.choice(other_cluster_skills)
                triplets.append({"anchor": skills[i], "positive": skills[j], "negative": neg})
                if len(triplets) >= MAX_TRIPLETS:
                    break
            if len(triplets) >= MAX_TRIPLETS:
                break
        if len(triplets) >= MAX_TRIPLETS:
            break
print(f"  Cluster-based: +{len(triplets) - count_before:,} triplets")


# --- 3. ESCO sentence-based triplets ---
count_before = len(triplets)
skill_to_sentences = defaultdict(list)
for pair in esco_pairs:
    skill_to_sentences[pair["skill"]].append(pair["sentence"])

skill_keys = list(skill_to_sentences.keys())
if len(skill_keys) >= 2 and len(triplets) < MAX_TRIPLETS:
    for skill in skill_keys:
        sentences = skill_to_sentences[skill]
        if len(sentences) < 2:
            continue
        for i in range(min(len(sentences), 3)):
            for j in range(i + 1, min(len(sentences), 4)):
                neg_skill = rng.choice(skill_keys)
                while neg_skill == skill:
                    neg_skill = rng.choice(skill_keys)
                neg_sentences = skill_to_sentences[neg_skill]
                neg_sentence = rng.choice(neg_sentences) if neg_sentences else neg_skill
                triplets.append({
                    "anchor": sentences[i],
                    "positive": sentences[j],
                    "negative": neg_sentence,
                })
                if len(triplets) >= MAX_TRIPLETS:
                    break
            if len(triplets) >= MAX_TRIPLETS:
                break
        if len(triplets) >= MAX_TRIPLETS:
            break
print(f"  ESCO sentence-based: +{len(triplets) - count_before:,} triplets")


# --- 4. Related skills triplets (bonus) ---
count_before = len(triplets)
if related_skills and len(triplets) < MAX_TRIPLETS:
    rel_keys = list(related_skills.keys())
    for skill_name, rels in related_skills.items():
        for rel in rels[:5]:  # Top 5 related skills
            neg = _random_neg({skill_name, rel})
            triplets.append({"anchor": skill_name, "positive": rel, "negative": neg})
            if len(triplets) >= MAX_TRIPLETS:
                break
        if len(triplets) >= MAX_TRIPLETS:
            break
print(f"  Related skills: +{len(triplets) - count_before:,} triplets")


# --- 5. Job co-occurrence triplets (bonus) ---
count_before = len(triplets)
if job_skill_sets and len(triplets) < MAX_TRIPLETS:
    for skill_list in job_skill_sets:
        if len(skill_list) < 2:
            continue
        # Sample pairs from same job
        pairs_per_job = min(3, len(skill_list) * (len(skill_list) - 1) // 2)
        for _ in range(pairs_per_job):
            a, p = rng.sample(skill_list, 2)
            neg = _random_neg(set(skill_list))
            triplets.append({"anchor": a, "positive": p, "negative": neg})
            if len(triplets) >= MAX_TRIPLETS:
                break
        if len(triplets) >= MAX_TRIPLETS:
            break
print(f"  Job co-occurrence: +{len(triplets) - count_before:,} triplets")


# --- 6. Tech ontology synonym triplets (bonus) ---
count_before = len(triplets)
if tech_ontology and len(triplets) < MAX_TRIPLETS:
    for skill_name, info in tech_ontology.items():
        for syn in info["synonyms"]:
            neg = _random_neg({skill_name, syn})
            triplets.append({"anchor": skill_name, "positive": syn, "negative": neg})
            if len(triplets) >= MAX_TRIPLETS:
                break
        if len(triplets) >= MAX_TRIPLETS:
            break
print(f"  Tech ontology: +{len(triplets) - count_before:,} triplets")


# --- Shuffle and cap ---
rng.shuffle(triplets)
triplets = triplets[:MAX_TRIPLETS]
print(f"\nTotal triplets (before hard negatives): {len(triplets):,}")


# --- 7. Hard negatives: replace 30% with co-occurring but different skills ---
# Use both StackLite co-occurrence AND related_skills for hard negatives
hard_neg_sources = dict(cooccur)  # tag -> set of co-occurring tags
for skill_name, rels in related_skills.items():
    if skill_name not in hard_neg_sources:
        hard_neg_sources[skill_name] = set(rels)
    else:
        hard_neg_sources[skill_name].update(rels)

if hard_neg_sources:
    n_replace = int(len(triplets) * 0.3)
    indices = rng.sample(range(len(triplets)), min(n_replace, len(triplets)))
    replaced = 0
    for idx in indices:
        anchor = triplets[idx]["anchor"]
        positive = triplets[idx]["positive"]
        cooccurring = hard_neg_sources.get(anchor, set()) - {positive, anchor}
        if not cooccurring:
            cooccurring = hard_neg_sources.get(anchor.lower(), set()) - {positive.lower(), anchor.lower()}
        if cooccurring:
            triplets[idx]["negative"] = rng.choice(list(cooccurring))
            replaced += 1
    print(f"Hard negatives: replaced {replaced:,} / {len(triplets):,} negatives")
else:
    print("No hard negative sources available (StackLite + Related Skills empty).")

print(f"\nFinal triplet count: {len(triplets):,}")

# Show sample triplets
print(f"\nSample triplets:")
for t in triplets[:5]:
    print(f"  anchor: {t['anchor'][:60]}")
    print(f"  positive: {t['positive'][:60]}")
    print(f"  negative: {t['negative'][:60]}")
    print()

In [None]:
# Cell 6: Model setup + contrastive training
#
# Uses SentenceTransformerTrainer API (newer, more stable than model.fit())
# Architecture: JobBERT-v2 base (no projection head - let the model learn directly)
# Loss: MultipleNegativesRankingLoss (in-batch negatives)
# Training: 6 epochs, batch_size=64, fp16

import transformers
from sentence_transformers import SentenceTransformer, losses, models
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from datasets import Dataset

# --- Config ---
BASE_MODEL = "TechWolf/JobBERT-v2"
EPOCHS = 6
BATCH_SIZE = 64
WARMUP_RATIO = 0.1
OUTPUT_DIR = "/kaggle/working/m3_skills_comparator"

# --- Load base model ---
print(f"Loading base model: {BASE_MODEL}")
transformers.logging.set_verbosity_error()
model = SentenceTransformer(BASE_MODEL)
transformers.logging.set_verbosity_warning()
print(f"Model loaded. Embedding dim: {model.get_sentence_embedding_dimension()}")

# --- Split triplets: 90% train, 10% eval ---
np.random.seed(SEED)
np.random.shuffle(triplets)
split = int(len(triplets) * 0.9)
train_triplets = triplets[:split]
eval_triplets = triplets[split:]
print(f"Train: {len(train_triplets):,} triplets, Eval: {len(eval_triplets):,} triplets")

# --- Convert to HuggingFace Datasets (Trainer API requires this) ---
train_dataset = Dataset.from_dict({
    "anchor": [t["anchor"] for t in train_triplets],
    "positive": [t["positive"] for t in train_triplets],
    "negative": [t["negative"] for t in train_triplets],
})
eval_dataset = Dataset.from_dict({
    "anchor": [t["anchor"] for t in eval_triplets],
    "positive": [t["positive"] for t in eval_triplets],
    "negative": [t["negative"] for t in eval_triplets],
})
print(f"Train dataset: {len(train_dataset):,}, Eval dataset: {len(eval_dataset):,}")

# --- Loss ---
train_loss = losses.MultipleNegativesRankingLoss(model)

# --- Evaluator ---
evaluator = TripletEvaluator(
    anchors=[t["anchor"] for t in eval_triplets],
    positives=[t["positive"] for t in eval_triplets],
    negatives=[t["negative"] for t in eval_triplets],
    name="skill_triplet_eval",
)

# --- Training args ---
eval_steps = max(1, len(train_triplets) // BATCH_SIZE // 2)

print(f"\n{'='*60}")
print(f"Starting contrastive training")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Warmup ratio: {WARMUP_RATIO}")
print(f"  Eval every: {eval_steps} steps")
print(f"  FP16: True")
print(f"{'='*60}\n")

args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    warmup_ratio=WARMUP_RATIO,
    fp16=True,
    eval_strategy="steps",
    eval_steps=eval_steps,
    save_strategy="steps",
    save_steps=eval_steps,
    load_best_model_at_end=True,
    save_total_limit=2,
    logging_steps=50,
    report_to="none",
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=evaluator,
)

trainer.train()
model.save(OUTPUT_DIR)
print("\nTraining complete!")

In [None]:
# Cell 7: Evaluate + export projection weights

# Final evaluation
print("Evaluating on held-out triplets...")
eval_result = evaluator(model, output_path=OUTPUT_DIR)

# evaluator returns a dict in newer sentence-transformers versions
if isinstance(eval_result, dict):
    eval_score = eval_result.get('skill_triplet_eval_cosine_accuracy',
                 eval_result.get('cosine_accuracy', 0.0))
    print(f"Full eval results: {eval_result}")
else:
    eval_score = eval_result

TARGET = 0.80
print(f"\nTriplet accuracy: {eval_score:.4f}")
if eval_score >= TARGET:
    print(f"Target {TARGET:.2f} ACHIEVED!")
else:
    print(f"Target {TARGET:.2f} NOT MET (got {eval_score:.4f})")

# Export projection weights for lightweight inference (if projection head exists)
print("\nExporting projection head weights (if any)...")
projection_weights = {}
for name, param in model.named_parameters():
    if "projection" in name:
        projection_weights[name.split(".")[-1]] = param.detach().cpu().numpy()

if projection_weights:
    projection_path = os.path.join(OUTPUT_DIR, "projection.npy")
    np.save(projection_path, projection_weights)
    print(f"Projection weights saved to {projection_path}")
else:
    print("No projection head found (using base model embeddings directly).")

# Quick embedding sanity check
print("\nSanity check - similar vs dissimilar skill embeddings:")
test_pairs = [
    ("Python programming", "Python development", "should be HIGH"),
    ("machine learning", "deep learning", "should be HIGH"),
    ("Python programming", "accounting", "should be LOW"),
    ("data science", "plumbing", "should be LOW"),
]
from sentence_transformers.util import cos_sim
for s1, s2, expected in test_pairs:
    e1 = model.encode(s1)
    e2 = model.encode(s2)
    sim = cos_sim(e1, e2).item()
    print(f"  '{s1}' vs '{s2}': {sim:.4f} ({expected})")

In [None]:
# Cell 8: Save model + upload as Kaggle Dataset for download
import shutil
import json

# Save full model
model.save(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")

# List saved files
total_size = 0
for root, dirs, files in os.walk(OUTPUT_DIR):
    for f in sorted(files):
        fp = os.path.join(root, f)
        size_mb = os.path.getsize(fp) / 1e6
        total_size += size_mb
        rel = os.path.relpath(fp, OUTPUT_DIR)
        print(f"  {rel}: {size_mb:.1f} MB")
print(f"  Total: {total_size:.1f} MB")

# Zip for download
zip_path = "/kaggle/working/m3_skills_comparator_trained"
shutil.make_archive(zip_path, "zip", OUTPUT_DIR)
zip_size = os.path.getsize(f"{zip_path}.zip") / 1e6
print(f"\nZipped to {zip_path}.zip ({zip_size:.1f} MB)")

# Upload as Kaggle Dataset for easy download
dataset_dir = "/kaggle/working/m3_dataset_upload"
os.makedirs(dataset_dir, exist_ok=True)
shutil.copy(f"{zip_path}.zip", dataset_dir)

metadata = {
    "title": "m3-skills-comparator-trained",
    "id": "thinkkun/m3-skills-comparator-trained",
    "licenses": [{"name": "CC0-1.0"}]
}
with open(os.path.join(dataset_dir, "dataset-metadata.json"), "w") as f:
    json.dump(metadata, f)

try:
    from kaggle.api.kaggle_api_extended import KaggleApi
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=dataset_dir, dir_mode="zip", quiet=False)
    print("\nDataset uploaded successfully!")
    print("Download locally with:")
    print("  kaggle datasets download thinkkun/m3-skills-comparator-trained -p ./training/models/")
except Exception as e:
    print(f"\nDataset upload failed: {e}")
    print("\nAlternative methods:")
    print("  1. Output tab (right sidebar) -> click download on the zip")
    print("  2. 'Save Version' -> after commit: kaggle kernels output thinkkun/<notebook-slug> -p ./training/models/")