<a href="https://colab.research.google.com/github/mahb97/Wake2vec/blob/main/letsbuildthisthing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

did some digging and guess what i found, absolute gold, so here comes everybody

**MORPHEME-AWARE WAKE2VEC**

**Teaching TinyLlama Joyce's Generative Grammar**

Based on hand-compiled morphological analysis of Finnegans Wake

This notebook teaches compositional word formation via embedding arithmetic

In [1]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["PYTHONHASHSEED"] = "1337"

import torch
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
torch.use_deterministic_algorithms(True)

import random, json, math, re
import numpy as np
from datetime import datetime
from pathlib import Path
from collections import defaultdict, Counter

SEED = 1337
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("Morpheme chaos mode activated")

Morpheme chaos mode activated


config

In [None]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
CORPUS_PATH = "/content/drive/MyDrive/Wake2vec_runs/fw.txt"
MORPHEME_DATA_PATH = "/content/drive/MyDrive/Wake2vec_runs/morpheme_data.txt"

# Training params
BATCH_SIZE = 2
BLOCK_SIZE = 256
EPOCHS = 2
LR = 2e-5
WARMUP_RATIO = 0.05
WEIGHT_DECAY = 0.01
GRAD_ACCUM = 4
SAVE_STEPS = 200

# Morpheme chaos params
SYNTHETIC_PER_MORPHEME = 10  # Generate N examples per morpheme combo
COMPOSITION_ALPHA = 0.33     # Weight for prefix:root:suffix (0.33:0.34:0.33)
MORPHEME_NOISE = 0.05        # Add chaos to composed embeddings

# Output
RUN_ID = datetime.now().strftime("morpheme_wake_%Y%m%d_%H%M")
OUTDIR = Path(f"./runs/{RUN_ID}")
(OUTDIR / "results").mkdir(parents=True, exist_ok=True)
(OUTDIR / "checkpoints").mkdir(parents=True, exist_ok=True)

print(f"Run ID: {RUN_ID}")
print(f"Teaching TinyLlama Joyce's morphological grammar...")

In [None]:
def load_corpus(path):
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Corpus not found: {p}")
    text = p.read_text(encoding="utf-8", errors="ignore")
    print(f"✓ Loaded corpus: {len(text)} chars")
    return text

FW_TEXT = load_corpus(CORPUS_PATH)

parse morpheme data

In [None]:
def parse_morpheme_document(text):
    """
    Parse your hand-compiled morpheme analysis into structured data.
    Returns: {prefixes: {}, suffixes: {}, examples: {}}
    """
    data = {
        'prefixes': defaultdict(list),
        'suffixes': defaultdict(list),
        'infixes': defaultdict(list),
        'prefix_counts': Counter(),
        'suffix_counts': Counter(),
    }

    lines = text.split('\n')
    current_type = None
    current_morph = None

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Detect section headers
        if line.startswith('Prefix') or line.startswith('prefix'):
            current_type = 'prefix'
            # Extract morpheme: "Prefix ab- 13" -> "ab-"
            parts = line.split()
            if len(parts) >= 2:
                current_morph = parts[1].lower()
                count = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 1
                data['prefix_counts'][current_morph] = count

        elif line.startswith('Suffix') or line.startswith('suffix'):
            current_type = 'suffix'
            parts = line.split()
            if len(parts) >= 2:
                current_morph = parts[1].lower()
                count = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 1
                data['suffix_counts'][current_morph] = count

        elif line.startswith('Infix') or line.startswith('infix'):
            current_type = 'infix'
            parts = line.split()
            if len(parts) >= 2:
                current_morph = parts[1].lower()

        # Collect examples (lines that aren't headers)
        elif current_type and current_morph:
            # Skip lines with numbers only or special headers
            if line[0].isupper() and current_type in ['prefix', 'suffix']:
                continue

            # Clean example words
            word = line.split()[0] if line.split() else line
            word = word.strip('.,;:()[]{}\"\'')

            if word and len(word) > 1:
                if current_type == 'prefix':
                    data['prefixes'][current_morph].append(word)
                elif current_type == 'suffix':
                    data['suffixes'][current_morph].append(word)
                elif current_type == 'infix':
                    data['infixes'][current_morph].append(word)

    # Convert defaultdicts to regular dicts
    data['prefixes'] = dict(data['prefixes'])
    data['suffixes'] = dict(data['suffixes'])
    data['infixes'] = dict(data['infixes'])

    return data

# Load and parse your morpheme data
print("\n" + "="*60)
print("PARSING HAND-COMPILED MORPHEME DATA")
print("="*60)

morpheme_doc = Path(MORPHEME_DATA_PATH)
if morpheme_doc.exists():
    morpheme_text = morpheme_doc.read_text(encoding="utf-8", errors="ignore")
    MORPHEME_DATA = parse_morpheme_document(morpheme_text)
else:
    # Fallback: extract from the text you pasted inline
    print("⚠ Morpheme data file not found, using inline extraction...")
    # You can paste your dataset here as a string if needed
    MORPHEME_DATA = {
        'prefixes': {
            'ab-': ['above', 'abaft', 'abject', 'abler'],
            'anti-': ['anticipation', 'antipathies'],
            'circum-': ['circumvallator'],
            'hyper-': ['hyperchemical'],
            'sub-': ['subject', 'substrate', 'subordinating'],
        },
        'suffixes': {
            '-ation': ['acclammitation', 'anticipation', 'paupulation'],
            '-ous': ['delicious', 'precious', 'gracious'],
            '-ness': ['darkness', 'sweetness', 'softness'],
            '-ing': ['going', 'coming', 'being'],
        },
        'prefix_counts': Counter({'ab-': 13, 'anti-': 2, 'circum-': 1, 'hyper-': 1, 'sub-': 7}),
        'suffix_counts': Counter({'-ation': 38, '-ous': 49, '-ness': 28, '-ing': 257}),
    }

print(f"✓ Parsed {len(MORPHEME_DATA['prefixes'])} prefixes")
print(f"✓ Parsed {len(MORPHEME_DATA['suffixes'])} suffixes")
print(f"\nTop prefixes by frequency:")
for morph, count in MORPHEME_DATA['prefix_counts'].most_common(10):
    print(f"  {morph}: {count}")
print(f"\nTop suffixes by frequency:")
for morph, count in MORPHEME_DATA['suffix_counts'].most_common(10):
    print(f"  {morph}: {count}")

 LOAD MODEL & TOKENIZER

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed

set_seed(SEED)

tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)

print(f"\nModel: {MODEL_NAME}")
print(f"Device: {DEVICE}")
print(f"Initial vocab: {len(tok)}")

In [None]:
import re, csv
from pathlib import Path

AFFIX_TXT = Path("/content/affixes_terms.txt")

EXAMPLES_PER_MORPHEME = 120

def _clean_line(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"\b\d+\b$", "", s).strip()                 # drop trailing counts
    s = re.sub(r"\((sic|decap)\)", "", s, flags=re.I).strip()
    return s

def _normalise_affix(kind: str, aff: str) -> str:
    aff = (aff or "").replace("–","-").strip()
    if not aff: return ""
    if kind == "prefix":
        return aff if aff.endswith("-") else (aff + "-")
    if kind == "suffix":
        return aff if aff.startswith("-") else ("-" + aff)
    return aff

def _parse_docx(path: Path):
    from docx import Document
    doc = Document(str(path))
    cat = None
    rows = []
    cat_re = re.compile(r"^(prefix|suffix)\s*[-–]?\s*([A-Za-z()'\.]+)", re.I)

    for p in doc.paragraphs:
        line = (p.text or "").strip()
        if not line:
            continue
        m = cat_re.match(line)
        if m:
            kind = m.group(1).lower()
            aff  = _normalise_affix(kind, m.group(2))
            cat = (kind, aff)
            continue
        # skip alpha dividers like "A", "B"
        if len(line) <= 2 and line.isalpha():
            continue

        line = _clean_line(line)
        if not line:
            continue

        # split very conservatively on commas / tabs / big whitespace
        for chunk in re.split(r"[,\t]+|\s{2,}", line):
            w = _clean_line(chunk)
            if not w or not re.search(r"[A-Za-z]", w):
                continue
            rec = {"term": w, "prefix": "", "suffix": ""}
            if cat:
                kind, aff = cat
                rec[kind] = aff
            rows.append(rec)

    # merge duplicates, prefer rows that carry a prefix/suffix label
    merged = {}
    for r in rows:
        t = r["term"]
        if t not in merged:
            merged[t] = r
        else:
            if r.get("prefix") and not merged[t].get("prefix"):
                merged[t]["prefix"] = r["prefix"]
            if r.get("suffix") and not merged[t].get("suffix"):
                merged[t]["suffix"] = r["suffix"]
    return list(merged.values())

def _parse_csv(path: Path):
    rows = []
    with path.open("r", encoding="utf-8") as f:
        rdr = csv.DictReader(f)
        for r in rdr:
            t = _clean_line(r.get("term",""))
            if not t: continue
            pref = _normalise_affix("prefix", r.get("prefix",""))
            suff = _normalise_affix("suffix", r.get("suffix",""))
            rows.append({"term": t, "prefix": pref, "suffix": suff})
    # de-dupe similar to docx
    merged = {}
    for r in rows:
        t = r["term"]
        if t not in merged:
            merged[t] = r
        else:
            if r.get("prefix") and not merged[t].get("prefix"):
                merged[t]["prefix"] = r["prefix"]
            if r.get("suffix") and not merged[t].get("suffix"):
                merged[t]["suffix"] = r["suffix"]
    return list(merged.values())

# 1) Load annotations
if AFFIX_CSV.exists():
    ann = _parse_csv(AFFIX_CSV)
elif AFFIX_DOCX.exists():
    try:
        ann = _parse_docx(AFFIX_DOCX)
    except Exception as e:
        # fallback if python-docx isn't installed
        raise RuntimeError("python-docx is required to read the DOCX. Install it or provide CSV.") from e
else:
    raise FileNotFoundError("Provide either AFFIX_CSV or AFFIX_DOCX")

# 2) Build maps: morpheme -> list[word]
from collections import defaultdict

prefix_map = defaultdict(list)
suffix_map = defaultdict(list)

for r in ann:
    w = r["term"].strip()
    p = r.get("prefix","").strip()
    s = r.get("suffix","").strip()
    if p:
        prefix_map[p].append(w)
    if s:
        suffix_map[s].append(w)

# 3) Deduplicate & cap examples
def _dedupe_cap(d):
    out = {}
    for k, lst in d.items():
        seen = set()
        uniq = []
        for w in lst:
            wl = w.lower()
            if wl not in seen:
                seen.add(wl)
                uniq.append(w)
            if len(uniq) >= EXAMPLES_PER_MORPHEME:
                break
        out[k] = uniq
    return out

prefix_map = _dedupe_cap(prefix_map)
suffix_map = _dedupe_cap(suffix_map)

# 4) Final structure for your functions
MORPHEME_DATA = {
    "prefixes": dict(prefix_map),
    "suffixes": dict(suffix_map),
}

# 5) Nice preview
def _peek(d, n=5):
    return {k: d[k][:min(len(d[k]),3)] for k in list(d.keys())[:n]}

print(f"[MORPHEME_DATA] prefixes={len(MORPHEME_DATA['prefixes'])} | suffixes={len(MORPHEME_DATA['suffixes'])}")
print("  sample prefixes:", _peek(MORPHEME_DATA["prefixes"]))
print("  sample suffixes:", _peek(MORPHEME_DATA["suffixes"]))

In [None]:
def find_morpheme_embedding(morpheme, model, tokenizer):
    """
    Find or approximate embedding for a morpheme.
    Strategy:
    1. Check if morpheme exists as single token
    2. Average embeddings of example words containing it
    3. Fall back to subtoken average
    """
    emb_matrix = model.get_input_embeddings().weight.data

    # Strategy 1: Direct lookup
    morph_clean = morpheme.strip('-')
    tid = tokenizer.convert_tokens_to_ids(morph_clean)
    if tid != tokenizer.unk_token_id:
        return emb_matrix[tid].clone()

    # Strategy 2: Average from example words
    if morpheme in MORPHEME_DATA['prefixes']:
        examples = MORPHEME_DATA['prefixes'][morpheme][:5]
    elif morpheme in MORPHEME_DATA['suffixes']:
        examples = MORPHEME_DATA['suffixes'][morpheme][:5]
    else:
        examples = []

    if examples:
        valid_embs = []
        for word in examples:
            wid = tokenizer.convert_tokens_to_ids(word.lower())
            if wid != tokenizer.unk_token_id:
                valid_embs.append(emb_matrix[wid])

        if valid_embs:
            return torch.stack(valid_embs).mean(dim=0)

    # Strategy 3: Random init based on similar morphemes
    return torch.randn(emb_matrix.shape[1], device=emb_matrix.device) * emb_matrix.std()

def compose_morpheme_embedding(prefix, root, suffix, model, tokenizer):
    """
    Create embedding via composition: E(word) = α*E(prefix) + β*E(root) + γ*E(suffix)
    """
    alpha, beta, gamma = COMPOSITION_ALPHA, 1 - 2*COMPOSITION_ALPHA, COMPOSITION_ALPHA

    # Get component embeddings
    prefix_emb = find_morpheme_embedding(prefix, model, tokenizer) if prefix else 0
    suffix_emb = find_morpheme_embedding(suffix, model, tokenizer) if suffix else 0

    # Root embedding
    root_id = tokenizer.convert_tokens_to_ids(root.lower())
    if root_id != tokenizer.unk_token_id:
        root_emb = model.get_input_embeddings().weight.data[root_id]
    else:
        root_emb = torch.randn(model.get_input_embeddings().weight.shape[1], device=DEVICE) * 0.02

    # Compose
    composed = alpha * prefix_emb + beta * root_emb + gamma * suffix_emb

    # Add morpheme noise for diversity
    noise = torch.randn_like(composed) * MORPHEME_NOISE * composed.std()
    composed = composed + noise

    return composed

gen synthetic wake words via morpheme combination

In [None]:
def generate_morpheme_words(n_samples=1000):
    """
    Generate synthetic Wake words by combining morphemes.
    Uses frequency weights to match style.
    """
    generated_words = []

    # Extract morpheme lists
    prefixes = list(MORPHEME_DATA['prefix_counts'].keys())
    suffixes = list(MORPHEME_DATA['suffix_counts'].keys())

    # Weight by frequency
    prefix_weights = [MORPHEME_DATA['prefix_counts'][p] for p in prefixes]
    suffix_weights = [MORPHEME_DATA['suffix_counts'][s] for s in suffixes]

    # Common roots from Wake
    roots = [
        'dream', 'river', 'thunder', 'word', 'night', 'day', 'wake', 'sleep',
        'fire', 'water', 'time', 'man', 'woman', 'king', 'queen', 'stone',
        'tree', 'moon', 'sun', 'star', 'wind', 'rain', 'storm', 'cloud',
        'book', 'letter', 'voice', 'sound', 'song', 'dance', 'walk', 'run'
    ]

    for _ in range(n_samples):
        # Sample with frequency weighting
        prefix = random.choices(prefixes, weights=prefix_weights)[0] if random.random() < 0.7 else None
        root = random.choice(roots)
        suffix = random.choices(suffixes, weights=suffix_weights)[0] if random.random() < 0.8 else None

        # Construct word
        if prefix and suffix:
            word = f"{prefix.strip('-')}{root}{suffix.strip('-')}"
        elif prefix:
            word = f"{prefix.strip('-')}{root}"
        elif suffix:
            word = f"{root}{suffix.strip('-')}"
        else:
            word = root

        generated_words.append({
            'word': word,
            'prefix': prefix,
            'root': root,
            'suffix': suffix
        })

    return generated_words

print("\n" + "="*60)
print("GENERATING SYNTHETIC WAKE WORDS")
print("="*60)

synthetic_words = generate_morpheme_words(n_samples=500)
print(f"✓ Generated {len(synthetic_words)} morphological neologisms")
print("\nExamples:")
for w in synthetic_words[:20]:
    print(f"  {w['word']:20s} ({w['prefix'] or 'Ø'} + {w['root']} + {w['suffix'] or 'Ø'})")

token injection

In [None]:
def inject_morpheme_tokens(word_data, model, tokenizer):
    """
    Inject tokens with morpheme-aware compositional initialization.
    """
    to_add = []
    compositions = []  # Track (word, prefix, root, suffix) for init

    for item in word_data:
        word = item['word']
        # Add both regular and ▁ prefixed versions
        variants = [word, f"▁{word}"]

        for form in variants:
            tid = tokenizer.convert_tokens_to_ids(form)
            if tid == tokenizer.unk_token_id:
                to_add.append(form)
                compositions.append(item)

    if not to_add:
        print("⚠ No new tokens to add")
        return []

    old_size = model.get_input_embeddings().num_embeddings
    n_added = tokenizer.add_tokens(to_add, special_tokens=False)
    model.resize_token_embeddings(len(tokenizer))

    print(f"✓ Injected {n_added} morpheme-composed tokens")

    # Initialize with compositional embeddings
    emb = model.get_input_embeddings().weight.data

    for i, item in enumerate(compositions):
        token_idx = old_size + i
        if token_idx < len(tokenizer):
            composed_emb = compose_morpheme_embedding(
                item['prefix'],
                item['root'],
                item['suffix'],
                model,
                tokenizer
            )
            emb[token_idx] = composed_emb

    # Tie weights
    if hasattr(model, "tie_weights"):
        model.tie_weights()

    print(f"✓ Initialized embeddings via morpheme composition")
    print(f"  Formula: E(word) = {COMPOSITION_ALPHA}*E(prefix) + {1-2*COMPOSITION_ALPHA}*E(root) + {COMPOSITION_ALPHA}*E(suffix)")

    return to_add

added_tokens = inject_morpheme_tokens(synthetic_words[:200], model, tok)
print(f"✓ Vocabulary expanded: {len(tok)}")

## SYNTHETIC TRAINING DATA GENERATION

In [None]:
def generate_morpheme_sentences(word_data, per_word=5):
    """
    Generate training sentences that showcase morphological patterns.
    """
    patterns = [
        "The {word} of {root} echoes through the Wake.",
        "By {word} and by {root}, the river flows.",
        "In the {word} of night, {root} speaks.",
        "From {root} to {word}, the tale unwinds.",
        "He spoke of {word} as if {root} remembered.",
        "{word} upon {word}, the {root} multiplies.",
        "Through {word} and beyond {root}, voices drift.",
        "The {word} contains the {root} contains the word.",
        "Call it {word}, call it {root}-become-language.",
        "Riverrun past {word} and {root} from swerve of shore.",
    ]

    sentences = []
    for item in word_data:
        for _ in range(per_word):
            pattern = random.choice(patterns)
            sentence = pattern.format(word=item['word'], root=item['root'])
            sentences.append(sentence)

    return sentences

synthetic_sentences = generate_morpheme_sentences(synthetic_words[:200], per_word=SYNTHETIC_PER_MORPHEME)
random.shuffle(synthetic_sentences)

print(f"\n✓ Generated {len(synthetic_sentences)} training sentences")
print("\nSample sentences:")
for s in synthetic_sentences[:5]:
    print(f"  {s}")

# Combine with original Wake text
COMBINED_TEXT = FW_TEXT + "\n" + "\n".join(synthetic_sentences)
print(f"\n✓ Combined corpus: {len(COMBINED_TEXT)} chars")

# Save generated words for analysis
with open(OUTDIR / "results" / "generated_morpheme_words.json", "w") as f:
    json.dump(synthetic_words, f, indent=2)

data set prep

In [None]:
from datasets import Dataset

def create_blocks(text, tokenizer, block_size):
    ids = tokenizer(text, add_special_tokens=False, return_attention_mask=False)["input_ids"]
    n_blocks = len(ids) // block_size
    if n_blocks == 0:
        raise ValueError(f"Text too short for block_size={block_size}")
    ids = ids[:n_blocks * block_size]
    arr = np.array(ids, dtype=np.int32).reshape(n_blocks, block_size)
    return Dataset.from_dict({"input_ids": arr.tolist()})

ds = create_blocks(COMBINED_TEXT, tok, BLOCK_SIZE)
print(f"✓ Created {len(ds)} blocks")

ds = ds.map(lambda x: {"labels": x["input_ids"]}, batched=True)

n = len(ds)
if n > 20:
    split_idx = int(n * 0.9)
    train_ds = ds.select(range(split_idx))
    valid = ds.select(range(split_idx, n))
else:
    train_ds = ds
    valid = None

print(f"Train: {len(train_ds)}, Val: {len(valid) if valid else 0}")

In [None]:
training args

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import inspect

collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)
has_eval = valid is not None and len(valid) > 0

# Check supported args for version compatibility
sig = inspect.signature(TrainingArguments.__init__)
supported = sig.parameters.keys()

args_dict = {
    "output_dir": str(OUTDIR / "checkpoints"),
    "num_train_epochs": EPOCHS,
    "per_device_train_batch_size": BATCH_SIZE,
    "gradient_accumulation_steps": GRAD_ACCUM,
    "learning_rate": LR,
    "weight_decay": WEIGHT_DECAY,
    "warmup_ratio": WARMUP_RATIO,
    "logging_steps": 20,
    "fp16": False,
    "seed": SEED,
}

if "save_strategy" in supported:
    args_dict.update({"save_strategy": "steps", "save_steps": SAVE_STEPS, "save_total_limit": 2})
else:
    args_dict["save_steps"] = SAVE_STEPS

if "evaluation_strategy" in supported:
    args_dict["evaluation_strategy"] = "steps" if has_eval else "no"
    if has_eval: args_dict["eval_steps"] = SAVE_STEPS
elif "evaluate_during_training" in supported:
    args_dict["evaluate_during_training"] = has_eval
    if has_eval and "eval_steps" in supported: args_dict["eval_steps"] = SAVE_STEPS

if "report_to" in supported: args_dict["report_to"] = ["none"]
if "bf16" in supported: args_dict["bf16"] = False
if "remove_unused_columns" in supported: args_dict["remove_unused_columns"] = False
if "lr_scheduler_type" in supported: args_dict["lr_scheduler_type"] = "cosine"
if "per_device_eval_batch_size" in supported: args_dict["per_device_eval_batch_size"] = BATCH_SIZE

training_args = TrainingArguments(**args_dict)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=train_ds,
    eval_dataset=valid if has_eval else None,
)

print(f"Trainer ready")

pre train snap shot

In [None]:
def get_embedding_snapshot(words, model, tokenizer, name="snapshot"):
    model.eval()
    emb_matrix = model.get_input_embeddings().weight.data
    emb_norm = emb_matrix / emb_matrix.norm(dim=1, keepdim=True)

    snapshot = {"name": name, "vocab_size": len(tokenizer), "words": {}}

    for word_item in words[:50]:
        word = word_item['word']
        tid = tokenizer.convert_tokens_to_ids(word)
        if tid == tokenizer.unk_token_id:
            continue

        word_emb_norm = emb_norm[tid]
        sims = torch.matmul(word_emb_norm.unsqueeze(0), emb_norm.T)[0]
        top_k = torch.topk(sims, 11)

        neighbors = []
        for idx, sim in zip(top_k.indices[1:], top_k.values[1:]):
            neighbors.append({
                "token": tokenizer.convert_ids_to_tokens(idx.item()),
                "sim": round(sim.item(), 4)
            })

        snapshot["words"][word] = {
            "token_id": tid,
            "composition": f"{word_item['prefix'] or 'Ø'}+{word_item['root']}+{word_item['suffix'] or 'Ø'}",
            "embedding_norm": round(emb_matrix[tid].norm().item(), 4),
            "top_neighbors": neighbors[:10]
        }

    return snapshot

print("\n" + "="*60)
print("PRE-TRAINING SNAPSHOT")
print("="*60)

pre_snapshot = get_embedding_snapshot(synthetic_words, model, tok, "pre_morpheme")
with open(OUTDIR / "results" / "pre_morpheme_snapshot.json", "w") as f:
    json.dump(pre_snapshot, f, indent=2)

print(f"✓ Pre-training snapshot: {len(pre_snapshot['words'])} words")

train

In [None]:
print("\n" + "="*60)
print("TRAINING: TEACHING JOYCE'S MORPHOLOGICAL GRAMMAR")
print("="*60 + "\n")

result = trainer.train()

print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print(f"Final loss: {result.metrics.get('train_loss', 'N/A'):.4f}")

post training analysis

In [None]:
print("\n" + "="*60)
print("POST-TRAINING ANALYSIS")
print("="*60)

post_snapshot = get_embedding_snapshot(synthetic_words, model, tok, "post_morpheme")
with open(OUTDIR / "results" / "post_morpheme_snapshot.json", "w") as f:
    json.dump(post_snapshot, f, indent=2)

# Compare
comparison = {}
for word in list(pre_snapshot["words"].keys())[:10]:
    if word not in post_snapshot["words"]:
        continue

    pre = pre_snapshot["words"][word]
    post = post_snapshot["words"][word]

    pre_neighbors = {n["token"] for n in pre["top_neighbors"][:5]}
    post_neighbors = {n["token"] for n in post["top_neighbors"][:5]}
    overlap = len(pre_neighbors & post_neighbors)

    comparison[word] = {
        "composition": pre["composition"],
        "norm_change": post["embedding_norm"] - pre["embedding_norm"],
        "neighbor_overlap": overlap,
        "pre_top5": [n["token"] for n in pre["top_neighbors"][:5]],
        "post_top5": [n["token"] for n in post["top_neighbors"][:5]]
    }

    print(f"\n{word} ({pre['composition']}):")
    print(f"  Norm: {pre['embedding_norm']:.4f} → {post['embedding_norm']:.4f}")
    print(f"  Overlap: {overlap}/5")
    print(f"  Before: {', '.join(comparison[word]['pre_top5'])}")
    print(f"  After:  {', '.join(comparison[word]['post_top5'])}")

with open(OUTDIR / "results" / "morpheme_comparison.json", "w") as f:
    json.dump(comparison, f, indent=2)

# Save final model
final_dir = OUTDIR / "final_morpheme_model"
model.save_pretrained(final_dir)
tok.save_pretrained(final_dir)

print(f"\n Model saved: {final_dir}")
print(f" Results: {OUTDIR / 'results'}")
print("\n maybe it learned a thing or two")