In [None]:
# (minimal role + lookup tables)

from pathlib import Path
import json, hashlib, numpy as np, spacy, torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import srl  # from liaad wrapper

# ─── deterministic seed everywhere ────────────────────────────────────────────
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED)

# ─── constants & helper artefacts (tiny demo versions) ───────────────────────
ROLES         = ["Predicate", "Subject", "Object", "IndirectObject", "Attr", "Tense",
                 "VerbClass", "Type", "Source", "Date", "Venue", "CHV"]
ROLE_VECTORS  = {r: np.random.choice([-1, 1], 4096).astype(np.int8) for r in ROLES}

JL_MATRIX     = np.random.randn(4096, 768).astype(np.float32) / np.sqrt(768)  # throw-away demo
RESOLVE_DICT  = {"i": "i", "white": "white", "dog": "dog", "chase": "chase",
                 "cat": "cat", "quickly": "quickly", "backyard": "backyard"}

def jl_project(vec_768: np.ndarray) -> np.ndarray:
    """Dense → 4096, sign-binarised."""
    return np.sign(JL_MATRIX @ vec_768).astype(np.int8)


In [None]:
# (load spaCy + SRL once)

# spaCy transformer (only tokenizer / POS / DEP)
nlp = spacy.load("en_core_web_trf", exclude=["ner"])
nlp.max_length = 512

# SRL pipeline (english)
srl_tokenizer = AutoTokenizer.from_pretrained("liaad/srl-en_xlmr-large")
srl_model     = AutoModelForTokenClassification.from_pretrained("liaad/srl-en_xlmr-large")
srl_pipe      = srl.SRL_Predictor(model=srl_model, tokenizer=srl_tokenizer, batch=1)


In [None]:
# (ultra-thin deterministic extractor)

def resolve_alias(txt: str) -> str:
    key = txt.lower().strip()
    return RESOLVE_DICT.get(key, key)          # fallback = lower-cased text

def simple_rule_map(frame):
    """
    Maps SRL labels → {Subject, Object, ...}.
    Again: minimal demo rules for PropBank labels.
    """
    label = frame["tag"]
    if label == "B-ARG0":
        return "Subject"
    if label in ("B-ARG1", "B-ARG2"):
        return "Object"
    if label.startswith("B-ARGM-LOC"):
        return "IndirectObject"
    if label.startswith("B-ARGM-MNR"):
        return "Attr"
    return None  # ignore the rest for the demo

def encode_chv(tuples, gamma=0.15):
    """Implements Eq. (1) in the checklist using majority-vote bundling."""
    accum = np.zeros(4096, dtype=np.int32)
    for role, f_vec in tuples:                     # f_vec already ±1
        r_vec = ROLE_VECTORS[role]
        bound = r_vec * f_vec
        mixed = (1 - gamma) * bound + gamma * f_vec
        accum += mixed.astype(np.int32)
    return np.sign(accum).astype(np.int8)

def extract(sentence: str, meta: dict, doc_id="DOC1", sent_id="0001"):
    doc = nlp(sentence)
    frames = srl_pipe.predict(sentence)["verbs"]

    nodes, edges, tuples_for_chv = [], [], []
    eid_counter = 1

    for verb_frame in frames:
        predicate = verb_frame["verb"]
        args      = verb_frame["tags"]
        words     = verb_frame["words"]

        # create predicate node
        pred_id = f"spo:{predicate}@e{eid_counter}"
        nodes.append({
            "id": pred_id, "filler": predicate, "alias_key": resolve_alias(predicate),
            "roles": ["Predicate"], "eid_set": [f"e{eid_counter}"], "ntype": "spo",
            "char_start": sentence.index(predicate), "char_end": sentence.index(predicate) + len(predicate)
        })

        # walk over labelled tokens
        current_arg = None
        for w, tag in zip(words, args):
            role = simple_rule_map({"tag": tag})
            if role is None:
                continue
            node_id = f"spo:{w}@e{eid_counter}"
            if node_id not in {n["id"] for n in nodes}:
                nodes.append({
                    "id": node_id, "filler": w, "alias_key": resolve_alias(w),
                    "roles": [role], "eid_set": [f"e{eid_counter}"], "ntype": "spo",
                    "char_start": sentence.index(w), "char_end": sentence.index(w) + len(w)
                })
            # edges
            if role == "Subject":
                edges.append({"source": node_id, "target": pred_id, "kind": "S-P"})
            elif role == "Object":
                edges.append({"source": pred_id, "target": node_id, "kind": "P-O"})
            else:
                edges.append({"source": node_id, "target": pred_id, "kind": "attr"})

            # collect CHV pieces
            filler_vec = jl_project(np.random.randn(768))   # placeholder! use real embed
            tuples_for_chv.append((role, filler_vec))

        eid_counter += 1

    # CHV anchor node
    nodes.append({"id":"chv:main","filler":"CHV","roles":["CHV"],"eid_set":[],"ntype":"chv"})
    payload = {
        "version":"2.4-demo",
        "sentence": sentence,
        "nodes": nodes,
        "edges": edges,
        "layouts": {"hulls":[]}
    }
    chv = encode_chv(tuples_for_chv)
    return payload, chv


In [None]:
# (run the example sentence)

test_sent = "I saw a white dog chase the brown cat quickly in the backyard."
meta      = {"date":"2025-07-05","source":"Book_X"}
payload, chv = extract(test_sent, meta)

print(json.dumps(payload, indent=2, ensure_ascii=False)[:1000])  # first 1 kB
print("CHV shape:", chv.shape, "∈ {-1,+1}")