In [1]:
# %% [markdown]
"""
# SRL → Tuple Extraction Pipeline (Transformer‑SRL + spaCy)

Multi‑cell notebook to generate subject–predicate–object tuples, event IDs, and
VSA‑friendly nodes/edges from a sentence. Uses `transformer-srl` in place of
AllenNLP. Each cell prints intermediate structures for rapid diagnosis.
"""

# %%
# Install once per environment (GPU optional)
# Note: uncomment next two lines if running in a fresh kernel.
# !pip install -q transformer-srl spacy==3.7.2 torch jsonschema
# !python -m spacy download en_core_web_sm

# %%
import json, re, uuid, hashlib
from pathlib import Path
from collections import defaultdict

import random, numpy as np, torch
import spacy
from transformer_srl import predictors

# ── deterministic seeds ─────────────────────────────────────────
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

print("Seeds set →", SEED)

# %%
# ── load NLP models once ───────────────────────────────────────
NLP = spacy.load("en_core_web_sm", disable=["ner", "textcat", "lemmatizer"])
print("spaCy model loaded → en_core_web_sm")

MODEL_PATH = "srl_bert_base_conll2012.tar.gz"  # download once and cache
if not Path(MODEL_PATH).exists():
    import urllib.request, tempfile
    print("Downloading SRL model (≈420 MB)…")
    url = (
        "https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.12.15.tar.gz"
    )
    urllib.request.urlretrieve(url, MODEL_PATH)
    print("✓ Downloaded →", MODEL_PATH)

SRL = predictors.SrlTransformersPredictor.from_path(MODEL_PATH)
print("Transformer‑SRL predictor ready →", SRL)

# %% [markdown]
"""
## 1. Input sentence
Modify `SENTENCE` below and re‑run from here to inspect pipeline outputs.
"""

# %%
SENTENCE = "I saw a white dog chase the brown cat quickly in the backyard."
print("Sentence →", SENTENCE)

# %%
# ── spaCy tokenisation + dependencies ──────────────────────────
doc = NLP(SENTENCE)

TOKENS = [t.text for t in doc]
DEPS = [(t.text, t.dep_, t.head.text, t.i, t.head.i) for t in doc]

print("Tokens (spaCy):", TOKENS)
print("Dependency triples (token, dep, head):")
for tok, dep, head, *_ in DEPS:
    print(f"  {tok:>10} ─{dep:<8}→ {head}")

# %%
# ── Transformer‑SRL frames ─────────────────────────────────────
srl_out = SRL.predict(sentence=SENTENCE)
FRAMES = srl_out["verbs"]  # list of dicts: {verb, tags}

print("SRL Frames (predicate → role tags):")
for f in FRAMES:
    print(f"{f['verb']:<8} → {f['description']}")

# %%
# ── Role dictionaries ──────────────────────────────────────────
DEP2ROLE = {
    "nsubj": "Subject", "csubj": "Subject",
    "dobj": "Object", "obj": "Object",
    "iobj": "IndirectObject",
    "advmod": "Attr", "amod": "Attr",
}
SRL2ROLE = {
    "ARG0": "Subject", "ARG1": "Object", "ARG2": "IndirectObject",
    "AM-TMP": "Time", "AM-LOC": "IndirectObject", "AM-MNR": "Attr",
}

# Build proto‑tuples from dependencies + SRL
proto_tuples = []
for tok, dep, head, ti, hi in DEPS:
    if dep in DEP2ROLE:
        proto_tuples.append({"role": DEP2ROLE[dep], "span": (ti, ti+1), "text": tok})

for vf in FRAMES:
    tags = vf["tags"]
    for idx, tag in enumerate(tags):
        if tag.startswith("B-"):
            role_tag = tag[2:]
            end = idx
            while end+1 < len(tags) and tags[end+1].startswith("I-"):
                end += 1
            if role_tag in SRL2ROLE:
                span_text = " ".join(TOKENS[idx:end+1])
                proto_tuples.append({"role": SRL2ROLE[role_tag],
                                     "span": (idx, end+1),
                                     "text": span_text})

print("Proto‑tuples →", len(proto_tuples))
for t in proto_tuples:
    print(t)

# %%
# ── Event detection: finite verbs as event anchors ─────────────
eids, eid_counter = {}, 0
for tok in doc:
    if tok.pos_ == "VERB" and tok.morph.get("VerbForm") != ["Inf"]:
        eid_counter += 1
        eids[tok.i] = f"e{eid_counter}"
print("EIDs detected →", eids)

# Attach eid to tuples (nearest governing verb)
for t in proto_tuples:
    head_i = doc[t["span"][0]].head.i
    t["eid"] = eids.get(head_i, "e1")

# %%
# ── Node construction (CHV schema) ─────────────────────────────

def slug(text):
    return re.sub(r"[^a-z0-9]+", "", text.lower())[:32] or "unk"

nodes, node_index = [], {}
for tup in proto_tuples:
    base = f"spo:{slug(tup['text'])}@{tup['eid']}"
    if base not in node_index:
        node_index[base] = base
        nodes.append({
            "id": base,
            "filler": tup["text"],
            "roles": [tup["role"]],
            "eid_set": [tup["eid"]],
            "ntype": "spo",
        })
    else:
        for n in nodes:
            if n["id"] == base and tup["role"] not in n["roles"]:
                n["roles"].append(tup["role"])

# explicit predicate nodes
action_nodes = []
for tok_i, eid in eids.items():
    pred = doc[tok_i].lemma_
    n_id = f"spo:{slug(pred)}@{eid}"
    if n_id not in node_index:
        node_index[n_id] = n_id
        action_nodes.append({
            "id": n_id,
            "filler": pred,
            "roles": ["Predicate"],
            "eid_set": [eid],
            "ntype": "spo",
        })

nodes.extend(action_nodes)

# event stubs
for eid in set(eids.values()):
    nodes.append({"id": f"evt:{eid}", "filler": eid,
                  "roles": ["Event"], "eid_set": [eid], "ntype": "event"})

# single CHV hub
nodes.append({"id": "chv:main", "filler": "CHV",
              "roles": ["CHV"], "eid_set": [], "ntype": "chv"})

print("Total nodes →", len(nodes))

# %%
# ── Edge construction ─────────────────────────────────────────
edges = []

def add_edge(src, tgt, kind):
    edges.append({"source": src, "target": tgt, "kind": kind})

for tup in proto_tuples:
    head_token = doc[tup["span"][0]].head
    pred_id = f"spo:{slug(head_token.lemma_)}@{tup['eid']}"
    if tup["role"] == "Subject":
        add_edge(f"spo:{slug(tup['text'])}@{tup['eid']}", pred_id, "S-P")
    elif tup["role"] == "Object":
        add_edge(pred_id, f"spo:{slug(tup['text'])}@{tup['eid']}", "P-O")

for eid in set(eids.values()):
    subj_nodes = [n for n in nodes if "Subject" in n["roles"] and eid in n["eid_set"]]
    if subj_nodes:
        add_edge(f"evt:{eid}", subj_nodes[0]["id"], "event-pred")

outer_eid = min(set(eids.values()))
last_obj_nodes = [n["id"] for n in nodes if "Object" in n["roles"] and outer_eid in n["eid_set"]]
if last_obj_nodes:
    add_edge(last_obj_nodes[-1], "chv:main", "binder")

print("Total edges →", len(edges))

# %%
# ── Hull construction (visual grouping) ───────────────────────
hulls = []
for eid in set(eids.values()):
    members = [n["id"] for n in nodes if eid in n["eid_set"] and n["ntype"] == "spo"]
    if members:
        hulls.append({"eid": eid, "members": members})
print("Hulls →", hulls)

# %%
# ── Payload assembly & JSON Schema check ─────────────────────

payload = {
    "version": "2.1",
    "sentence": SENTENCE,
    "nodes": nodes,
    "edges": edges,
    "layouts": {"hulls": hulls},
}

print(json.dumps(payload, indent=2, ensure_ascii=False))

SCHEMA_PATH = "schema.json"
if Path(SCHEMA_PATH).exists():
    from jsonschema import Draft202012Validator
    SCHEMA = json.load(open(SCHEMA_PATH))
    Draft202012Validator.check_schema(SCHEMA)
    Draft202012Validator(SCHEMA).validate(payload)
    print("✅  Payload is schema‑valid")
else:
    print("⚠️  schema.json not found ‑‑ skipping validation")


ModuleNotFoundError: No module named 'transformer_srl'