In [1]:
import os
cwd =  os.getcwd().replace("notebooks/research","")
os.chdir(cwd)

In [2]:
# --- Robust notebook shim for legacy joblib artifacts expecting `encoders.*` ---
import sys, types, numpy as np

# Create/replace a lightweight 'encoders' module in sys.modules
enc_mod = types.ModuleType("encoders")

try:
    from sentence_transformers import SentenceTransformer
except Exception as e:
    SentenceTransformer = None
    print("NOTE: sentence-transformers not available:", e)

class _SBERTBase:
    """
    Compat shim implementing the sklearn Transformer API expected by saved Pipelines.
    Handles pickles that don't call __init__ and are missing attributes.
    Provides both class names: SBERTEncoder and SBERTFeaturizer.
    """
    # NOTE: __init__ might not be called during unpickle; use _ensure_attrs() everywhere.
    def __init__(self, model="sentence-transformers/all-MiniLM-L6-v2", **kwargs):
        self.model_name = model
        self._enc = None
        self._kwargs = kwargs

    def _ensure_attrs(self):
        # Add any attributes that might be missing from legacy pickles
        if not hasattr(self, "model_name") or self.model_name is None:
            self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
        if not hasattr(self, "_enc"):
            self._enc = None
        if not hasattr(self, "_kwargs"):
            self._kwargs = {}

    def _ensure_encoder(self):
        self._ensure_attrs()
        if self._enc is None:
            if SentenceTransformer is None:
                raise RuntimeError(
                    "sentence-transformers not installed in this kernel; "
                    "pip install sentence-transformers && restart kernel"
                )
            self._enc = SentenceTransformer(self.model_name)

    # sklearn API
    def fit(self, X, y=None):
        self._ensure_attrs()
        return self

    def transform(self, X):
        self._ensure_encoder()
        return np.asarray(self._enc.encode(list(X), show_progress_bar=False))

    # some older code may call .encode directly; alias it
    def encode(self, X):
        return self.transform(X)

# Expose both legacy names on the encoders module
class SBERTEncoder(_SBERTBase): ...
class SBERTFeaturizer(_SBERTBase): ...

enc_mod.SBERTEncoder = SBERTEncoder
enc_mod.SBERTFeaturizer = SBERTFeaturizer
sys.modules["encoders"] = enc_mod

# Make sure your package code is importable too (if needed)
import pathlib
if str(pathlib.Path("src").resolve()) not in sys.path:
    sys.path.append(str(pathlib.Path("src").resolve()))
print("encoders shim ready (SBERTEncoder + SBERTFeaturizer) and sys.path configured")

encoders shim ready (SBERTEncoder + SBERTFeaturizer) and sys.path configured


In [3]:
import joblib
from pathlib import Path

def load_mapper():
    for name in [".artifacts/defi_mapper.joblib", ".artifacts/defi_mapper_embed.joblib"]:
        p = Path(name).resolve()
        if p.exists():
            print("Loading:", p.as_posix())
            return joblib.load(p.as_posix())
    raise FileNotFoundError("No mapper artifact found in .artifacts/")

pipe = load_mapper()
print(pipe)


Loading: /Users/ian_moore/repos/micro-lm/.artifacts/defi_mapper.joblib
Pipeline(steps=[('sbertencoder', <__main__.SBERTEncoder object at 0x318dfb9d0>),
                ('calibratedclassifiercv',
                 CalibratedClassifierCV(cv=3,
                                        estimator=LogisticRegression(C=8.0,
                                                                     class_weight='balanced',
                                                                     max_iter=2000,
                                                                     random_state=0),
                                        method='isotonic'))])


In [4]:
prompt = "supply 7.0245 SOL to maker"
pred  = pipe.predict([prompt])[0]
probs = pipe.predict_proba([prompt])[0]
print("Predicted:", pred)
print("Top-3:", sorted(zip(pipe.classes_, probs), key=lambda t: t[1], reverse=True)[:3])

  return forward_call(*args, **kwargs)


Predicted: deposit_asset
Top-3: [('deposit_asset', 1.0), ('borrow_asset', 0.0), ('claim_rewards', 0.0)]


  return forward_call(*args, **kwargs)


### Get token-time traces from the base model

In [5]:
import torch
from transformers import AutoTokenizer, AutoModel

BASE = "sentence-transformers/all-MiniLM-L6-v2"
tok = AutoTokenizer.from_pretrained(BASE)
mdl = AutoModel.from_pretrained(BASE, output_hidden_states=True)
mdl.eval();

def get_hidden_states(text: str, layer_offset: int = -9):
    """Returns [T, H] float32 tensor for a single prompt from a chosen layer."""
    with torch.no_grad():
        batch = tok(text, return_tensors="pt")
        out = mdl(**batch)
        # hidden_states: tuple(len=L+1) of [1, T, H]; choose a stable mid/earlier layer (e.g., -9)
        hs = out.hidden_states[layer_offset].squeeze(0).float()  # [T, H]
        # strip CLS if you prefer token-only; optional:
        return hs  # [T, H]


### Build PCA channels (3D) as the WDD signal space

In [6]:
import torch
from transformers import AutoTokenizer, AutoModel

BASE = "sentence-transformers/all-MiniLM-L6-v2"
tok = AutoTokenizer.from_pretrained(BASE)
mdl = AutoModel.from_pretrained(BASE, output_hidden_states=True).eval()

def get_hidden_states(text: str, layer_offset: int = -4):
    """
    Returns a [T, H] tensor from a valid layer.
    For MiniLM-L6, valid offsets are: -1..-7 (where -7 is embeddings).
    We avoid embeddings and clamp to the range automatically.
    """
    with torch.no_grad():
        batch = tok(text, return_tensors="pt")
        out = mdl(**batch)
        hs = out.hidden_states
        n = len(hs)                 # e.g., 7 for L6 (0..6)
        # clamp offset to [- (n-1), -1] so we never pick embeddings or go OOR
        lo = -(n-1)                 # e.g., -6
        hi = -1
        k = max(lo, min(layer_offset, hi))
        return hs[k].squeeze(0).float()   # [T, H]

# quick probe
h = get_hidden_states("supply 7.0245 SOL to maker", layer_offset=-4)
h.shape


  return forward_call(*args, **kwargs)


torch.Size([10, 384])

In [7]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def build_pca_channels(texts, layer_offset=-4, d=3, whiten=True):
    mats = [get_hidden_states(t, layer_offset).numpy() for t in texts]  # each [T,H]
    all_tokens = np.vstack(mats)  # [sum_T, H]

    scaler = StandardScaler(with_mean=True, with_std=True)
    Xn = scaler.fit_transform(all_tokens)

    pca = PCA(n_components=d, whiten=whiten, random_state=42)
    Z = pca.fit_transform(Xn)

    seqs, i = [], 0
    for m in mats:
        T = m.shape[0]
        seqs.append(Z[i:i+T])
        i += T
    return seqs, pca, scaler

# with torch.no_grad():
#     out = mdl(**tok("test", return_tensors="pt"))
#     print(len(out.hidden_states))  # embeddings + num_layers

In [8]:
# Example batch (you can feed your benchmark prompts)
texts = [
    "supply 7.0245 SOL to maker",
    "swap 10 ETH to USDC on uniswap",
    "attempt a borrow with low health factor"
]
pca_seqs, pca_model, pca_scaler = build_pca_channels(texts, d=3, whiten=True)
len(pca_seqs), pca_seqs[0].shape  # → (3, [T,3])

(3, (10, 3))

### 4) Compute WDD scores (margin) + NGF matched-filter (if available)

In [9]:
import numpy as np

# Try to import ngeodesic’s matched filtering / denoising
try:
    from ngeodesic.dsp.filters import gaussian_matched_filter_1d
    from ngeodesic.dsp.denoise import hybrid_smoother
    HAVE_NG = True
except Exception:
    HAVE_NG = False
    print("ngeodesic not available; falling back to simple np.convolve matcher")

def matched_filter_channel(x: np.ndarray, sigma: float = 5.0) -> np.ndarray:
    """Apply 1D matched filter per channel."""
    if HAVE_NG:
        return gaussian_matched_filter_1d(x, sigma=sigma)
    # Fallback: crude gaussian kernel
    L = int(6*sigma)+1
    t = np.arange(L) - L//2
    k = np.exp(-(t**2)/(2*sigma*sigma)); k /= k.sum()
    return np.convolve(x, k, mode="same")

def wdd_score_sequence(seq_3d: np.ndarray, sigma: float = 5.0) -> float:
    """
    Simple WDD score for one prompt:
    - take 3 PCA channels over tokens [T,3],
    - matched filter each channel,
    - compute max energy and a margin-like separation across channels,
    - return a scalar score (higher = stronger detection).
    """
    # matched filter each channel independently
    mf = np.stack([matched_filter_channel(seq_3d[:,i], sigma) for i in range(seq_3d.shape[1])], axis=1)  # [T,3]
    # channel energy
    ch_energy = mf.max(axis=0)  # [3]
    # margin: best - second_best
    srt = np.sort(ch_energy)
    margin = float(srt[-1] - srt[-2]) if len(srt) >= 2 else float(srt[-1])
    return margin

scores = [wdd_score_sequence(z, sigma=5.0) for z in pca_seqs]
scores


ngeodesic not available; falling back to simple np.convolve matcher


[0.5843617692371803, 0.5322664036930501, 0.5372362585588677]

### 5) Null calibration (circular shifts) → threshold → abstain/fail

In [10]:
rng = np.random.default_rng(42)

def circular_shift(x, k):  # shift tokens by k
    k = k % len(x); 
    return np.concatenate([x[-k:], x[:-k]], axis=0)

def wdd_null_scores(seq_3d, n_null=128, sigma=5.0):
    # Build null by circularly shifting channels independently
    T = seq_3d.shape[0]
    nulls = []
    for _ in range(n_null):
        z = np.stack([circular_shift(seq_3d[:,i], int(rng.integers(1, max(2, T//2)))) 
                      for i in range(seq_3d.shape[1])], axis=1)
        nulls.append(wdd_score_sequence(z, sigma=sigma))
    return np.array(nulls)

# Calibrate threshold on each example’s null (you can also pool nulls)
nulls = [wdd_null_scores(z, n_null=256, sigma=5.0) for z in pca_seqs]
mu = np.array([n.mean() for n in nulls]); sd = np.array([n.std() + 1e-9 for n in nulls])

# z-scores for observed sequences
z_scores = (np.array(scores) - mu) / sd

# Decision: pass if z >= z_thr (e.g., 1.5), else abstain
z_thr = 1.5
decisions = (z_scores >= z_thr)
list(zip(texts, np.round(scores,3), np.round(z_scores,2), decisions))


[('supply 7.0245 SOL to maker', 0.584, 1.73, True),
 ('swap 10 ETH to USDC on uniswap', 0.532, -0.47, False),
 ('attempt a borrow with low health factor', 0.537, -4.0, False)]

### 6) (Optional) Dual-gate + denoise (closer to NGF Stage-11)

In [11]:
# Build per-token “energy” streams (post-filter) for visualization & denoising
def channel_streams(seq_3d, sigma=5.0):
    return np.stack([matched_filter_channel(seq_3d[:,i], sigma) for i in range(3)], axis=1)  # [T,3]

streams = [channel_streams(z, sigma=5.0) for z in pca_seqs]

if HAVE_NG:
    smooth = [hybrid_smoother(s, ema_alpha=0.15, med_k=3) for s in streams]  # each [T,3]
else:
    smooth = streams

# Dual-gate decision (toy example): 
# - relative: margin between best and second channel ≥ τ_rel
# - absolute: best channel z-score ≥ τ_abs (using null on that best channel)
def dual_gate_decision(seq_3d, null_ns=256, sigma=5.0, tau_rel=0.05, tau_abs=1.5):
    s = channel_streams(seq_3d, sigma=sigma)
    ch_max = s.max(axis=0)  # [3]
    rel = np.sort(ch_max)[-1] - np.sort(ch_max)[-2]
    # absolute gate via per-example pooled null
    n = wdd_null_scores(seq_3d, n_null=null_ns, sigma=sigma)
    z = (wdd_score_sequence(seq_3d, sigma=sigma) - n.mean()) / (n.std() + 1e-9)
    return (rel >= tau_rel) and (z >= tau_abs), {"rel": float(rel), "z": float(z)}

for t, z in zip(texts, pca_seqs):
    ok, stats = dual_gate_decision(z, sigma=5.0)
    print(ok, stats, "←", t)


True {'rel': 0.5843617692371803, 'z': 1.5777207699924611} ← supply 7.0245 SOL to maker
False {'rel': 0.5322664036930501, 'z': -0.6078239725797517} ← swap 10 ETH to USDC on uniswap
False {'rel': 0.5372362585588677, 'z': -3.9480696151284724} ← attempt a borrow with low health factor
