In [23]:
#!/usr/bin/env python3
"""
End-to-end script for training and evaluating a DeFi mapper.
Combines simplified training from train_mapper_embed.py and evaluation/prediction from defi_milestone8.py,
in a style similar to mapper_simple.py for easy Jupyter adaptation.

Usage example:
python3 end_to_end_mapper.py \
  --train_labels_csv tests/fixtures/defi_mapper_labeled_mini.csv \
  --sbert sentence-transformers/all-mpnet-base-v2 \
  --C 8 --max_iter 2000 --calibrate \
  --prompts_jsonl tests/fixtures/defi_mapper_5k_prompts.jsonl \
  --eval_labels_csv tests/fixtures/defi_mapper_labeled_5k.csv \
  --thresholds "0.5,0.6,0.7,0.8,0.9" \
  --out_dir .artifacts \
  --out_model defi_mapper_embed.joblib \
  --out_summary defi_milestone8_summary.json \
  --out_csv defi_milestone8_metrics.csv \
  --out_rows_csv defi_milestone8_rows.csv

In Jupyter, you can copy-paste sections or override args programmatically.
"""

import argparse, csv, json, os, sys, time
from collections import Counter
from typing import Any, Dict, List
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
import os
cwd =  os.getcwd().replace("/notebooks","")
os.chdir(cwd)

In [75]:
# SBERTEncoder class (from defi_milestone8.py shim)
class SBERTEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2",
                 batch_size=64, normalize=True):
        self.model_name = model_name
        self.batch_size = batch_size
        self.normalize = normalize
        self._model = None

    def fit(self, X, y=None):
        from sentence_transformers import SentenceTransformer
        self._model = SentenceTransformer(self.model_name)
        return self

    def transform(self, X):
        if self._model is None:
            from sentence_transformers import SentenceTransformer
            self._model = SentenceTransformer(self.model_name)
        embs = self._model.encode(
            list(X),
            batch_size=self.batch_size,
            show_progress_bar=False,
            normalize_embeddings=self.normalize,
        )
        return np.asarray(embs)

DEFAULT_CLASSES = ["deposit_asset", "withdraw_asset", "swap_asset", "check_balance"]

# -------------------------- Training (from train_mapper_embed.py) --------------------------

def train_mapper(args):
    df = pd.read_csv(args.train_labels_csv)
    need = {"prompt", "label"}
    if not need.issubset(df.columns):
        sys.exit(f"[train] labels_csv must have columns {need}, got {df.columns.tolist()}")
    df = df.copy()
    df["prompt"] = df["prompt"].astype(str).str.strip()
    df["label"] = df["label"].astype(str).str.strip()
    df = df.dropna(subset=["prompt", "label"])
    df = df[df["prompt"].str.len() > 0]
    if df.empty:
        sys.exit("[train] No non-empty prompts after cleaning.")
    
    X = df["prompt"].tolist()
    y = df["label"].tolist()
    
    # Base classifier
    base = LogisticRegression(max_iter=int(args.max_iter), C=float(args.C), class_weight="balanced", random_state=0)
    
    # Calibration if requested
    model = base
    if args.calibrate:
        cnt = Counter(y)
        m = min(cnt.values())
    
        method = args.calibration_method
        cv = args.calibration_cv
        if method == "auto":
            if m >= max(3, cv):
                method, cv = "isotonic", max(3, cv)
            elif m >= 2:
                method, cv = "sigmoid", max(2, min(m, cv))
            else:
                print("[train] Not enough samples per class for calibration; skipping.", file=sys.stderr)
        if method in ("isotonic", "sigmoid"):
            try:
                model = CalibratedClassifierCV(estimator=base, method=method, cv=cv)
            except TypeError:
                model = CalibratedClassifierCV(base_estimator=base, method=method, cv=cv)
    
        # Pipeline
        pipe = make_pipeline(SBERTEncoder(args.sbert), model)
        pipe.fit(X, y)
    
        out_path = os.path.join(args.out_dir, args.out_model)
        joblib.dump(pipe, out_path)
        print(f"[train] Wrote mapper to {out_path} (n={len(X)})")
    return pipe, out_path

# -------------------------- I/O Helpers (from defi_milestone8.py) --------------------------

def read_prompts_jsonl(path: str) -> List[str]:
    prompts: List[str] = []
    if not os.path.exists(path):
        raise FileNotFoundError(path)
    with open(path, "r") as f:
        for line in f:
            try:
                rec = json.loads(line)
                p = rec.get("prompt", "").strip()
                if p:
                    prompts.append(p)
            except Exception:
                continue
    return prompts

def read_labels_csv(path: str) -> Dict[str, str]:
    gold: Dict[str, str] = {}
    if not path or not os.path.exists(path):
        return gold
    with open(path, newline="") as f:
        reader = csv.DictReader(f)
        for rec in reader:
            p = str(rec.get("prompt", "")).strip()
            y = str(rec.get("label", "")).strip()
            if p:
                gold[p] = y
    return gold

# -------------------------- Scoring & Metrics (simplified from defi_milestone8.py) --------------------------

def mapper_predict_scores(mapper, prompts: List[str], class_names: List[str]) -> List[Dict[str, float]]:
    import numpy as np
    scores: List[Dict[str, float]] = []
    if hasattr(mapper, "predict_proba"):
        probs = mapper.predict_proba(prompts)
        classes = list(getattr(mapper, "classes_", class_names))
        for row in probs:
            row_map = {str(c): float(p) for c, p in zip(classes, row)}
            for cname in class_names:
                row_map.setdefault(cname, 0.0)
            scores.append(row_map)
        return scores
    # Fallback to predict (simplified, no decision_function for brevity)
    preds = mapper.predict(prompts)
    for y in preds:
        row_map = {c: 0.0 for c in class_names}
        row_map[str(y)] = 1.0
        scores.append(row_map)
    return scores

def metrics_for_threshold(
    prompts: List[str],
    scores: List[Dict[str, float]],
    class_names: List[str],
    thr: float,
    gold: Dict[str, str],
) -> Dict[str, Any]:
    total = len(prompts)
    abstain = 0
    correct = 0
    fired = 0
    for p, smap in zip(prompts, scores):
        top = max(class_names, key=lambda c: smap.get(c, 0.0))
        score = smap.get(top, 0.0)
        if score < thr:
            abstain += 1
        else:
            fired += 1
            if p in gold and gold[p] == top:
                correct += 1
    abstain_rate = abstain / max(1, total)
    coverage = fired / max(1, total)
    acc_on_fired = (correct / max(1, fired)) if fired else None
    overall_acc = correct / max(1, total)
    return {
        "threshold": thr,
        "total": total,
        "abstain": abstain,
        "abstain_rate": abstain_rate,
        "coverage": coverage,
        "fired": fired,
        "correct_on_fired": correct,
        "accuracy_on_fired": acc_on_fired,
        "overall_correct": correct,
        "overall_accuracy": overall_acc,
    }

def write_rows_csv(path: str, prompts, scores, class_names, thr, gold):
    with open(path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["prompt", "gold_label", "predicted", "confidence", "abstain", "threshold"])
        for p, smap in zip(prompts, scores):
            top = max(class_names, key=lambda c: smap.get(c, 0.0))
            conf = float(smap.get(top, 0.0))
            fire = conf >= thr
            pred = top if fire else ""
            w.writerow([p, gold.get(p, ""), pred, f"{conf:.4f}", (not fire), thr])

In [5]:

# # -------------------------- Main --------------------------

# def main():
#     ap = argparse.ArgumentParser(description="End-to-end DeFi Mapper Training & Evaluation")
#     # Training args
#     ap.add_argument("--train_labels_csv", required=True, help="CSV with prompt,label for training")
#     ap.add_argument("--sbert", default="sentence-transformers/all-MiniLM-L6-v2", help="SentenceTransformer model")
#     ap.add_argument("--C", type=float, default=8.0, help="LogisticRegression C")
#     ap.add_argument("--max_iter", type=int, default=2000, help="LogisticRegression max_iter")
#     ap.add_argument("--calibrate", action="store_true", help="Calibrate probabilities")
#     ap.add_argument("--calibration_method", choices=["auto", "isotonic", "sigmoid"], default="auto")
#     ap.add_argument("--calibration_cv", type=int, default=3)
#     # Evaluation args
#     ap.add_argument("--prompts_jsonl", required=True, help="JSONL with prompts for evaluation")
#     ap.add_argument("--eval_labels_csv", default="", help="Optional CSV with labels for evaluation")
#     ap.add_argument("--class_names", type=str, default=",".join(DEFAULT_CLASSES), help="Comma-separated class names")
#     ap.add_argument("--thresholds", type=str, default="0.5,0.6,0.7,0.8,0.9", help="Comma-separated thresholds")
#     ap.add_argument("--max_abstain_rate", type=float, default=0.10)
#     ap.add_argument("--min_overall_acc", type=float, default=0.85)
#     ap.add_argument("--choose_by", type=str, default="abstain_then_acc", choices=["abstain_then_acc", "utility"])
#     # Output args
#     ap.add_argument("--out_dir", default=".artifacts", help="Output directory")
#     ap.add_argument("--out_model", default="defi_mapper_embed.joblib", help="Trained model filename")
#     ap.add_argument("--out_summary", default="defi_milestone8_summary.json", help="Summary JSON")
#     ap.add_argument("--out_csv", default="defi_milestone8_metrics.csv", help="Metrics CSV")
#     ap.add_argument("--out_rows_csv", default="defi_milestone8_rows.csv", help="Per-prompt rows CSV")
#     args = ap.parse_args()

#     # Ensure output dir
#     os.makedirs(args.out_dir, exist_ok=True)

In [130]:
def get_args():
    ap = argparse.ArgumentParser()
    ap.add_argument("--backend",       default="wordmap", help="wordmap|sbert")
    ap.add_argument("--model_path",    default=".artifacts/defi_mapper.joblib")
    ap.add_argument("--prompts_jsonl", default="tests/fixtures/defi/defi_mapper_5k_prompts.json")
    ap.add_argument("--test_labels_csv", default="tests/fixtures/defi/defi_mapper_labeled_5k.csv")
    ap.add_argument("--thresholds",    default="0.5,0.55,0.6,0.65,0.7")
    ap.add_argument("--max_iter",    default="2000")
    ap.add_argument("--C",    default="8")
    ap.add_argument("--calibrate",    default="True")
    ap.add_argument("--calibration_method", choices=["auto","isotonic","sigmoid"], default="auto")
    ap.add_argument("--calibration_cv", type=int, default=3)
    ap.add_argument("--train_labels_csv",    default="tests/fixtures/defi/defi_mapper_labeled_large.csv")
    ap.add_argument("--sbert", default="sentence-transformers/all-MiniLM-L6-v2")
    ap.add_argument("--out_model", default="defi_mapper_embed.joblib")
    ap.add_argument("--out_dir",       default="")
    ap.add_argument("--min_overall_acc", default=None)
    
    notebook_args = [
        "--backend", "sbert",
        "--model_path", ".artifacts/defi_mapper.joblib",
        "--prompts_jsonl", "tests/fixtures/defi/defi_mapper_5k_prompts.jsonl",
        "--test_labels_csv",    "tests/fixtures/defi/defi_mapper_labeled_5k.csv",
        "--train_labels_csv", "tests/fixtures/defi/defi_mapper_labeled_large.csv",
        "--thresholds", "0.5,0.55,0.6,0.65,0.7",
        "--max_iter", "2000",
        "--C", "8",
        "--calibrate", "True",
        "--calibration_method", "auto",
        "--calibration_cv", "3",
        "--min_overall_acc", "0.75",
        "--min_overall_acc", "0.75",
        "--sbert", "sentence-transformers/all-MiniLM-L6-v2",
        "--out_model", "defi_mapper_embed.joblib",
        "--out_dir", ".artifacts/defi/mapper_bench",
    ]
    
    return ap.parse_args(notebook_args)

In [120]:
CLASS_NAMES = ['claim_rewards','withdraw_asset','stake_asset','deposit_asset','swap_asset','repay_asset','unstake_asset','repay_asset'] 

In [121]:
args = get_args()
mapper, model_path = train_mapper(args)

# Load eval data
class_names = CLASS_NAMES
prompts = read_prompts_jsonl(args.prompts_jsonl)
if not prompts:
    sys.exit("No prompts loaded.")
gold = read_labels_csv(args.labels_csv)
scores = mapper_predict_scores(mapper, prompts, class_names)
thr_list = [float(x.strip()) for x in args.thresholds.split(",") if x.strip()]
metrics = [metrics_for_threshold(prompts, scores, class_names, thr, gold) for thr in thr_list]

  return forward_call(*args, **kwargs)


[train] Wrote mapper to .artifacts/defi/mapper_bench/defi_mapper_embed.joblib (n=1000)


  return forward_call(*args, **kwargs)


In [123]:
#Wmetrics

In [131]:
args = get_args()

df = pd.read_csv(args.train_labels_csv)
need = {"prompt", "label"}
if not need.issubset(df.columns):
    sys.exit(f"[train] labels_csv must have columns {need}, got {df.columns.tolist()}")
df = df.copy()
df["prompt"] = df["prompt"].astype(str).str.strip()
df["label"] = df["label"].astype(str).str.strip()
df = df.dropna(subset=["prompt", "label"])
df = df[df["prompt"].str.len() > 0]
if df.empty:
    sys.exit("[train] No non-empty prompts after cleaning.")
    
X = df["prompt"].tolist()
y = df["label"].tolist()

# Base classifier
base = LogisticRegression(max_iter=int(args.max_iter), C=float(args.C), class_weight="balanced", random_state=0)

# Tiny-data friendly calibration
model = base
if args.calibrate:
    cnt = Counter(y); m = min(cnt.values())
    method = args.calibration_method; cv = args.calibration_cv
    if method == "auto":
        if m >= max(3, cv):
            method, cv = "isotonic", max(3, cv)
        elif m >= 2:
            method, cv = "sigmoid", max(2, min(m, cv))
        else:
            print("[train_mapper_embed] Not enough samples per class for calibration; skipping.", file=sys.stderr)
    if method in ("isotonic","sigmoid"):
        try:
            # sklearn >= 1.3 uses 'estimator'
            model = CalibratedClassifierCV(estimator=base, method=method, cv=cv)
        except TypeError:
            # older sklearn used 'base_estimator'
            model = CalibratedClassifierCV(base_estimator=base, method=method, cv=cv)
    

pipe = make_pipeline(SBERTEncoder(args.sbert), model)
pipe.fit(X, y)

out_path = os.path.join(args.out_dir, args.out_model)
joblib.dump(pipe, out_path)
print(f"[train] Wrote mapper to {out_path} (n={len(X)})")

mapper, model_path = pipe, out_path

  return forward_call(*args, **kwargs)


[train] Wrote mapper to .artifacts/defi/mapper_bench/defi_mapper_embed.joblib (n=1000)


In [133]:
# Load eval data
class_names = CLASS_NAMES
prompts = read_prompts_jsonl(args.prompts_jsonl)
if not prompts:
    sys.exit("No prompts loaded.")
gold = read_labels_csv(args.test_labels_csv)
scores = mapper_predict_scores(mapper, prompts, class_names)
thr_list = [float(x.strip()) for x in args.thresholds.split(",") if x.strip()]
metrics = [metrics_for_threshold(prompts, scores, class_names, thr, gold) for thr in thr_list]
#metrics

  return forward_call(*args, **kwargs)


In [134]:
prompts[0:5]

['claim staking rewards on aave base',
 'pull out 2752.8264 ARB from lido',
 'restake 33.8529 MATIC with balancer on solana — this minute, use normal gas',
 'deposit 4697 USDC into uniswap on base',
 'supply 7.0245 SOL to maker']

In [None]:
# Score
    scores = mapper_predict_scores(mapper, prompts, class_names)

    # Sweep thresholds
    thr_list = [float(x.strip()) for x in args.thresholds.split(",") if x.strip()]
    metrics = [metrics_for_threshold(prompts, scores, class_names, thr, gold) for thr in thr_list]

    # Choose operating point (simplified: highest thr with abstain_rate <= max_abstain_rate)
    admissible = [m for m in metrics if m["abstain_rate"] <= args.max_abstain_rate]
    if admissible:
        admissible.sort(key=lambda m: m["threshold"], reverse=True)
        chosen = admissible[0]
    else:
        metrics.sort(key=lambda m: m["abstain_rate"])
        chosen = metrics[0]

In [7]:


# Load eval data
class_names = [c.strip() for c in args.class_names.split(",") if c.strip()]
prompts = read_prompts_jsonl(args.prompts_jsonl)
if not prompts:
    sys.exit("No prompts loaded.")
gold = read_labels_csv(args.eval_labels_csv)

# Score
scores = mapper_predict_scores(mapper, prompts, class_names)

AttributeError: 'Namespace' object has no attribute 'train_labels_csv'

In [None]:

    # Score
    scores = mapper_predict_scores(mapper, prompts, class_names)

    # Sweep thresholds
    thr_list = [float(x.strip()) for x in args.thresholds.split(",") if x.strip()]
    metrics = [metrics_for_threshold(prompts, scores, class_names, thr, gold) for thr in thr_list]

    # Choose operating point (simplified: highest thr with abstain_rate <= max_abstain_rate)
    admissible = [m for m in metrics if m["abstain_rate"] <= args.max_abstain_rate]
    if admissible:
        admissible.sort(key=lambda m: m["threshold"], reverse=True)
        chosen = admissible[0]
    else:
        metrics.sort(key=lambda m: m["abstain_rate"])
        chosen = metrics[0]

    # Status
    has_labels = bool(gold)
    pass_abstain = (chosen["abstain_rate"] <= args.max_abstain_rate)
    pass_accuracy = True if not has_labels else (chosen["overall_accuracy"] >= args.min_overall_acc)
    status = "pass" if (pass_abstain and pass_accuracy) else "fail"

    # Write summary
    summary = {
        "ok": True,
        "timestamp": int(time.time()),
        "prompts": len(prompts),
        "has_labels": has_labels,
        "thresholds": thr_list,
        "metrics": metrics,
        "chosen": {
            "threshold": chosen["threshold"],
            "abstain_rate": chosen["abstain_rate"],
            "coverage": chosen["coverage"],
            "accuracy_on_fired": chosen.get("accuracy_on_fired"),
            "overall_accuracy": chosen.get("overall_accuracy"),
        },
        "status": status,
    }
    out_summary = os.path.join(args.out_dir, args.out_summary)
    with open(out_summary, "w") as f:
        json.dump(summary, f, indent=2)

    # Write metrics CSV
    out_csv = os.path.join(args.out_dir, args.out_csv)
    with open(out_csv, "w", newline="") as f:
        writer = csv.writer(f)
        header = ["threshold", "total", "abstain", "abstain_rate", "coverage", "fired", "correct_on_fired", "accuracy_on_fired", "overall_correct", "overall_accuracy"]
        writer.writerow(header)
        for m in metrics:
            writer.writerow([m.get(h) for h in header])

    # Write rows CSV at chosen threshold
    out_rows_csv = os.path.join(args.out_dir, args.out_rows_csv)
    write_rows_csv(out_rows_csv, prompts, scores, class_names, chosen["threshold"], gold)

    # Print summary
    print(json.dumps({"ok": True, "model": model_path, "summary": out_summary, "csv": out_csv, "rows_csv": out_rows_csv, "status": status}, indent=2))

    # Train
    mapper, model_path = train_mapper(args)

    # Load eval data
    class_names = [c.strip() for c in args.class_names.split(",") if c.strip()]
    prompts = read_prompts_jsonl(args.prompts_jsonl)
    if not prompts:
        sys.exit("No prompts loaded.")
    gold = read_labels_csv(args.eval_labels_csv)


if __name__ == "__main__":
    main()