In [8]:
import pandas as pd
from pathlib import Path

base = Path.cwd()
candidate = base / "outputs" / "trustreg_results.csv"
if not candidate.exists():
    candidate = base.parent / "outputs" / "trustreg_results.csv"

df = pd.read_csv(candidate)


In [9]:
def decision_harm(predicted, true):
    if predicted==1 and true==0:
        return 5
    if predicted==0 and true==1:
        return 1
    return 0

# Raw LLM always approves
df["LLM_pred"] = 1
df["LLM_harm"] = df.apply(lambda r: decision_harm(1, r["binary_violation"]), axis=1)
df["LLM_harm"].sum()

df["HarmValue"] = df["LLM_harm"]
df["HarmLabel"] = (df["HarmValue"] > 0).astype(int)

In [10]:
from sklearn.linear_model import LogisticRegression

X = df[["FactRisk","InterpretationRisk","RetrievalMismatch","ConfidenceGap"]]
y = df["HarmLabel"]

clf = LogisticRegression(class_weight={0:1, 1:5}, max_iter=1000)
clf.fit(X, y)

df["GovProb"] = clf.predict_proba(X)[:,1]


In [11]:
df["LearnedTrustRegDecision"] = df["GovProb"].apply(
    lambda p: "BLOCK" if p>0.5 else "APPROVE"
)

df["LearnedTrustReg_pred"] = df["LearnedTrustRegDecision"].apply(lambda d: 1 if d=="APPROVE" else 0)

df["LearnedTrustReg_harm"] = df.apply(
    lambda r: decision_harm(r["LearnedTrustReg_pred"], r["binary_violation"]),
    axis=1
)


In [12]:
print("Raw LLM harm:", df["LLM_harm"].sum())
print("Threshold TrustReg harm:", df["TrustReg_harm"].sum() if "TrustReg_harm" in df else "not computed")
print("Learned TrustReg harm:", df["LearnedTrustReg_harm"].sum())


Raw LLM harm: 1860
Threshold TrustReg harm: not computed
Learned TrustReg harm: 1860
