In [2]:
import pandas as pd
import spacy
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from numpy.linalg import norm

df = pd.read_csv("../outputs/llm_answers.csv")
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer("all-MiniLM-L6-v2")



In [3]:
def fact_risk(answer, source):
    if not isinstance(answer,str) or not isinstance(source,str):
        return 0.0

    doc = nlp(answer)
    ents = [e.text for e in doc.ents]

    if len(ents)==0:
        return 0.0

    unsupported = [e for e in ents if e.lower() not in source.lower()]
    return len(unsupported)/len(ents)


In [4]:
answers = df["llm_answer"].fillna("").tolist()
sources = df["source_text"].fillna("").tolist()

answer_embs = embedder.encode(answers, batch_size=32, show_progress_bar=True)
source_embs = embedder.encode(sources, batch_size=32, show_progress_bar=True)

from numpy.linalg import norm

def cosine_sim(a,b):
    return (a @ b) / (norm(a)*norm(b))

df["RetrievalMismatch"] = [
    1 - cosine_sim(answer_embs[i], source_embs[i])
    for i in range(len(df))
]


Batches:   0%|          | 0/113 [00:00<?, ?it/s]

Batches:   0%|          | 0/113 [00:00<?, ?it/s]

In [6]:
def interpretation_risk(answer, source):
	if not isinstance(answer, str) or not isinstance(source, str):
		return 0.0

	hedges = [
		"possible", "possibly", "probable", "probably", "likely", "unlikely",
		"suggests", "may", "might", "could", "appears", "seems", "unclear",
		"not possible", "cannot determine", "insufficient", "not enough information"
	]
	text = answer.lower()
	hits = sum(1 for h in hedges if h in text)
	return min(1.0, hits / 5.0)

df["FactRisk"] = [fact_risk(a, s) for a, s in zip(df["llm_answer"], df["source_text"])]
df["InterpretationRisk"] = [interpretation_risk(a, s) for a, s in zip(df["llm_answer"], df["source_text"])]
df["ConfidenceGap"] = df["confidence"] * df["RetrievalMismatch"]


In [7]:
df["GovScore"] = (
    0.4*df["InterpretationRisk"]*df["ConfidenceGap"] +
    0.3*df["FactRisk"]*df["ConfidenceGap"] +
    0.2*df["RetrievalMismatch"] +
    0.1*df["ConfidenceGap"]
)


In [8]:
def governance_decision(score):
    if score>0.55: return "BLOCK"
    if score>0.30: return "REVIEW"
    return "APPROVE"


df["TrustRegDecision"] = df["GovScore"].apply(governance_decision)


In [9]:
df.to_csv("../outputs/trustreg_results.csv", index=False)


In [10]:
df[["GovScore","TrustRegDecision"]].head()
df["TrustRegDecision"].value_counts()


TrustRegDecision
APPROVE    3522
REVIEW       78
Name: count, dtype: int64