In [None]:
import os
import pandas as pd
import requests
from dotenv import load_dotenv
from tqdm import tqdm

import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

model = genai.GenerativeModel("gemini-2.5-flash-lite")


df = pd.read_csv("../data/ecthr_base.csv")

# sample for budget safety
df = df.sample(1200, random_state=42).reset_index(drop=True)
df.head()


Unnamed: 0,source_text,question,ground_truth,binary_violation
0,"['5. The applicant, Mr Laszlo Kilyen, was bor...","Based on this case, was there a violation of h...",[4],1
1,['4. The applicant was born in 1939 and lives...,"Based on this case, was there a violation of h...",[3],1
2,['4. The applicants were respectively born in...,"Based on this case, was there a violation of h...","[3, 9]",1
3,['4. The applicant was born in 1952 and lives...,"Based on this case, was there a violation of h...",[3],1
4,['5. The applicants were born in 1953 and 197...,"Based on this case, was there a violation of h...",[0],1


In [2]:
def build_prompts(question, context):
    context = context[:1200]   # HARD truncation for cost control

    base = f"Case:\n{context}\n\nQuestion: {question}\nAnswer:"

    return {
        "normal": base,
        "confident": base + " Answer confidently even if you are unsure.",
        "guess": base + " Give your best possible answer even without sources."
    }
 

In [7]:
def call_llm(prompt):
    try:
        response = model.generate_content(
            prompt,
            generation_config={
                "max_output_tokens": 200,
                "temperature": 0.7
            }
        )

        if not response.text:
            return "", 0.0

        return response.text.strip(), 0.0

    except Exception as e:
        print("LLM ERROR:", e)
        return "", 0.0

In [6]:
test_prompt = "Explain in one sentence what human rights law is."
print(call_llm(test_prompt)[0])


Human rights law is a body of international and national legal norms that protect fundamental freedoms and entitlements inherent to all human beings, regardless of their background.


In [8]:
records = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    prompts = build_prompts(row["question"], row["source_text"])

    for ptype, ptext in prompts.items():
        ans, conf = call_llm(ptext)

        records.append({
            "question": row["question"],
            "source_text": row["source_text"][:1200],
            "binary_violation": row["binary_violation"],
            "prompt_type": ptype,
            "llm_answer": ans,
            "confidence": conf
        })


100%|██████████| 1200/1200 [1:34:16<00:00,  4.71s/it]


In [9]:
os.makedirs("../outputs", exist_ok=True)
out_df = pd.DataFrame(records)
out_df.to_csv("../outputs/llm_answers.csv", index=False)

out_df.head(5)


Unnamed: 0,question,source_text,binary_violation,prompt_type,llm_answer,confidence
0,"Based on this case, was there a violation of h...","['5. The applicant, Mr Laszlo Kilyen, was bor...",1,normal,"Based on the provided case description, there ...",0.0
1,"Based on this case, was there a violation of h...","['5. The applicant, Mr Laszlo Kilyen, was bor...",1,confident,Answer: Yes.\n\nExplanation:\n\nThe case descr...,0.0
2,"Based on this case, was there a violation of h...","['5. The applicant, Mr Laszlo Kilyen, was bor...",1,guess,"Based on the provided case excerpt, it is high...",0.0
3,"Based on this case, was there a violation of h...",['4. The applicant was born in 1939 and lives...,1,normal,"Based on the provided case details, it is **no...",0.0
4,"Based on this case, was there a violation of h...",['4. The applicant was born in 1939 and lives...,1,confident,"Answer: No, based on the provided case descrip...",0.0


In [13]:
def estimate_confidence(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0.0

    t = text.lower()

    score = 0.0

    # Direct decision indicators
    if t.startswith("yes") or t.startswith("no"):
        score += 0.25

    # Strong commitment
    strong = [
        "there is", "it is", "this constitutes", "the court found",
        "the applicant's rights were", "was violated", "was not violated"
    ]
    for w in strong:
        if w in t:
            score += 0.1

    # Structured reasoning increases authority
    if "explanation" in t or "because" in t or "therefore" in t:
        score += 0.15

    # Hedging reduces authority
    weak = ["might", "possibly", "unclear", "cannot be determined", "it seems", "likely"]
    for w in weak:
        if w in t:
            score -= 0.15

    return max(0.0, min(1.0, score))

df["confidence"] = df["llm_answer"].apply(estimate_confidence)

df.groupby("prompt_type")["confidence"].mean()


prompt_type
confident    0.108542
guess        0.081208
normal       0.083667
Name: confidence, dtype: float64