In [None]:
# ======================================================================
# WBL 2026 – CANADA LEGAL ANSWER PIPELINE WITH EVALUATOR (GPT-5-MINI)
# ======================================================================

import os
import ssl
import certifi
import httpx
import re
import pandas as pd
import datetime
import time
import uuid
import json
from openai import OpenAI

# ----------------------------------------------------------------------
# CONFIGURATION
# ----------------------------------------------------------------------

MODEL = os.getenv("OPENAI_MODEL", "gpt-5-mini")
EVAL_MODEL = "gpt-5-mini"
API_KEY = "sk-"

MAX_OUTPUT_TOKENS = 4096
HTTP_TIMEOUT_SECS = 600
MAX_RETRIES = 3
BACKOFF_FACTOR = 2
CACHE_ENABLED = True

LOG_FILE = "run_log_WBL_CANADA.jsonl"
EVAL_LOG_FILE = "eval_log_WBL_CANADA.jsonl"

# ----------------------------------------------------------------------
# INITIALIZE CLIENT
# ----------------------------------------------------------------------

if not API_KEY or not API_KEY.startswith("sk-"):
    raise RuntimeError("Missing or invalid OPENAI_API_KEY. Please provide a valid key.")

_ctx = ssl.create_default_context(cafile=certifi.where())
_http = httpx.Client(verify=_ctx, timeout=HTTP_TIMEOUT_SECS, follow_redirects=True)
client = OpenAI(api_key=API_KEY, http_client=_http)

print("OpenAI client initialized (GPT-5-mini).")

# ----------------------------------------------------------------------
# PROMPT DEFINITIONS
# ----------------------------------------------------------------------

PROMPT_LEGAL = """
You are a senior legal expert specializing in Canada's labor, equality, and family law.
Your task is to answer questions for the World Bank’s Women, Business and the Law (WBL) 2026 project.

Pillar I: LEGAL FRAMEWORK
Interpret questions strictly in the context of binding national laws and regulations.
Base answers on codified legislation and subordinate regulations in force as of 1 September 2024.
Exclude customary or religious law unless it is codified in an act or regulation.
When applicable, prioritize federal or Ontario laws that apply in the main business city (Toronto).

Relevant sources include:
- Justice Laws Website (https://laws-lois.justice.gc.ca)
- Canada Labour Code
- Employment Equity Act
- Canadian Human Rights Act
- Pay Equity Act
- Employment Insurance Act
- Government of Canada and Ontario official portals

If an exact statute cannot be identified, infer the most probable answer using the Canadian legal system and mention this in the summary.

### Output Format (MUST FOLLOW EXACTLY)
ANSWER: <Yes/No/N/A | number | short phrase>
LAW/Website: <primary law or regulation>
LINK: <official URL(s)>
SUMMARY: <≤200 words summarizing reasoning and citing authoritative sources>
"""

PROMPT_POLICY = """
You are a senior policy and governance expert on gender equality and labor markets in Canada.
Your task is to answer questions for the World Bank’s Women, Business and the Law (WBL) 2026 project.

Pillar II: POLICY INSTRUMENTS
Interpret questions in the context of instruments that support the implementation of laws.
Include national and provincial policies, action plans, monitoring institutions, enforcement agencies,
access-to-justice measures, and government programs or data collection systems in place as of 1 September 2024.

Focus on:
- Federal and provincial ministries responsible for gender equality, labor, and employment.
- Operational government initiatives and datasets (e.g., Employment and Social Development Canada, Statistics Canada, Labour Program, Department for Women and Gender Equality).
- Public programs or registries, including enforcement or access mechanisms.

If no operational instrument is identified, state so clearly.

### Output Format (MUST FOLLOW EXACTLY)
ANSWER: <Yes/No/N/A | short phrase>
LAW/Website: <main institution, plan, or official instrument>
LINK: <official URL(s)>
SUMMARY: <≤200 words explaining the reasoning and describing relevant institutional instruments>
"""

def get_prompt_for_section(section):
    if "Policy Instrument" in str(section):
        return PROMPT_POLICY
    return PROMPT_LEGAL

# ----------------------------------------------------------------------
# SUPPORT FUNCTIONS
# ----------------------------------------------------------------------

cache = {}

def cached_call(key, fn):
    if CACHE_ENABLED and key in cache:
        return cache[key]
    value = fn()
    if CACHE_ENABLED:
        cache[key] = value
    return value

def exponential_backoff_retry(fn, max_retries=MAX_RETRIES, factor=BACKOFF_FACTOR):
    for attempt in range(max_retries):
        try:
            return fn()
        except Exception as e:
            wait = factor ** attempt
            print(f"Retry {attempt + 1}/{max_retries} after error: {e}. Waiting {wait}s.")
            time.sleep(wait)
    raise RuntimeError("Max retries exceeded.")

def calibrate_confidence(text):
    if not text:
        return 0.0
    text_lower = text.lower()
    if any(w in text_lower for w in ["clearly", "definitely", "explicitly", "yes"]):
        return 0.9
    if any(w in text_lower for w in ["likely", "generally", "appears", "probably"]):
        return 0.7
    if any(w in text_lower for w in ["unclear", "uncertain", "may", "not specified", "no information"]):
        return 0.4
    return 0.5

# ----------------------------------------------------------------------
# STAGE 1: PRIMARY ANSWER GENERATOR (A0)
# ----------------------------------------------------------------------

def get_legal_answer_canada(section, reference, question, economy="Canada"):
    cache_key = f"{economy}|{section}|{reference}"
    if CACHE_ENABLED and cache_key in cache:
        return cache[cache_key]

    system_prompt = get_prompt_for_section(section)
    user_prompt = f"""
Economy: {economy}
Section: {section}
Reference: {reference}

Question:
{question}

Task:
Provide the structured WBL-style response (ANSWER / LAW / LINK / SUMMARY)
based on applicable {economy} legal or policy frameworks.
"""

    def run_query():
        start_time = time.time()
        try:
            response = client.responses.create(
                model=MODEL,
                input=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                store=True
            )
        except Exception as e:
            print(f"API call failed for {reference}: {e}")
            return None, 0
        latency = time.time() - start_time
        return response, latency

    response, latency = exponential_backoff_retry(run_query)
    if not response or not getattr(response, "output", None):
        print(f"Empty response object for {reference}.")
        return {
            "run_id": str(uuid.uuid4()),
            "economy": economy,
            "section": section,
            "reference": reference,
            "question": question,
            "answer": "",
            "law_website": "",
            "link": "",
            "summary": "[No output received from model]",
            "confidence": 0.0,
            "latency_seconds": round(latency, 2),
            "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat()
        }

    text_output = None
    for item in response.output or []:
        for part in getattr(item, "content", []) or []:
            text_val = getattr(part, "text", None) or part
            if isinstance(text_val, str) and text_val.strip():
                text_output = text_val.strip()
                break

    if not text_output:
        print(f"No text content returned for {reference}.")
        return {
            "run_id": str(uuid.uuid4()),
            "economy": economy,
            "section": section,
            "reference": reference,
            "question": question,
            "answer": "",
            "law_website": "",
            "link": "",
            "summary": "[Model returned no text]",
            "confidence": 0.0,
            "latency_seconds": round(latency, 2),
            "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat()
        }

    print(f"\n--- RAW MODEL OUTPUT ({reference}) ---\n{text_output[:400]}\n-------------------------\n")

    answer = re.search(r"ANSWER:\s*(.*)", text_output)
    law = re.search(r"LAW/Website:\s*(.*)", text_output)
    link = re.search(r"LINK:\s*(.*)", text_output)
    summary = re.search(r"SUMMARY:\s*(.*)", text_output, re.DOTALL)

    result = {
        "run_id": str(uuid.uuid4()),
        "economy": economy,
        "section": section,
        "reference": reference,
        "question": question,
        "answer": answer.group(1).strip() if answer else "",
        "law_website": law.group(1).strip() if law else "",
        "link": link.group(1).strip() if link else "",
        "summary": summary.group(1).strip() if summary else text_output,
        "confidence": calibrate_confidence(text_output),
        "latency_seconds": round(latency, 2),
        "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat()
    }

    cache[cache_key] = result
    with open(LOG_FILE, "a", encoding="utf-8") as logf:
        json.dump(result, logf)
        logf.write("\n")
    return result

# ----------------------------------------------------------------------
# STAGE 2: EVALUATOR (B5)
# ----------------------------------------------------------------------

def evaluate_answer(a0_result):
    economy = a0_result["economy"]
    reference = a0_result["reference"]

    EVAL_PROMPT = f"""
You are an independent senior legal evaluator for the World Bank’s Women, Business and the Law project.

Evaluate the correctness, legal validity, and completeness of the AI-generated answer (A0) below for {economy}.

Question: {a0_result["question"]}
A0 Answer: {a0_result["answer"]}
Law/Website: {a0_result["law_website"]}
Link: {a0_result["link"]}
Summary: {a0_result["summary"]}

Assess the legal reasoning and citations against authoritative Canadian sources.
If the cited laws are correct and current as of 1 September 2024, label as “Correct”.
Otherwise, classify the issue precisely.

### Output Format (STRICT)
VERDICT: Correct / Incorrect / Insufficient Evidence / Outdated Law
JUSTIFICATION: <≤100 words referencing specific sources or reasoning>
CORRECTED_ANSWER: <if applicable, otherwise blank>
CORRECTED_LAW/Website: <if applicable, otherwise blank>
CORRECTED_LINK: <if applicable, otherwise blank>
CORRECTED_SUMMARY: <if applicable, otherwise blank>
EVAL_CONFIDENCE: <float between 0 and 1>
"""

    start_time = time.time()
    try:
        response = client.responses.create(
            model=EVAL_MODEL,
            input=[
                {
                    "role": "system",
                    "content": "You are a legal evaluator who must return structured text following the output format exactly."
                },
                {"role": "user", "content": EVAL_PROMPT},
            ],
            store=True
        )
    except Exception as e:
        print(f"Evaluator API call failed for {reference}: {e}")
        return None

    latency = time.time() - start_time

    text_output = None
    if hasattr(response, "output_text") and response.output_text:
        text_output = response.output_text.strip()
    elif getattr(response, "output", None):
        for item in response.output or []:
            if isinstance(item, dict) and "content" in item:
                for part in item["content"]:
                    if isinstance(part, dict) and "text" in part:
                        text_output = part["text"].strip()
                        break
            elif hasattr(item, "content"):
                for part in getattr(item, "content", []):
                    text_val = getattr(part, "text", None)
                    if text_val:
                        text_output = text_val.strip()
                        break
            if text_output:
                break

    if not text_output:
        print(f"No evaluator text output for {reference}.")
        print(f"Raw evaluator response: {response}")
        return None

    print(f"\n--- RAW EVALUATOR OUTPUT ({reference}) ---\n{text_output[:400]}\n-------------------------\n")

    # Normalize for consistent parsing
    normalized = re.sub(r"\r", "", text_output)
    normalized = re.sub(r"\n+", "\n", normalized)
    normalized = re.sub(r"([A-Z_]+/Website:)", r"\n\1", normalized)
    normalized = re.sub(r"([A-Z_]+:)", r"\n\1", normalized)

    # Parse fields line-by-line for better precision
    def extract_field(label):
        pattern = rf"{label}:(.*?)(?=\n[A-Z_]+:|$)"
        match = re.search(pattern, normalized, re.DOTALL | re.IGNORECASE)
        return match.group(1).strip() if match else ""


    verdict = extract_field("VERDICT")
    justification = extract_field("JUSTIFICATION")
    corr_answer = extract_field("CORRECTED_ANSWER")
    corr_law = extract_field("CORRECTED_LAW/Website")
    corr_link = extract_field("CORRECTED_LINK")
    corr_summary = extract_field("CORRECTED_SUMMARY")
    eval_conf = extract_field("EVAL_CONFIDENCE")

    try:
        val = float(eval_conf)
    except:
        val = 0.5
    val = max(0.0, min(1.0, val))

    result = {
        "eval_run_id": str(uuid.uuid4()),
        "verdict": verdict,
        "justification": justification,
        "corrected_answer": corr_answer,
        "corrected_law_website": corr_law,
        "corrected_link": corr_link,
        "corrected_summary": corr_summary,
        "eval_confidence": val,
        "eval_latency_seconds": round(latency, 2),
        "eval_timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat()
    }

    with open(EVAL_LOG_FILE, "a", encoding="utf-8") as f:
        json.dump(result, f)
        f.write("\n")
    return result

# ----------------------------------------------------------------------
# EXECUTION PIPELINE
# ----------------------------------------------------------------------

excel_path = r"C:\Users\wb611279\OneDrive - WBG\Desktop\AI Test\Clean_Fin_Extracted_WBL_Questions_Labor_FinalCleaned.xlsx"
df = pd.read_excel(excel_path)

for col in [
    "Economy", "ANSWER", "LAW/Website", "LINK", "SUMMARY", "CONFIDENCE",
    "LATENCY_SECONDS", "RUN_ID",
    "EVAL_VERDICT", "EVAL_JUSTIFICATION", "EVAL_CORRECTED_ANSWER",
    "EVAL_CORRECTED_LAW/Website", "EVAL_CORRECTED_LINK",
    "EVAL_CORRECTED_SUMMARY", "EVAL_CONFIDENCE", "EVAL_LATENCY_SECONDS"
]:
    if col not in df.columns:
        df[col] = ""

for idx, row in df.iterrows():
    section = str(row["Section"])
    reference = str(row["Reference"])
    question = str(row["Question"])
    if not question.strip():
        continue

    print(f"\nProcessing {reference} ({section[:40]})")

    a0 = get_legal_answer_canada(section, reference, question)
    if not a0:
        continue

    df.loc[idx, "Economy"] = a0["economy"]
    df.loc[idx, "ANSWER"] = a0["answer"]
    df.loc[idx, "LAW/Website"] = a0["law_website"]
    df.loc[idx, "LINK"] = a0["link"]
    df.loc[idx, "SUMMARY"] = a0["summary"]
    df.loc[idx, "CONFIDENCE"] = a0["confidence"]
    df.loc[idx, "LATENCY_SECONDS"] = a0["latency_seconds"]
    df.loc[idx, "RUN_ID"] = a0["run_id"]

    eval_result = evaluate_answer(a0)
    if eval_result:
        df.loc[idx, "EVAL_VERDICT"] = eval_result["verdict"]
        df.loc[idx, "EVAL_JUSTIFICATION"] = eval_result["justification"]
        df.loc[idx, "EVAL_CORRECTED_ANSWER"] = eval_result["corrected_answer"]
        df.loc[idx, "EVAL_CORRECTED_LAW/Website"] = eval_result["corrected_law_website"]
        df.loc[idx, "EVAL_CORRECTED_LINK"] = eval_result["corrected_link"]
        df.loc[idx, "EVAL_CORRECTED_SUMMARY"] = eval_result["corrected_summary"]
        df.loc[idx, "EVAL_CONFIDENCE"] = eval_result["eval_confidence"]
        df.loc[idx, "EVAL_LATENCY_SECONDS"] = eval_result["eval_latency_seconds"]

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
excel_out = excel_path.replace(".xlsx", f"_CANADA_with_Evals_{timestamp}.xlsx")
csv_out = excel_out.replace(".xlsx", ".csv")

df.to_excel(excel_out, index=False)
df.to_csv(csv_out, index=False, encoding="utf-8-sig")

print(f"\nCompleted. Results saved to:\n{excel_out}\n{csv_out}")


OpenAI client initialized (GPT-5-mini).

Processing WBL_1_1 (Section 1. Work - Legal Framework)

--- RAW MODEL OUTPUT (WBL_1_1) ---
ANSWER: Yes
LAW/Website: Ontario Human Rights Code; Canada Human Rights Act (federal)
LINK: https://www.ontario.ca/laws/statute/90h19
https://laws-lois.justice.gc.ca/eng/acts/h-6/
SUMMARY: Both provincial and federal human‑rights statutes prohibit discrimination in recruitment/employment on the bases asked. Ontario’s Human Rights Code (R.S.O. 1990, c. H.19) bars discrimination in employment becaus
-------------------------


--- RAW EVALUATOR OUTPUT (WBL_1_1) ---
VERDICT: Correct
JUSTIFICATION: Ontario Human Rights Code (R.S.O. 1990, c. H.19) expressly prohibits employment discrimination on the grounds of age, marital status and family status (commonly used to cover parental responsibilities). The Canada Human Rights Act likewise prohibits discrimination in federally regulated employment on grounds including age and marital status, and federal tribunals ha