In [None]:
import csv, json, re, pathlib, math
import pandas as pd
import llamabot as lmb

# Based on our discussions with Dr. Mendible, we decided it would be best to use smaller models to filter down the information.
# Perhaps using the output of these smaller models as a filter for the next step in the pipeline.
# Dr. Mendible suggested generating the output in JSON format to make it easier to parse.


# --- Config ---
CLASS_MODEL   = "ollama_chat/llama3.1:8b"
INCIDENT_MODEL = "ollama_chat/llama3.1:8b"
EXTRACT_MODEL  = "ollama_chat/llama3.1:8b"

# --- Input CSV ---
# Must have a 'text' column with the complaint text.
CSV_PATH = pathlib.Path("manually_cleaned_police_reports_small.csv")

# --- Prompts ---
SYSTEM_PROMPT = """You extract the most specific geolocation where a police complaint incident occurred.
Return only JSON with keys: location, confidence, rationale.
- location MUST be formatted as: "Street(s), City, State". If no street is given, return "City, State" only.
- Choose the incident location (where it happened), not office or mailing addresses.
- If multiple places appear, pick the most specific place where the incident occurred.
- Be conservative: if streets are unclear or missing, use just "City, State".
- If you know the state, use the USPS two-letter code (e.g., MN)."""

TYPE_PROMPT = """Classify the text. Return JSON: {"is_article": true/false, "is_court_case": true/false, "rationale": "..."}.
Rules:
- "article" = news/blog/reporting prose describing a real-world event (time/place/people involved).
- "court_case" = opinions, dockets, case numbers, motions, legal filings.
Return ONLY JSON.
EXAMPLE TEXT:
"According to witnesses, the stop happened at 1515 Nicollet Ave, Minneapolis, MN at 8:30 p.m."
EXPECTED JSON:
{"is_article": true, "is_court_case": false, "rationale": "Narrative report of an event with a street address."}

EXAMPLE TEXT:
"STATE v. DOE, No. A23-0123, 2024 WL 123456 (Minn. Ct. App.). We reverse and remand."
EXPECTED JSON:
{"is_article": false, "is_court_case": true, "rationale": "Legal citation and opinion language."}
"""

# Incident detection prompt
# David brought up a good point that some articles may be present in more than one complaint.
# INCIDENT_PROMPT = """Does this text describe a police-complaint incident (an event that happened)?
# Return ONLY JSON: {"has_incident": true/false, "rationale": "..."}.
# EXAMPLE TEXT:
# "Witnesses say the stop happened outside 1515 Nicollet Ave, Minneapolis, MN."
# EXPECTED JSON:
# {"has_incident": true, "rationale": "Specific stop at a street address."}

# EXAMPLE TEXT:
# "This opinion discusses qualified immunity in general terms."
# EXPECTED JSON:
# {"has_incident": false, "rationale": "No concrete event; discussion only."}
# """

# Few-shot examples for location extraction.
# Using JSON format for clarity and ease of parsing.
INPUT_OUTPUT_EXAMPLE = [
    {
        "text": "Officer stopped me by 5th Ave & Hennepin in Minneapolis. We later went to the precinct.",
        "json": {"location": "5th Ave & Hennepin, Minneapolis, MN", "confidence": 0.86, "rationale": "Stop occurred at the intersection in Minneapolis."}
    },
    {
        "text": "Complaint about rough handling at a downtown bar in St. Paul; no street mentioned nor state.",
        "json": {"location": "St. Paul, MN", "confidence": 0.72, "rationale": "No street given; city/state only."}
    },
]

# --- Helper functions ---
# Build prompt with few-shot examples from INPUT_OUTPUT_EXAMPLE
def build_prompt(complaint_text: str) -> str:
    fewshot = "\n".join(
        f"EXAMPLE TEXT:\n{ex['text']}\nEXPECTED JSON:\n{json.dumps(ex['json'])}"
        for ex in INPUT_OUTPUT_EXAMPLE
    )
    return f"""You will be given an article about a police incident.
{fewshot}

NOW EXTRACT FOR THIS TEXT:
{complaint_text}

Return ONLY JSON: {{"location": "...", "confidence": <0..1>, "rationale": "..."}}.
"""

# Due the to the variability in how locations are reported, we should normalize them a bit.
# This is a simple normalization; more could be added as needed.
# Takes a location string and returns a cleaned-up version.
def normalize(loc: str) -> str:
    # remove leading/trailing whitespace and punctuation
    loc = (loc or "").strip().strip(",.;: ")
    # collapse multiple spaces, ensure single space after commas
    loc = re.sub(r"\s+", " ", loc)
    # ensure single space after commas
    loc = re.sub(r"\s*,\s*", ", ", loc)
    return loc


# --- Robust JSON parsing ---
def strip_fences(s: str) -> str:
    s = str(s).strip()
    if s.startswith("```") and s.endswith("```"):
        s = re.sub(r"^```(?:json)?\s*", "", s, flags=re.DOTALL)
        s = re.sub(r"\s*```$", "", s, flags=re.DOTALL)
    return s

# Regex to find the first {...} block
JSON_PAT = re.compile(r"\{.*?\}", re.DOTALL)  

# Try to parse JSON robustly, with several fallbacks.
def parse_json(s: str) -> dict:
    txt = strip_fences(s).strip()

    # 1) Try raw parse
    try:
        return json.loads(txt)
    except Exception:
        pass

    # 2) Coerce Python-style to JSON-style (True/False/None, single quotes)
    coerced = re.sub(r"\bTrue\b", "true", txt)
    coerced = re.sub(r"\bFalse\b", "false", coerced)
    coerced = re.sub(r"\bNone\b", "null", coerced)
    if coerced.startswith("{") and '"' not in coerced:
        coerced = coerced.replace("'", '"')
    try:
        return json.loads(coerced)
    except Exception:
        pass

    # 3) Fallback: extract the FIRST {...} block only
    m = JSON_PAT.search(coerced)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            return {}

    return {}

# Convert various inputs to boolean
def to_bool(x):
    if isinstance(x, bool):
        return x
    if isinstance(x, (int, float)):
        return x != 0
    if isinstance(x, str):
        return x.strip().lower() in {"true", "yes", "y", "1"}
    return False

# Extract message content from various possible return types
def msg_content(x) -> str:
    # lmb.SimpleBot returns a Message-like object; the JSON is in .content
    return getattr(x, "content", str(x))

# --- CSV loader robust to encodings ---
# CSV files can be in various encodings; try several common ones.
# Loads a CSV file into a pandas DataFrame.
def load_csv(path: pathlib.Path) -> pd.DataFrame:
    encodings_to_try = ["utf-8", "utf-8-sig", "cp1252", "latin-1"]
    possible_error = None
    for enc in encodings_to_try:
        try:
            # If most common encodings fails, let's skip bad lines
            return pd.read_csv(path, encoding=enc, encoding_errors="replace", engine="python", on_bad_lines="skip")
        except Exception as e:
            possible_error = e
    raise possible_error or ValueError("Failed to read CSV file.")

# --- Build three small bots ---
type_bot     = lmb.SimpleBot(TYPE_PROMPT,     model_name=CLASS_MODEL)
# incident_bot = lmb.SimpleBot(INCIDENT_PROMPT, model_name=INCIDENT_MODEL)
extract_bot  = lmb.SimpleBot(SYSTEM_PROMPT,   model_name=EXTRACT_MODEL)

# --- Run pipeline ---
df = load_csv(CSV_PATH)
if "text" not in df.columns:
    raise ValueError("CSV file does not contain a 'text' column. Check the input file.")

# Collect results here, then write to CSV at the end.
rows = []
# For progress tracking. This is not working too well at the moment.
total = len(df)

# Loop over each row in the DataFrame
# Using iterrows() for simplicity; for large datasets, consider more efficient methods.
for idx, row in df.iterrows():

    # Simple progress indicator
    # Print every 10 rows
    if idx % 10 == 0:
        print(f"\nProcessing {idx}/{total}...")

    # Extract row data from DataFrame
    name = row.get("name", idx)
    department = row.get("department", "")
    url = row.get("url", "")

    
    raw_text = row["text"]
    text = "" if pd.isna(raw_text) else str(raw_text).strip()


    is_article = False
    is_court_case = False
    type_rationale = "empty_or_error"

    # has_incident = False          
    # incident_rationale = "skipped_not_article"

    location = ""
    confidence = 0.0
    rationale = "skipped"

    if text:
        # 1) Classify article type
        try:
            tjson = parse_json(msg_content(type_bot(text))) or {}
            is_article    = to_bool(tjson.get("is_article", False))
            is_court_case = to_bool(tjson.get("is_court_case", False))
            type_rationale = str(tjson.get("rationale", ""))
        except Exception as e:
            type_rationale = f"error: {e}"

        # 2) extract location if article
        # We decided to skip the incident detection step for now.
        if is_article:
            try:
                raw_ej = extract_bot(build_prompt(text))
                ej = parse_json(msg_content(raw_ej)) or {}
                location = normalize(ej.get("location", "") or "")
                c_raw = ej.get("confidence", 0.0)
                try:
                    confidence = max(0.0, min(1.0, float(c_raw)))
                except Exception:
                    confidence = 0.0
                rationale = str(ej.get("rationale", "")) or "no_rationale"

                # DEBUG: make it impossible to miss
                print(f"EXTRACT[{idx}] location={repr(location)} conf={confidence} rationale={rationale[:80]}")
                # If you want to see exactly what the model gave you, uncomment:
                # print("RAW_EXTRACT:", str(raw_ej)[:300])
                # print("PARSED_EXTRACT:", ej)
            except Exception as e:
                rationale = f"error: {e}"
        else:
            rationale = "skipped_not_article"

    print('\n' + location) 
    # Save results to rows

    rows.append({
        "name": name,
        "department": department,
        "url": url,
        "is_article": is_article,
        "is_court_case": is_court_case,
        "type_rationale": type_rationale,
        # "has_incident": has_incident,
        # "incident_rationale": incident_rationale,
        "location": location,
        "confidence": confidence,      
        "rationale": rationale,
    })

# --- Save ---
out_csv = "police_report_locations.csv"
with open(out_csv, "w", newline="", encoding="utf-8") as f:
    cols = ["name","department","url",
            "is_article","is_court_case","type_rationale",
            # removed "has_incident","incident_rationale" since you aren't writing them
            "location","confidence","rationale"]
    w = csv.DictWriter(f, fieldnames=cols)
    w.writeheader()
    w.writerows(rows)

print("\nDone. Wrote", len(rows), "rows to", pathlib.Path(out_csv).resolve())



Processing 0/13...
{"is_article": true, "is_court_case": false, "rationale": "Narrative report of an event with a street address."}{
    "location": "Nicollet Ave, Minneapolis, MN",
    "confidence": 0.95,
    "rationale": "Stop occurred at a specific address in Minneapolis."
}EXTRACT[0] location='Nicollet Ave, Minneapolis, MN' conf=0.95 rationale=Stop occurred at a specific address in Minneapolis.

Nicollet Ave, Minneapolis, MN
{"is_article": false, "is_court_case": false, "rationale": "Government document with legislative file number and fiscal note."}

{"is_article": false, "is_court_case": false, "rationale": "Government document with formal language and specific details about a settlement."}

{"is_article": false, "is_court_case": false, "rationale": "Government document with administrative language and no legal citation or opinion language."}

{"is_article": false, "is_court_case": false, "rationale": "Government document with administrative language and no legal citation or opi