### Minutes of meetings metadata enrichment

In [11]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [37]:
import json
import re
from pathlib import Path
from datetime import datetime

# === CONFIG ===
input_path = Path("../data/processed_register/document_manifest.jsonl")
output_path = Path("../data/processed_register/document_manifest_with_meeting_names.jsonl")

# === HELPERS ===

def format_minutes_display_name(doc):
    filename = doc.get("filename", "").lower()
    if "previous" not in filename:
        return doc.get("display_name", "")
    raw_date = doc.get("meeting_date", "")
    committee = doc.get("committee", "").replace("_", " ").title()
    try:
        date_str = datetime.strptime(raw_date, "%Y-%m-%d").strftime("%d %B %Y").lstrip("0")
    except:
        date_str = raw_date or "Unknown date"
    parts = [f"Minutes of the meeting on {date_str}"]
    if committee:
        parts.append(f"by {committee}")
    return " – ".join(parts)

def extract_meeting_date_from_filename(filename):
    patterns = [
        (r"(\d{1,2})[-](Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-](\d{4})", "%d-%b-%Y"),
        (r"(\d{1,2})\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{4})", "%d %B %Y"),
        (r"(\d{2})(\d{2})(\d{2})", "%d%m%y"),
        (r"(\d{2})\.(\d{2})\.(\d{2})", "%d.%m.%y")
    ]
    for pattern, fmt in patterns:
        match = re.search(pattern, filename)
        if match:
            try:
                date_str = " ".join(match.groups())
                date_obj = datetime.strptime(date_str, fmt)
                return date_obj.strftime("%Y-%m-%d")
            except:
                continue
    return None

# === STEP 1 + STEP 2: Apply display name logic + update meeting_date from filename ===
processed_docs = []
with open(input_path) as fin:
    for line in fin:
        doc = json.loads(line)
        filename = doc.get("filename", "")
        category = doc.get("document_category", "").lower()

        # Step 1: Apply custom display_name logic (only for 'minutes' with 'previous')
        if category == "minutes":
            doc["display_name"] = format_minutes_display_name(doc)

        # Step 2: Update meeting_date from filename if extractable
        extracted_date = extract_meeting_date_from_filename(filename)
        if extracted_date:
            doc["meeting_date"] = extracted_date

        processed_docs.append(doc)

# === WRITE intermediate file ===
with open(output_path, "w") as fout:
    for doc in processed_docs:
        fout.write(json.dumps(doc) + "\n")

print(f"✅ Step 1+2 complete. Saved to: {output_path}")

# === STEP 3: Fill missing display_names with filename ===
final_docs = []
for doc in processed_docs:
    if not doc.get("display_name"):
        doc["display_name"] = doc.get("filename", "").replace(".pdf", "")
    final_docs.append(doc)

# === OVERWRITE FINAL VERSION TO INPUT_PATH ===
with open(input_path, "w") as fout:
    for doc in final_docs:
        fout.write(json.dumps(doc) + "\n")

print("✅ Step 3 complete. Fallback display names set using filename where missing.")

# === STEP 4: Tag version and update display_name for minutes ===
versioned_docs = []
for doc in final_docs:
    if doc.get("document_category", "").lower() == "minutes":
        filename = doc.get("filename", "").lower()

        # Determine version
        if "printed" in filename:
            doc["version"] = "printed"
            if "display_name" in doc:
                doc["display_name"] += " – Draft version"
        elif "previous" in filename or "minutes of the meeting" in filename:
            doc["version"] = "approved"
            if "display_name" in doc:
                doc["display_name"] += " – Approved version"
        else:
            doc["version"] = "unspecified"

    versioned_docs.append(doc)

# === FINAL SAVE ===
with open(input_path, "w") as fout:
    for doc in versioned_docs:
        fout.write(json.dumps(doc) + "\n")

print("✅ Step 4 complete. Versions tagged and display names updated for minutes.")



✅ Step 1+2 complete. Saved to: ../data/processed_register/document_manifest_with_meeting_names.jsonl
✅ Step 3 complete. Fallback display names set using filename where missing.
✅ Step 4 complete. Versions tagged and display names updated for minutes.


In [38]:
import json
import pandas as pd
from pathlib import Path

# === CONFIG ===
output_path = Path("../data/processed_register/document_manifest_with_meeting_names.jsonl")

# === LOAD DATA TO VIEW MEETING DATES FOUND ===
minutes_data = []

with open(output_path) as f:
    for line in f:
        doc = json.loads(line)
        if doc.get("document_category", "").lower() == "minutes":
            minutes_data.append({
                "filename": doc.get("filename", ""),
                "committee": doc.get("committee", ""),
                "meeting_date": doc.get("meeting_date", ""),
                "display_name": doc.get("display_name", "")
            })

# Convert to DataFrame
df_meeting_dates = pd.DataFrame(minutes_data)

df_meeting_dates.sort_values(by=["meeting_date"])

Unnamed: 0,filename,committee,meeting_date,display_name
5,Minutes of the meeting held on 5 October 2023.pdf,cabinet,2023-10-05,Minutes of the meeting held on 5 October 2023 – Approved version
3,Minutes of the meeting held on 30 November 2023.pdf,cabinet,2023-11-30,Minutes of the meeting held on 30 November 2023 – Approved version
4,Printed minutes 30th-Nov-2023 10.00 Cabinet.pdf,cabinet,2023-11-30,Printed minutes 30th-Nov-2023 10.00 Cabinet – Draft version
2,Printed minutes 04th-Jan-2024 10.00 Cabinet.pdf,cabinet,2024-01-04,Printed minutes 04th-Jan-2024 10.00 Cabinet – Draft version
9,Minutes of the meeting held on 4 January 2024.pdf,cabinet,2024-01-04,Minutes of the meeting held on 4 January 2024 – Approved version
8,Minutes of the meeting held on 25 January 2024.pdf,cabinet,2024-01-25,Minutes of the meeting held on 25 January 2024 – Approved version
40,Minutes of Previous Meeting.pdf,full_council,2024-02-19,Minutes of the meeting on 19 February 2024 – by Full Council
39,Printed minutes 19th-Feb-2024 09.30 County Council.pdf,full_council,2024-02-19,Printed minutes 19th-Feb-2024 09.30 County Council – Draft version
18,Minutes of the meeting held on 21 March 2024.pdf,cabinet,2024-03-21,Minutes of the meeting held on 21 March 2024 – Approved version
16,Printed minutes 21st-Mar-2024 10.00 Cabinet.pdf,cabinet,2024-03-21,Printed minutes 21st-Mar-2024 10.00 Cabinet – Draft version


### Enrich EQIAs in metadata

In [None]:
import pdfplumber
import re
import json
from pathlib import Path

# === CONFIG ===
pdf_root = Path("../data/council_documents")  # base directory with PDFs
manifest_path = Path("../data/processed_register/document_manifest.jsonl")
output_path = Path("../data/processed_register/document_manifest_with_EQIA.jsonl")

# === EQIA METADATA EXTRACTION ===
def extract_eqia_metadata(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            first_text = "\n".join(p.extract_text() or "" for p in pdf.pages[:2])
            lines = first_text.splitlines()

            if not any("equality impact" in l.lower() or "eqia submission" in l.lower() for l in lines[:20]):
                return None  # Not an EQIA

            # === Extract EQIA Title ===
            title = ""
            match = re.search(r"(EQIA Title|Equality Impact Assessment Title)\s*[:\-]?\s*(.+)", first_text, re.IGNORECASE)
            if match:
                title = match.group(2).strip()
            else:
                for line in lines[:10]:
                    if "eqia" in line.lower() and len(line.strip()) > 20:
                        title = line.strip()
                        break

            # === Extract Responsible Officer ===
            officer = ""
            match = re.search(r"Responsible Officer\s*[:\-]?\s*(.+)", first_text, re.IGNORECASE)
            if match:
                officer = match.group(1).strip()

            # === Extract Service or Directorate ===
            service = ""
            match = re.search(r"Directorate\s*[:\-]?\s*(.+)", first_text, re.IGNORECASE)
            if match:
                service = match.group(1).strip()

            # === Extract Purpose or Context ===
            context = ""
            for key in ["Aims and Objectives", "Background", "Objective"]:
                if key.lower() in first_text.lower():
                    m = re.search(fr"{key}\s*[:\-]?\s*(.+?)(?:\n\s*\n|Section B|Accountability|Directorate)", first_text, re.IGNORECASE | re.DOTALL)
                    if m:
                        context = re.sub(r"\s+", " ", m.group(1).strip())
                        break

            short_title = title.split(":")[0] if ":" in title else title
            display_name = f"Equality Impact Assessment – {short_title}" if short_title else "Equality Impact Assessment"

            return {
                "document_category": "eqia",
                "eqia_title": title,
                "responsible_officer": officer,
                "service_area": service,
                "decision_context": context,
                "display_name": display_name
            }

    except Exception as e:
        print(f"[Error reading {pdf_path.name}]: {e}")
        return None

# === ENRICH MANIFEST ===
with open(manifest_path) as fin, open(output_path, "w") as fout:
    for line in fin:
        doc = json.loads(line)

        pdf_path = pdf_root / doc.get("path", "")
        if not pdf_path.exists():
            fout.write(json.dumps(doc) + "\n")
            continue

        if "eqia" in doc.get("filename", "").lower():
            enriched = extract_eqia_metadata(pdf_path)
            if enriched:
                doc.update(enriched)

        fout.write(json.dumps(doc) + "\n")

print(f"✅ EQIA enrichment complete. Updated manifest saved to:\n{output_path}")


### Look up metadata from PRODs

In [None]:
import json
import re
from pathlib import Path
import pdfplumber

# === CONFIG ===
manifest_path = Path("../data/processed_register/document_manifest_with_EQIA.jsonl")
output_path = Path("../data/processed_register/document_manifest_enriched.jsonl")
pdf_root = Path("../data/council_documents")

def clean(text):
    return re.sub(r"\s+", " ", text).strip()

def extract_decision_metadata(pdf_path):
    def clean(text):
        return re.sub(r"\s+", " ", text).strip()

    def is_junk_title(title):
        junk_titles = {
            "decision", "subject", "title of decision", "decision:", "2024-2025",
            "september 2024-2025", "october 2024-2025", "november 2024-2025"
        }
        return not title or title.strip().lower() in junk_titles

    try:
        with pdfplumber.open(pdf_path) as pdf:
            first_page = pdf.pages[0].extract_text()
            lines = first_page.splitlines()
            if not any("PROPOSED RECORD OF DECISION" in line.upper() for line in lines[:3]):
                return None  # Not a PROD

            text = "\n".join(lines[:40])  # limit to top portion

            # === Extract decision code
            code_match = re.search(r"\b(2[0-5])[-/](\d{5})\b", text)
            decision_code = f"{code_match.group(1)}/{code_match.group(2)}" if code_match else None

            # === Extract title of decision
            title_of_decision = ""
            subject_match = re.search(r"(Subject Matter|Title of Decision):?\s*(.+)", text, re.IGNORECASE)
            if subject_match:
                line = subject_match.group(2).strip()
                if not line or line.lower().startswith("/ title") or line.endswith(":"):
                    idx = next((i for i, l in enumerate(lines) if "title of decision" in l.lower()), None)
                    if idx is not None and idx + 1 < len(lines):
                        line = lines[idx + 1].strip()
                title_of_decision = re.sub(r"[:\-\s]+$", "", line)

            # === Extract decision maker
            decision_maker = ""
            role_match = re.search(r"(Cabinet Member for[^\n]*)", text)
            if role_match:
                decision_maker = clean(role_match.group(1))
            else:
                idx = next((i for i, l in enumerate(lines) if "DECISION TO BE TAKEN BY" in l.upper()), None)
                if idx is not None and idx + 1 < len(lines):
                    decision_maker = lines[idx + 1].strip()

            # === Key decision
            key_line = re.search(r"Key decision:\s*(Yes|No)", text, re.IGNORECASE)
            is_key_decision = key_line.group(1).capitalize() if key_line else ""

            # === Clean display name
            title_for_display = "" if is_junk_title(title_of_decision) else title_of_decision
            if decision_code and title_for_display:
                display_name = f"Record of Decision ({decision_code}) {title_for_display}"
            elif decision_code:
                display_name = f"Record of Decision ({decision_code})"
            else:
                display_name = "Record of Decision"

            return {
                "document_category": "prod",                
                "decision_code": decision_code,
                "title_of_decision": title_of_decision,
                "decision_maker": decision_maker,
                "is_key_decision": is_key_decision,
                "display_name": display_name
            }

    except Exception as e:
        print(f"[Error] {pdf_path.name}: {e}")
        return None

# === MAIN PROCESS ===
with open(manifest_path) as fin, open(output_path, "w") as fout:
    for line in fin:
        doc = json.loads(line)

        if not doc.get("filename", "").lower().endswith(".pdf"):
            fout.write(json.dumps(doc) + "\n")
            continue

        if "prod" not in doc["filename"].lower():
            fout.write(json.dumps(doc) + "\n")
            continue

        pdf_path = pdf_root / doc["path"]
        if not pdf_path.exists():
            print(f"[Missing] {pdf_path}")
            fout.write(json.dumps(doc) + "\n")
            continue

        scraped = extract_decision_metadata(pdf_path)
        if scraped:
            doc.update(scraped)
        fout.write(json.dumps(doc) + "\n")

print(f"\n✅ Enrichment complete. Output saved to: {output_path}")


### Meetings display names change

In [None]:
import json
import re
from pathlib import Path
from datetime import datetime

# === CONFIG ===
input_path = Path("../data/processed_register/document_manifest.jsonl")
output_path = Path("../data/processed_register/document_manifest_with_meetings.jsonl")

# === MINUTES NAME FORMATTER ===
def format_minutes_display_name(doc):
    date = doc.get("meeting_date", "")
    committee = doc.get("committee", "").replace("_", " ").title()
    pages = doc.get("num_pages", None)

    try:
        date_obj = datetime.strptime(date, "%Y-%m-%d")
        date_str = date_obj.strftime("%-d %B %Y")
    except:
        date_str = date or "Unknown date"

    parts = [f"Minutes of the meeting on {date_str}"]
    if committee:
        parts.append(f"by {committee}")
    if pages:
        parts.append(f"{pages} pages")

    return " – ".join(parts)

# === PROCESS ===
with open(input_path) as fin, open(output_path, "w") as fout:
    for line in fin:
        doc = json.loads(line)

        if doc.get("document_category", "").lower() == "minutes":
            doc["display_name"] = format_minutes_display_name(doc)

        fout.write(json.dumps(doc) + "\n")

print(f"✅ Updated minutes display names saved to: {output_path}")


### Changing display names for files to be more meaningful

In [17]:
import json
import re
from pathlib import Path
from datetime import datetime

# === CONFIG ===
input_path = Path("../data/processed_register/document_manifest_with_display_names.jsonl")
output_path = Path("../data/processed_register/document_manifest_with_meetings.jsonl")

# === RULES & HELPERS ===
GENERIC_TITLES = {
    "agenda template", "minutes", "the report", "executive decision",
    "powerpoint presentation",
    "kmccg powerpoint template title = arial, font 30, pantone nhs blue  author and date = arial, font 20, black"
}

def is_generic(title):
    return title.strip().lower() in GENERIC_TITLES

def clean(text):
    return re.sub(r"\s+", " ", text).strip().replace(" .", ".")

def extract_meeting_date_from_filename(filename):
    match = re.search(r"(\d{1,2})(st|nd|rd|th)?[-\s]?(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[-\s](\d{4})", filename, re.IGNORECASE)
    if match:
        day, _, month, year = match.groups()
        try:
            dt = datetime.strptime(f"{day} {month} {year}", "%d %b %Y")
            return dt.strftime("%-d %B %Y")  # e.g. 4 January 2024
        except:
            return None
    return None

def generate_display_name(doc):
    # Rule 0: Use enriched field if present
    if "display_name" in doc and doc["display_name"].strip():
        return doc["display_name"].strip()

    if doc.get("document_category", "").lower() == "minutes":
        date = doc.get("meeting_date", "")  # already in YYYY-MM-DD
        committee = doc.get("committee", "").replace("_", " ").title()
        pages = doc.get("num_pages", None)

        try:
            # Format date from YYYY-MM-DD to D MMMM YYYY (e.g., 28 November 2024)
            from datetime import datetime
            date_obj = datetime.strptime(date, "%Y-%m-%d")
            date_str = date_obj.strftime("%-d %B %Y")
        except:
            date_str = date

        parts = [f"Minutes of the meeting on {date_str}"]
        if committee:
            parts.append(f"by {committee}")
        if pages:
            parts.append(f"{pages} pages")

        return " – ".join(parts)

    filename = doc.get("filename", "")
    title = doc.get("title", "").strip()
    category = doc.get("document_category", "").lower()
    fname_lower = filename.lower()

    # Rule 1: Use title if it's meaningful
    if title and not is_generic(title):
        return clean(title)

    # Rule 2: Minutes with formatted date
    if "minutes" in fname_lower:
        date_str = extract_meeting_date_from_filename(filename)
        if date_str:
            return f"Minutes – {date_str}"

    # Rule 3: Appendix number and topic
    appendix_match = re.search(r"Appendix ([A-Z0-9]+)[\s\-.]*([^\.]*)", filename, re.IGNORECASE)
    if appendix_match:
        appendix_id = appendix_match.group(1)
        appendix_title = appendix_match.group(2)
        return f"Appendix {appendix_id} – {clean(appendix_title)}" if appendix_title else f"Appendix {appendix_id}"

    # Rule 4: Decision code + topic
    match = re.match(r"(23|24)-\d{5} - (.+)\.pdf", filename)
    if match:
        return clean(match.group(2))

    # Rule 5: Keyword-based topic inference
    if "budget" in fname_lower or "finance monitoring" in fname_lower:
        return "Budget Monitoring Report"
    if "performance report" in fname_lower:
        return "Quarterly Performance Report"
    if "risk register" in fname_lower:
        return "Corporate Risk Register"
    if "eqia" in fname_lower:
        return "Equality Impact Assessment"

    # Rule 6: Fallback to cleaned filename
    return clean(filename.replace(".pdf", ""))

# === MAIN PROCESS ===
with open(input_path) as fin, open(output_path, "w") as fout:
    for line in fin:
        doc = json.loads(line)
        doc["display_name"] = generate_display_name(doc)
        fout.write(json.dumps(doc) + "\n")

print(f"✅ Done: Assigned display names saved to:\n{output_path}")


✅ Done: Assigned display names saved to:
../data/processed_register/document_manifest_with_meetings.jsonl


In [None]:
import json
import re
from pathlib import Path

# === CONFIG ===
register_path = Path("../data/processed_register/document_manifest.jsonl")

# === MATCH LOGIC ===
def looks_like_eqia(doc):
    text_fields = [
        doc.get("filename", ""),
        doc.get("title", ""),
        doc.get("subject", ""),
        doc.get("keywords", "")
    ]
    return any(re.search(r"\bmeeting\b|\bminutes\b", field, re.IGNORECASE) for field in text_fields)

# === LOAD AND COUNT ===
total = 0
matches = []

with open(register_path) as f:
    for line in f:
        doc = json.loads(line)
        if looks_like_eqia(doc):
            matches.append(doc)
        total += 1

# === OUTPUT ===
print(f"✅ Found {len(matches)} EQIA-type documents out of {total} total.")

# Optional: list filenames
for m in matches[:20]:  # show first 10
    print(f" - {m['filename']}")


In [18]:
import json
import re
import pandas as pd
from pathlib import Path

# CONFIG
register_path = Path("../data/processed_register/document_manifest.jsonl")

# MATCH LOGIC for minutes
def is_minutes(doc):
    text_fields = [
        doc.get("filename", ""),
        doc.get("title", ""),
        doc.get("subject", ""),
        doc.get("keywords", "")
    ]
    return any(re.search(r"\bmeeting\b|\bminutes\b", field, re.IGNORECASE) for field in text_fields)

# LOAD and filter
matches = []

with open(register_path) as f:
    for line in f:
        doc = json.loads(line)
        if is_minutes(doc):
            matches.append({
                "filename": doc.get("filename", ""),
                "document_category": doc.get("document_category", ""),
                "display_name": doc.get("display_name", "")
            })

# Convert to DataFrame
df_minutes = pd.DataFrame(matches)

df_minutes

Unnamed: 0,filename,document_category,display_name
0,Printed minutes 28th-Nov-2024 10.00 Cabinet.pdf,minutes,
1,Minutes of the meeting held on 26 September 2024.pdf,minutes,
2,Printed minutes 04th-Jan-2024 10.00 Cabinet.pdf,minutes,
3,Minutes of the meeting held on 30 November 2023.pdf,minutes,
4,Printed minutes 30th-Nov-2023 10.00 Cabinet.pdf,minutes,
5,Minutes of the meeting held on 5 October 2023.pdf,minutes,
6,Printed minutes 26th-Sep-2024 10.00 Cabinet.pdf,minutes,
7,Minutes of the Meeting held on 11 July 2024.pdf,minutes,
8,Minutes of the meeting held on 25 January 2024.pdf,minutes,
9,Minutes of the meeting held on 4 January 2024.pdf,minutes,


In [19]:
import json
from pathlib import Path

# Adjust this to point to the actual metadata file
manifest_path = Path("../data/processed_register/document_manifest.jsonl")

# Load the first 4 matching files that look like minutes
def is_minutes(doc):
    text_fields = [
        doc.get("filename", ""),
        doc.get("title", ""),
        doc.get("subject", ""),
        doc.get("keywords", "")
    ]
    return any(re.search(r"\bmeeting\b|\bminutes\b", field, re.IGNORECASE) for field in text_fields)

# Read and collect metadata for first 4 minutes
metadata_samples = []
with open(manifest_path) as f:
    for line in f:
        doc = json.loads(line)
        if is_minutes(doc):
            metadata_samples.append(doc)
        if len(metadata_samples) == 4:
            break

metadata_samples  # Return raw metadata for inspection


[{'filename': 'Printed minutes 28th-Nov-2024 10.00 Cabinet.pdf',
  'path': 'cabinet/2024-11-28/originals/Printed minutes 28th-Nov-2024 10.00 Cabinet.pdf',
  'type': 'pdf',
  'committee': 'cabinet',
  'meeting_date': '2024-11-28',
  'document_category': 'minutes',
  'url': 'https%3A//democracy.kent.gov.uk/documents/g9473/Printed%20minutes%2028th-Nov-2024%2010.00%20Cabinet.pdf?T=1',
  'created': '2025-05-07T23:47:44.344189',
  'source_metadata_file': 'cabinet/2024-11-28/metadata.json',
  'scraped': True,
  'doc_id': 'doc_ee829e41',
  'title': 'Minutes',
  'author': 'Fiona Treveil',
  'subject': '',
  'keywords': '',
  'producer': 'Aspose.Words for .NET 22.5.0',
  'creator': 'Microsoft Office Word',
  'creation_date': 'D:20040805123000Z',
  'mod_date': 'D:20241230102200Z',
  'num_pages': 16,
  'hash': '8d11f8314e5369edf271c3ffb39d2e09675fe1ee97ade64ed25886c3555f51d9',
  'status': 'ready_for_embedding',
  'chunk_path': 'data/council_documents/cabinet/2024-11-28/chunks/Printed minutes 28th-

In [None]:
import json
from pathlib import Path
import re

BASE_DIR = Path("../data/council_documents/")
OUTPUT_FILE = Path("../data/events/meetings_metadata.jsonl")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

def slugify(text):
    return re.sub(r"[^a-z0-9]+", "_", text.lower()).strip("_")

meetings_metadata = {}

# Load existing metadata if file exists
if OUTPUT_FILE.exists():
    with open(OUTPUT_FILE) as f:
        for line in f:
            record = json.loads(line)
            meetings_metadata[record["meeting_id"]] = record

# Walk the committee/date folders
for committee_folder in BASE_DIR.iterdir():
    if not committee_folder.is_dir():
        continue

    committee_name = committee_folder.name
    committee_id = f"kent_cc__{slugify(committee_name)}"

    for meeting_folder in committee_folder.iterdir():
        if not meeting_folder.is_dir():
            continue

        date_str = meeting_folder.name
        summary_file = meeting_folder / "summary.txt"
        meeting_id = f"{date_str}_{committee_id}"
        folder_path = str(meeting_folder)

        # Load or create metadata
        record = meetings_metadata.get(meeting_id, {
            "meeting_id": meeting_id,
            "committee_id": committee_id,
            "meeting_date": date_str,
            "folder_path": folder_path
        })

        # Inject summary if available
        if summary_file.exists():
            record["summary"] = summary_file.read_text().strip()

        meetings_metadata[meeting_id] = record

# Save to JSONL
with open(OUTPUT_FILE, "w") as f:
    for rec in meetings_metadata.values():
        f.write(json.dumps(rec) + "\n")

print(f"✅ Extracted and saved {len(meetings_metadata)} meeting records to {OUTPUT_FILE}")

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import time
from tqdm import tqdm

BASE_URL = "https://kent.public-i.tv/core/portal/speaker_profile/"
OUTPUT_JSONL = "../data/jsons/new_speaker_profiles_32000_55000.jsonl"

def scrape_speaker_profile(url):
    response = requests.get(url, timeout=10)
    if response.status_code != 200:
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    name_tag = soup.select_one("h2.cs_heading_font_family")
    if not name_tag or "Error finding profile" in name_tag.text:
        return None  # skip error pages and placeholders
    profile = {
        "profile_url": url,
        "name": name_tag.text.strip()
    }
    return profile

with open(OUTPUT_JSONL, "a", encoding="utf-8") as f:
    for speaker_id in tqdm(range(34500, 34600)):
        url = f"{BASE_URL}{speaker_id}"
        try:
            profile = scrape_speaker_profile(url)
            if profile:
                f.write(json.dumps(profile, ensure_ascii=False) + "\n")
            time.sleep(3)
        except:
            continue

## MoM FC parsing

In [None]:
import re
from pathlib import Path
from PyPDF2 import PdfReader

# Prompt user to confirm or provide correct file path
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}\nPlease confirm the filename and ensure it's uploaded.")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

try:
    full_text = extract_text(PDF_PATH)
except FileNotFoundError as e:
    print(e)
    full_text = ""

# Define regex for splitting into chunks by agenda items
agenda_item_pattern = r"(?=\n?\d{3}\..+?\n\(Item \d+\))"  # matches e.g. '295. Chairman's Announcements\n(Item 5)'

# Preprocess to remove excessive newlines
cleaned_text = re.sub(r"\n{2,}", "\n", full_text)

# Split the text
chunks = re.split(agenda_item_pattern, cleaned_text) if cleaned_text else []

# First chunk is the preamble (attendance, apologies, declarations)
preamble = chunks[0] if chunks else ""

# Remaining chunks are the agenda items
agenda_chunks = chunks[1:] if len(chunks) > 1 else []

# Pair each chunk with its item number and title (e.g. 'Item 6 – Questions')
item_pattern = r"(\d{3}\..+?)\n\(Item (\d+)\)"

def label_chunks(chunks):
    labelled = []
    for chunk in chunks:
        match = re.search(item_pattern, chunk)
        if match:
            label = f"Item {match.group(2)} – {match.group(1).strip()}"
        else:
            label = "Unlabeled"
        labelled.append((label, chunk.strip()))
    return labelled

labeled_chunks = [("Preamble", preamble.strip())] + label_chunks(agenda_chunks)

# Display the labels for review
if labeled_chunks:
    for i, (label, _) in enumerate(labeled_chunks):
        print(f"{i}. {label}")
else:
    print("No agenda items were detected. Please verify that the correct PDF was provided.")


In [None]:
import re
from pathlib import Path
from PyPDF2 import PdfReader

# Prompt user to confirm or provide correct file path
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}\nPlease confirm the filename and ensure it's uploaded.")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

try:
    full_text = extract_text(PDF_PATH)
except FileNotFoundError as e:
    print(e)
    full_text = ""

# Define regex for splitting into chunks by agenda items
agenda_item_pattern = r"(?=\n?\d{3}\..+?\n\(Item \d+\))"  # matches e.g. '295. Chairman's Announcements\n(Item 5)'

# Preprocess to remove excessive newlines
cleaned_text = re.sub(r"\n{2,}", "\n", full_text)

# Split the text
chunks = re.split(agenda_item_pattern, cleaned_text) if cleaned_text else []

# First chunk is the preamble (attendance, apologies, declarations)
preamble = chunks[0] if chunks else ""

# Remaining chunks are the agenda items
agenda_chunks = chunks[1:] if len(chunks) > 1 else []

# Improved pattern to catch flexible item formatting
item_pattern = r"(\d{3}\.[^\n]+)\s*\(Item (\d+)\)"

def label_chunks(chunks):
    labelled = []
    for chunk in chunks:
        match = re.search(item_pattern, chunk)
        if match:
            label = f"Item {match.group(2)} – {match.group(1).strip()}"
        else:
            preview = chunk.strip().splitlines()[0][:80] if chunk.strip() else "[Empty]"
            label = f"Unlabeled – Preview: {preview}"
        labelled.append((label, chunk.strip()))
    return labelled

labeled_chunks = [("Preamble", preamble.strip())] + label_chunks(agenda_chunks)

# Display the labels for review
if labeled_chunks:
    for i, (label, _) in enumerate(labeled_chunks):
        print(f"{i}. {label}")
else:
    print("No agenda items were detected. Please verify that the correct PDF was provided.")


In [None]:
import re
from pathlib import Path
from PyPDF2 import PdfReader

# Prompt user to confirm or provide correct file path
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}\nPlease confirm the filename and ensure it's uploaded.")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

try:
    full_text = extract_text(PDF_PATH)
except FileNotFoundError as e:
    print(e)
    full_text = ""

# Focus only on content after "UNRESTRICTED ITEMS"
start_marker = "UNRESTRICTED ITEMS"
start_index = full_text.find(start_marker)
body_text = full_text[start_index:] if start_index != -1 else full_text

# Normalize excessive newlines
body_text = re.sub(r"\n{2,}", "\n", body_text)

# Find all section headers with integer + '.' pattern
section_pattern = re.compile(r"^ *(\d{1,3})\.\s", re.MULTILINE)
header_matches = list(section_pattern.finditer(body_text))

# Dynamically identify only monotonic increasing section headers
split_points = []
last_number = -1
for match in header_matches:
    number = int(match.group(1))
    if number > last_number:
        split_points.append((match.start(), number))
        last_number = number
split_points.append((len(body_text), None))  # End boundary

# Slice the text into valid sequentially increasing chunks
chunks = []
for i in range(len(split_points) - 1):
    start, current_number = split_points[i]
    end, _ = split_points[i + 1]
    chunk = body_text[start:end].strip()
    if chunk:
        header_line = chunk.splitlines()[0].strip()
        label = f"Section – {header_line}"
        chunks.append((label, chunk))

# Display the labels for review
if chunks:
    for i, (label, _) in enumerate(chunks):
        print(f"{i}. {label}")
else:
    print("No valid sequential agenda sections detected.")

# Display first few lines from each section
for i, (label, content) in enumerate(chunks):
    print(f"{i}. {label}")
    lines = [line.strip() for line in content.splitlines() if line.strip()]
    preview = "\n".join(lines[1:5]) if len(lines) > 1 else "[No additional content]"
    print(preview)
    print("\n---\n")
    if i >= 5:
        break


### Chunking and sub-chunking

In [None]:
import re
import json
from pathlib import Path
from PyPDF2 import PdfReader

# CONFIG
#PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Minutes of Previous Meeting.pdf")
#PDF_PATH = Path("../data/council_documents/full_council/2024-12-19/originals/Printed minutes 19th-Dec-2024 10.00 County Council.pdf")
#OUTPUT_DIR = Path("../data/council_documents/full_council/2024-12-19/chunks/")
OUTPUT_DIR = Path("../data/council_documents/full_council/2025-03-13/chunks/")
SUBCHUNK_DIR = OUTPUT_DIR / "subchunks"
SUBCHUNK_DIR.mkdir(parents=True, exist_ok=True)

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

# Clean line breaks and honorifics (e.g. "CBE", "MBE", "OBE")
def clean_honorifics(text):
    # 1. Remove line breaks
    text = text.replace("\n", " ")

    # 2. Remove ", CBE", ", MBE", ", OBE" (and variations with extra spaces)
    text = re.sub(r",\s*(CBE|MBE|OBE)\b", "", text, flags=re.IGNORECASE)

    # 3. Clean extra spaces
    text = re.sub(r"\s{2,}", " ", text).strip()

    return text

full_text = extract_text(PDF_PATH)

# Focus only on content after "UNRESTRICTED ITEMS"
start_marker = "UNRESTRICTED ITEMS"
start_index = full_text.find(start_marker)
body_text = full_text[start_index:] if start_index != -1 else full_text
body_text = re.sub(r"\n{2,}", "\n", body_text)

# Identify section start points
section_pattern = re.compile(r"^ *(\d{1,3})\.\s", re.MULTILINE)
header_matches = list(section_pattern.finditer(body_text))

split_points = []
last_number = -1
for match in header_matches:
    number = int(match.group(1))
    if number > last_number:
        split_points.append((match.start(), number))
        last_number = number
split_points.append((len(body_text), None))

# Split into agenda item chunks and save meaningful subchunks
for i in range(len(split_points) - 1):
    start, section_number = split_points[i]
    end, _ = split_points[i + 1]
    chunk_text = body_text[start:end].strip()
    lines = [line.strip() for line in chunk_text.splitlines() if line.strip()]
    title = lines[0] if lines else "Untitled"

    # Remove '(Item N)' if present
    chunk_text = re.sub(r"\(Item \d+\)", "", chunk_text)

    # Match both "1)" and "(1)"
    numbered_pattern = re.compile(r"(?=^\s*(?:\d{1,2}\)|\(\d{1,2}\))\s+)", re.MULTILINE)
    parts = numbered_pattern.split(chunk_text)

    # Avoid over-splitting: if only one part, keep as is
    if len(parts) <= 1:
        subchunks = [chunk_text.strip()]
    else:
        subchunks = [p.strip() for p in parts if p.strip() and not p.strip().startswith(str(section_number))]

    for idx, sub in enumerate(subchunks):
        sub = clean_honorifics(sub)

        # Check for 'RESOLVED that' preceded by number
        match = re.search(r"(\d{1,3}\. RESOLVED that)", sub)
        if match:
            split_point = match.start()
            first_part = sub[:split_point].strip()
            second_part = sub[split_point:].strip()

            # Save original (pre-RESOLVED) chunk
            data1 = {
                "section_number": section_number,
                "subchunk_index": idx,
                "title": title,
                "text": first_part
            }
            filename1 = f"section_{section_number:03d}_part_{idx:02d}.json"
            with open(SUBCHUNK_DIR / filename1, "w", encoding="utf-8") as f:
                json.dump(data1, f, indent=2, ensure_ascii=False)

            # Save RESOLVED chunk separately
            data2 = {
                "section_number": section_number,
                "subchunk_index": idx + 100,  # avoid collision
                "title": title + " [RESOLVED SPLIT]",
                "text": second_part
            }
            filename2 = f"section_{section_number:03d}_part_{idx+100:02d}.json"
            with open(SUBCHUNK_DIR / filename2, "w", encoding="utf-8") as f:
                json.dump(data2, f, indent=2, ensure_ascii=False)

        else:
            # Save chunk as-is
            data = {
                "section_number": section_number,
                "subchunk_index": idx,
                "title": title,
                "text": sub
            }
            filename = f"section_{section_number:03d}_part_{idx:02d}.json"
            with open(SUBCHUNK_DIR / filename, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

print(f"Saved subchunks to {SUBCHUNK_DIR}")

### Classification

In [None]:

import json
import re
import pandas as pd
from pathlib import Path

# Load subchunks from disk
subchunk_files = list(SUBCHUNK_DIR.glob("section_*_part_*.json"))

parsed_data = []

# Helper: identify ceremonial-style chunks
def classify_ceremonial(text):
    lower_text = text.lower()
    return any(phrase in lower_text for phrase in [
        "with great sadness",
        "death of", "sad passing", "tributes were made",
        "sense of loss", "heartfelt sympathy", "one-minute silence", "one minute silence",
        "warmest congratulations", "congratulated", "award", "winners of",
        "remembrance festival", "christmas campaign", "thanked all"
    ])
# Helper: identify apologies
def classify_apologies(text):
    return "apologies for absence" in text.lower()

# Helper: identify declarations of interest
def classify_interests(text):
    return bool(re.search(r"declared (a|an|any) (pecuniary )?interest", text, re.IGNORECASE))

# Helper: approval of previous meeting minutes
def classify_mom_approvals(text):
    return (
        "resolved that the minutes" in text.lower()
        or bool(re.search(r"minutes.*(approved|noted)", text.lower()))
    )

for path in subchunk_files:
    with open(path, "r", encoding="utf-8") as f:
        record = json.load(f)

    text = record["text"]
    content_type = []
    motion_text = None
    proposer = None
    seconder = None
    voting_result = None
    summary = None

    # RESOLVED clause
    if re.search(r"RESOLVED that", text, re.IGNORECASE):
        content_type.append("final_resolution")
        match = re.search(r'RESOLVED that(?: the Council)?(.*?)(\.|;|$)', text, re.IGNORECASE | re.DOTALL)
        if match:
            motion_text = match.group(1).strip()
            summary = f"Council resolved to {motion_text.lower()}."

    # Motion proposal pattern
    match = re.search(r'(\bMr|Mrs|Ms)\s+\w+\s+proposed,?\s+and\s+(\bMr|Mrs|Ms)\s+\w+\s+seconded', text)
    if match:
        content_type.append("motion_proposal")
        proposer_match = re.search(r'(\bMr|Mrs|Ms)\s+\w+\s+proposed', text)
        seconder_match = re.search(r'and\s+(\bMr|Mrs|Ms)\s+(\w+)\s+seconded', text)
        if proposer_match:
            proposer = proposer_match.group(0).replace("proposed", "").strip()
        if seconder_match:
            seconder = f"{seconder_match.group(1)} {seconder_match.group(2)}"
        quote_match = re.search(r'“(.*?)”', text, re.DOTALL)
        if quote_match:
            motion_text = quote_match.group(1).strip()
            summary = f"A motion was proposed and seconded: {motion_text[:100]}..."

    # Voting result (standard phrases)
    if "agreed unanimously" in text.lower() or "motion carried" in text.lower():
        content_type.append("vote_record")
        voting_result = {"result": "passed", "method": "unanimous" if "unanimously" in text.lower() else "carried"}
        summary = "The motion was passed unanimously."

    # Voting result (explicit breakdown)
    if re.search(r"voting\s+was\s+as\s+follows", text, re.IGNORECASE):
        content_type.append("vote_record")
        voting_result = {"result": "recorded", "method": "explicit"}
        for group in ["for", "against", "abstain"]:
            pattern = rf"{group.capitalize()} \((\d+)\):(.*?)(\n|$)"
            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
            if match:
                voting_result[group] = int(match.group(1))
                names = match.group(2).replace("\n", " ").strip()
                voting_result[f"names_{group}"] = re.findall(r"(Mr|Mrs|Ms) \w+", names)
        summary = "Detailed vote breakdown recorded."

    # Extra classifications
    if classify_ceremonial(text):
        content_type.append("ceremonials")
    if classify_apologies(text):
        content_type.append("apologies")
    if classify_interests(text):
        content_type.append("interests")
    if classify_mom_approvals(text):
        content_type.append("mom_approvals")

    parsed_data.append({
        "filename": path.name,
        "content_type": list(set(content_type)) or ["unclassified"],
        "summary": summary,
        "text": text,
        "voting_result": voting_result,
        "proposer": proposer,
        "seconder": seconder,
        "motion_text": motion_text
    })

# Create DataFrame
df = pd.DataFrame(parsed_data)
df = df[["filename", "content_type", "summary", "voting_result", "proposer", "seconder", "motion_text", "text"]]
df.head(5)


In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df[["filename", "content_type", "summary", "voting_result", "proposer", "seconder", "motion_text", "text"]].sample(1)

### Testing

In [None]:
# Filter to only motion_proposals
motion_df = df[df["content_type"].apply(lambda x: "motion_proposal" in x)]

# Show relevant columns
motion_df = motion_df[["filename", "summary", "proposer", "seconder", "motion_text", "text"]]

# Display the first few rows
motion_df.head(1)

In [None]:
# Filter and sort subchunks that start with 'section_307'
section_307_df = df[df["filename"].str.startswith("section_291")].sort_values(by="filename")

# Display relevant columns
section_307_df[["filename", "content_type", "summary", "proposer", "seconder", "motion_text", "text"]]

In [None]:
import re

def extract_voting_record(text):
    result = {}
    confidence_issues = []

    if re.search(r"voting\s+was\s+as\s+follows", text, re.IGNORECASE):
        result["result"] = "passed"
        result["method"] = "recorded"

        # Normalize line breaks
        clean_text = re.sub(r'\s*\n\s*', ' ', text)

        for group in ["for", "against", "abstain"]:
            pattern = rf"{group.capitalize()} \((\d+)\):?\s*(.*?)(?=(For \(|Against \(|Abstain \(|$))"
            match = re.search(pattern, clean_text, re.IGNORECASE)
            if match:
                declared_count = int(match.group(1))
                names_block = match.group(2).strip()
                names = [n.strip() for n in names_block.split(",") if n.strip()]
                result[group] = declared_count
                result[f"names_{group}"] = names

                if len(names) != declared_count:
                    confidence_issues.append(
                        f"{group}: declared {declared_count}, extracted {len(names)}"
                    )
            else:
                result[group] = 0
                result[f"names_{group}"] = []

        if confidence_issues:
            result["confidence_warning"] = "; ".join(confidence_issues)

    return result if result else None

In [None]:
# 1. Apply updated function
df["voting_result_dict"] = df["text"].apply(extract_voting_record)

# 2. Helper to extract safely
def safe_get(d, key, default=None):
    return d.get(key, default) if isinstance(d, dict) else default

# 3. Expand into individual columns
df["votes_for"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "for", 0))
df["names_for"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "names_for", []))

df["votes_against"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "against", 0))
df["names_against"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "names_against", []))

df["votes_abstain"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "abstain", 0))
df["names_abstain"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "names_abstain", []))

df["votes_total"] = df["votes_for"] + df["votes_against"] + df["votes_abstain"]

# 4. Optional: flag discrepancies
df["confidence_warning"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "confidence_warning", None))

In [None]:
# Filter rows where "vote_record" is one of the content types
vote_rows = df[df["content_type"].apply(lambda x: "vote_record" in x)]

#vote_rows.sample(1)

In [None]:
df.sample(1)

In [None]:
# Sort by filename
df_sorted = df.sort_values(by="filename")

# Save to CSV
df_sorted.to_csv("/Users/lgfolder/Downloads/subchunks_inspection.csv", index=False, encoding="utf-8-sig")

In [None]:
summary_chunks = df[df["content_type"].apply(lambda x: any(ct in x for ct in ["motion_proposal", "final_resolution", "unclassified"]))]

In [None]:
summary_chunks = summary_chunks.sort_values(by="filename")
meeting_text = "\n\n".join(summary_chunks["text"])

In [None]:
print(meeting_text)

In [None]:
import re
import json
from pathlib import Path
from PyPDF2 import PdfReader

# CONFIG
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")
#PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Minutes of Previous Meeting.pdf")
#PDF_PATH = Path("../data/council_documents/full_council/2024-12-19/originals/Printed minutes 19th-Dec-2024 10.00 County Council.pdf")
#OUTPUT_DIR = Path("../data/council_documents/full_council/2024-12-19/chunks/")
OUTPUT_DIR = Path("../data/council_documents/full_council/2025-03-13/chunks/")
SUBCHUNK_DIR = OUTPUT_DIR / "subchunks"
SUBCHUNK_DIR.mkdir(parents=True, exist_ok=True)

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

# Clean line breaks and honorifics (e.g. "CBE", "MBE", "OBE")
def clean_honorifics(text):
    # 1. Remove line breaks
    text = text.replace("\n", " ")

    # 2. Remove ", CBE", ", MBE", ", OBE" (and variations with extra spaces)
    text = re.sub(r",\s*(CBE|MBE|OBE)\b", "", text, flags=re.IGNORECASE)

    # 3. Clean extra spaces
    text = re.sub(r"\s{2,}", " ", text).strip()

    return text

full_text = extract_text(PDF_PATH)

# Focus only on content after "UNRESTRICTED ITEMS"
start_marker = "UNRESTRICTED ITEMS"
start_index = full_text.find(start_marker)
body_text = full_text[start_index:] if start_index != -1 else full_text
body_text = re.sub(r"\n{2,}", "\n", body_text)

# Identify section start points
section_pattern = re.compile(r"^ *(\d{1,3})\.\s", re.MULTILINE)
header_matches = list(section_pattern.finditer(body_text))

split_points = []
last_number = -1
for match in header_matches:
    number = int(match.group(1))
    if number > last_number:
        split_points.append((match.start(), number))
        last_number = number
split_points.append((len(body_text), None))

# Split into agenda item chunks and save meaningful subchunks
for i in range(len(split_points) - 1):
    start, section_number = split_points[i]
    end, _ = split_points[i + 1]
    chunk_text = body_text[start:end].strip()
    lines = [line.strip() for line in chunk_text.splitlines() if line.strip()]
    title = lines[0] if lines else "Untitled"

    # Remove '(Item N)' if present
    chunk_text = re.sub(r"\(Item \d+\)", "", chunk_text)

    # Match both "1)" and "(1)"
    numbered_pattern = re.compile(r"(?=^\s*(?:\d{1,2}\)|\(\d{1,2}\))\s+)", re.MULTILINE)
    parts = numbered_pattern.split(chunk_text)

    # Avoid over-splitting: if only one part, keep as is
    if len(parts) <= 1:
        subchunks = [chunk_text.strip()]
    else:
        subchunks = [p.strip() for p in parts if p.strip() and not p.strip().startswith(str(section_number))]

    for idx, sub in enumerate(subchunks):
        sub = clean_honorifics(sub)

        # Check for 'RESOLVED that' preceded by number
        match = re.search(r"(\d{1,3}\. RESOLVED that)", sub)
        if match:
            split_point = match.start()
            first_part = sub[:split_point].strip()
            second_part = sub[split_point:].strip()

            # Save original (pre-RESOLVED) chunk
            data1 = {
                "section_number": section_number,
                "subchunk_index": idx,
                "title": title,
                "text": first_part
            }
            filename1 = f"section_{section_number:03d}_part_{idx:02d}.json"
            with open(SUBCHUNK_DIR / filename1, "w", encoding="utf-8") as f:
                json.dump(data1, f, indent=2, ensure_ascii=False)

            # Save RESOLVED chunk separately
            data2 = {
                "section_number": section_number,
                "subchunk_index": idx + 100,  # avoid collision
                "title": title + " [RESOLVED SPLIT]",
                "text": second_part
            }
            filename2 = f"section_{section_number:03d}_part_{idx+100:02d}.json"
            with open(SUBCHUNK_DIR / filename2, "w", encoding="utf-8") as f:
                json.dump(data2, f, indent=2, ensure_ascii=False)

        else:
            # Save chunk as-is
            data = {
                "section_number": section_number,
                "subchunk_index": idx,
                "title": title,
                "text": sub
            }
            filename = f"section_{section_number:03d}_part_{idx:02d}.json"
            with open(SUBCHUNK_DIR / filename, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

print(f"Saved subchunks to {SUBCHUNK_DIR}")


import json
import re
import pandas as pd
from pathlib import Path

# Load subchunks from disk
subchunk_files = list(SUBCHUNK_DIR.glob("section_*_part_*.json"))

parsed_data = []

# Helper: identify ceremonial-style chunks
def classify_ceremonial(text):
    lower_text = text.lower()
    return any(phrase in lower_text for phrase in [
        "with great sadness",
        "death of", "sad passing", "tributes were made",
        "sense of loss", "heartfelt sympathy", "one-minute silence", "one minute silence",
        "warmest congratulations", "congratulated", "award", "winners of",
        "remembrance festival", "christmas campaign", "thanked all"
    ])
# Helper: identify apologies
def classify_apologies(text):
    return "apologies for absence" in text.lower()

# Helper: identify declarations of interest
def classify_interests(text):
    return bool(re.search(r"declared (a|an|any) (pecuniary )?interest", text, re.IGNORECASE))

# Helper: approval of previous meeting minutes
def classify_mom_approvals(text):
    return (
        "resolved that the minutes" in text.lower()
        or bool(re.search(r"minutes.*(approved|noted)", text.lower()))
    )

for path in subchunk_files:
    with open(path, "r", encoding="utf-8") as f:
        record = json.load(f)

    text = record["text"]
    content_type = []
    motion_text = None
    proposer = None
    seconder = None
    voting_result = None
    summary = None

    # RESOLVED clause
    if re.search(r"RESOLVED that", text, re.IGNORECASE):
        content_type.append("final_resolution")
        match = re.search(r'RESOLVED that(?: the Council)?(.*?)(\.|;|$)', text, re.IGNORECASE | re.DOTALL)
        if match:
            motion_text = match.group(1).strip()
            summary = f"Council resolved to {motion_text.lower()}."

    # Motion proposal pattern
    match = re.search(r'(\bMr|Mrs|Ms)\s+\w+\s+proposed,?\s+and\s+(\bMr|Mrs|Ms)\s+\w+\s+seconded', text)
    if match:
        content_type.append("motion_proposal")
        proposer_match = re.search(r'(\bMr|Mrs|Ms)\s+\w+\s+proposed', text)
        seconder_match = re.search(r'and\s+(\bMr|Mrs|Ms)\s+(\w+)\s+seconded', text)
        if proposer_match:
            proposer = proposer_match.group(0).replace("proposed", "").strip()
        if seconder_match:
            seconder = f"{seconder_match.group(1)} {seconder_match.group(2)}"
        quote_match = re.search(r'“(.*?)”', text, re.DOTALL)
        if quote_match:
            motion_text = quote_match.group(1).strip()
            summary = f"A motion was proposed and seconded: {motion_text[:100]}..."

    # Voting result (standard phrases)
    if "agreed unanimously" in text.lower() or "motion carried" in text.lower():
        content_type.append("vote_record")
        voting_result = {"result": "passed", "method": "unanimous" if "unanimously" in text.lower() else "carried"}
        summary = "The motion was passed unanimously."

    # Voting result (explicit breakdown)
    if re.search(r"voting\s+was\s+as\s+follows", text, re.IGNORECASE):
        content_type.append("vote_record")
        voting_result = {"result": "recorded", "method": "explicit"}
        for group in ["for", "against", "abstain"]:
            pattern = rf"{group.capitalize()} \((\d+)\):(.*?)(\n|$)"
            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
            if match:
                voting_result[group] = int(match.group(1))
                names = match.group(2).replace("\n", " ").strip()
                voting_result[f"names_{group}"] = re.findall(r"(Mr|Mrs|Ms) \w+", names)
        summary = "Detailed vote breakdown recorded."

    # Extra classifications
    if classify_ceremonial(text):
        content_type.append("ceremonials")
    if classify_apologies(text):
        content_type.append("apologies")
    if classify_interests(text):
        content_type.append("interests")
    if classify_mom_approvals(text):
        content_type.append("mom_approvals")

    parsed_data.append({
        "filename": path.name,
        "content_type": list(set(content_type)) or ["unclassified"],
        "summary": summary,
        "text": text,
        "voting_result": voting_result,
        "proposer": proposer,
        "seconder": seconder,
        "motion_text": motion_text
    })

# Create DataFrame
df = pd.DataFrame(parsed_data)
df = df[["filename", "content_type", "summary", "voting_result", "proposer", "seconder", "motion_text", "text"]].head(5)
df
