## MoM Full Council parsing

### Experimental code - initial stages

In [None]:
import re
from pathlib import Path
from PyPDF2 import PdfReader

# Prompt user to confirm or provide correct file path
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}\nPlease confirm the filename and ensure it's uploaded.")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

try:
    full_text = extract_text(PDF_PATH)
except FileNotFoundError as e:
    print(e)
    full_text = ""

# Define regex for splitting into chunks by agenda items
agenda_item_pattern = r"(?=\n?\d{3}\..+?\n\(Item \d+\))"  # matches e.g. '295. Chairman's Announcements\n(Item 5)'

# Preprocess to remove excessive newlines
cleaned_text = re.sub(r"\n{2,}", "\n", full_text)

# Split the text
chunks = re.split(agenda_item_pattern, cleaned_text) if cleaned_text else []

# First chunk is the preamble (attendance, apologies, declarations)
preamble = chunks[0] if chunks else ""

# Remaining chunks are the agenda items
agenda_chunks = chunks[1:] if len(chunks) > 1 else []

# Pair each chunk with its item number and title (e.g. 'Item 6 – Questions')
item_pattern = r"(\d{3}\..+?)\n\(Item (\d+)\)"

def label_chunks(chunks):
    labelled = []
    for chunk in chunks:
        match = re.search(item_pattern, chunk)
        if match:
            label = f"Item {match.group(2)} – {match.group(1).strip()}"
        else:
            label = "Unlabeled"
        labelled.append((label, chunk.strip()))
    return labelled

labeled_chunks = [("Preamble", preamble.strip())] + label_chunks(agenda_chunks)

# Display the labels for review
if labeled_chunks:
    for i, (label, _) in enumerate(labeled_chunks):
        print(f"{i}. {label}")
else:
    print("No agenda items were detected. Please verify that the correct PDF was provided.")


In [None]:
import re
from pathlib import Path
from PyPDF2 import PdfReader

# Prompt user to confirm or provide correct file path
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}\nPlease confirm the filename and ensure it's uploaded.")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

try:
    full_text = extract_text(PDF_PATH)
except FileNotFoundError as e:
    print(e)
    full_text = ""

# Define regex for splitting into chunks by agenda items
agenda_item_pattern = r"(?=\n?\d{3}\..+?\n\(Item \d+\))"  # matches e.g. '295. Chairman's Announcements\n(Item 5)'

# Preprocess to remove excessive newlines
cleaned_text = re.sub(r"\n{2,}", "\n", full_text)

# Split the text
chunks = re.split(agenda_item_pattern, cleaned_text) if cleaned_text else []

# First chunk is the preamble (attendance, apologies, declarations)
preamble = chunks[0] if chunks else ""

# Remaining chunks are the agenda items
agenda_chunks = chunks[1:] if len(chunks) > 1 else []

# Improved pattern to catch flexible item formatting
item_pattern = r"(\d{3}\.[^\n]+)\s*\(Item (\d+)\)"

def label_chunks(chunks):
    labelled = []
    for chunk in chunks:
        match = re.search(item_pattern, chunk)
        if match:
            label = f"Item {match.group(2)} – {match.group(1).strip()}"
        else:
            preview = chunk.strip().splitlines()[0][:80] if chunk.strip() else "[Empty]"
            label = f"Unlabeled – Preview: {preview}"
        labelled.append((label, chunk.strip()))
    return labelled

labeled_chunks = [("Preamble", preamble.strip())] + label_chunks(agenda_chunks)

# Display the labels for review
if labeled_chunks:
    for i, (label, _) in enumerate(labeled_chunks):
        print(f"{i}. {label}")
else:
    print("No agenda items were detected. Please verify that the correct PDF was provided.")


In [None]:
import re
from pathlib import Path
from PyPDF2 import PdfReader

# Prompt user to confirm or provide correct file path
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}\nPlease confirm the filename and ensure it's uploaded.")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

try:
    full_text = extract_text(PDF_PATH)
except FileNotFoundError as e:
    print(e)
    full_text = ""

# Focus only on content after "UNRESTRICTED ITEMS"
start_marker = "UNRESTRICTED ITEMS"
start_index = full_text.find(start_marker)
body_text = full_text[start_index:] if start_index != -1 else full_text

# Normalize excessive newlines
body_text = re.sub(r"\n{2,}", "\n", body_text)

# Find all section headers with integer + '.' pattern
section_pattern = re.compile(r"^ *(\d{1,3})\.\s", re.MULTILINE)
header_matches = list(section_pattern.finditer(body_text))

# Dynamically identify only monotonic increasing section headers
split_points = []
last_number = -1
for match in header_matches:
    number = int(match.group(1))
    if number > last_number:
        split_points.append((match.start(), number))
        last_number = number
split_points.append((len(body_text), None))  # End boundary

# Slice the text into valid sequentially increasing chunks
chunks = []
for i in range(len(split_points) - 1):
    start, current_number = split_points[i]
    end, _ = split_points[i + 1]
    chunk = body_text[start:end].strip()
    if chunk:
        header_line = chunk.splitlines()[0].strip()
        label = f"Section – {header_line}"
        chunks.append((label, chunk))

# Display the labels for review
if chunks:
    for i, (label, _) in enumerate(chunks):
        print(f"{i}. {label}")
else:
    print("No valid sequential agenda sections detected.")

# Display first few lines from each section
for i, (label, content) in enumerate(chunks):
    print(f"{i}. {label}")
    lines = [line.strip() for line in content.splitlines() if line.strip()]
    preview = "\n".join(lines[1:5]) if len(lines) > 1 else "[No additional content]"
    print(preview)
    print("\n---\n")
    if i >= 5:
        break


### Chunking and sub-chunking

In [277]:
import re
import json
from pathlib import Path
from PyPDF2 import PdfReader

# CONFIG
#PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Minutes of Previous Meeting.pdf")
#PDF_PATH = Path("../data/council_documents/full_council/2024-12-19/originals/Printed minutes 19th-Dec-2024 10.00 County Council.pdf")
#OUTPUT_DIR = Path("../data/council_documents/full_council/2024-12-19/chunks/")
OUTPUT_DIR = Path("../data/council_documents/full_council/2025-03-13/chunks/")
SUBCHUNK_DIR = OUTPUT_DIR / "subchunks"
SUBCHUNK_DIR.mkdir(parents=True, exist_ok=True)

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

# Clean line breaks and honorifics (e.g. "CBE", "MBE", "OBE")
def clean_honorifics(text):
    # 1. Remove line breaks
    text = text.replace("\n", " ")

    # 2. Remove ", CBE", ", MBE", ", OBE" (and variations with extra spaces)
    text = re.sub(r",\s*(CBE|MBE|OBE)\b", "", text, flags=re.IGNORECASE)

    # 3. Clean extra spaces
    text = re.sub(r"\s{2,}", " ", text).strip()

    return text

full_text = extract_text(PDF_PATH)

# Focus only on content after "UNRESTRICTED ITEMS"
start_marker = "UNRESTRICTED ITEMS"
start_index = full_text.find(start_marker)
body_text = full_text[start_index:] if start_index != -1 else full_text
body_text = re.sub(r"\n{2,}", "\n", body_text)

# Identify section start points
section_pattern = re.compile(r"^ *(\d{1,3})\.\s", re.MULTILINE)
header_matches = list(section_pattern.finditer(body_text))

split_points = []
last_number = -1
for match in header_matches:
    number = int(match.group(1))
    if number > last_number:
        split_points.append((match.start(), number))
        last_number = number
split_points.append((len(body_text), None))

# Split into agenda item chunks and save meaningful subchunks
for i in range(len(split_points) - 1):
    start, section_number = split_points[i]
    end, _ = split_points[i + 1]
    chunk_text = body_text[start:end].strip()
    lines = [line.strip() for line in chunk_text.splitlines() if line.strip()]
    title = lines[0] if lines else "Untitled"

    # Remove '(Item N)' if present
    chunk_text = re.sub(r"\(Item \d+\)", "", chunk_text)

    # Match both "1)" and "(1)"
    numbered_pattern = re.compile(r"(?=^\s*(?:\d{1,2}\)|\(\d{1,2}\))\s+)", re.MULTILINE)
    parts = numbered_pattern.split(chunk_text)

    # Avoid over-splitting: if only one part, keep as is
    if len(parts) <= 1:
        subchunks = [chunk_text.strip()]
    else:
        subchunks = [p.strip() for p in parts if p.strip() and not p.strip().startswith(str(section_number))]

    for idx, sub in enumerate(subchunks):
        sub = clean_honorifics(sub)

        # Check for 'RESOLVED that' preceded by number
        match = re.search(r"(\d{1,3}\. RESOLVED that)", sub)
        if match:
            split_point = match.start()
            first_part = sub[:split_point].strip()
            second_part = sub[split_point:].strip()

            # Save original (pre-RESOLVED) chunk
            data1 = {
                "section_number": section_number,
                "subchunk_index": idx,
                "title": title,
                "text": first_part
            }
            filename1 = f"section_{section_number:03d}_part_{idx:02d}.json"
            with open(SUBCHUNK_DIR / filename1, "w", encoding="utf-8") as f:
                json.dump(data1, f, indent=2, ensure_ascii=False)

            # Save RESOLVED chunk separately
            data2 = {
                "section_number": section_number,
                "subchunk_index": idx + 100,  # avoid collision
                "title": title + " [RESOLVED SPLIT]",
                "text": second_part
            }
            filename2 = f"section_{section_number:03d}_part_{idx+100:02d}.json"
            with open(SUBCHUNK_DIR / filename2, "w", encoding="utf-8") as f:
                json.dump(data2, f, indent=2, ensure_ascii=False)

        else:
            # Save chunk as-is
            data = {
                "section_number": section_number,
                "subchunk_index": idx,
                "title": title,
                "text": sub
            }
            filename = f"section_{section_number:03d}_part_{idx:02d}.json"
            with open(SUBCHUNK_DIR / filename, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

print(f"Saved subchunks to {SUBCHUNK_DIR}")

Saved subchunks to ../data/council_documents/full_council/2025-03-13/chunks/subchunks


### Classification

In [278]:

import json
import re
import pandas as pd
from pathlib import Path

# Load subchunks from disk
subchunk_files = list(SUBCHUNK_DIR.glob("section_*_part_*.json"))

parsed_data = []

# Helper: identify ceremonial-style chunks
def classify_ceremonial(text):
    lower_text = text.lower()
    return any(phrase in lower_text for phrase in [
        "with great sadness",
        "death of", "sad passing", "tributes were made",
        "sense of loss", "heartfelt sympathy", "one-minute silence", "one minute silence",
        "warmest congratulations", "congratulated", "award", "winners of",
        "remembrance festival", "christmas campaign", "thanked all"
    ])
# Helper: identify apologies
def classify_apologies(text):
    return "apologies for absence" in text.lower()

# Helper: identify declarations of interest
def classify_interests(text):
    return bool(re.search(r"declared (a|an|any) (pecuniary )?interest", text, re.IGNORECASE))

# Helper: approval of previous meeting minutes
def classify_mom_approvals(text):
    return (
        "resolved that the minutes" in text.lower()
        or bool(re.search(r"minutes.*(approved|noted)", text.lower()))
    )

for path in subchunk_files:
    with open(path, "r", encoding="utf-8") as f:
        record = json.load(f)

    text = record["text"]
    content_type = []
    motion_text = None
    proposer = None
    seconder = None
    voting_result = None
    summary = None

    # RESOLVED clause
    if re.search(r"RESOLVED that", text, re.IGNORECASE):
        content_type.append("final_resolution")
        match = re.search(r'RESOLVED that(?: the Council)?(.*?)(\.|;|$)', text, re.IGNORECASE | re.DOTALL)
        if match:
            motion_text = match.group(1).strip()
            summary = f"Council resolved to {motion_text.lower()}."

    # Motion proposal pattern
    match = re.search(r'(\bMr|Mrs|Ms)\s+\w+\s+proposed,?\s+and\s+(\bMr|Mrs|Ms)\s+\w+\s+seconded', text)
    if match:
        content_type.append("motion_proposal")
        proposer_match = re.search(r'(\bMr|Mrs|Ms)\s+\w+\s+proposed', text)
        seconder_match = re.search(r'and\s+(\bMr|Mrs|Ms)\s+(\w+)\s+seconded', text)
        if proposer_match:
            proposer = proposer_match.group(0).replace("proposed", "").strip()
        if seconder_match:
            seconder = f"{seconder_match.group(1)} {seconder_match.group(2)}"
        quote_match = re.search(r'“(.*?)”', text, re.DOTALL)
        if quote_match:
            motion_text = quote_match.group(1).strip()
            summary = f"A motion was proposed and seconded: {motion_text[:100]}..."

    # Voting result (standard phrases)
    if "agreed unanimously" in text.lower() or "motion carried" in text.lower():
        content_type.append("vote_record")
        voting_result = {"result": "passed", "method": "unanimous" if "unanimously" in text.lower() else "carried"}
        summary = "The motion was passed unanimously."

    # Voting result (explicit breakdown)
    if re.search(r"voting\s+was\s+as\s+follows", text, re.IGNORECASE):
        content_type.append("vote_record")
        voting_result = {"result": "recorded", "method": "explicit"}
        for group in ["for", "against", "abstain"]:
            pattern = rf"{group.capitalize()} \((\d+)\):(.*?)(\n|$)"
            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
            if match:
                voting_result[group] = int(match.group(1))
                names = match.group(2).replace("\n", " ").strip()
                voting_result[f"names_{group}"] = re.findall(r"(Mr|Mrs|Ms) \w+", names)
        summary = "Detailed vote breakdown recorded."

    # Extra classifications
    if classify_ceremonial(text):
        content_type.append("ceremonials")
    if classify_apologies(text):
        content_type.append("apologies")
    if classify_interests(text):
        content_type.append("interests")
    if classify_mom_approvals(text):
        content_type.append("mom_approvals")

    parsed_data.append({
        "filename": path.name,
        "content_type": list(set(content_type)) or ["unclassified"],
        "summary": summary,
        "text": text,
        "voting_result": voting_result,
        "proposer": proposer,
        "seconder": seconder,
        "motion_text": motion_text
    })

# Create DataFrame
df = pd.DataFrame(parsed_data)
df = df[["filename", "content_type", "summary", "voting_result", "proposer", "seconder", "motion_text", "text"]]
df.head(5)


Unnamed: 0,filename,content_type,summary,voting_result,proposer,seconder,motion_text,text
0,section_291_part_03.json,[unclassified],,,,,,"(4) Mr Oakford proposed and Mr Gough Seconded the following motion: “County Council, having given due regard to the s25 Report (published for consideration as agenda item 5 of this meeting), is asked to agree the following: 2025-35 Capital Programme (a) The 10-year Capital programme and investment proposals of £1,419m over the years from 2025-26 to 2034-35 together with the necessary funding and subject to approval to spend arrangements. (b) The directorate capital programmes as set out in appendices A & B of the final draft budget report published on 5th February 2025. 2025-26 Revenue Budget and Medium Term Financial Plan (c) The net revenue budget requirement of £1,530.9m for 2025-26. (d) The directorate revenue budget proposals for 2025-26 and the medium term financial plan as set out in appendices D (high level county 3 year plan) E (high level 2025-26 plan by directorate), appendix F (individual spending, savings, income and reserves variations for 2025-26) and G (2025-26 key services) of the final draft budget report published on 5th February 2025. 2025-26 Council Tax (e) To increase Council Tax band rates up to the maximum permitted without a referendum as set out in section 6.4 (table 6.1) of the final draft report published on 5th February 2025. (f) The total Council Tax requirement of £994,287,655 to be raised through precepts on districts as set out in section 6.6 (table 6.2) in the final draft report published on 5th February 2025. Kent Pay Scheme 2025-26 (g) Note the transition of all Kent Scheme staff to the new pay structure agreed by County Council in May 2024 (h) Agree to the recommendations on the uplift to Kent Pay Scheme as set out in section 7.9 of the final draft budget published on 5th February 2025 Key Policies and Strategies (i) Fees and charges to continue to be reviewed in line with the policy agreed in the 2023-24 budget approval (j) The Capital Strategy as set out in appendix O of the final draft report published on 5th February 2025 including the Prudential Indicators. (k) The Treasury Management Strategy as set out in appendix M of the final draft report published on 5th February 2025 (l) The Minimum Revenue Provision (MRP) Statement as set out in appendix P of the final draft report published on 5th February 2025 (m) The Flexible Use of Capital Receipts Strategy as set out in appendix Q of the final draft report published on 5th February 2025. (n) The Reserves Policy as set out in appendix H of the final draft budget report published on 5th February 2025. In addition: (o) To delegate authority to the Chief Executive, in consultation with the Leader of the Council and the Cabinet Member for Finance, Corporate and Traded Services, to agree any other non-pay related changes to the Kent Scheme through the conclusion of pay bargaining. (p) To delegate authority to the Corporate Director of Finance (after consultation with the Leader, the Deputy Leader and Cabinet Member for Finance, Corporate & Traded Services and the political Group Leaders) to resolve any minor technical issues and structural changes for the final budget publication which do not materially alter the approved budget or change the net budget requirement and for any changes made to be reflected in the final version of the Budget Book (blue combed) due to be published in March 2025. (q) To note the information on the impact of the County Council’s share of retained business rates, business rate pool and business rate collection fund balances on the revenue budget will be reported to Cabinet once it has all been received. (r) To note the ongoing and escalating cost pressures on the Council’s budget alongside insufficient funding in the local government finance settlement and knock on requirement for savings and income in the final draft 2025-26 budget and medium term financial plan. (s) To note potential changes to local authority funding system from 2026- 27 onwards and consequential uncertain financial outlook for later years until a multi-year settlement from government is reintroduced. (t) To note that fundamental changes to social care are unlikely until after Baroness Casey enquiry concludes. (u) To note that the planned use of reserves still ensures sufficient reserves are available in the short term with no immediate concerns triggering a S114 notice provided the use of these reserves is replaced with sustainable savings over the medium term. (v) To note the rate of recent drawdown from reserves and potential drawdown to balance 2024-25 outturn is still cause for serious concern and reserves will still need to be maintained ahead of changes under Devolution White Paper. Further unplanned drawdowns would weaken resilience and should only be considered as a last resort with an agreed strategy to replenish reserves at earliest opportunity.”"
1,section_291_part_15.json,[vote_record],Detailed vote breakdown recorded.,"{'result': 'recorded', 'method': 'explicit'}",,,,"(16) Following the debate, the Chairman put to the vote the amendment set out in paragraph 15 above and the voting was as follows: For (9) Mr Baldock, Mr Campkin, Mr Chittenden, Mr Hood, Mr Hook, Mr Lehmann, Mr Lewis, Mr Passmore, Mr Stepto Against (41) Mr Baker, Mr Bartlett, Mr Beaney, Mrs Bell, Mrs Binks, Mr Bond, Mr Booth, Mr Brazier, Miss Carey, Mrs Chandler, Mr Chard, Mr Cole, Mr Cooke, Mr Dance, Mr Dendor, Mrs Game, Mr Gough, Ms Hamilton, Mr Hill, Mr Hills, Mrs Hohler, Mr Holden, Mr Jeffrey, Mr Kennedy, Mr Love, Mr Marsh, Mr Meade, Mr Murphy, Mr Oakford, Mr Ozog, Mrs Prendergast, Mr Rayner, Mr Richardson, Mr Ridgers, Mr Robey, Mr Ross, Mr Sandhu, Mr Thomas, Mr Watkins, Mr Wright, Ms Wright Abstain (8) Mr Brady, Sir Paul Carter, Ms Constantine, Mr Cooper, Ms Dawkins, Ms Meade, Mr Shonk, Mr Whiting Amendment lost."
2,section_287_part_01.json,[interests],,,,,,(2) Ms Meade declared an interest that she was a carer.
3,section_290_part_02.json,[vote_record],Detailed vote breakdown recorded.,"{'result': 'recorded', 'method': 'explicit'}",,,,"(3) The Chairman put the motion set out in paragraph 2 to the vote and the voting was as follows: For (57) Mr Baker, Mr Baldock, Mr Bartlett, Mr Beaney, Mrs Bell, Mrs Binks, Mr Bond, Mr Booth, Mr Brady, Mr Brazier, Mr Campkin, Miss Carey, Sir Paul Carter, Mrs Chandler, Mr Chard, Mr Chittenden, Mr Cole, Mr Cooper, Ms Constantine, Mr Cooke, Mr Dance, Ms Dawkins, Mr Dendor, Mrs Game, Mr Gough, Ms Grehan, Ms Hamilton, Mr Hill, Mrs Hohler, Mr Holden, Mr Hood, Mr Hook, Mr Jeffrey, Mr Kennedy, Mr Lehmann, Mr Love, Mr Marsh, Mr Meade, Ms Meade, Mr Murphy, Mr Oakford, Mr Ozog, Mr Passmore, Mrs Prendergast, Mr Rayner, Mr Ridgers, Mr Robey, Mr Sandhu, Mr Shonk, Mr Stepto, Mr Streatfeild, Mr Thomas, Mr Watkins, Mr Webb, Mr Whiting, Mr Wright, Ms Wright Against (0) Abstain (2) Mr B Lewis, Mr D Ross Motion carried."
4,section_291_part_19.json,[vote_record],Detailed vote breakdown recorded.,"{'result': 'recorded', 'method': 'explicit'}",,,,"(20) Following the debate, the Chairman put to the vote the amendment set out in paragraph 19 above and the voting was as follows: For (13) Mr Brady, Mr Campkin, Mr Chittenden, Ms Constantine, Ms Dawkins, Mr Hood, Mr Hook, Mr Lehmann, Mr Lewis, Ms Meade, Mr Passmore, Mr Stepto, Mr Streatfeild Against (39) Mr Baker, Mr Bartlett, Mr Beaney, Mrs Bell, Mrs Binks, Mr Bond, Mr Booth, Mr Brazier, Miss Carey, Mrs Chandler, Mr Chard, Mr Cole, Mr Cooke, Mr Dance, Mr Dendor, Mrs Game, Mr Gough, Ms Hamilton, Mr Hill, Mr Hills, Mrs Hohler, Mr Holden, Mr Jeffrey, Mr Kennedy, Mr Love, Mr Marsh, Mr Meade, Mr Murphy, Mr Oakford, Mr Ozog, Mr Rayner, Mr Richardson, Mr Ridgers, Mr Robey, Mr Sandhu, Mr Thomas, Mr Watkins, Mr Wright, Ms Wright Abstain (3) Mr Ross, Mr Shonk, Mr Whiting Amendment lost. Adult Social Care and Health Directorate"


In [279]:
pd.set_option('display.max_colwidth', None)

In [280]:
df[["filename", "content_type", "summary", "voting_result", "proposer", "seconder", "motion_text", "text"]].sample(1)

Unnamed: 0,filename,content_type,summary,voting_result,proposer,seconder,motion_text,text
16,section_291_part_13.json,[unclassified],,,,,,"(14) Following the general debate, the Chairman called for directorate specific amendments."


### Testing

In [281]:
# Filter to only motion_proposals
motion_df = df[df["content_type"].apply(lambda x: "motion_proposal" in x)]

# Show relevant columns
motion_df = motion_df[["filename", "summary", "proposer", "seconder", "motion_text", "text"]]

# Display the first few rows
motion_df.head(1)

Unnamed: 0,filename,summary,proposer,seconder,motion_text,text
6,section_291_part_23.json,A motion was proposed and seconded: Reverse the proposed £1.75m cut to the grant paid to districts for Council Tax Reduction Scheme (CTR...,Mr Lehmann,Mr Hood,Reverse the proposed £1.75m cut to the grant paid to districts for Council Tax Reduction Scheme (CTRS) support for the 2025-26 financial year (using money earmarked for general reserves) to allow for a thorough investigation of the underlying causes of the estimated £5million gap in council tax base between the KCC estimate and the estimates reported by districts for 2025-26.,"(24) Mr Lehmann proposed, and Mr Hood seconded the following amendment: Proposed Purpose: “Reverse the proposed £1.75m cut to the grant paid to districts for Council Tax Reduction Scheme (CTRS) support for the 2025-26 financial year (using money earmarked for general reserves) to allow for a thorough investigation of the underlying causes of the estimated £5million gap in council tax base between the KCC estimate and the estimates reported by districts for 2025-26.” Proposed Amount: “£1,750k” Proposed Funding Source: “Reduce Contribution to General Reserves”"


In [282]:
# Filter and sort subchunks that start with 'section_307'
section_307_df = df[df["filename"].str.startswith("section_291")].sort_values(by="filename")

# Display relevant columns
section_307_df[["filename", "content_type", "summary", "proposer", "seconder", "motion_text", "text"]]

Unnamed: 0,filename,content_type,summary,proposer,seconder,motion_text,text
34,section_291_part_00.json,[unclassified],,,,,"(1) The Chairman reminded Members that any Member of a Local Authority who was liable to pay Council Tax, and who had any unpaid Council Tax amount overdue for at least two months, even if there was an arrangement to pay off the arrears, must declare the fact that they are in arrears and must not cast their vote on anything related to KCC’s Budget or Council Tax."
29,section_291_part_01.json,[unclassified],,,,,"(2) The Chairman drew Members’ attention to the Section 25 Assurance Statement, as considered under the previous item, reminding them of the agreement by Council to give it due regard while considering the Budget."
12,section_291_part_02.json,[unclassified],,,,,(3) The Chairman explained that a document setting out the procedure for the meeting and the proposed amendments to the draft budget were circulated to Members prior to the meeting.
0,section_291_part_03.json,[unclassified],,,,,"(4) Mr Oakford proposed and Mr Gough Seconded the following motion: “County Council, having given due regard to the s25 Report (published for consideration as agenda item 5 of this meeting), is asked to agree the following: 2025-35 Capital Programme (a) The 10-year Capital programme and investment proposals of £1,419m over the years from 2025-26 to 2034-35 together with the necessary funding and subject to approval to spend arrangements. (b) The directorate capital programmes as set out in appendices A & B of the final draft budget report published on 5th February 2025. 2025-26 Revenue Budget and Medium Term Financial Plan (c) The net revenue budget requirement of £1,530.9m for 2025-26. (d) The directorate revenue budget proposals for 2025-26 and the medium term financial plan as set out in appendices D (high level county 3 year plan) E (high level 2025-26 plan by directorate), appendix F (individual spending, savings, income and reserves variations for 2025-26) and G (2025-26 key services) of the final draft budget report published on 5th February 2025. 2025-26 Council Tax (e) To increase Council Tax band rates up to the maximum permitted without a referendum as set out in section 6.4 (table 6.1) of the final draft report published on 5th February 2025. (f) The total Council Tax requirement of £994,287,655 to be raised through precepts on districts as set out in section 6.6 (table 6.2) in the final draft report published on 5th February 2025. Kent Pay Scheme 2025-26 (g) Note the transition of all Kent Scheme staff to the new pay structure agreed by County Council in May 2024 (h) Agree to the recommendations on the uplift to Kent Pay Scheme as set out in section 7.9 of the final draft budget published on 5th February 2025 Key Policies and Strategies (i) Fees and charges to continue to be reviewed in line with the policy agreed in the 2023-24 budget approval (j) The Capital Strategy as set out in appendix O of the final draft report published on 5th February 2025 including the Prudential Indicators. (k) The Treasury Management Strategy as set out in appendix M of the final draft report published on 5th February 2025 (l) The Minimum Revenue Provision (MRP) Statement as set out in appendix P of the final draft report published on 5th February 2025 (m) The Flexible Use of Capital Receipts Strategy as set out in appendix Q of the final draft report published on 5th February 2025. (n) The Reserves Policy as set out in appendix H of the final draft budget report published on 5th February 2025. In addition: (o) To delegate authority to the Chief Executive, in consultation with the Leader of the Council and the Cabinet Member for Finance, Corporate and Traded Services, to agree any other non-pay related changes to the Kent Scheme through the conclusion of pay bargaining. (p) To delegate authority to the Corporate Director of Finance (after consultation with the Leader, the Deputy Leader and Cabinet Member for Finance, Corporate & Traded Services and the political Group Leaders) to resolve any minor technical issues and structural changes for the final budget publication which do not materially alter the approved budget or change the net budget requirement and for any changes made to be reflected in the final version of the Budget Book (blue combed) due to be published in March 2025. (q) To note the information on the impact of the County Council’s share of retained business rates, business rate pool and business rate collection fund balances on the revenue budget will be reported to Cabinet once it has all been received. (r) To note the ongoing and escalating cost pressures on the Council’s budget alongside insufficient funding in the local government finance settlement and knock on requirement for savings and income in the final draft 2025-26 budget and medium term financial plan. (s) To note potential changes to local authority funding system from 2026- 27 onwards and consequential uncertain financial outlook for later years until a multi-year settlement from government is reintroduced. (t) To note that fundamental changes to social care are unlikely until after Baroness Casey enquiry concludes. (u) To note that the planned use of reserves still ensures sufficient reserves are available in the short term with no immediate concerns triggering a S114 notice provided the use of these reserves is replaced with sustainable savings over the medium term. (v) To note the rate of recent drawdown from reserves and potential drawdown to balance 2024-25 outturn is still cause for serious concern and reserves will still need to be maintained ahead of changes under Devolution White Paper. Further unplanned drawdowns would weaken resilience and should only be considered as a last resort with an agreed strategy to replenish reserves at earliest opportunity.”"
18,section_291_part_04.json,[unclassified],,,,,"(5) Mr Brady (Leader of the Opposition), Mr Hook (Leader of the Liberal Democrat Group) and Mr Lehmann (Leader of the Green & Independent Group) gave their responses to the recommendations."
15,section_291_part_05.json,[unclassified],,,,,"(6) Following a general debate, the Chairman called for cross-directorate amendments."
24,section_291_part_06.json,[motion_proposal],A motion was proposed and seconded: To amend the budget proposed by the Administration in line with the Labour Group’s Alternative Budge...,Mr Brady,Ms Meade,To amend the budget proposed by the Administration in line with the Labour Group’s Alternative Budget in accordance with sections 8.10 – 14 of the Constitution.,"(7) Mr Brady proposed, and Ms Meade seconded the following amendment: Proposed Purpose: “To amend the budget proposed by the Administration in line with the Labour Group’s Alternative Budget in accordance with sections 8.10 – 14 of the Constitution.” Proposed Amount: “See revised budget appendices D and F. Please also refer to the Labour Group’s alternative budget covering report, which includes the revised recommendation as proposed by the Labour Group.” Proposed Funding Source: “See revised budget appendices D and F. Please also refer to the Labour Group’s alternative budget covering report, which includes the revised recommendation as proposed by the Labour Group.” Post meeting note – A covering report along with the revised budget appendices D and F can be accessed here."
23,section_291_part_07.json,[vote_record],Detailed vote breakdown recorded.,,,,"(8) Following the debate, the Chairman put to the vote the amendment set out in paragraph 7 above and the voting was as follows: For (12) Mr Baldock, Mr Brady, Mr Campkin, Ms Constantine, Ms Dawkins, Ms Grehan, Mr Hood, Mr Lehmann, Mr Lewis, Ms Meade, Mr Stepto, Dr Sullivan Against (42) Mr Baker, Mr Bartlett, Mr Beaney, Mrs Bell, Mrs Binks, Mr Bond, Mr Booth, Mr Brazier, Miss Carey, Sir Paul Carter, Mrs Chandler, Mr Chard, Mr Cole, Mr Cooper, Mr Cooke, Mr Dance, Mr Dendor, Mrs Game, Mr Gough, Ms Hamilton, Mr Hill, Mr Hills, Mrs Hohler, Mr Holden, Mr Jeffrey, Mr Kennedy, Mr Love, Mr Marsh, Mr Meade, Mr Murphy, Mr Oakford, Mr Ozog, Mr Rayner, Mr Richardson, Mr Ridgers, Mr Robey, Mr Sandhu, Mr Thomas, Mr Watkins, Mr Webb, Mr Wright, Ms Wright Abstain (7) Mr Chittenden, Mr Hook, Mr Passmore, Mr Ross, Mr Shonk, Mr Streatfeild, Mr Whiting Amendment lost. Children, Young People and Education Directorate"
20,section_291_part_08.json,[unclassified],,,,,(9) The Cabinet Member for Integrated Children’s Services and the Cabinet Member for Education and Skills introduced the budget for this directorate prior to general debate and the taking of directorate specific amendments.
13,section_291_part_09.json,[unclassified],,,,,"(10) Following the general debate, the Chairman called for directorate specific amendments."


In [283]:
import re

def extract_voting_record(text):
    result = {}
    confidence_issues = []

    if re.search(r"voting\s+was\s+as\s+follows", text, re.IGNORECASE):
        result["result"] = "passed"
        result["method"] = "recorded"

        # Normalize line breaks
        clean_text = re.sub(r'\s*\n\s*', ' ', text)

        for group in ["for", "against", "abstain"]:
            pattern = rf"{group.capitalize()} \((\d+)\):?\s*(.*?)(?=(For \(|Against \(|Abstain \(|$))"
            match = re.search(pattern, clean_text, re.IGNORECASE)
            if match:
                declared_count = int(match.group(1))
                names_block = match.group(2).strip()
                names = [n.strip() for n in names_block.split(",") if n.strip()]
                result[group] = declared_count
                result[f"names_{group}"] = names

                if len(names) != declared_count:
                    confidence_issues.append(
                        f"{group}: declared {declared_count}, extracted {len(names)}"
                    )
            else:
                result[group] = 0
                result[f"names_{group}"] = []

        if confidence_issues:
            result["confidence_warning"] = "; ".join(confidence_issues)

    return result if result else None

In [284]:
# 1. Apply updated function
df["voting_result_dict"] = df["text"].apply(extract_voting_record)

# 2. Helper to extract safely
def safe_get(d, key, default=None):
    return d.get(key, default) if isinstance(d, dict) else default

# 3. Expand into individual columns
df["votes_for"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "for", 0))
df["names_for"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "names_for", []))

df["votes_against"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "against", 0))
df["names_against"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "names_against", []))

df["votes_abstain"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "abstain", 0))
df["names_abstain"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "names_abstain", []))

df["votes_total"] = df["votes_for"] + df["votes_against"] + df["votes_abstain"]

# 4. Optional: flag discrepancies
df["confidence_warning"] = df["voting_result_dict"].apply(lambda x: safe_get(x, "confidence_warning", None))

In [285]:
# Filter rows where "vote_record" is one of the content types
vote_rows = df[df["content_type"].apply(lambda x: "vote_record" in x)]

#vote_rows.sample(1)

In [286]:
df.sample(1)

Unnamed: 0,filename,content_type,summary,voting_result,proposer,seconder,motion_text,text,voting_result_dict,votes_for,names_for,votes_against,names_against,votes_abstain,names_abstain,votes_total,confidence_warning
8,section_291_part_18.json,[motion_proposal],"A motion was proposed and seconded: Fly tipping in Kent was reported to have increased in Kent and Medway by 10% with 27,000 incidents r...",,Mr Passmore,Mr Chittenden,"Fly tipping in Kent was reported to have increased in Kent and Medway by 10% with 27,000 incidents reported in 2022- 23 there is no reason to believe that it is not continuing to rise due to the increasing costs of commercial waste disposal. For Districts clearing up the mess is difficult, time consuming and expensive stretching our budgets further and with very low levels of prosecution there is no effective deterrent. It is proposed provide additional funds to Districts including but not limited to strategically placed movable ANPR Cameras located in the vicinity of fly tipping hot spots which will aid the work of District staff who are working to reduce this disgusting treatment of our environment.","(19) Mr Passmore proposed, and Mr Chittenden seconded the following amendment: Proposed Purpose: “Fly tipping in Kent was reported to have increased in Kent and Medway by 10% with 27,000 incidents reported in 2022- 23 there is no reason to believe that it is not continuing to rise due to the increasing costs of commercial waste disposal. For Districts clearing up the mess is difficult, time consuming and expensive stretching our budgets further and with very low levels of prosecution there is no effective deterrent. It is proposed provide additional funds to Districts including but not limited to strategically placed movable ANPR Cameras located in the vicinity of fly tipping hot spots which will aid the work of District staff who are working to reduce this disgusting treatment of our environment.” Proposed Amount: “£1m” Proposed Funding Source: “Reduce the £12m waste reserve to £11m and use this un[1]ringfenced EPR income to fund these ANPR Cameras.”",,0,[],0,[],0,[],0,


In [287]:
# Sort by filename
df_sorted = df.sort_values(by="filename")

# Save to CSV
df_sorted.to_csv("/Users/lgfolder/Downloads/subchunks_inspection.csv", index=False, encoding="utf-8-sig")

In [288]:
summary_chunks = df[df["content_type"].apply(lambda x: any(ct in x for ct in ["motion_proposal", "final_resolution", "unclassified"]))]

In [289]:
summary_chunks = summary_chunks.sort_values(by="filename")
meeting_text = "\n\n".join(summary_chunks["text"])

In [290]:
print(meeting_text)

(1) The General Counsel advised that Members’ Register of Interests detailed their main declarations and it was not necessary to declare any of those at the meeting. However, Members may wish to declare any specific interest in relation to the agenda.

288. Minutes of the meetings held on 19 December 2024 and 9 January 2025 and, if in order, to be approved as a correct record RESOLVED that the minutes of the meetings held on 19 December 2024 and 9 January 2025 be approved as a correct record.

(1) Mr John Betts, Interim Corporate Director Finance, as the Section 151 Officer, provided an overview of the Section 25 Assurance Statement. He said there were considerable risks facing the authority in delivering a balanced budget due to a number of factors including the current economic environment, the single year finance settlement for local government, the scale of savings required, and the growing demands on core statutory services. He said all the budget resolutions had undergone extensi

In [None]:
import re
import json
from pathlib import Path
from PyPDF2 import PdfReader

# CONFIG
PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf")
#PDF_PATH = Path("../data/council_documents/full_council/2025-03-13/originals/Minutes of Previous Meeting.pdf")
#PDF_PATH = Path("../data/council_documents/full_council/2024-12-19/originals/Printed minutes 19th-Dec-2024 10.00 County Council.pdf")
#OUTPUT_DIR = Path("../data/council_documents/full_council/2024-12-19/chunks/")
OUTPUT_DIR = Path("../data/council_documents/full_council/2025-03-13/chunks/")
SUBCHUNK_DIR = OUTPUT_DIR / "subchunks"
SUBCHUNK_DIR.mkdir(parents=True, exist_ok=True)

# Extract raw text from the PDF
def extract_text(path):
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {path}")
    reader = PdfReader(str(path))
    return "\n".join(page.extract_text() or "" for page in reader.pages)

# Clean line breaks and honorifics (e.g. "CBE", "MBE", "OBE")
def clean_honorifics(text):
    # 1. Remove line breaks
    text = text.replace("\n", " ")

    # 2. Remove ", CBE", ", MBE", ", OBE" (and variations with extra spaces)
    text = re.sub(r",\s*(CBE|MBE|OBE)\b", "", text, flags=re.IGNORECASE)

    # 3. Clean extra spaces
    text = re.sub(r"\s{2,}", " ", text).strip()

    return text

full_text = extract_text(PDF_PATH)

# Focus only on content after "UNRESTRICTED ITEMS"
start_marker = "UNRESTRICTED ITEMS"
start_index = full_text.find(start_marker)
body_text = full_text[start_index:] if start_index != -1 else full_text
body_text = re.sub(r"\n{2,}", "\n", body_text)

# Identify section start points
section_pattern = re.compile(r"^ *(\d{1,3})\.\s", re.MULTILINE)
header_matches = list(section_pattern.finditer(body_text))

split_points = []
last_number = -1
for match in header_matches:
    number = int(match.group(1))
    if number > last_number:
        split_points.append((match.start(), number))
        last_number = number
split_points.append((len(body_text), None))

# Split into agenda item chunks and save meaningful subchunks
for i in range(len(split_points) - 1):
    start, section_number = split_points[i]
    end, _ = split_points[i + 1]
    chunk_text = body_text[start:end].strip()
    lines = [line.strip() for line in chunk_text.splitlines() if line.strip()]
    title = lines[0] if lines else "Untitled"

    # Remove '(Item N)' if present
    chunk_text = re.sub(r"\(Item \d+\)", "", chunk_text)

    # Match both "1)" and "(1)"
    numbered_pattern = re.compile(r"(?=^\s*(?:\d{1,2}\)|\(\d{1,2}\))\s+)", re.MULTILINE)
    parts = numbered_pattern.split(chunk_text)

    # Avoid over-splitting: if only one part, keep as is
    if len(parts) <= 1:
        subchunks = [chunk_text.strip()]
    else:
        subchunks = [p.strip() for p in parts if p.strip() and not p.strip().startswith(str(section_number))]

    for idx, sub in enumerate(subchunks):
        sub = clean_honorifics(sub)

        # Check for 'RESOLVED that' preceded by number
        match = re.search(r"(\d{1,3}\. RESOLVED that)", sub)
        if match:
            split_point = match.start()
            first_part = sub[:split_point].strip()
            second_part = sub[split_point:].strip()

            # Save original (pre-RESOLVED) chunk
            data1 = {
                "section_number": section_number,
                "subchunk_index": idx,
                "title": title,
                "text": first_part
            }
            filename1 = f"section_{section_number:03d}_part_{idx:02d}.json"
            with open(SUBCHUNK_DIR / filename1, "w", encoding="utf-8") as f:
                json.dump(data1, f, indent=2, ensure_ascii=False)

            # Save RESOLVED chunk separately
            data2 = {
                "section_number": section_number,
                "subchunk_index": idx + 100,  # avoid collision
                "title": title + " [RESOLVED SPLIT]",
                "text": second_part
            }
            filename2 = f"section_{section_number:03d}_part_{idx+100:02d}.json"
            with open(SUBCHUNK_DIR / filename2, "w", encoding="utf-8") as f:
                json.dump(data2, f, indent=2, ensure_ascii=False)

        else:
            # Save chunk as-is
            data = {
                "section_number": section_number,
                "subchunk_index": idx,
                "title": title,
                "text": sub
            }
            filename = f"section_{section_number:03d}_part_{idx:02d}.json"
            with open(SUBCHUNK_DIR / filename, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

print(f"Saved subchunks to {SUBCHUNK_DIR}")


import json
import re
import pandas as pd
from pathlib import Path

# Load subchunks from disk
subchunk_files = list(SUBCHUNK_DIR.glob("section_*_part_*.json"))

parsed_data = []

# Helper: identify ceremonial-style chunks
def classify_ceremonial(text):
    lower_text = text.lower()
    return any(phrase in lower_text for phrase in [
        "with great sadness",
        "death of", "sad passing", "tributes were made",
        "sense of loss", "heartfelt sympathy", "one-minute silence", "one minute silence",
        "warmest congratulations", "congratulated", "award", "winners of",
        "remembrance festival", "christmas campaign", "thanked all"
    ])
# Helper: identify apologies
def classify_apologies(text):
    return "apologies for absence" in text.lower()

# Helper: identify declarations of interest
def classify_interests(text):
    return bool(re.search(r"declared (a|an|any) (pecuniary )?interest", text, re.IGNORECASE))

# Helper: approval of previous meeting minutes
def classify_mom_approvals(text):
    return (
        "resolved that the minutes" in text.lower()
        or bool(re.search(r"minutes.*(approved|noted)", text.lower()))
    )

for path in subchunk_files:
    with open(path, "r", encoding="utf-8") as f:
        record = json.load(f)

    text = record["text"]
    content_type = []
    motion_text = None
    proposer = None
    seconder = None
    voting_result = None
    summary = None

    # RESOLVED clause
    if re.search(r"RESOLVED that", text, re.IGNORECASE):
        content_type.append("final_resolution")
        match = re.search(r'RESOLVED that(?: the Council)?(.*?)(\.|;|$)', text, re.IGNORECASE | re.DOTALL)
        if match:
            motion_text = match.group(1).strip()
            summary = f"Council resolved to {motion_text.lower()}."

    # Motion proposal pattern
    match = re.search(r'(\bMr|Mrs|Ms)\s+\w+\s+proposed,?\s+and\s+(\bMr|Mrs|Ms)\s+\w+\s+seconded', text)
    if match:
        content_type.append("motion_proposal")
        proposer_match = re.search(r'(\bMr|Mrs|Ms)\s+\w+\s+proposed', text)
        seconder_match = re.search(r'and\s+(\bMr|Mrs|Ms)\s+(\w+)\s+seconded', text)
        if proposer_match:
            proposer = proposer_match.group(0).replace("proposed", "").strip()
        if seconder_match:
            seconder = f"{seconder_match.group(1)} {seconder_match.group(2)}"
        quote_match = re.search(r'“(.*?)”', text, re.DOTALL)
        if quote_match:
            motion_text = quote_match.group(1).strip()
            summary = f"A motion was proposed and seconded: {motion_text[:100]}..."

    # Voting result (standard phrases)
    if "agreed unanimously" in text.lower() or "motion carried" in text.lower():
        content_type.append("vote_record")
        voting_result = {"result": "passed", "method": "unanimous" if "unanimously" in text.lower() else "carried"}
        summary = "The motion was passed unanimously."

    # Voting result (explicit breakdown)
    if re.search(r"voting\s+was\s+as\s+follows", text, re.IGNORECASE):
        content_type.append("vote_record")
        voting_result = {"result": "recorded", "method": "explicit"}
        for group in ["for", "against", "abstain"]:
            pattern = rf"{group.capitalize()} \((\d+)\):(.*?)(\n|$)"
            match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
            if match:
                voting_result[group] = int(match.group(1))
                names = match.group(2).replace("\n", " ").strip()
                voting_result[f"names_{group}"] = re.findall(r"(Mr|Mrs|Ms) \w+", names)
        summary = "Detailed vote breakdown recorded."

    # Extra classifications
    if classify_ceremonial(text):
        content_type.append("ceremonials")
    if classify_apologies(text):
        content_type.append("apologies")
    if classify_interests(text):
        content_type.append("interests")
    if classify_mom_approvals(text):
        content_type.append("mom_approvals")

    parsed_data.append({
        "filename": path.name,
        "content_type": list(set(content_type)) or ["unclassified"],
        "summary": summary,
        "text": text,
        "voting_result": voting_result,
        "proposer": proposer,
        "seconder": seconder,
        "motion_text": motion_text
    })

# Create DataFrame
df = pd.DataFrame(parsed_data)
df = df[["filename", "content_type", "summary", "voting_result", "proposer", "seconder", "motion_text", "text"]].head(5)
df
