In [28]:
import os
import re
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.corpus import stopwords
import PyPDF2
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# PDF Extraction Function
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

# Enhanced Preprocessing Function
def preprocess_text(text):
    # Remove page markers
    text = re.sub(r'=+ Page \d+ =+\n', '', text)
    # Remove line numbers
    text = re.sub(r'^\d+\n', '', text, flags=re.MULTILINE)
    # Normalize bullet points
    text = re.sub(r'(\n\s*)[•\-](\s+)', r'\1• ', text)
    # Collapse whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Remove headers/footers
    text = re.sub(r'^.*borough council.*$', '', text, flags=re.IGNORECASE|re.MULTILINE)
    return text.strip()

# Improved Chunk Cleaning
def clean_chunks(chunks):
    cleaned = []
    for chunk in chunks:
        # Split bullet lists while preserving context
        if chunk.count('•') >= 2:
            header = re.search(r'^(.*?)(?=\n•)', chunk)
            header = header.group(1) if header else ""
            bullet_points = [f"{header}\n• {point.strip()}" 
                           for point in re.split(r'\n•', chunk) 
                           if point.strip()]
            cleaned.extend(bullet_points)
        else:
            cleaned.append(chunk)
    return cleaned

# Regourp chunks after fragmenting them, based on meaning
def regroup_bullet_chunks(chunks):
    regrouped = []
    current_topic = ""
    current_chunk = ""
    
    for chunk in chunks:
        # Detect topic headers (e.g., "WORK PROGRAMME")
        if len(chunk.split()) < 10 and chunk.isupper():
            if current_chunk:
                regrouped.append(current_chunk.strip())
            current_topic = chunk
            current_chunk = f"{chunk}\n\n"
        # Group related bullet points
        elif chunk.startswith("•"):
            if len(current_chunk + chunk) > 150:  # Word limit
                regrouped.append(current_chunk.strip())
                current_chunk = f"{current_topic}\n\n{chunk}"
            else:
                current_chunk += f"\n{chunk}"
        else:
            current_chunk += f"\n{chunk}"
    
    if current_chunk:
        regrouped.append(current_chunk.strip())
    return regrouped


# Add this new function above analyze_minutes()
def regroup_bullet_chunks(chunks):
    regrouped = []
    current_chunk = ""
    current_wordcount = 0
    
    for chunk in chunks:
        chunk_wordcount = len(chunk.split())
        
        # Case 1: Topic header (all caps, short)
        if chunk.isupper() and chunk_wordcount < 8:
            if current_chunk:  # Finalize previous chunk
                regrouped.append(current_chunk.strip())
            current_chunk = f"{chunk}\n\n"  # Start new section
            current_wordcount = 0
        
        # Case 2: Bullet point or continuation
        else:
            if current_wordcount + chunk_wordcount > 150:  # Hard limit
                regrouped.append(current_chunk.strip())
                current_chunk = f"{chunk}\n"
                current_wordcount = chunk_wordcount
            else:
                current_chunk += f"{chunk}\n"
                current_wordcount += chunk_wordcount
    
    if current_chunk:  # Add the last chunk
        regrouped.append(current_chunk.strip())
    
    return [c for c in regrouped if len(c.split()) >= 15]  # Filter tiny chunks


def postprocess_chunks(chunks):
    """Split resolved items and numbered agenda sections"""
    refined = []
    for chunk in chunks:
        # Split RESOLVED sections
        if "RESOLVED" in chunk:
            parts = re.split(r'(RESOLVED[^\n]+\n)', chunk)
            refined.extend([p.strip() for p in parts if p.strip()])
        # Split numbered agenda items
        elif re.search(r'\n\d+\.\s+[A-Z]', chunk):
            parts = re.split(r'(\n\d+\.\s+[A-Z].+?\n)', chunk)
            refined.extend([p.strip() for p in parts if p.strip()])
        else:
            refined.append(chunk)
    return refined


def analyze_minutes(pdf_path):
    # Extract and preprocess
    raw_text = extract_text_from_pdf(pdf_path)
    processed_text = preprocess_text(raw_text)
    
    # Initialize TextTiling
    tt = TextTilingTokenizer(
        w=7,
        k=5,
        stopwords=stopwords.words('english') + [
            'council', 'committee', 'meeting', 
            'members', 'member', 'officer',
            'present', 'apologies', 'resolution',
            'agenda', 'minute', 'chair'
        ]
    )
    
    print(f"\n{'='*40}\nAnalyzing: {os.path.basename(pdf_path)}\n{'='*40}")
    
    try:
        # 1. First-pass segmentation
        raw_segments = tt.tokenize(processed_text)
        
        # 2. Clean and split bullet points
        final_chunks = clean_chunks(raw_segments)
        
        # 3. NEW: Optimize chunk grouping
        optimized_chunks = regroup_bullet_chunks(final_chunks)
        
        # 4. Output results (skip tiny orphans)
        for i, chunk in enumerate(optimized_chunks, 1):
            word_count = len(chunk.split())
            if word_count >= 15:  # Only show meaningful chunks
                print(f"\n--- Segment {i} ({word_count} words) ---")
                print(chunk.strip())
                
        # Optional: Print stats
        print(f"\nOriginal segments: {len(final_chunks)}")
        print(f"Optimized chunks: {len([c for c in optimized_chunks if len(c.split()) >= 15])}")
        
    except Exception as e:
        print(f"Processing error: {str(e)}")

# Example Usage
if __name__ == "__main__":
    pdf_paths = [
        "/Users/lgfolder/Downloads/Printed minutes 28012025 1000 Health Overview and Scrutiny Committee.pdf",
#        "/Users/lgfolder/Downloads/Printed minutes 17062024 1830 Overview and Scrutiny Committee.pdf",
#        "/Users/lgfolder/Downloads/data scrape full 1 page only/cype/2024-11-21/originals/Minutes 24092024 Childrens Young People and Education Cabinet Committee.pdf"
    ]
    
    for path in pdf_paths:
        if os.path.exists(path):
            analyze_minutes(path)
        else:
            print(f"File not found: {path}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lgfolder/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lgfolder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Analyzing: Printed minutes 28012025 1000 Health Overview and Scrutiny Committee.pdf

--- Segment 1 (123 words) ---
1 KENT COUNTY COUNCIL 
 
 
HEALTH OVERVIEW AND SCRUTINY COMMITTEE 
 
MINUTES of a meeting of the Health Overview and Scrutiny Committee held in the 
Council Chamber, Sessions House, County Hall, Maidstone on Tuesday, 28 January 
2025.
 
 
PRESENT: Mr P Bartlett (Chair), Mr P V Barrington-King, Sir Paul Carter, CBE, 
Ms S Hamilton (Vice-Chairman), Mr A Kennedy, Mr J Meade, Ms L Wright, 
Mr S R Campkin, Ms K Constantine, Cllr H Keen, Cllr S Jeffery, Cllr J Kite, MBE, 
Mr T Bond (Substitute for Ms L Parfitt) and Mrs P T Cole (Substitute for Mr P Cole)
 
 
ALSO PRESENT: Mr R Goatham (Healthwatch Kent) and Dr C Rickard (Local 
Medical Committee) 
 
PRESENT VIRTUALLY: Mr R Streatfield MBE, Mr N Chard and Cllr K Moses

--- Segment 2 (127 words) ---
IN ATTENDANCE: Mrs K Goldsmith (Research Officer - Overview and Scrutiny), 
Tracey Fletcher (Chief Executive, EKHUFT), Angela van de

## PDF Document Chunking Processor

This code processes PDF meeting minutes into structured chunks optimized for embedding:

### Key Features:
- **Hierarchical Section Detection**  
  - Identifies main sections (200+ numbering) and subsections
  - Always treats section "0" as a main section

- **Smart Chunking**  
  - Splits large sections (>100 words) into optimal chunks (~300 words)
  - Preserves semantic coherence using:
    - Numbering patterns (1., a), • etc.)
    - Paragraph boundaries
    - Sentence boundaries

- **Intelligent Numbering**  
  - Main sections: `205`, `206`  
  - Subsections: `1.1`, `1.2`  
  - Chunks: `2.1.1`, `2.1.2` (when nested)

- **Output Structure**  
  ```python
  {
    'display_number': '1.2',
    'section_title': 'Declarations',
    'text': '...', 
    'is_main_section': False,
    'word_count': 45,
    'parent_section': 'Declarations of Interest'
  }

In [99]:
import pdfplumber
import re
import pandas as pd
import warnings
from nltk.tokenize import sent_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt', quiet=True)

# Suppress PDF parsing warnings
warnings.filterwarnings("ignore", message="CropBox missing from /Page")

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF while handling errors."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def detect_numbering_pattern(text, min_count=3):
    """Detect the most prevalent numbering pattern in text."""
    patterns = {
        'digit_dot': r'(?<=\n)\d+\.\s+',          # 1. 
        'digit_paren': r'(?<=\n)\d+\)\s+',        # 1)
        'letter_paren': r'(?<=\n)[a-z]\)\s+',     # a)
        'bullet': r'(?<=\n)[•▪♦]\s+',             # • 
        'hyphen': r'(?<=\n)-\s+',                 # -
    }
    
    pattern_counts = {}
    for name, pattern in patterns.items():
        try:
            count = len(re.findall(pattern, text))
            if count >= min_count:
                pattern_counts[name] = (pattern, count)
        except Exception:
            continue
    
    return max(pattern_counts.values(), key=lambda x: x[1])[0] if pattern_counts else None

def split_by_numbering(text, pattern):
    """Split text using detected numbering pattern."""
    if not pattern or not text.strip():
        return [text]
    
    try:
        # Split but keep the delimiters
        parts = re.split(f'({pattern})', text)
        if len(parts) == 1:
            return [text]
        
        # Recombine the pattern with its text
        chunks = []
        for i in range(1, len(parts), 2):
            chunk = parts[i] + parts[i+1]
            chunks.append(chunk.strip())
        
        return chunks
    except Exception:
        return [text]

def split_into_paragraphs(text):
    """Split text into paragraphs while preserving structure."""
    if not text:
        return []
    
    # Split by double newlines first
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    
    # If that doesn't work well, try single newlines
    if len(paragraphs) <= 1:
        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    
    return paragraphs

def create_optimal_chunks(text, max_words=300):
    """Create optimally sized chunks while preserving structure."""
    if not text or len(text.split()) <= max_words:
        return [text]
    
    # First try splitting by numbering
    pattern = detect_numbering_pattern(text)
    if pattern:
        numbered_chunks = split_by_numbering(text, pattern)
        if len(numbered_chunks) > 1:
            return numbered_chunks
    
    # Then try splitting by paragraphs
    paragraphs = split_into_paragraphs(text)
    if len(paragraphs) > 1:
        chunks = []
        current_chunk = []
        current_word_count = 0
        
        for para in paragraphs:
            word_count = len(para.split())
            
            if current_word_count + word_count > max_words and current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_word_count = 0
                
            current_chunk.append(para)
            current_word_count += word_count
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        if len(chunks) > 1:
            return chunks
    
    # Finally fall back to sentence splitting
    try:
        sentences = sent_tokenize(text)
    except Exception:
        sentences = [text]
    
    chunks = []
    current_chunk = []
    current_word_count = 0
    
    for sentence in sentences:
        word_count = len(sentence.split())
        
        if current_word_count + word_count > max_words and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_word_count = 0
            
        current_chunk.append(sentence)
        current_word_count += word_count
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

def split_minutes_flexible(text):
    """Split document into main sections and sub-chunks."""
    if not text:
        return []
    
    # First pass: split by 1-3 digit numbers followed by dot and space
    pattern = r"(?<=\n)(\d{1,3})\.\s+(.+)"
    matches = list(re.finditer(pattern, text))
    
    sections = []
    
    if matches:
        sections.append({
            "agenda_number": "0",
            "title": "Preliminary Information",
            "text": text[:matches[0].start()].strip(),
            "is_main_section": False
        })
    
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_text = text[start:end].strip()
        sections.append({
            "agenda_number": match.group(1),
            "title": match.group(2).strip(),
            "text": section_text,
            "is_main_section": True  # Will be updated in second pass
        })
    
    # Second pass: identify main sections by sequential numbering
    if len(sections) > 1:
        try:
            # Find the first numeric agenda number that's >= 200
            first_main_idx = next(i for i, s in enumerate(sections) 
                               if s['agenda_number'].isdigit() and int(s['agenda_number']) >= 200)
            first_main_num = int(sections[first_main_idx]['agenda_number'])
            
            current_expected = first_main_num
            for section in sections[first_main_idx:]:
                if section['agenda_number'].isdigit():
                    num = int(section['agenda_number'])
                    if num == current_expected:
                        section['is_main_section'] = True
                        current_expected += 1
                    else:
                        section['is_main_section'] = False
                else:
                    section['is_main_section'] = False
        except StopIteration:
            # No main sections found, mark all as not main sections
            for section in sections:
                section['is_main_section'] = False
    
    return sections

def process_minutes(pdf_path):
    """Full processing pipeline with hierarchical numbering."""
    try:
        text = extract_text_from_pdf(pdf_path)
        if not text:
            return []
        
        sections = split_minutes_flexible(text)
        
        # Force section 0 to be main section
        if sections and sections[0]['agenda_number'] == '0':
            sections[0]['is_main_section'] = True
        
        for section in sections:
            # Apply hierarchical numbering to chunks
            if section['is_main_section'] or len(section['text'].split()) > 100:
                chunks = create_optimal_chunks(section['text'])
                # Number chunks as 1.1, 1.2, etc. for subsections
                if not section['is_main_section'] and '.' not in section['agenda_number']:
                    section['chunks'] = [
                        {
                            'chunk_number': f"{section['agenda_number']}.{i+1}",
                            'text': chunk,
                            'word_count': len(chunk.split())
                        }
                        for i, chunk in enumerate(chunks)
                    ]
                else:
                    # For main sections or already numbered subsections
                    section['chunks'] = [
                        {
                            'chunk_number': str(i+1),
                            'text': chunk,
                            'word_count': len(chunk.split())
                        }
                        for i, chunk in enumerate(chunks)
                    ]
            else:
                section['chunks'] = [{
                    'chunk_number': section['agenda_number'],
                    'text': section['text'],
                    'word_count': len(section['text'].split())
                }]
            
            section['chunk_count'] = len(section['chunks'])
            section['avg_chunk_words'] = (
                sum(c['word_count'] for c in section['chunks']) / section['chunk_count'] 
                if section['chunk_count'] > 0 else 0
            )
        
        return sections
    except Exception as e:
        print(f"Error processing minutes: {e}")
        return []

# Modified DataFrame creation for better display
def create_final_dataframe(processed_sections):
    """Create DataFrame with proper hierarchical numbering that reflects parent-child relationships"""
    if not processed_sections:
        return pd.DataFrame()
    
    rows = []
    current_main_section = None
    
    for section in processed_sections:
        # Track the current main section
        if section['is_main_section']:
            current_main_section = section['agenda_number']
            
            # Handle main section chunks
            if len(section['chunks']) == 1:
                # Single chunk gets [agenda_num].0
                display_num = f"{current_main_section}.0"
                rows.append(create_row(section, display_num, current_main_section, section['chunks'][0]))
            else:
                # Multiple chunks get [agenda_num].1, [agenda_num].2, etc.
                for i, chunk in enumerate(section['chunks'], 1):
                    display_num = f"{current_main_section}.{i}"
                    rows.append(create_row(section, display_num, current_main_section, chunk))
        else:
            # Handle subsections - they belong to the current main section
            if current_main_section is None:
                current_main_section = '0'  # Fallback to preliminary section
            
            for chunk in section['chunks']:
                if '.' in chunk['chunk_number']:
                    # Already has sub-numbering (like 1.1)
                    sub_num = chunk['chunk_number']
                    display_num = f"{current_main_section}.{sub_num}"
                else:
                    # Simple numbered subsection (like 1)
                    display_num = f"{current_main_section}.{chunk['chunk_number']}"
                
                rows.append(create_row(section, display_num, current_main_section, chunk))
    
    df = pd.DataFrame(rows)
    
    # Clean up numbering for single-chunk main sections
    df['display_number'] = df.apply(
        lambda x: x['display_number'].replace('.0', '') 
        if x['is_main_section'] and x['display_number'].endswith('.0') and 
           len(df[df['agenda_item'] == x['agenda_item']]) == 1
        else x['display_number'],
        axis=1
    )
    
    # Select and order columns for embeddings
    embedding_columns = [
        'chunk_id', 'display_number', 'agenda_item', 'section_title', 
        'text', 'is_main_section', 'word_count', 'parent_section',
        'source_document'
    ]
    
    # Only include columns that exist
    available_columns = [col for col in embedding_columns if col in df.columns]
    
    return df[available_columns]


def find_parent_agenda(section, all_sections):
    """Find the parent agenda number for a subsection"""
    # For sections after preliminary (0)
    if section['agenda_number'] == '0':
        return '0'
    
    # Find the last main section before this one
    for s in reversed(all_sections[:all_sections.index(section)]):
        if s['is_main_section']:
            return s['agenda_number']
    return section['agenda_number']  # fallback

def create_row(section, display_num, agenda_num, chunk):
    """Helper to create consistent row data"""
    return {
        'display_number': display_num,
        'agenda_item': agenda_num,
        'section_title': section['title'],
        'text': chunk['text'],
        'is_main_section': section['is_main_section'],
        'word_count': chunk['word_count'],
        'parent_section': section['title']
    }


# Example output would now look like:
# display_number  agenda_item  section_title  ...  
# 0           258.0          258  Main Section    ...
# 1         258.1.1          258  Subsection      ...  
# 2         258.1.2          258  Subsection      ...
# 3           258.2          258  Subsection      ...
# 4           258.3          258  Subsection      ...

# Example usage
#pdf_path = "/Users/lgfolder/Downloads/Public minutes 06032025 1000 Growth Economic Development and Communities Cabinet Committee.pdf"
#pdf_path = "/Users/lgfolder/Downloads/Printed minutes 25022025 1000 Environment Transport Cabinet Committee.pdf"
pdf_path = "/Users/lgfolder/Downloads/Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf"
#pdf_path = "/Users/lgfolder/Downloads/Printed minutes 28012025 1000 Health Overview and Scrutiny Committee.pdf"

processed_sections = process_minutes(pdf_path)
final_df = create_final_dataframe(processed_sections)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


In [100]:
final_df.head(30)

Unnamed: 0,display_number,agenda_item,section_title,text,is_main_section,word_count,parent_section
0,0.0,0,Preliminary Information,KENT COUNTY COUNCIL\n_________________________...,True,259,Preliminary Information
1,0.20,0,Apologies and Substitutes,20. Apologies and Substitutes\n(Item 2)\nApolo...,False,52,Apologies and Substitutes
2,0.21,0,Declarations of Interest,21. Declarations of Interest\n(Item 3),False,6,Declarations of Interest
3,0.1,0,Mr Passmore declared that his wife was a full ...,1. Mr Passmore declared that his wife was a fu...,False,30,Mr Passmore declared that his wife was a full ...
4,0.2,0,Sir Paul Carter declared that he was a Directo...,2. Sir Paul Carter declared that he was a Dire...,False,16,Sir Paul Carter declared that he was a Directo...
5,0.22,0,Minutes of the meeting held on 16 January 2025,22. Minutes of the meeting held on 16 January ...,False,28,Minutes of the meeting held on 16 January 2025
6,0.23,0,Verbal Update by Cabinet Members,23. Verbal Update by Cabinet Members\n(Item 5),False,8,Verbal Update by Cabinet Members
7,0.1.1,0,"Mrs Chandler, Cabinet Member for Integrated Ch...","1. Mrs Chandler, Cabinet Member for Integrated...",False,273,"Mrs Chandler, Cabinet Member for Integrated Ch..."
8,0.2.1,0,"Mr Love, Cabinet Member for Education and Skil...","2. Mr Love, Cabinet Member for Education and S...",False,112,"Mr Love, Cabinet Member for Education and Skil..."
9,0.24,0,Performance Monitoring,24. Performance Monitoring\n(Item 6),False,5,Performance Monitoring


In [101]:
def verify_and_fix_chunking(processed_sections, original_text):
    """Verify no text was lost during chunking and fix numbering"""
    # 1. First verify text integrity
    reconstructed_text = ""
    for section in processed_sections:
        for chunk in section['chunks']:
            reconstructed_text += chunk['text'] + "\n\n"
    
    # Normalize whitespace for comparison
    original_clean = re.sub(r'\s+', ' ', original_text).strip()
    reconstructed_clean = re.sub(r'\s+', ' ', reconstructed_text).strip()
    
    if original_clean != reconstructed_clean:
        print("Warning: Some text may have been lost during chunking")
        print(f"Original length: {len(original_clean)}, Reconstructed length: {len(reconstructed_clean)}")
    else:
        print("Text integrity verified - no content lost during chunking")
    
    # 2. Fix inconsistent numbering
    for section in processed_sections:
        if not section['is_main_section']:
            # Check for single-child chunks that got x.y.1 numbering
            if len(section['chunks']) == 1 and '.' in section['chunks'][0]['chunk_number']:
                # Simplify numbering (e.g., 256.2.1 → 256.2)
                chunk_num = section['chunks'][0]['chunk_number']
                if chunk_num.endswith('.1'):
                    section['chunks'][0]['chunk_number'] = chunk_num[:-2]
    
    # 3. Regenerate display numbers
    for section in processed_sections:
        agenda_num = section['agenda_number']
        for i, chunk in enumerate(section['chunks'], 1):
            if section['is_main_section']:
                if len(section['chunks']) == 1:
                    chunk['display_number'] = f"{agenda_num}.0"
                else:
                    chunk['display_number'] = f"{agenda_num}.{i}"
            else:
                parent_agenda = find_parent_agenda(section, processed_sections)
                if '.' in chunk['chunk_number']:
                    # Handle existing sub-numbering
                    parts = chunk['chunk_number'].split('.')
                    if len(parts) > 2 and parts[-1] == '1' and i == 1:
                        # Simplify x.y.1 to x.y when it's the only chunk
                        chunk['display_number'] = f"{parent_agenda}.{'.'.join(parts[:-1])}"
                    else:
                        chunk['display_number'] = f"{parent_agenda}.{chunk['chunk_number']}"
                else:
                    chunk['display_number'] = f"{parent_agenda}.{chunk['chunk_number']}"
    
    return processed_sections

# Usage:
verified_sections = verify_and_fix_chunking(processed_sections, text)
final_df = create_final_dataframe(verified_sections)

# Additional verification
def check_chunk_continuity(df):
    """Check for numbering gaps or inconsistencies"""
    print("\nChunk numbering analysis:")
    prev_agenda = None
    for agenda in df['agenda_item'].unique():
        agenda_df = df[df['agenda_item'] == agenda]
        print(f"\nAgenda {agenda}: {len(agenda_df)} chunks")
        
        # Check display number sequence
        display_nums = agenda_df['display_number'].str.extract(r'(\d+)$')[0].astype(float)
        expected_seq = range(1, len(display_nums)+1)
        
        if not all(a == b for a, b in zip(display_nums, expected_seq)):
            print(f"  Warning: Non-sequential numbering in agenda {agenda}")
            print(f"  Found: {', '.join(agenda_df['display_number'])}")
        
        # Check for consistent hierarchy
        dots = agenda_df['display_number'].str.count('\.')
        if len(dots.unique()) > 1:
            print(f"  Warning: Inconsistent hierarchy levels in agenda {agenda}")
    
    # Check for duplicate display numbers
    dupes = df[df.duplicated(['display_number'], keep=False)]
    if not dupes.empty:
        print("\nWarning: Duplicate display numbers found:")
        print(dupes[['display_number', 'section_title']])

check_chunk_continuity(final_df)

Original length: 128, Reconstructed length: 18599

Chunk numbering analysis:

Agenda 0: 48 chunks
  Found: 0.0, 0.20, 0.21, 0.1, 0.2, 0.22, 0.23, 0.1, 0.2, 0.24, 0.1, 0.2, 0.3.1, 0.3.2, 0.25, 0.1, 0.2, 0.26, 0.1, 0.2, 0.27, 0.1, 0.2, 0.28, 0.1, 0.2, 0.29, 0.1, 0.2, 0.30, 0.1, 0.31, 0.1, 0.2, 0.3, 0.32, 0.1, 0.2, 0.33, 0.1, 0.2, 0.34, 0.1, 0.35, 0.36, 0.37, 0.38, 0.39

   display_number                                      section_title
3             0.1  Mr Passmore declared that his wife was a full ...
4             0.2  Sir Paul Carter declared that he was a Directo...
7             0.1  Mrs Chandler, Cabinet Member for Integrated Ch...
8             0.2  Mr Love, Cabinet Member for Education and Skil...
10            0.1  Ms Atkinson, introduced the report and highlig...
11            0.2  Mr Love highlighted the KPI for the number of ...
15            0.1  Ms Holden introduced the report and gave an ov...
16            0.2  In response to comments and questions it was s...
18      

### Claude's version

In [136]:
import pdfplumber
import re
import pandas as pd
import warnings
from nltk.tokenize import sent_tokenize
import nltk
import hashlib

# Download required NLTK data
nltk.download('punkt', quiet=True)

# Suppress PDF parsing warnings
warnings.filterwarnings("ignore", message="CropBox missing from /Page")

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF while handling errors."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

def detect_numbering_pattern(text, min_count=2):  # Reduced min_count from 3 to 2
    """Detect the most prevalent numbering pattern in text."""
    patterns = {
        'digit_dot': r'(?<=\n|\A)\d+\.\s+',      # Changed to also match at start of text (\A)
        'digit_paren': r'(?<=\n|\A)\d+\)\s+',    # Changed to also match at start of text (\A)
        'letter_paren': r'(?<=\n|\A)[a-z]\)\s+', # Changed to also match at start of text (\A)
        'bullet': r'(?<=\n|\A)[•▪♦]\s+',         # Changed to also match at start of text (\A)
        'hyphen': r'(?<=\n|\A)-\s+',             # Changed to also match at start of text (\A)
        'roman_numeral': r'(?<=\n|\A)[ivxIVX]+\.\s+', # Changed to also match at start of text (\A)
    }
    
    pattern_counts = {}
    for name, pattern in patterns.items():
        try:
            count = len(re.findall(pattern, text))
            if count >= min_count:
                pattern_counts[name] = (pattern, count)
        except Exception:
            continue
    
    return max(pattern_counts.values(), key=lambda x: x[1])[0] if pattern_counts else None

def split_by_numbering(text, pattern):
    """Split text using detected numbering pattern."""
    if not pattern or not text.strip():
        return [text]
    
    try:
        # Split but keep the delimiters
        parts = re.split(f'({pattern})', text)
        if len(parts) == 1:
            return [text]
        
        # Recombine the pattern with its text
        chunks = []
        for i in range(1, len(parts), 2):
            if i+1 < len(parts):  # Ensure we don't go out of bounds
                chunk = parts[i] + parts[i+1]
                chunks.append(chunk.strip())
        
        # If the first part had content, add it too
        if parts[0].strip():
            chunks.insert(0, parts[0].strip())
            
        return chunks
    except Exception as e:
        print(f"Error in split_by_numbering: {e}")
        return [text]

def split_into_paragraphs(text):
    """Split text into paragraphs while preserving structure."""
    if not text:
        return []
    
    # Split by double newlines first
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    
    # If that doesn't work well, try single newlines
    if len(paragraphs) <= 1:
        paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
    
    return paragraphs

def semantic_boundary_split(text, max_words=300):
    """Split text at semantic boundaries like paragraphs, sentences, but respect max_words."""
    if not text or len(text.split()) <= max_words:
        return [text]
        
    # First try splitting by paragraphs (most semantic)
    paragraphs = split_into_paragraphs(text)
    
    chunks = []
    current_chunk = []
    current_word_count = 0
    
    if len(paragraphs) > 1:
        for para in paragraphs:
            para_words = len(para.split())
            
            # If a single paragraph exceeds max_words, we'll need to split it by sentences
            if para_words > max_words:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = []
                    current_word_count = 0
                
                # Split this paragraph by sentences
                try:
                    sentences = sent_tokenize(para)
                    sentence_chunks = []
                    sent_chunk = []
                    sent_word_count = 0
                    
                    for sentence in sentences:
                        sent_words = len(sentence.split())
                        
                        if sent_word_count + sent_words > max_words and sent_chunk:
                            sentence_chunks.append(" ".join(sent_chunk))
                            sent_chunk = []
                            sent_word_count = 0
                            
                        # If a single sentence is too long, split by word count
                        if sent_words > max_words:
                            if sent_chunk:
                                sentence_chunks.append(" ".join(sent_chunk))
                                sent_chunk = []
                                sent_word_count = 0
                            
                            # Split this long sentence
                            words = sentence.split()
                            for i in range(0, len(words), max_words):
                                chunk_words = words[i:i+max_words]
                                sentence_chunks.append(" ".join(chunk_words))
                        else:
                            sent_chunk.append(sentence)
                            sent_word_count += sent_words
                    
                    if sent_chunk:
                        sentence_chunks.append(" ".join(sent_chunk))
                    
                    chunks.extend(sentence_chunks)
                    
                except Exception as e:
                    print(f"Error splitting sentences: {e}")
                    # Fall back to word-level splitting
                    words = para.split()
                    for i in range(0, len(words), max_words):
                        chunks.append(" ".join(words[i:i+max_words]))
            
            # Normal case: paragraph fits or can be combined with others
            elif current_word_count + para_words > max_words and current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = [para]
                current_word_count = para_words
            else:
                current_chunk.append(para)
                current_word_count += para_words
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        return chunks
    
    # If paragraphs didn't work well, fall back to sentence splitting
    try:
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_word_count = 0
        
        for sentence in sentences:
            word_count = len(sentence.split())
            
            # If a single sentence is too long, split by word count
            if word_count > max_words:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = []
                    current_word_count = 0
                
                # Split this long sentence
                words = sentence.split()
                for i in range(0, len(words), max_words):
                    chunk_words = words[i:i+max_words]
                    chunks.append(" ".join(chunk_words))
            elif current_word_count + word_count > max_words and current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]
                current_word_count = word_count
            else:
                current_chunk.append(sentence)
                current_word_count += word_count
        
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        
        return chunks
    
    except Exception as e:
        print(f"Error in semantic splitting: {e}")
        # Last resort: split by words
        words = text.split()
        chunks = []
        for i in range(0, len(words), max_words):
            chunks.append(" ".join(words[i:i+max_words]))
        return chunks

def create_optimal_chunks(text, max_words=300, min_words=100, overlap_words=50):
    """Create optimally sized chunks while preserving structure and adding overlap."""
    if not text:
        return []
        
    if len(text.split()) <= max_words:
        return [text]
    
    # First try splitting by numbering patterns
    pattern = detect_numbering_pattern(text)
    if pattern:
        numbered_chunks = split_by_numbering(text, pattern)
        if len(numbered_chunks) > 1:
            # Process each numbered chunk to ensure they meet size constraints
            processed_chunks = []
            for chunk in numbered_chunks:
                if len(chunk.split()) > max_words:
                    processed_chunks.extend(semantic_boundary_split(chunk, max_words))
                else:
                    processed_chunks.append(chunk)
            
            # Apply overlapping if requested
            if overlap_words > 0:
                return add_overlap_to_chunks(processed_chunks, overlap_words)
            return processed_chunks
    
    # If numbering didn't work, try semantic boundary splitting
    chunks = semantic_boundary_split(text, max_words)
    
    # Merge very small chunks if needed
    if min_words > 0:
        chunks = merge_small_chunks(chunks, min_words)
    
    # Add overlap if requested
    if overlap_words > 0:
        chunks = add_overlap_to_chunks(chunks, overlap_words)
    
    return chunks

def merge_small_chunks(chunks, min_words):
    """Merge chunks that are smaller than min_words."""
    if not chunks or len(chunks) <= 1:
        return chunks
        
    result = []
    current_chunk = chunks[0]
    current_words = len(current_chunk.split())
    
    for i in range(1, len(chunks)):
        next_chunk = chunks[i]
        next_words = len(next_chunk.split())
        
        # If current chunk is too small and combining won't exceed max_words
        if current_words < min_words and current_words + next_words <= 400:  # Using 400 as a buffer
            current_chunk = current_chunk + " " + next_chunk
            current_words += next_words
        else:
            result.append(current_chunk)
            current_chunk = next_chunk
            current_words = next_words
    
    # Don't forget the last chunk
    result.append(current_chunk)
    return result

def add_overlap_to_chunks(chunks, overlap_words):
    """Add overlapping words between chunks for better semantic continuity."""
    if not chunks or len(chunks) <= 1 or overlap_words <= 0:
        return chunks
        
    result = []
    
    for i in range(len(chunks)):
        if i == 0:
            # First chunk stays as is
            result.append(chunks[i])
        else:
            # Get overlap from previous chunk
            prev_chunk = chunks[i-1]
            prev_words = prev_chunk.split()
            
            # Get overlap words from end of previous chunk
            overlap = " ".join(prev_words[-overlap_words:]) if len(prev_words) >= overlap_words else prev_chunk
            
            # Add overlap to beginning of current chunk
            current_chunk = overlap + " " + chunks[i]
            result.append(current_chunk)
    
    return result

def split_minutes_flexible(text):
    """Split document into main sections and sub-chunks."""
    if not text:
        return []
    
    # First pass: split by 1-3 digit numbers followed by dot and space
    pattern = r"(?<=\n)(\d{1,3})\.\s+(.+)"
    matches = list(re.finditer(pattern, text))
    
    sections = []
    
    if matches:
        preliminary_text = text[:matches[0].start()].strip()
        if preliminary_text:  # Only add preliminary if not empty
            sections.append({
                "agenda_number": "0",
                "title": "Preliminary Information",
                "text": preliminary_text,
                "is_main_section": False
            })
    
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_text = text[start:end].strip()
        sections.append({
            "agenda_number": match.group(1),
            "title": match.group(2).strip(),
            "text": section_text,
            "is_main_section": True  # Will be updated in second pass
        })
    
    # Second pass: identify main sections by sequential numbering
    if len(sections) > 1:
        try:
            # First try to find sections numbered >= 200 (traditional meeting minutes)
            try:
                # Find the first numeric agenda number that's >= 200
                first_main_idx = next(i for i, s in enumerate(sections) 
                                if s['agenda_number'].isdigit() and int(s['agenda_number']) >= 200)
                first_main_num = int(sections[first_main_idx]['agenda_number'])
                
                current_expected = first_main_num
                for section in sections[first_main_idx:]:
                    if section['agenda_number'].isdigit():
                        num = int(section['agenda_number'])
                        if num == current_expected:
                            section['is_main_section'] = True
                            current_expected += 1
                        else:
                            section['is_main_section'] = False
                    else:
                        section['is_main_section'] = False
            except StopIteration:
                # No high-numbered sections found, check if we have sequential numbering from 1
                numerical_sections = [s for s in sections if s['agenda_number'].isdigit()]
                
                # If we have numerical sections and they start close to 1
                if numerical_sections and int(numerical_sections[0]['agenda_number']) <= 5:
                    # Check if we have a sequence (1,2,3...) or (1,2,3,4...)
                    is_sequential = True
                    expected_num = int(numerical_sections[0]['agenda_number'])
                    
                    for section in numerical_sections:
                        num = int(section['agenda_number'])
                        if num == expected_num:
                            section['is_main_section'] = True
                            expected_num += 1
                        else:
                            section['is_main_section'] = False
                            is_sequential = False
                    
                    # If not sequential, default to all being main sections
                    if not is_sequential:
                        for section in sections:
                            if section['agenda_number'] != '0':  # Skip preliminary
                                section['is_main_section'] = True
                else:
                    # Default all sections to main if nothing else fits
                    for section in sections:
                        if section['agenda_number'] != '0':  # Skip preliminary
                            section['is_main_section'] = True
        except Exception as e:
            print(f"Error identifying main sections: {e}")
            # Safety fallback - mark all non-preliminary as main sections
            for section in sections:
                if section['agenda_number'] != '0':
                    section['is_main_section'] = True
    
    return sections

def generate_chunk_id(text):
    """Generate a stable ID for a chunk based on its content."""
    return hashlib.md5(text.encode('utf-8')).hexdigest()[:12]

def process_minutes(pdf_path, chunk_size=300, chunk_overlap=50, min_chunk_size=100):
    """Full processing pipeline with hierarchical numbering and content overlap."""
    try:
        text = extract_text_from_pdf(pdf_path)
        if not text:
            return []
        
        sections = split_minutes_flexible(text)
        
        # Force section 0 to be main section
        if sections and sections[0]['agenda_number'] == '0':
            sections[0]['is_main_section'] = True
        
        # Extract filename without path for metadata
        file_name = pdf_path.split('/')[-1] if '/' in pdf_path else pdf_path
        
        for section in sections:
            # Apply hierarchical numbering to chunks with overlap
            if section['is_main_section'] or len(section['text'].split()) > min_chunk_size:
                chunks = create_optimal_chunks(
                    section['text'], 
                    max_words=chunk_size, 
                    min_words=min_chunk_size,
                    overlap_words=chunk_overlap
                )
                
                # Number chunks as 1.1, 1.2, etc. for subsections
                if not section['is_main_section'] and '.' not in section['agenda_number']:
                    section['chunks'] = [
                        {
                            'chunk_number': f"{section['agenda_number']}.{i+1}",
                            'text': chunk,
                            'word_count': len(chunk.split()),
                            'chunk_id': generate_chunk_id(chunk)
                        }
                        for i, chunk in enumerate(chunks)
                    ]
                else:
                    # For main sections or already numbered subsections
                    section['chunks'] = [
                        {
                            'chunk_number': str(i+1),
                            'text': chunk,
                            'word_count': len(chunk.split()),
                            'chunk_id': generate_chunk_id(chunk)
                        }
                        for i, chunk in enumerate(chunks)
                    ]
            else:
                section['chunks'] = [{
                    'chunk_number': section['agenda_number'],
                    'text': section['text'],
                    'word_count': len(section['text'].split()),
                    'chunk_id': generate_chunk_id(section['text'])
                }]
            
            section['chunk_count'] = len(section['chunks'])
            section['avg_chunk_words'] = (
                sum(c['word_count'] for c in section['chunks']) / section['chunk_count'] 
                if section['chunk_count'] > 0 else 0
            )
            
            # Add source document metadata
            section['source_document'] = file_name
            
        return sections
    except Exception as e:
        print(f"Error processing minutes: {e}")
        return []

def create_final_dataframe(processed_sections):
    """Assign display_number based on sequential agenda progression with sub-chunk tracking. Preserve agenda 0."""
    if not processed_sections:
        return pd.DataFrame()
    
    rows = []
    for section in processed_sections:
        for chunk in section['chunks']:
            row = create_row(section, None, section['agenda_number'], chunk)
            rows.append(row)
    
    df = pd.DataFrame(rows)
    
    # Separate preliminary (agenda_item == '0') from rest
    prelim_df = df[df['agenda_item'] == '0'].copy()
    main_df = df[df['agenda_item'] != '0'].copy()

    # Assign display_number = '0' to all prelim chunks
    prelim_df['display_number'] = '0'

    # Convert to int for sorting and logic
    main_df['agenda_item_int'] = main_df['agenda_item'].astype(int)
    
    display_numbers = []
    current_agenda = None
    current_index = -1
    sub_index = 0

    for i, row in main_df.iterrows():
        agenda = row['agenda_item_int']
        
        if current_agenda is None:
            # First real agenda item
            current_agenda = agenda
            current_index = agenda
            sub_index = 0
            display_numbers.append(f"{current_index}.0")
        elif agenda == current_index:
            # Same as last -> sub-item
            sub_index += 1
            display_numbers.append(f"{current_index}.{sub_index}")
        elif agenda == current_index + 1:
            # Expected new agenda item
            current_index = agenda
            current_agenda = agenda
            sub_index = 0
            display_numbers.append(f"{current_index}.0")
        else:
            # Gap in agenda numbering: treat as another sub-item
            sub_index += 1
            display_numbers.append(f"{current_index}.{sub_index}")

    main_df['display_number'] = display_numbers
    
    # Merge back
    final_df = pd.concat([prelim_df, main_df], ignore_index=True)
    
    # Cleanup
    final_df.drop(columns=['agenda_item_int'], errors='ignore', inplace=True)
    
    final_columns = [
        'chunk_id', 'display_number', 'agenda_item', 'section_title', 
        'text', 'is_main_section', 'word_count', 'parent_section',
        'source_document'
    ]
    
    return final_df[[col for col in final_columns if col in final_df.columns]]

def find_parent_agenda(section, all_sections):
    """Find the parent agenda number for a subsection"""
    # For sections after preliminary (0)
    if section['agenda_number'] == '0':
        return '0'
    
    # Find the last main section before this one
    for s in reversed(all_sections[:all_sections.index(section)]):
        if s['is_main_section']:
            return s['agenda_number']
    return section['agenda_number']  # fallback

def create_row(section, display_num, agenda_num, chunk):
    """Helper to create consistent row data"""
    row = {
        'chunk_id': chunk.get('chunk_id', ''),
        'display_number': display_num,
        'agenda_item': agenda_num,
        'section_title': section['title'],
        'text': chunk['text'],
        'is_main_section': section['is_main_section'],
        'word_count': chunk['word_count'],
        'parent_section': section['title']
    }
    
    # Add source document if available
    if 'source_document' in section:
        row['source_document'] = section['source_document']
        
    return row

def prepare_for_embeddings(df, text_column='text', metadata_columns=None):
    """Prepare dataframe for embeddings generation by separating text and metadata."""
    if metadata_columns is None:
        metadata_columns = [
            'chunk_id', 'display_number', 'agenda_item', 'section_title',
            'is_main_section', 'parent_section', 'source_document'
        ]
    
    # Ensure all requested columns exist
    available_metadata = [col for col in metadata_columns if col in df.columns]
    
    # Create text series for embeddings
    texts = df[text_column].tolist()
    
    # Create metadata list
    metadata = df[available_metadata].to_dict('records')
    
    return texts, metadata

# Example usage with adjustable parameters
def process_pdf_for_embeddings(pdf_path, chunk_size=300, chunk_overlap=50, min_chunk_size=200):
    """Process PDF and prepare for embeddings with customizable chunking parameters."""
    processed_sections = process_minutes(
        pdf_path, 
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        min_chunk_size=min_chunk_size
    )
    
    final_df = create_final_dataframe(processed_sections)
    
    # Get texts and metadata ready for embeddings
    texts, metadata = prepare_for_embeddings(final_df)
    
    # Return all necessary components
    return {
        'dataframe': final_df,
        'texts': texts,
        'metadata': metadata,
        'sections': processed_sections
    }

# Example pipeline
if __name__ == "__main__":
    pdf_path = pdf_path # replace with the pdf name here
    
    # Process with custom parameters
    result = process_pdf_for_embeddings(
        pdf_path,
        chunk_size=500,  # Max words per chunk
        chunk_overlap=50,  # Words overlap between chunks 
        min_chunk_size=200  # Minimum chunk size before merging
    )
    
    # Access results
    df = result['dataframe']
    texts_for_embeddings = result['texts']
    metadata_for_embeddings = result['metadata']
    
    # Print statistics
    print(f"Total chunks: {len(df)}")
    print(f"Average chunk size (words): {df['word_count'].mean():.1f}")
    print(f"Min chunk size: {df['word_count'].min()}")
    print(f"Max chunk size: {df['word_count'].max()}")


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Total chunks: 48
Average chunk size (words): 62.0
Min chunk size: 5
Max chunk size: 495


In [137]:
pd.set_option('display.max_colwidth', None)
df.head(20)

Unnamed: 0,chunk_id,display_number,agenda_item,section_title,text,is_main_section,word_count,parent_section,source_document
0,07f915be82f2,0.0,0,Preliminary Information,"KENT COUNTY COUNCIL\n_____________________________________________\nCHILDREN'S, YOUNG PEOPLE AND EDUCATION CABINET\nCOMMITTEE\nMINUTES of a meeting of the Children's, Young People and Education Cabinet\nCommittee held at Council Chamber, Sessions House, County Hall, Maidstone on\nThursday, 27th February, 2025.\nPRESENT: Mr M C Dance (Chairman), Mr M Dendor (Vice-Chairman), Mr P V Barrington-\nKing, Mr D Beaney, Mr P Bartlett (Substitute for Mr A Sandhu, MBE), Sir Paul Carter, CBE\n(Substitute for Mrs M McArthur), Ms M Dawkins, Ms S Hamilton, Ms J Hawkins,\nMs J Meade, Mr C Passmore (Substitute for Mrs T Dean, MBE), Mr P Stepto and\nMr Q Roper\nALSO PRESENT: Mr R Love, OBE (Cabinet Member for Education and Skills) and Mrs S\nChandler (Cabinet Member for Integrated Children’s Services)\nIN ATTENDANCE: Sarah Hammond (Corporate Director Children, Young People and\nEducation), Craig Chapman (Head of Fair Access), Christy Holden (Head of Children's\nCommissioning), James Clapson (Democratic Services Officer), Katherine Atkinson\n(Assistant Director, Management Information and Intelligence, Integrated Children's\nServices), Helen Cook (Senior Commissioner), Alice Gleave (Interim Assistant Director for\nSEND Operations), Emma Hanson (Senior Commissioning Manager), Georgia\nHumphreys (Democratic Services Officer), Kevin Kasaven (Director of Children’s\nCountywide Services), Christine McInnes (Director of Education), Mark Scrivener (Head of\nRisk and Delivery Assurance), Sarah Chapman (Children & Young People's Occupational\nTherapy service Manager), Robert Veale (Assistant Director Education (East Kent)), Jude\nFarrell (Head of Service), Michelle Stanley (Education Lead Adviser), Karen Stone\n(Revenue Finance Manager (0 - 25 services)), David Adams (Assistant Director Education\n(South Kent)) and Nick Abrahams (Area Education Officer – West Kent)\nUNRESTRICTED ITEMS",True,259,Preliminary Information,Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
1,89c1624e5efe,20.0,20,Apologies and Substitutes,"20. Apologies and Substitutes\n(Item 2)\nApologies had been received from Mr Cooke, Mr Reidy, Mrs Game, Mr Manion,\nMrs McArthur for whom Sir Paul Carter was present as substitute, Mr Sandhu for\nwhom Mr Bartlett was present as substitute, and Mrs Dean for whom Mr Passmore\nwas present as a substitute.",True,52,Apologies and Substitutes,Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
2,2b742a077858,21.0,21,Declarations of Interest,21. Declarations of Interest\n(Item 3),True,6,Declarations of Interest,Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
3,19edd0deadf4,21.1,1,Mr Passmore declared that his wife was a full time private educational,1. Mr Passmore declared that his wife was a full time private educational\npsychologist working with Kent residents and the full range of private and\npublic educational establishments in Kent.,True,30,Mr Passmore declared that his wife was a full time private educational,Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
4,dad351dac43c,21.2,2,Sir Paul Carter declared that he was a Director of the Lee Academy Trust.,2. Sir Paul Carter declared that he was a Director of the Lee Academy Trust.\n1,True,16,Sir Paul Carter declared that he was a Director of the Lee Academy Trust.,Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
5,fb0f1d0ee7f8,22.0,22,Minutes of the meeting held on 16 January 2025,22. Minutes of the meeting held on 16 January 2025\n(Item 4)\nRESOVED that the minutes of the meeting held on 16 January 2025 were a correct\nrecord.,True,28,Minutes of the meeting held on 16 January 2025,Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
6,8fa348d51852,23.0,23,Verbal Update by Cabinet Members,23. Verbal Update by Cabinet Members\n(Item 5),True,8,Verbal Update by Cabinet Members,Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
7,8ec8305e5a32,23.1,1,"Mrs Chandler, Cabinet Member for Integrated Children’s Services, gave a","1. Mrs Chandler, Cabinet Member for Integrated Children’s Services, gave a\nverbal update on the following:\na. The Kent Practice Framework had been given Kent Safeguarding\nChildren Multi-Agency Partnership (KSCMP) executive approval. The\nmultiagency agreement was a key development for the design phase\nof reforming children’s services. The framework offered the\nopportunity for families to receive consistent practice from across the\nmulti-agency.\nb. The LADO and Education Safeguarding Advisory Service (LESAS)\nabsorbed the Sector Led Improvement Programme (SLIP). This\nprovided an opportunity for KCC to work closely alongside the\nDepartment of Education and other Local Authorities to help design\nthe future of children’s services.\nc. Kent Youth County Council took part in the launch of ‘Don’t\nDisrespect’, which was a partnership campaign between Kent County\nCouncil, Kent Police and the Kent and Medway Violence Reduction\nUnit to combat street harassment and violence towards women and\ngirls. Mrs Chandler shared that the ‘Don’t Disrespect’ film and\ncampaign including posters and social media campaigns were\navailable at: www.dontdisrespect.uk.\nd. Virtual School Kent had received further accreditation for the Nurture\nProgramme. Tony Doran, the Headteacher of Virtual School Kent had\nbeen asked to be part of an All Party Parliamentary Group on\nInclusion and Relational Approaches in Education.\ne. Mrs Chandler attended the CYPE Conference Day which focused on\nneurodiversity. The conference provided insight on the latest research\nfrom the Children’s Commissioner, the work of NELFT, the\nneuroscience of learning and the biology of stress, and the Dynamic\nSupport Service.\nf. Safer Internet Day 2025 took place on 11th February 2025, the theme\nwas “Too good to be true? Protecting yourself and others from scams\nonline”.",True,273,"Mrs Chandler, Cabinet Member for Integrated Children’s Services, gave a",Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
8,69e611905143,23.2,2,"Mr Love, Cabinet Member for Education and Skills, gave a verbal update on","2. Mr Love, Cabinet Member for Education and Skills, gave a verbal update on\nthe following:\na. This year's SEN Phase Transfer had completed 97.5% of decisions\nwithin the published timeline, 91% of families had received a named\npreference. Mr Love shared his pride in and expressed\ncongratulations for the teams achievement, whilst recognising that\nthere was more work to be done.\n2\nb. Mr Love had conducted four school visits in February, three of which\nbeing special schools. KCC were supporting two of the schools visited\nto expand, to ensure that sufficient special school places were\navailable for those children with the most complex needs.\nRESOLVED that the updates were noted.",True,112,"Mr Love, Cabinet Member for Education and Skills, gave a verbal update on",Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf
9,9868fbb8a8ec,24.0,24,Performance Monitoring,24. Performance Monitoring\n(Item 6),True,5,Performance Monitoring,Printed minutes 27022025 1000 Childrens Young People and Education Cabinet Committee.pdf


In [146]:

import hashlib
import pandas as pd

def generate_chunk_id(text):
    """Generate a stable ID for a chunk based on its content."""
    return hashlib.md5(text.encode('utf-8')).hexdigest()[:12]

def merge_small_agenda_chunks(df, min_words=100):
    """
    Merge small chunks across agenda items if they're below the minimum threshold.
    If a small chunk is surrounded by large ones, merge it with the next.
    Update text, display_number, word_count, chunk_id.
    """
    if df.empty:
        return df.copy()

    merged_rows = []
    buffer = None
    columns = df.columns.tolist()

    df = df.reset_index(drop=True)

    i = 0
    while i < len(df):
        current = df.iloc[i].copy()

        if current["word_count"] >= min_words:
            if buffer is not None:
                merged_rows.append(buffer)
                buffer = None
            merged_rows.append(current.to_dict())
            i += 1
        else:
            if buffer is None:
                buffer = current.copy()
                buffer["merged_from"] = [buffer["display_number"]]
            else:
                buffer["text"] += "\n\n" + current["text"]
                buffer["word_count"] += current["word_count"]
                buffer["display_number"] += "+" + current["display_number"]
                buffer["merged_from"].append(current["display_number"])

            i += 1

            # Look ahead: merge with next large chunk if we're done with small ones
            if i < len(df) and df.iloc[i]["word_count"] >= min_words:
                next_row = df.iloc[i].copy()
                buffer["text"] += "\n\n" + next_row["text"]
                buffer["word_count"] += next_row["word_count"]
                buffer["display_number"] += "+" + next_row["display_number"]
                buffer["merged_from"].append(next_row["display_number"])
                i += 1

                merged_rows.append(buffer)
                buffer = None

    if buffer is not None:
        merged_rows.append(buffer)

    result_df = pd.DataFrame(merged_rows)

    # Recalculate chunk_id for merged chunks
    result_df["chunk_id"] = result_df["text"].apply(generate_chunk_id)

    return result_df

# Step 1: Flatten the processed sections into a dataframe
df_flat = create_final_dataframe(processed_sections)

# Step 2: Apply the post-processing merge logic
df_merged = merge_small_agenda_chunks(df_flat, min_words=100)

# Step 3: View the results
df_merged[["display_number", "word_count", "merged_from"]]



Unnamed: 0,display_number,word_count,merged_from
0,0,259,
1,20.0+21.0+21.1+21.2+22.0+23.0+23.1,413,"[20.0, 21.0, 21.1, 21.2, 22.0, 23.0, 23.1]"
2,23.2,112,
3,24.0+24.1+24.2+24.3,382,"[24.0, 24.1, 24.2, 24.3]"
4,24.4,275,
5,25.0+25.1+25.2,208,"[25.0, 25.1, 25.2]"
6,26.0+26.1+26.2+27.0+27.1+27.2,280,"[26.0, 26.1, 26.2, 27.0, 27.1, 27.2]"
7,28.0+28.1+28.2,250,"[28.0, 28.1, 28.2]"
8,29.0+29.1+29.2+30.0+30.1+31.0+31.1+31.2+31.3,491,"[29.0, 29.1, 29.2, 30.0, 30.1, 31.0, 31.1, 31.2, 31.3]"
9,32.0+32.1+32.2+33.0+33.1+33.2+34.0+34.1+35.0+36.0+37.0+38.0+39.0,257,"[32.0, 32.1, 32.2, 33.0, 33.1, 33.2, 34.0, 34.1, 35.0, 36.0, 37.0, 38.0, 39.0]"


In [147]:
print("Before:", df_flat.shape[0], "chunks")
print("After: ", df_merged.shape[0], "chunks")


Before: 48 chunks
After:  10 chunks


In [144]:
df_merged

Unnamed: 0,chunk_id,display_number,agenda_item,section_title,text,is_main_section,word_count,parent_section,merged_from
0,07f915be82f2,0,0,Preliminary Information,"KENT COUNTY COUNCIL\n_____________________________________________\nCHILDREN'S, YOUNG PEOPLE AND EDUCATION CABINET\nCOMMITTEE\nMINUTES of a meeting of the Children's, Young People and Education Cabinet\nCommittee held at Council Chamber, Sessions House, County Hall, Maidstone on\nThursday, 27th February, 2025.\nPRESENT: Mr M C Dance (Chairman), Mr M Dendor (Vice-Chairman), Mr P V Barrington-\nKing, Mr D Beaney, Mr P Bartlett (Substitute for Mr A Sandhu, MBE), Sir Paul Carter, CBE\n(Substitute for Mrs M McArthur), Ms M Dawkins, Ms S Hamilton, Ms J Hawkins,\nMs J Meade, Mr C Passmore (Substitute for Mrs T Dean, MBE), Mr P Stepto and\nMr Q Roper\nALSO PRESENT: Mr R Love, OBE (Cabinet Member for Education and Skills) and Mrs S\nChandler (Cabinet Member for Integrated Children’s Services)\nIN ATTENDANCE: Sarah Hammond (Corporate Director Children, Young People and\nEducation), Craig Chapman (Head of Fair Access), Christy Holden (Head of Children's\nCommissioning), James Clapson (Democratic Services Officer), Katherine Atkinson\n(Assistant Director, Management Information and Intelligence, Integrated Children's\nServices), Helen Cook (Senior Commissioner), Alice Gleave (Interim Assistant Director for\nSEND Operations), Emma Hanson (Senior Commissioning Manager), Georgia\nHumphreys (Democratic Services Officer), Kevin Kasaven (Director of Children’s\nCountywide Services), Christine McInnes (Director of Education), Mark Scrivener (Head of\nRisk and Delivery Assurance), Sarah Chapman (Children & Young People's Occupational\nTherapy service Manager), Robert Veale (Assistant Director Education (East Kent)), Jude\nFarrell (Head of Service), Michelle Stanley (Education Lead Adviser), Karen Stone\n(Revenue Finance Manager (0 - 25 services)), David Adams (Assistant Director Education\n(South Kent)) and Nick Abrahams (Area Education Officer – West Kent)\nUNRESTRICTED ITEMS",True,259,Preliminary Information,
1,a38ee3ab922c,20.0+21.0+21.1+21.2+22.0+23.0+23.1,20,Apologies and Substitutes,"20. Apologies and Substitutes\n(Item 2)\nApologies had been received from Mr Cooke, Mr Reidy, Mrs Game, Mr Manion,\nMrs McArthur for whom Sir Paul Carter was present as substitute, Mr Sandhu for\nwhom Mr Bartlett was present as substitute, and Mrs Dean for whom Mr Passmore\nwas present as a substitute.\n\n21. Declarations of Interest\n(Item 3)\n\n1. Mr Passmore declared that his wife was a full time private educational\npsychologist working with Kent residents and the full range of private and\npublic educational establishments in Kent.\n\n2. Sir Paul Carter declared that he was a Director of the Lee Academy Trust.\n1\n\n22. Minutes of the meeting held on 16 January 2025\n(Item 4)\nRESOVED that the minutes of the meeting held on 16 January 2025 were a correct\nrecord.\n\n23. Verbal Update by Cabinet Members\n(Item 5)\n\n1. Mrs Chandler, Cabinet Member for Integrated Children’s Services, gave a\nverbal update on the following:\na. The Kent Practice Framework had been given Kent Safeguarding\nChildren Multi-Agency Partnership (KSCMP) executive approval. The\nmultiagency agreement was a key development for the design phase\nof reforming children’s services. The framework offered the\nopportunity for families to receive consistent practice from across the\nmulti-agency.\nb. The LADO and Education Safeguarding Advisory Service (LESAS)\nabsorbed the Sector Led Improvement Programme (SLIP). This\nprovided an opportunity for KCC to work closely alongside the\nDepartment of Education and other Local Authorities to help design\nthe future of children’s services.\nc. Kent Youth County Council took part in the launch of ‘Don’t\nDisrespect’, which was a partnership campaign between Kent County\nCouncil, Kent Police and the Kent and Medway Violence Reduction\nUnit to combat street harassment and violence towards women and\ngirls. Mrs Chandler shared that the ‘Don’t Disrespect’ film and\ncampaign including posters and social media campaigns were\navailable at: www.dontdisrespect.uk.\nd. Virtual School Kent had received further accreditation for the Nurture\nProgramme. Tony Doran, the Headteacher of Virtual School Kent had\nbeen asked to be part of an All Party Parliamentary Group on\nInclusion and Relational Approaches in Education.\ne. Mrs Chandler attended the CYPE Conference Day which focused on\nneurodiversity. The conference provided insight on the latest research\nfrom the Children’s Commissioner, the work of NELFT, the\nneuroscience of learning and the biology of stress, and the Dynamic\nSupport Service.\nf. Safer Internet Day 2025 took place on 11th February 2025, the theme\nwas “Too good to be true? Protecting yourself and others from scams\nonline”.",False,413,Apologies and Substitutes,"[20.0, 21.0, 21.1, 21.2, 22.0, 23.0, 23.1]"
2,6632c3d2124e,23.2+24.0+24.1+24.2+24.3,2,"Mr Love, Cabinet Member for Education and Skills, gave a verbal update on","2. Mr Love, Cabinet Member for Education and Skills, gave a verbal update on\nthe following:\na. This year's SEN Phase Transfer had completed 97.5% of decisions\nwithin the published timeline, 91% of families had received a named\npreference. Mr Love shared his pride in and expressed\ncongratulations for the teams achievement, whilst recognising that\nthere was more work to be done.\n2\nb. Mr Love had conducted four school visits in February, three of which\nbeing special schools. KCC were supporting two of the schools visited\nto expand, to ensure that sufficient special school places were\navailable for those children with the most complex needs.\nRESOLVED that the updates were noted.\n\n24. Performance Monitoring\n(Item 6)\n\n1. Ms Atkinson, introduced the report and highlighted key aspects throughout,\nexplaining that further information had been added to scorecard such as the\nrates of permanent exclusions and severe/persistent absences.\n\n2. Mr Love highlighted the KPI for the number of EHCP’s issued within 20\nweeks, emphasising the importance of ensuring improvements within this\nprocess were long term and sustainable. Additionally, noting that there has\nbeen significant improvements in progress, Mr Love was confident that for\nFebruary, Kent would be within the top quartile of performers in England.\n\n3. In response to comments and questions it was said: a. Ms Atkinson shared that the backlog of cases had decreased and the timelines of EHCP’s was under control. The idea of quarterly targets had been raised to have a continuous improvement approach. Regarding the take-up of early free education places, Ms Atkinson explained that it was seasonal, there were significant changes within the early years sector that was affecting take-up rates. Further indicators for early years were going to be explored to provide more information. Mr Love added in October 2023 the oldest EHCP case was 124 weeks old, compared to updated figures where the oldest case was around 20 weeks old. The service was significantly improving, there was an aim to chase targets but not compromise quality. b. Mr Chapman shared that the volume of available Educational Psychologists was limited, there was an intention to make better use of staff time rather than increasing the numbers, additionally Educational Psychologists were aiming to increase their availability. c. In order to tackle persistent absences in schools a multi-disciplinary approach was required, there had been more challenges in schools since the COVID-19 Pandemic. There was research going into emotional based school avoidance, how to support schools with this and ensure front line staff understanding. The was going to be a session with Assistant Directors about absence, and how to make sure all those involved had a full understanding of the causes, the way to address it and the support that could be offered. Last year training was developed for front line staff, there was a desire to refresh and expand this. Ms Chandler added that school avoidance was a key issue for early help teams and that there was work being",False,494,"Mr Love, Cabinet Member for Education and Skills, gave a verbal update on","[23.2, 24.0, 24.1, 24.2, 24.3]"
3,071c5b3c5b64,24.4,3,In response to comments and questions it was said:,"done to tackle this issue. Ms Hammond added that there was an 3 annual meeting with two South East local authority colleagues for peer review, the topic for the upcoming meeting was long term school absences. d. A Member noted the importance of clear thresholds and criteria for the balance of mainstream inclusion and special school places, explaining that national Government and the Tribunal System needed to define these. Mr Love agreed that there was a need for further collaboration with the Government to refine and strengthen guidance. Additionally, the Tribunal Service needed to consider the broader responsibilities of Local Authorities in resource allocation. The thresholds for EHCP’s used by KCC were those outlined in the SEND Code of Practice. Mr Love was to continue his dialog with Head Teacher’s and Principal’s around this topic. Ms McInnes added that guidance was needed, Officers had been supporting school leaders in Kent to develop that guidance and descriptors, the document was in its final stages. There was a working group of special school Head Teachers who were developing the criteria for the different types of special needs, to help identify which children were best placed in special schools, the document was to be published in May 2025. e. Ms McInnes recognised the issue of Educational Psychologists not receiving required information in a timely manner from schools. To combat this, termly Special Educational Needs Coordinator (SENCO) briefings had been established, there were separate briefings for early years, primary and secondary to address individual issues. These briefings had been running for 18 months, the benefits of the SENCO’s work was being seen. RESOLVED that the Committee noted the update.",False,275,In response to comments and questions it was said:,
4,d45c9db66183,25.0+25.1+25.2+26.0+26.1+26.2+27.0+27.1+27.2+28.0+28.1+28.2,25,CYPE Contract Register- Overview of Commissioned Contracts,"25. CYPE Contract Register- Overview of Commissioned Contracts\n(Item 7)\n\n1. Ms Holden introduced the report and gave an overview of the content.\n\n2. In response to comments and questions it was said:\na. Ms McInnes shared that there was work being done to reduce the\nfigures associated with the non-maintained and independent special\nschool placements. This was being done through strengthening the\ncommissioning system, there were visits to non-maintained and\nindependent special schools being undertaken in order to check that\nyoung people were attending school and receiving the correct\nservices. Additionally, issues were addressed through the special\nschool review. Mrs Chandler explained that the Policy and Resources\nCabinet Committee covered procurement which provided a level of\noversight. Mr Love added that the special school review was\nimportant to ensure that the right provision was in the right place.\nThere were expansions taking place on schools and new schools\nwere being opened with the aim of being able to have the option to\nsend children and young people to maintained special schools.\n4\nb. Ms Hammond explained that there would always be a need for\nindependent special schools, whilst they could be specialised and\ntargeted, they came at a significant cost.\nRESOLVED that the Committee noted the report.\n\n26. Risk Management: Children, Young People and Education\n(Item 8)\n\n1. Mr Scrivener, introduced the report and gave an overview of the risks\npresented within the report.\n\n2. In response to comments and questions it was said:\na. Mr Love explained that the safety valve was not the highest risk to\nKCC rather it was the historic overspending on the high needs\nspending block. There was a vast amount of work done to bring\nspending down in line with government allocation. It was believed that\nthe safety valve was a part of the solution not the risk.\nRESOLVED that the Committee considered the risks presented.\n\n27. 25/00022 Admission Arrangements and Scheme for 2026-27 Academic Year\n(Item 9)\n\n1. Mr Chapman introduced the annual report and gave an overview of its\ncontent.\n\n2. In response to comments and questions it was said:\na. Mr Chapman shared that proposed developments were considered in\nareas where reductions were agreed. An advantage of the\narrangement of the admissions process was that if there was an\nunexpected influx of children, schools were able to offer places above\ntheir PAN. Mr Love added that future developments were taken into\nconsideration in the Annual Kent Commissioning Plan, which was a\nfive year rolling programme that had a significant amount of\nbackground intelligence, with the opportunity to adjust if needed.\nb. When asked if schools would be provided with additional assistance\nas school boundaries changed. Mr Chapman explained that this issue\nwould not be directly addressed through the admission process,\nhowever it was confirmed that there would be adjustments to funding\nrelative to need.\nRESOLVED that the Committee considered and endorsed the decision of the\ndetermined admission arrangements.\n\n28. 25/00032 Therapies – Recommissioning of SEN Therapy provision to enable\naccess to the Education Curriculum\n(Item 10)\n5\n\n1. Ms Holden, introduced the report and gave an overview of its content.\n\n2. In response to comments and questions it was said:\na. Ms Holden explained that according to the SEND Code of Practice\nthe Local Authority had to cover the costs of any failures in NHS\ncommissioning. The contract represented 20% of the total therapy\nprovision across the county, which covered KCC's statutory delivery\nfor therapy for children who needed it in order to access the education\ncurriculum. Additionally, within the new contract, the service delivery\nmodel was to be changed to drive efficiency, the NHS invested in the\nBalance System model in 2020, the providers had been adapting their\nmodel which increased the efficiency of the system.\nb. The NHS invested £13 million into therapeutic services, whilst the\nLocal Authority invested £2.2 million, there was intention to bring a\nproportion of that funding in-house for physiotherapy and occupational\ntherapy delivery, £1.8 million would be reprocured for speech,\nlanguage and communication needs.\nc. Due to the contracts being legacy NHS provider contracts, the Local\nAuthority did not have oversight of the KPI’s on the provider contracts,\nthis was to change to ensure the Local Authority had further oversight\nand decision making powers in the KPI’s.\nd. Ms Holden was to come back to the Committee with specific funding\ndetails.\nRESOLVED that the Committee considered and endorsed the proposed decision.",False,738,CYPE Contract Register- Overview of Commissioned Contracts,"[25.0, 25.1, 25.2, 26.0, 26.1, 26.2, 27.0, 27.1, 27.2, 28.0, 28.1, 28.2]"
5,00e82fff961c,29.0+29.1+29.2+30.0+30.1+31.0+31.1+31.2+31.3,29,25/00020 Community Learning and Skills (CLS) - Sub-contracting Education,"29. 25/00020 Community Learning and Skills (CLS) - Sub-contracting Education\nand Skills Funding Agency provision for 16 to 19 year olds\n(Item 11)\n\n1. Ms Holden, introduced the report explaining that the team were waiting for\nconfirmation of funding.\n\n2. In response to comments and questions it was said:\na. Ms Farrell confirmed that the providers of delivery for services could\nbe managed as appropriate, there was work done with the Education\nDirectorate to track needs within districts in Kent. This sub-contracting\nprovision would allow the service to offer opportunities as they arose\nwithin Kent.\nb. Members noted the importance of young people’s awareness of the\nbursaries available. Ms Farrell was to ensure that offers of these\nbursaries were clear to young people.\nRESOLVED that the Committee considered and endorsed the proposed decision.\n\n30. 25/00021 Community Learning and Skills (CLS) - Sub-contracting Education\nand Skills Funding Agency provision for 19 to 25 year olds\n(Item 12)\n6\n\n1. Ms Holden, introduced the report and gave an overview of its content.\nRESOLVED that the Committee considered and endorsed the proposed decision.\n\n31. 25/00023 Raising Ambition. Enabling Curiosity. Building Resilience. A\nStrategy for the Future of Education in Kent, 2025-2030\n(Item 13)\n\n1. Mr Love, introduced the report and gave an overview of its content. Ms\nStanley shared that the documents shared with Members were to be revised\nbefore publication.\n\n2. Ms McInnes, added that around 50% of schools in Kent were academies,\nthere was a desire for schools to have a shared focus on the best outcomes\nfor children, young people and families within local areas. Additionally,\ngovernment funding for the Local Authorities duty of school improvement for\nmaintained schools had ceased and was to be funded from core funding.\n\n3. In response to comments and question it was said:\na. Mr Love explained that this strategy allowed the opportunity for Kent’s\nopinion on the role of education to be clear. Additionally, it provided\nthe opportunity to produce changes, the strategy was to continue to\nevolve and refine, it was to be used as a tool for future conversations\nand strategies.\nb. It was difficult to measure the resilience of the strategy in terms of\nlocal government reorganisation as the exact details had not yet been\nprovided by the Government. Mr Love emphasised the importance of\nhaving clarity on the correct strategy for Kent instead of waiting to find\nout what was going to happen next.\nc. Mr Love explained that the strategy would support Kent’s Grammar\nSchools.\nd. Mrs Chandler shared the importance of looking at the early years\nsection of this strategy, emphasising the importance to early\nintervention.\ne. Ms Stanley added that there was an action plan that would be\nevaluated against progress, feedback, qualitative feedback from the\nsector, children and young people and the established data sets.\nThere were sections on implementation, SEND and the environment\nwithin this.\nRESOLVED that the Committee considered and endorsed the proposed decision.",False,491,25/00020 Community Learning and Skills (CLS) - Sub-contracting Education,"[29.0, 29.1, 29.2, 30.0, 30.1, 31.0, 31.1, 31.2, 31.3]"
6,c442026192c0,32.0+32.1+32.2+33.0+33.1+33.2+34.0+34.1+35.0+36.0+37.0+38.0+39.0,32,25/00010 Proposed Revision of Rates Payable and Charges Levied by Kent,"32. 25/00010 Proposed Revision of Rates Payable and Charges Levied by Kent\nCounty Council for Children's Social Care Services in 2025-26\n(Item 14)\n\n1. Ms Stone, introduced the annual report and gave an overview of its content.\n\n2. Ms Hammond requested that the link to initial foster care enquiry website be\nincluded in the minutes. Which could be accessed here: Fostering Enquiry\n7\nRESOLVED that the Committee considered and endorsed the proposals.\n\n33. 25/00009 Proposal to change the age range of Blean Primary School,\nCanterbury, from 4-11 years to 3-11 years\n(Item 15)\n\n1. Mr Veale introduced the paper and gave an overview of its content.\n\n2. When asked how issues with parking would be dealt with, Mr Veale shared\nthat parking had been provided by the University of Kent. Additionally,\nexplaining that the school needed to develop its own travel plan. Mr Veale\nwas not able to comment on any proposed developments.\nRESOLVED that the Committee considered and endorsed the proposed decision.\n\n34. SACRE Annual Report 2023/24\n(Item 17)\n\n1. The Committee were advised that the Chairman of the SACRE Committee\nendorsed the report.\nRESOLVED that the Committee noted the report.\n\n35. Work Programme\n(Item 18)\nRESOLVED that the work programme was noted.\n\n36. 25/00030 Proposed Additional Pathway of The Oaks Specialist College\n(Item 19)\n\n37. 25/00029 Additional capital funding - Whitfield Aspen Primary School (Dover)\n(Item 20)\n\n38. 25/00006 Chilmington Green - 2025 allocations\n(Item 21)\n\n39. 25/00031 Proposal to establish a new 2FE Primary School in Thanington\nCanterbury for September 2026\n(Item 16)\n8",False,257,25/00010 Proposed Revision of Rates Payable and Charges Levied by Kent,"[32.0, 32.1, 32.2, 33.0, 33.1, 33.2, 34.0, 34.1, 35.0, 36.0, 37.0, 38.0, 39.0]"
