# Scraping KCC Meetings in bulk

### Scraping all meetings from the web

In [66]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import os
import re
import time
import random
from tqdm import tqdm

OUTPUT_FILE = "../data/meetings/meetings_metadata.jsonl"
COMMITTEE_ID = "144"
MIDS = range(8000, 9000)  # Change as needed
BASE_URL = "https://democracy.kent.gov.uk"

def clean_day_suffix(date_str):
    return re.sub(r'(\d{1,2})(st|nd|rd|th)', r'\1', date_str)

def load_seen_ids(output_file):
    seen = set()
    if os.path.exists(output_file):
        with open(output_file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    obj = json.loads(line)
                    seen.add(obj["web_meeting_code"])
                except:
                    continue
    return seen

def scrape_meeting_metadata(mid, cid="144"):
    url = f"{BASE_URL}/ieListDocuments.aspx?CId={cid}&MId={mid}"

    try:
        res = requests.get(url, timeout=6)
        if res.status_code != 200:
            return None

        soup = BeautifulSoup(res.text, "html.parser")

        # --- Status detection ---
        page_title = soup.find("title").text.strip()
        status_match = re.search(r'\b(CANCELLED|WITHDRAWN|POSTPONED|MOVED|NEW)\b', page_title.upper())
        status = status_match.group(1).lower() if status_match else "scheduled"

        # --- Heading and metadata ---
        title_candidates = soup.find_all(["h1", "h2"])
        full_heading = ""
        for tag in title_candidates:
            if "Committee" in tag.get_text() or re.search(r"\d{4}", tag.get_text()):
                full_heading = tag.get_text(strip=True)
                break

        match = re.match(
            r"^(.*?)\s*-\s*(Monday|Tuesday|Wednesday|Thursday|Friday),\s*(.*?),\s*(\d{4})\s*(\d{1,2}\.\d{2})\s*(am|pm)",
            full_heading
        )

        if match:
            committee_name = match.group(1).strip()
            raw_day = clean_day_suffix(match.group(3))
            meeting_date = datetime.strptime(f"{raw_day}, {match.group(4)}", "%d %B, %Y").strftime("%Y-%m-%d")
            meeting_time = datetime.strptime(match.group(5) + match.group(6), "%I.%M%p").strftime("%H:%M")
        else:
            committee_name = None
            meeting_date = None
            meeting_time = None

        # --- Agenda item extraction with PDFs ---
        agenda_items = []
        for row in soup.find_all("tr"):
            number_cell = row.find("td", class_="mgItemNumberCell")
            content_cells = row.find_all("td")
            if number_cell and len(content_cells) > 1:
                item_number = number_cell.get_text(strip=True)
                content_td = content_cells[1]
                paragraphs = content_td.find_all("p")
                item_title = paragraphs[0].get_text(strip=True) if paragraphs else ""
                item_text = "\n".join(p.get_text(strip=True) for p in paragraphs[1:]) if len(paragraphs) > 1 else ""

                # Find any PDF links inside this agenda item
                item_pdfs = []
                for a in content_td.find_all("a", href=True):
                    href = a["href"]
                    if href.lower().endswith(".pdf"):
                        full_url = BASE_URL + "/" + href.lstrip("/")
                        item_pdfs.append(full_url)

                agenda_items.append({
                    "item_number": item_number,
                    "item_title": item_title,
                    "item_text": item_text,
                    "pdf_urls": item_pdfs
                })

        return {
            "web_meeting_code": str(mid),
            "meeting_title": full_heading,
            "meeting_status": status,
            "committee_name": committee_name,
            "meeting_date": meeting_date,
            "meeting_time": meeting_time,
            "agenda_items": agenda_items
        }

    except Exception as e:
        return {"web_meeting_code": str(mid), "error": str(e)}

def run_scrape_batch(mids, cid, output_path, delay=(1.5, 3.5)):
    seen_ids = load_seen_ids(output_path)
    with open(output_path, "a", encoding="utf-8") as f:
        for mid in tqdm(mids, desc="Scraping meetings"):
            if str(mid) in seen_ids:
                continue
            data = scrape_meeting_metadata(mid, cid)
            if data:
                f.write(json.dumps(data, ensure_ascii=False) + "\n")
            time.sleep(random.uniform(*delay))

if __name__ == "__main__":
    run_scrape_batch(MIDS, COMMITTEE_ID, OUTPUT_FILE)


Scraping meetings: 100%|██████████| 1000/1000 [1:00:58<00:00,  3.66s/it]


In [69]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

# Load the scraped JSONL file
jsonl_path = "../data/meetings/meetings_metadata.jsonl"
meetings = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        try:
            meetings.append(json.loads(line))
        except:
            continue

# Flatten agenda items
flattened = []
for meeting in meetings:
    meeting_code = meeting.get("web_meeting_code", "")
    agenda_items = meeting.get("agenda_items", [])
    for item in agenda_items:
        item_text = item.get("item_text", "").strip()
        flattened.append({
            "meeting_code": meeting_code,
            "item_number": item.get("item_number", ""),
            "item_title": item.get("item_title", ""),
            "text": item_text,
            "word_count": len(item_text.split())
        })

# Convert to DataFrame for review
df_agenda = pd.DataFrame(flattened)

# Display to user
df_agenda.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9873 entries, 0 to 9872
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   meeting_code  9873 non-null   object
 1   item_number   9873 non-null   object
 2   item_title    9873 non-null   object
 3   text          9873 non-null   object
 4   word_count    9873 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 385.8+ KB


### Saving agenda items in chunks warehouse

### 📦 Clean and Export All Agenda Chunks

This code block loads the full council meeting metadata from `meetings_metadata.jsonl`, and generates a list of agenda chunks.

Each chunk includes:
- `chunk_id` (safe and unique)
- `meeting_code`, `meeting_date`, `committee_name`
- `item_number`, `item_title`, and full `text`
- `word_count` for reference

The output is saved to: 
data/chunks/minutes_cleaned/chunks.jsonl


This cleaned file can be used as the input for embedding or RAG indexing.

In [72]:
# Import missing regex module and re-run the cleaned chunk generation
import re
import hashlib
from pathlib import Path
import json

input_path = Path("../data/meetings/meetings_metadata.jsonl")
with open(input_path, "r", encoding="utf-8") as f:
    meetings = [json.loads(line) for line in f if line.strip()]

# Define output path again
output_folder = Path("../data/chunks/minutes/")
output_folder.mkdir(parents=True, exist_ok=True)
output_path = output_folder / "chunks.jsonl"

# Keywords to filter out low-value items
low_signal_keywords = [
    "apologies", "substitutes", "panel business",
    "motion to exclude", "minutes of the meeting",
    "future work programme", "webcast", "any other business"
]

# Prepare cleaned chunk records
cleaned_chunks = []
for meeting in meetings:
    meeting_code = meeting.get("web_meeting_code", "")
    meeting_date = meeting.get("meeting_date")
    committee_name = meeting.get("committee_name")
    agenda_items = meeting.get("agenda_items", [])

    for idx, item in enumerate(agenda_items):
        item_number = item.get("item_number", "").strip()
        item_title = item.get("item_title", "").strip()
        item_text = item.get("item_text", "").strip()
        word_count = len(item_text.split())

        # Safe chunk_id
        base_id = item_number if item_number else f"item{idx}"
        clean_id = re.sub(r"[^\w]+", "", base_id.upper()) or f"ID{hashlib.md5(item_title.encode()).hexdigest()[:6]}"
        chunk_id = f"{meeting_code}_{clean_id}"

        cleaned_chunks.append({
            "chunk_id": chunk_id,
            "meeting_code": meeting_code,
            "meeting_date": meeting_date,
            "committee_name": committee_name,
            "item_number": item_number,
            "item_title": item_title,
            "text": item_text,
            "word_count": word_count
        })

# Save cleaned chunks to JSONL
with open(output_path, "w", encoding="utf-8") as f:
    for chunk in cleaned_chunks:
        f.write(json.dumps(chunk, ensure_ascii=False) + "\n")

output_path.as_posix()

'../data/chunks/minutes/chunks.jsonl'

### Filtering out meaningless chunks for embedding

this is done in a script 4a.

### Scraping 1 page

In [None]:
test_result = scrape_meeting_metadata(mid=9502, cid="144")
test_result