# Scraping KCC Cabinet Issues and Decisions 

### Scraping Level 1 links

In [19]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
import json
import os
import time
from tqdm import tqdm
import re

BASE_URL = "https://democracy.kent.gov.uk:9071"
OUTPUT_JSONL = "../data/jsons/level1_plans.jsonl"
INPUT_URLS = [
    "https://democracy.kent.gov.uk:9071/mgListPlanItems.aspx?PlanId=632&RP=115",
    "https://democracy.kent.gov.uk:9071/mgListPlanItems.aspx?PlanId=629&RP=115",
    "https://democracy.kent.gov.uk:9071/mgListPlanItems.aspx?PlanId=628&RP=115"
]

def extract_plan_id(url):
    parsed = urlparse(url)
    return parse_qs(parsed.query).get("PlanId", [None])[0]

def extract_plan_metadata(soup):
    meta_title = soup.find("meta", attrs={"name": "DC.title"})
    meta_date = soup.find("meta", attrs={"name": "DC.date"})
    return (
        meta_title["content"].strip() if meta_title else "Unknown Title",
        meta_date["content"] if meta_date else None
    )

def parse_issue_row(row):
    issue = {
        "decision_code": None,
        "title": None,
        "url": None,
        "decision_maker": None,
        "decision_due": None,
        "lead_officer": None,
        "status": None,
        "first_published": None,
        "restriction": None
    }

    # Find the link to the issue history (Level 2)
    title_link = row.select_one("a[href*='mgIssueHistoryHome.aspx']")
    if title_link:
        issue["title"] = title_link.text.strip()
        issue["url"] = urljoin(BASE_URL, title_link["href"])
        # Try to find the decision code before the title link
        preceding_span = title_link.find_previous("span")
        if preceding_span:
            code = preceding_span.get_text(strip=True)
            if "/" in code or "-" in code:
                issue["decision_code"] = code

    # Now extract key-value pairs from ALL text lines inside the row
    raw_lines = row.get_text(separator="\n", strip=True).splitlines()
    for line in raw_lines:
        line = line.strip()
        if not line or ":" not in line:
            continue
        key, val = line.split(":", 1)
        key = key.lower().strip()
        val = val.strip()

        if "decision maker" in key:
            issue["decision_maker"] = val
        elif key.startswith("decision"):
            issue["decision_due"] = val
        elif "lead officer" in key:
            issue["lead_officer"] = val
        elif "status" in key:
            issue["status"] = val
        elif "first published" in key:
            issue["first_published"] = val
        elif "restriction" in key:
            issue["restriction"] = val

    return issue

def scrape_plan_page(url):
    res = requests.get(url, timeout=10)
    if res.status_code != 200:
        return None

    soup = BeautifulSoup(res.text, "html.parser")
    plan_id = extract_plan_id(url)
    plan_title, plan_date = extract_plan_metadata(soup)

    issues = []

    # Anchor around all decision links to Level 2
    for anchor in soup.select("a[href*='mgIssueHistoryHome.aspx']"):
        issue = {
            "decision_code": None,
            "title": None,
            "url": None,
            "decision_maker": None,
            "decision_due": None,
            "lead_officer": None,
            "status": None,
            "first_published": None,
            "restriction": None
        }

        full_title = anchor.get_text(strip=True)
        issue["title"] = full_title

        # Extract decision_code directly from the title
        match = re.match(r"(\d{2}/\d{5})", full_title)
        if match:
            issue["decision_code"] = match.group(1)

        issue["url"] = urljoin(BASE_URL, anchor["href"])

        # Walk through surrounding text nodes and paragraphs
        context_block = []
        parent = anchor.find_parent()
        while parent and len(context_block) < 12:
            ps = parent.find_all("p")
            if ps:
                for p in ps:
                    text = p.get_text(" ", strip=True)
                    if text:
                        context_block.append(text)
                break
            parent = parent.find_parent()

        # Parse key-value fields from block
        for line in context_block:
            if ":" not in line:
                continue
            key, val = line.split(":", 1)
            key = key.lower().strip()
            val = val.strip()

            if "decision maker" in key:
                issue["decision_maker"] = val
            elif key.startswith("decision"):
                issue["decision_due"] = val
            elif "lead officer" in key:
                issue["lead_officer"] = val
            elif "status" in key:
                issue["status"] = val
            elif "first published" in key:
                issue["first_published"] = val
            elif "restriction" in key:
                issue["restriction"] = val

        # Only add if title and URL are found
        if issue["title"] and issue["url"]:
            issues.append(issue)

    return {
        "plan_id": plan_id,
        "plan_title": plan_title,
        "plan_date": plan_date,
        "plan_url": url,
        "issues": issues
    }

    # Append last block
    if current_issue:
        issues.append(current_issue)

    return {
        "plan_id": plan_id,
        "plan_title": plan_title,
        "plan_date": plan_date,
        "plan_url": url,
        "issues": issues
    }

def load_existing_jsonl(path):
    seen = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    seen.add(data.get("plan_url"))
                except:
                    continue
    return seen

def save_jsonl_entry(path, data):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

def main():
    seen = load_existing_jsonl(OUTPUT_JSONL)

    with open(OUTPUT_JSONL, "a", encoding="utf-8") as out:
        for url in tqdm(INPUT_URLS, desc="Scraping Level 1 plans"):
            if url in seen:
                continue
            try:
                record = scrape_plan_page(url)
                if record:
                    save_jsonl_entry(OUTPUT_JSONL, record)
                    time.sleep(2)
            except Exception as e:
                print(f"❌ Error with {url}: {e}")

    print(f"✅ Done. Results saved to {OUTPUT_JSONL}")

if __name__ == "__main__":
    main()

Scraping Level 1 plans: 100%|██████████| 3/3 [00:12<00:00,  4.09s/it]

✅ Done. Results saved to ../data/jsons/level1_plans.jsonl





### Scraping Level 2 links

In [7]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
import json
import os
import time
from tqdm import tqdm

BASE_URL = "https://democracy.kent.gov.uk:9071"
OUTPUT_JSONL = "../data/jsons/level2_issues_full.jsonl"
INPUT_URLS = [
    "https://democracy.kent.gov.uk:9071/mgIssueHistoryHome.aspx?IId=68660",
    "https://democracy.kent.gov.uk:9071/mgIssueHistoryHome.aspx?IId=65693"
]

def extract_issue_id_from_url(url):
    parsed = urlparse(url)
    return parse_qs(parsed.query).get("IId", [None])[0]

def clean_narrative(soup):
    content_div = soup.select_one("div.mgContent")
    if not content_div:
        return "", []
    paragraphs = content_div.find_all(["p", "ul", "ol", "li", "h2", "h3", "strong"])
    texts = []
    urls = set()
    for tag in paragraphs:
        text = tag.get_text(" ", strip=True)
        if text:
            texts.append(text)
        for a in tag.find_all("a", href=True):
            urls.add(urljoin(BASE_URL, a["href"]))
    return "\n\n".join(texts), sorted(urls)

def scrape_issue_history_page(url):
    res = requests.get(url, timeout=10)
    if res.status_code != 200:
        return None

    soup = BeautifulSoup(res.text, "html.parser")

    # Meta fields
    meta_title = soup.find("meta", attrs={"name": "DC.title"})
    full_title = meta_title["content"].strip() if meta_title else "Unknown Title"
    if " - " in full_title:
        parts = full_title.split(" - ", 1)
        decision_code = parts[0].strip()
        title = parts[1].strip()
    else:
        decision_code = None
        title = full_title

    description = (soup.find("meta", attrs={"name": "DC.description"}) or {}).get("content", None)
    created = (soup.find("meta", attrs={"name": "DC.date.created"}) or {}).get("content", None)
    modified = (soup.find("meta", attrs={"name": "DC.date.modified"}) or {}).get("content", None)
    subjects = [s["content"] for s in soup.find_all("meta", attrs={"name": "DC.subject"})]

    # Narrative text and links
    narrative_text, narrative_links = clean_narrative(soup)

    # Decision links
    decision_links = []
    for a in soup.select("a[href*='ieDecisionDetails.aspx?Id=']"):
        href = a.get("href")
        full_url = urljoin(BASE_URL, href)
        decision_id = parse_qs(urlparse(href).query).get("Id", [None])[0]
        if decision_id:
            decision_links.append({
                "decision_id": decision_id,
                "url": full_url
            })

    return {
        "issue_id": extract_issue_id_from_url(url),
        "decision_code": decision_code,
        "title": title,
        "description": description,
        "created": created,
        "modified": modified,
        "subjects": subjects,
        "narrative": narrative_text,
        "narrative_links": narrative_links,
        "decision_links": decision_links,
        "page_url": url
    }

def load_existing_jsonl(path):
    seen = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    seen.add(data.get("page_url"))
                except:
                    continue
    return seen

def save_jsonl_entry(path, data):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

def main():
    seen = load_existing_jsonl(OUTPUT_JSONL)

    with open(OUTPUT_JSONL, "a", encoding="utf-8") as out:
        for url in tqdm(INPUT_URLS, desc="Scraping Level 2 (full)"):
            if url in seen:
                continue
            try:
                record = scrape_issue_history_page(url)
                if record:
                    save_jsonl_entry(OUTPUT_JSONL, record)
                    time.sleep(2)
            except Exception as e:
                print(f"❌ Error with {url}: {e}")

    print(f"✅ Done. Results saved to {OUTPUT_JSONL}")

if __name__ == "__main__":
    main()

Scraping Level 2 (full): 100%|██████████| 2/2 [00:06<00:00,  3.47s/it]

✅ Done. Results saved to ../data/jsons/level2_issues_full.jsonl





### Level 4

In [5]:

INPUT_URLS = [
    "https://democracy.kent.gov.uk:9071/ieIssueDetails.aspx?IId=52995",
    "https://democracy.kent.gov.uk:9071/ieIssueDetails.aspx?IId=52840",
    "https://democracy.kent.gov.uk:9071/ieDecisionDetails.aspx?Id=2917",
    "https://democracy.kent.gov.uk:9071/ieIssueDetails.aspx?IId=67560&PlanId=0&Opt=3#AI66632"
]

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
import json
import os
import time
from tqdm import tqdm

BASE_URL = "https://democracy.kent.gov.uk:9071"
OUTPUT_JSONL = "../data/jsons/level4_issues.jsonl"


def classify_doc_type(filename):
    lowered = filename.lower()
    if "decision" in lowered and "record" in lowered:
        return "Record of Decision"
    if "eqia" in lowered:
        return "Equality Impact Assessment"
    if "appendix" in lowered:
        return "Appendix"
    if "report" in lowered:
        return "Decision Report"
    return "Other"

def extract_issue_id_from_url(url):
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    return query.get("IId", [None])[0]

def scrape_issue_page(url):
    if "ieDecisionDetails.aspx" in url:
        print(f"⚠️ Skipping Level 3 page: {url}")
        return None

    res = requests.get(url, timeout=10)
    if res.status_code != 200:
        return None

    soup = BeautifulSoup(res.text, "html.parser")

    # Title & decision code
    meta_title = soup.find("meta", attrs={"name": "DC.title"})
    full_title = meta_title["content"].strip() if meta_title else soup.title.text.strip()

    if " - " in full_title:
        parts = full_title.split(" - ", 1)
        decision_code = parts[0].strip()
        title = parts[1].strip()
    else:
        decision_code = None
        title = full_title

    # Metadata
    date_tag = soup.find("meta", attrs={"name": "DC.date.created"})
    created = date_tag["content"] if date_tag else None
    subject_tags = soup.find_all("meta", attrs={"name": "DC.subject"})
    subjects = [s["content"] for s in subject_tags]

    # PDF documents
    documents = []
    for a in soup.select("a[href*='/documents/']"):
        href = a.get("href")
        full_url = urljoin(BASE_URL, href)
        filename = os.path.basename(href)
        doc_type = classify_doc_type(filename)
        documents.append({
            "filename": filename,
            "doc_type": doc_type,
            "url": full_url
        })

    return {
        "decision_code": decision_code,
        "title": title,
        "issue_id": extract_issue_id_from_url(url),
        "page_url": url,
        "created": created,
        "subjects": subjects,
        "documents": documents
    }

def load_existing_jsonl(path):
    seen = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    seen.add(data.get("page_url"))
                except:
                    continue
    return seen

def save_jsonl_entry(path, data):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

def main():
    seen = load_existing_jsonl(OUTPUT_JSONL)

    with open(OUTPUT_JSONL, "a", encoding="utf-8") as out:
        for url in tqdm(INPUT_URLS, desc="Scraping Level 4 issues"):
            if url in seen:
                continue
            try:
                record = scrape_issue_page(url)
                if record:
                    save_jsonl_entry(OUTPUT_JSONL, record)
                    time.sleep(2)
            except Exception as e:
                print(f"❌ Error with {url}: {e}")

    print(f"✅ Done. Results saved to {OUTPUT_JSONL}")

if __name__ == "__main__":
    main()

Scraping Level 4 issues:   0%|          | 0/4 [00:00<?, ?it/s]

Scraping Level 4 issues:  50%|█████     | 2/4 [00:00<00:00,  4.57it/s]

⚠️ Skipping Level 3 page: https://democracy.kent.gov.uk:9071/ieDecisionDetails.aspx?Id=2917


Scraping Level 4 issues: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]

✅ Done. Results saved to ../data/jsons/level4_issues.jsonl



