### Code to scrape 

In [1]:
mother_urls = [
    "https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=113&Year=0"  # full council
]
topic='full_council'

In [40]:
mother_urls = ["https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=115&Year=0"
               ]
topic = 'cabinet'

In [11]:
mother_urls = ["https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=128&Year=0"
               ]
topic = 'pension'

In [None]:
mother_urls = ["https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=896&Year=0"
               ]
topic = 'asc'

### Final recovered version

In [41]:
import os
import re
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime

# === SETTINGS ===
download_base = f"/Users/lgfolder/github/council-assistant/data/council_documents/{topic}"
base_url = "https://democracy.kent.gov.uk/"


# === Unknown date tracker ===
unknown_date_counter = 0
def assign_unknown_folder():
    global unknown_date_counter
    unknown_date_counter += 1
    return f"unknown-{unknown_date_counter}"

# === FUNCTION: Extract meeting date from page ===
def extract_meeting_date(soup):
    text = soup.get_text(" ", strip=True)

    # Pattern 1: "Thursday, 14 September 2023"
    match1 = re.search(
        r"(?:Thursday|Tuesday|Monday|Wednesday|Friday),\s+(\d{1,2})(?:st|nd|rd|th)?(?:,)?\s+([A-Za-z]+)(?:,)?\s+(20\d{2})",
        text
    )

    # Pattern 2: "Meeting of County Council held on Thursday, 17 October 2019 at 10.00 am"
    match2 = re.search(
        r"held on (?:Thursday|Tuesday|Monday|Wednesday|Friday),\s+(\d{1,2})\s+([A-Za-z]+)\s+(20\d{2})",
        text
    )

    for match in [match1, match2]:
        if match:
            day, month, year = match.groups()
            raw_date = f"{day} {month} {year}"
            try:
                dt = datetime.strptime(raw_date, "%d %B %Y")
                return dt.strftime("%Y-%m-%d")
            except ValueError:
                print(f"  ⚠️ Date parsing error: {raw_date}")

    return None

# === FUNCTION: Categorise document based on filename ===
def get_document_category(filename):
    lower = filename.lower()

    if "agenda" in lower and "front" in lower:
        return "agenda_frontsheet"
    elif "agenda" in lower or "additional agenda" in lower or "agenda item" in lower:
        return "agenda"
    elif "printed minutes" in lower or "cpp minutes" in lower or "minutes of previous" in lower:
        return "minutes"
    elif "minutes" in lower:
        return "minutes"
    elif "questions put" in lower or "answers to questions" in lower or "q&a" in lower:
        return "questions"
    elif "appendix" in lower or "annex" in lower:
        return "appendix"
    elif "motion" in lower or "mtld" in lower:
        return "motion"
    elif "amendment" in lower:
        return "amendment"
    elif "budget" in lower or "revenue plan" in lower:
        return "budget"
    elif "report" in lower or "covering report" in lower or "update" in lower:
        return "report"
    elif "response" in lower or "decision" in lower or "record of decision" in lower:
        return "decision_response"
    elif "strategy" in lower or "investment strategy" in lower or "capital strategy" in lower:
        return "strategy"
    elif "plan" in lower or "local plan" in lower or "delivery plan" in lower:
        return "plan"
    elif "policy" in lower or "statement" in lower:
        return "policy"
    elif "consultation" in lower:
        return "consultation"
    elif "performance" in lower or "quarterly performance" in lower or "qpr" in lower:
        return "performance"
    elif "terms of reference" in lower or "tor" in lower:
        return "terms_of_reference"
    elif "glossary" in lower or "note" in lower or "you said we did" in lower:
        return "supporting_material"
    else:
        return "other"

# === FUNCTION: Download file and collect metadata ===
def download_pdf_and_record_metadata(pdf_url, destination_folder, meeting_date, seen_urls):
    parsed = urlparse(pdf_url)

    # Skip if hostname is missing or clearly internal
    if not parsed.hostname or "kent.gov.uk" not in parsed.hostname:
        print(f"  ⚠️ Skipping invalid or internal link: {pdf_url}")
        return None

    # Skip if URL already processed for this meeting
    if pdf_url in seen_urls:
        print(f"    🔁 Skipping duplicate URL: {pdf_url}")
        return None
    seen_urls.add(pdf_url)

    # Handle duplicate filenames by appending _1, _2, etc.
    filename = os.path.basename(parsed.path)
    original_name = filename
    counter = 1
    while os.path.exists(os.path.join(destination_folder, "originals", filename)):
        filename_parts = os.path.splitext(original_name)
        filename = f"{filename_parts[0]}_{counter}{filename_parts[1]}"
        counter += 1

    path_rel = os.path.join("originals", filename)
    full_path = os.path.join(destination_folder, path_rel)

    # Download file if not already saved
    if not os.path.exists(full_path):
        print(f"    ⬇️ Downloading: {filename}")
        try:
            response = requests.get(pdf_url, timeout=20)
            response.raise_for_status()
        except Exception as e:
            print(f"  ❌ Failed to download {pdf_url}: {e}")
            return None  # Skip this entry on error

        with open(full_path, "wb") as f:
            f.write(response.content)
    else:
        print(f"    ⏩ Skipped (already exists): {filename}")

    # Return metadata
    return {
        "filename": filename,
        "path": path_rel,
        "type": "pdf",
        "committee": topic,
        "meeting_date": meeting_date,
        "document_category": get_document_category(filename),
        "url": pdf_url,
        "created": datetime.now().isoformat()
    }

# === MAIN SCRAPER LOOP ===
for mother_url in mother_urls:
    print(f"\n🔎 Loading mother page: {mother_url}")
    resp = requests.get(mother_url)
    soup = BeautifulSoup(resp.text, "html.parser")

    child_links = [
        urljoin(base_url, a["href"])
        for a in soup.select("a[href*='ieListDocuments.aspx']")
    ]

    print(f"  ➤ Found {len(child_links)} child pages.")

    for child_url in child_links:
        print(f"\n📄 Scraping child page: {child_url}")
        child_resp = requests.get(child_url)
        child_soup = BeautifulSoup(child_resp.text, "html.parser")

        meeting_date = extract_meeting_date(child_soup)
        if not meeting_date:
            meeting_date = assign_unknown_folder()

        meeting_folder = os.path.join(download_base, meeting_date)
        originals_folder = os.path.join(meeting_folder, "originals")
        os.makedirs(originals_folder, exist_ok=True)

        metadata = []
        seen_urls = set()  # Track URLs to avoid duplicates

        for a in child_soup.select("a[href]"):
            href = a['href']
            if '.pdf' in href.lower():
                pdf_url = urljoin(base_url, href)
                meta_entry = download_pdf_and_record_metadata(
                    pdf_url, meeting_folder, meeting_date, seen_urls
                )
                if meta_entry:
                    metadata.append(meta_entry)

        # === GRANDCHILD LINKS ===
        grandchild_links = [
            urljoin(base_url, a["href"])
            for a in child_soup.select("a")
            if "View the full list of documents" in a.get_text()
        ]

        for g_url in grandchild_links:
            print(f"    ↪️ Scraping grandchild: {g_url}")
            g_resp = requests.get(g_url)
            g_soup = BeautifulSoup(g_resp.text, "html.parser")

            g_date = extract_meeting_date(g_soup) or assign_unknown_folder()
            g_folder = os.path.join(download_base, g_date)
            g_originals = os.path.join(g_folder, "originals")
            os.makedirs(g_originals, exist_ok=True)

            seen_urls = set()  # Reset for each grandchild

            for a in g_soup.select("a[href]"):
                href = a['href']
                if '.pdf' in href.lower():
                    pdf_url = urljoin(base_url, href)
                    meta_entry = download_pdf_and_record_metadata(
                        pdf_url, g_folder, g_date, seen_urls
                    )
                    if meta_entry:
                        metadata.append(meta_entry)

        # === WRITE OR APPEND METADATA FILE (with deduplication) ===
        metadata_path = os.path.join(meeting_folder, "metadata.json")
        
        # Load existing metadata if present
        try:
            with open(metadata_path, "r", encoding="utf-8") as f:
                existing_metadata = json.load(f)
        except FileNotFoundError:
            existing_metadata = []

        # Remove None entries from new metadata
        metadata = [entry for entry in metadata if entry is not None]

        # Combine and deduplicate by URL
        combined = {entry["url"]: entry for entry in existing_metadata + metadata}
        metadata = list(combined.values())

        # Save updated metadata
        with open(metadata_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        print(f"  🗂️ Metadata updated at {metadata_path}")


🔎 Loading mother page: https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=115&Year=0
  ➤ Found 15 child pages.

📄 Scraping child page: https://democracy.kent.gov.uk/ieListDocuments.aspx?CId=115&MId=9471&Ver=4
  🗂️ Metadata updated at /Users/lgfolder/github/council-assistant/data/council_documents/cabinet/2025-06-26/metadata.json

📄 Scraping child page: https://democracy.kent.gov.uk/ieListDocuments.aspx?CId=115&MId=9472&Ver=4
  🗂️ Metadata updated at /Users/lgfolder/github/council-assistant/data/council_documents/cabinet/2025-06-05/metadata.json

📄 Scraping child page: https://democracy.kent.gov.uk/ieListDocuments.aspx?CId=115&MId=9761&Ver=4
    ⬇️ Downloading: Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf
    ⬇️ Downloading: Public reports pack 13th-Mar-2025 15.00 Cabinet.pdf
  🗂️ Metadata updated at /Users/lgfolder/github/council-assistant/data/council_documents/cabinet/2025-03-13/metadata.json

📄 Scraping child page: https://democracy.kent.gov.uk/ieListDocuments.aspx?CId=115&

KeyboardInterrupt: 

### Auduting

Your auditing and cleaning code is already quite solid and well thought out. It does four important things:
	1.	Checks for missing metadata.
	2.	Deduplicates entries in metadata.json.
	3.	Compares recorded metadata with actual PDFs in the originals/ folder.
	4.	Deletes truly empty folders and quarantines folders with discrepancies.

That said, here’s a refined and optimized version with improved structure, additional safety checks, and better separation of concerns. It avoids unnecessary memory usage and gives you a clean foundation for scaling:

In [7]:
from pathlib import Path
from difflib import SequenceMatcher
from collections import defaultdict
import os
import json
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

base_folder = Path("../data/council_documents")
deleted_files = []
renamed_files = []
keep_set = set()

for folder in base_folder.rglob("*/originals"):
    pdfs = list(folder.glob("*.pdf"))
    size_map = defaultdict(list)

    metadata_path = folder.parent / "metadata.json"
    url_lookup = {}
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            try:
                metadata = json.load(f)
                for entry in metadata:
                    url_lookup[entry.get("filename")] = entry.get("url")
            except:
                pass

    for f in pdfs:
        size_map[f.stat().st_size].append(f)

    for size, files in size_map.items():
        if len(files) > 1:
            checked_pairs = set()
            for i, f1 in enumerate(files):
                for f2 in files[i + 1:]:
                    pair_key = tuple(sorted([f1.name, f2.name]))
                    if pair_key in checked_pairs:
                        continue
                    checked_pairs.add(pair_key)

                    name1, name2 = f1.name, f2.name
                    ratio = SequenceMatcher(None, name1, name2).ratio()
                    if ratio > 0.85:
                        url1 = url_lookup.get(name1)
                        url2 = url_lookup.get(name2)

                        # Skip if either file is already marked to keep
                        if f1 in keep_set or f2 in keep_set:
                            continue

                        def ends_with_suffix(name):
                            return any(name.rstrip(".pdf").endswith(f"_{n}") for n in range(1, 10))

                        # Case 1: One has a URL, the other doesn't
                        if url1 and not url2:
                            try:
                                os.remove(f2)
                                deleted_files.append((str(folder), name2))
                                keep_set.add(f1)
                                if ends_with_suffix(name1):
                                    new_name = name1.replace(f"_{name1.split('_')[-1]}", ".pdf")
                                    new_path = f1.with_name(new_name)
                                    if not new_path.exists():
                                        try:
                                            f1.rename(new_path)
                                            renamed_files.append((str(folder), name1, new_name))
                                            keep_set.discard(f1)
                                            keep_set.add(new_path)
                                        except FileNotFoundError:
                                            pass
                            except FileNotFoundError:
                                pass

                        elif url2 and not url1:
                            try:
                                os.remove(f1)
                                deleted_files.append((str(folder), name1))
                                keep_set.add(f2)
                                if ends_with_suffix(name2):
                                    new_name = name2.replace(f"_{name2.split('_')[-1]}", ".pdf")
                                    new_path = f2.with_name(new_name)
                                    if not new_path.exists():
                                        try:
                                            f2.rename(new_path)
                                            renamed_files.append((str(folder), name2, new_name))
                                            keep_set.discard(f2)
                                            keep_set.add(new_path)
                                        except FileNotFoundError:
                                            pass
                            except FileNotFoundError:
                                pass

                        # Case 2: Both have URLs and they are equal → keep cleaner name
                        elif url1 == url2 and url1 is not None:
                            if ends_with_suffix(name1):
                                try:
                                    os.remove(f1)
                                    deleted_files.append((str(folder), name1))
                                    keep_set.add(f2)
                                except FileNotFoundError:
                                    pass
                            elif ends_with_suffix(name2):
                                try:
                                    os.remove(f2)
                                    deleted_files.append((str(folder), name2))
                                    keep_set.add(f1)
                                except FileNotFoundError:
                                    pass

print(f"✅ Deleted {len(deleted_files)} files and renamed {len(renamed_files)} files.")
# pd.DataFrame(deleted_files, columns=["Folder", "Deleted File"]).head(20)

✅ Deleted 18 files and renamed 18 files.


In [8]:
import os
import json

# === CONFIGURATION ===
base_path = "/Users/lgfolder/github/council-assistant/data/council_documents/"

# === TRACKING ===
quarantine_report = []

def load_metadata(metadata_path):
    try:
        with open(metadata_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data if isinstance(data, list) else []
    except Exception:
        return []

def deduplicate_metadata(metadata):
    seen = set()
    deduped = []
    for entry in metadata:
        if isinstance(entry, dict):
            fn = entry.get("filename")
            if fn and fn not in seen:
                seen.add(fn)
                deduped.append(entry)
    return deduped

def audit_folder(folder_name):
    folder_path = os.path.join(base_path, folder_name)
    if not os.path.isdir(folder_path) or folder_name == "quarantine":
        return

    metadata_path = os.path.join(folder_path, "metadata.json")
    originals_path = os.path.join(folder_path, "originals")

    metadata = load_metadata(metadata_path)
    metadata = deduplicate_metadata(metadata)

    # Re-save deduplicated metadata if needed
    if metadata:
        with open(metadata_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)

    metadata_filenames = {entry["filename"] for entry in metadata if "filename" in entry}
    actual_files = set()

    if os.path.exists(originals_path):
        for f in os.listdir(originals_path):
            if f.lower().endswith(".pdf"):
                actual_files.add(f)

    # Report files that exist but are missing metadata entries
    missing_from_metadata = actual_files - metadata_filenames
    if missing_from_metadata:
        quarantine_report.append({
            "folder": folder_name,
            "missing_files": sorted(missing_from_metadata)
        })

# === MAIN LOOP ===
for folder in sorted(os.listdir(base_path)):
    audit_folder(folder)

# === REPORT ===
if quarantine_report:
    print(f"\n📋 Found {len(quarantine_report)} folders with missing metadata:")
    for entry in quarantine_report:
        print(f"\n📅 {entry['folder']}")
        for fname in entry["missing_files"]:
            print(f"   🔍 {fname}")
else:
    print("✅ All folders are consistent with metadata.")

✅ All folders are consistent with metadata.
