# Cleaning Post Scraping NOtebook

### Correcting URLs in metadata

1.  Walks through every file matching ../data/council_documents/*/*/metadata.json
2.  For each entry inside the metadata.json:
	•	Checks if the "url" key exists and contains a space
	•	If so:
	•	Encodes the URL safely using urllib.parse.quote
	•	Preserves the query string (?T=9, etc.)
	•	Sets the updated value back to entry["url"]
3.	If any URLs were changed in that file:
	•	It overwrites the entire metadata.json with the updated entries using:json.dump(data, f, indent=2)
4.	Logs which files were updated.

Careful: script modifies the file contents on disk

In [20]:
import json
from pathlib import Path
from urllib.parse import quote

# Set base path
base_path = Path("../data/council_documents")
metadata_paths = list(base_path.glob("*/*/metadata.json"))
print(f"🔍 Found {len(metadata_paths)} metadata.json files")

for meta_file in metadata_paths:
    try:
        with meta_file.open("r", encoding="utf-8") as f:
            data = json.load(f)

        modified = False
        for entry in data:
            url = entry.get("url")
            if url and " " in url:
                # Split to preserve any query string
                if "?" in url:
                    url_base, query = url.split("?", 1)
                    url_base_encoded = quote(url_base, safe="/:")
                    entry["url"] = f"{url_base_encoded}?{query}"
                else:
                    entry["url"] = quote(url, safe="/:")
                modified = True

        if modified:
            with meta_file.open("w", encoding="utf-8") as f:
                json.dump(data, f, indent=2)
            print(f"✅ Updated: {meta_file}")

    except Exception as e:
        print(f"⚠️ Error processing {meta_file}: {e}")

🔍 Found 12 metadata.json files
✅ Updated: ../data/council_documents/cabinet/2025-03-13/metadata.json
✅ Updated: ../data/council_documents/cabinet/2025-01-30/metadata.json
✅ Updated: ../data/council_documents/cabinet/2025-01-09/metadata.json
✅ Updated: ../data/council_documents/cabinet/2025-03-04/metadata.json
✅ Updated: ../data/council_documents/full_council/2025-03-13/metadata.json
✅ Updated: ../data/council_documents/full_council/2025-02-13/metadata.json
✅ Updated: ../data/council_documents/full_council/2025-01-09/metadata.json


### Update and Overwrite "path" in All metadata.json Files (to make them unique)

What It Does:
* Rebuilds the "path" field using: committee/meeting_date/originals/filename
* Only overwrites the file if changes were made
* Uses json.dump(..., indent=2, ensure_ascii=False) for human-readable formatting

In [21]:
from pathlib import Path
import json

base_dir = Path("../data/council_documents")
metadata_files = list(base_dir.rglob("metadata.json"))

for meta_path in metadata_files:
    with meta_path.open("r", encoding="utf-8") as f:
        metadata = json.load(f)

    modified = False
    for entry in metadata:
        committee = entry.get("committee")
        date = entry.get("meeting_date")
        filename = entry.get("filename")

        if committee and date and filename:
            new_path = f"{committee}/{date}/originals/{filename}"
            if entry.get("path") != new_path:
                entry["path"] = new_path
                modified = True

    if modified:
        with meta_path.open("w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        print(f"✅ Updated: {meta_path}")
    else:
        print(f"➖ No changes: {meta_path}")

✅ Updated: ../data/council_documents/cabinet/2025-03-13/metadata.json
✅ Updated: ../data/council_documents/cabinet/2025-01-30/metadata.json
➖ No changes: ../data/council_documents/cabinet/2025-06-26/metadata.json
✅ Updated: ../data/council_documents/cabinet/2025-01-09/metadata.json
➖ No changes: ../data/council_documents/cabinet/2025-06-05/metadata.json
✅ Updated: ../data/council_documents/cabinet/2025-03-04/metadata.json
✅ Updated: ../data/council_documents/full_council/2025-03-13/metadata.json
✅ Updated: ../data/council_documents/full_council/2025-02-13/metadata.json
➖ No changes: ../data/council_documents/full_council/2025-02-24/metadata.json
➖ No changes: ../data/council_documents/full_council/2025-07-10/metadata.json
✅ Updated: ../data/council_documents/full_council/2025-01-09/metadata.json
➖ No changes: ../data/council_documents/full_council/2025-05-22/metadata.json


### Identifying and relocating Public Packs

In [22]:
#relocate_public_packs.py

import os
import shutil
import json
from PyPDF2 import PdfReader
from pathlib import Path

def relocate_public_packs(base_folder="../data/council_documents"):
    base_path = Path(base_folder)

    for committee in os.listdir(base_path):
        committee_path = base_path / committee
        if not committee_path.is_dir():
            continue

        for meeting_date in os.listdir(committee_path):
            meeting_path = committee_path / meeting_date
            originals_path = meeting_path / "originals"
            public_packs_path = meeting_path / "public_packs"
            metadata_path = meeting_path / "metadata.json"

            if not originals_path.exists() or not metadata_path.exists():
                continue

            with metadata_path.open("r", encoding="utf-8") as f:
                try:
                    metadata = json.load(f)
                except json.JSONDecodeError:
                    print(f"⚠️ Skipping bad metadata: {metadata_path}")
                    continue

            moved_filenames = []

            for filename in os.listdir(originals_path):
                if not filename.endswith(".pdf"):
                    continue

                file_path = originals_path / filename

                try:
                    reader = PdfReader(str(file_path))
                    title = reader.metadata.get("/Title", "")
                    if "(Public Pack)" in title:
                        os.makedirs(public_packs_path, exist_ok=True)
                        dest_path = public_packs_path / filename
                        shutil.move(str(file_path), str(dest_path))
                        moved_filenames.append(filename)
                        print(f"✅ Moved public pack: {file_path} → {dest_path}")
                except Exception as e:
                    print(f"⚠️ Could not process {file_path}: {e}")

            # Update metadata
            modified = False
            for entry in metadata:
                if entry.get("filename") in moved_filenames:
                    entry["path"] = f"{committee}/{meeting_date}/public_packs/{entry['filename']}"
                    modified = True

            if modified:
                with metadata_path.open("w", encoding="utf-8") as f:
                    json.dump(metadata, f, indent=2, ensure_ascii=False)
                print(f"📝 Metadata updated: {metadata_path}")

relocate_public_packs()

✅ Moved public pack: ../data/council_documents/cabinet/2025-03-13/originals/Public reports pack 13th-Mar-2025 15.00 Cabinet.pdf → ../data/council_documents/cabinet/2025-03-13/public_packs/Public reports pack 13th-Mar-2025 15.00 Cabinet.pdf
📝 Metadata updated: ../data/council_documents/cabinet/2025-03-13/metadata.json
✅ Moved public pack: ../data/council_documents/cabinet/2025-01-30/originals/24-00096 - PROD 30th-Jan-2025 10.00 Cabinet.pdf → ../data/council_documents/cabinet/2025-01-30/public_packs/24-00096 - PROD 30th-Jan-2025 10.00 Cabinet.pdf
✅ Moved public pack: ../data/council_documents/cabinet/2025-01-30/originals/Item 7 - Supplementary - 2400109 - Transfer the 18-25 section of the Strengthening Independence Ser.pdf → ../data/council_documents/cabinet/2025-01-30/public_packs/Item 7 - Supplementary - 2400109 - Transfer the 18-25 section of the Strengthening Independence Ser.pdf
📝 Metadata updated: ../data/council_documents/cabinet/2025-01-30/metadata.json
✅ Moved public pack: ../da

### Hashing and removing duplicates

In [23]:
import hashlib
import json
import shutil
from pathlib import Path

base_dir = Path("../data/council_documents")
duplicates_dir = base_dir / "duplicates"
duplicates_dir.mkdir(exist_ok=True)

def hash_pdf(file_path):
    with file_path.open("rb") as f:
        return hashlib.sha256(f.read()).hexdigest()

def clean_filename_priority(name):
    """Return True if filename does NOT end with _1, _2, etc."""
    return not any(name.stem.endswith(f"_{i}") for i in range(1, 10))

# Process each committee/meeting/originals folder
for originals_dir in base_dir.rglob("originals"):
    if not originals_dir.is_dir():
        continue

    metadata_path = originals_dir.parent / "metadata.json"
    if not metadata_path.exists():
        print(f"⚠️ No metadata found: {metadata_path}")
        continue

    with metadata_path.open("r", encoding="utf-8") as f:
        try:
            metadata = json.load(f)
        except json.JSONDecodeError:
            print(f"❌ Could not parse metadata: {metadata_path}")
            continue

    # Build lookup: filename ➝ metadata entry
    file_lookup = {entry["filename"]: entry for entry in metadata if entry.get("filename")}
    updated_metadata = []

    # Group files by hash
    hash_groups = {}
    for pdf_path in originals_dir.glob("*.pdf"):
        file_hash = hash_pdf(pdf_path)
        hash_groups.setdefault(file_hash, []).append(pdf_path)

    for file_hash, paths in hash_groups.items():
        if len(paths) == 1:
            # Keep as-is, just add hash
            path = paths[0]
            entry = file_lookup.get(path.name)
            if entry:
                entry["hash"] = file_hash
                updated_metadata.append(entry)
            continue

        # Choose file to keep
        candidates = []
        for path in paths:
            entry = file_lookup.get(path.name)
            if entry:
                has_url = bool(entry.get("url"))
                is_clean = clean_filename_priority(path)
                candidates.append((has_url, is_clean, path, entry))

        candidates.sort(reverse=True)  # highest priority first
        keep_path, keep_entry = candidates[0][2], candidates[0][3]
        keep_entry["hash"] = file_hash
        updated_metadata.append(keep_entry)

        # Remove the rest
        for _, _, dupe_path, dupe_entry in candidates[1:]:
            print(f"🗑 Removing duplicate: {dupe_path.name}")
            target_path = duplicates_dir / dupe_path.name
            shutil.move(str(dupe_path), str(target_path))

    # Save cleaned metadata
    with metadata_path.open("w", encoding="utf-8") as f:
        json.dump(updated_metadata, f, indent=2, ensure_ascii=False)
    print(f"✅ Updated metadata: {metadata_path}")

✅ Updated metadata: ../data/council_documents/cabinet/2025-03-13/metadata.json
✅ Updated metadata: ../data/council_documents/cabinet/2025-01-30/metadata.json
✅ Updated metadata: ../data/council_documents/cabinet/2025-06-26/metadata.json
✅ Updated metadata: ../data/council_documents/cabinet/2025-01-09/metadata.json
✅ Updated metadata: ../data/council_documents/cabinet/2025-06-05/metadata.json
✅ Updated metadata: ../data/council_documents/cabinet/2025-03-04/metadata.json
✅ Updated metadata: ../data/council_documents/full_council/2025-03-13/metadata.json
🗑 Removing duplicate: 1 - Annex 2 - Labour Group Alternative Budget 2025-28 - Appendix D_1.pdf
🗑 Removing duplicate: 1 - Annex 3 - Labour Group Alternative Budget 2025-28 - Appendix F_1.pdf
✅ Updated metadata: ../data/council_documents/full_council/2025-02-13/metadata.json
✅ Updated metadata: ../data/council_documents/full_council/2025-02-24/metadata.json
✅ Updated metadata: ../data/council_documents/full_council/2025-07-10/metadata.json


### Diagnostic tool to consolidate data from underlying meeting level metadata generated during scraping

This is a diagnostic or inspection tool — useful for:
* Finding missing URLs or metadata
* Auditing document categories
* Filtering or fixing records across the entire dataset

In [24]:
import json
from pathlib import Path
import pandas as pd

base_dir = Path("../data/council_documents")
metadata_files = list(base_dir.rglob("metadata.json"))

all_rows = []

for meta_path in metadata_files:
    with meta_path.open("r", encoding="utf-8") as f:
        try:
            entries = json.load(f)
        except json.JSONDecodeError:
            continue

    for entry in entries:
        all_rows.append({
            "committee": entry.get("committee"),
            "meeting_date": entry.get("meeting_date"),
            "filename": entry.get("filename"),
            "document_category": entry.get("document_category"),
            "url": entry.get("url"),
            "path": entry.get("path"),
            "source_file": str(meta_path)
        })

# Create DataFrame
metadata_df = pd.DataFrame(all_rows)
metadata_df

Unnamed: 0,committee,meeting_date,filename,document_category,url,path,source_file
0,cabinet,2025-03-13,Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf,agenda_frontsheet,https://democracy.kent.gov.uk/documents/g9767/Agenda%20frontsheet%2013th-Mar-2025%2015.00%20Cabinet.pdf?T=0,cabinet/2025-03-13/originals/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf,../data/council_documents/cabinet/2025-03-13/metadata.json
1,cabinet,2025-01-30,Item 7 - Supplementary - 24-00109 - Transfer the 18-25 section of the Strengthening Independence Ser.pdf,other,https://democracy.kent.gov.uk/documents/s128883/Item%207%20-%20Supplementary%20-%2024-00109%20-%20Transfer%20the%2018-25%20section%20of%20the%20Strengthening%20Independence%20Ser.pdf,cabinet/2025-01-30/originals/Item 7 - Supplementary - 24-00109 - Transfer the 18-25 section of the Strengthening Independence Ser.pdf,../data/council_documents/cabinet/2025-01-30/metadata.json
2,cabinet,2025-01-30,24-00096 - PRoD.pdf,other,https://democracy.kent.gov.uk/documents/s128835/24-00096%20-%20PRoD.pdf,cabinet/2025-01-30/originals/24-00096 - PRoD.pdf,../data/council_documents/cabinet/2025-01-30/metadata.json
3,cabinet,2025-01-30,Minutes of the meeting held on 9th January 2025.pdf,minutes,https://democracy.kent.gov.uk/documents/s128743/Minutes%20of%20the%20meeting%20held%20on%209th%20January%202025.pdf,cabinet/2025-01-30/originals/Minutes of the meeting held on 9th January 2025.pdf,../data/council_documents/cabinet/2025-01-30/metadata.json
4,cabinet,2025-01-30,Appendix B Capital Investment by Directorate_1.pdf,appendix,https://democracy.kent.gov.uk/documents/s128753/Appendix%20B%20Capital%20Investment%20by%20Directorate.pdf,cabinet/2025-01-30/originals/Appendix B Capital Investment by Directorate_1.pdf,../data/council_documents/cabinet/2025-01-30/metadata.json
...,...,...,...,...,...,...,...
145,full_council,2025-01-09,Devolution White Paper - County Council Report - FINAL.pdf,report,https://democracy.kent.gov.uk/documents/s128315/Devolution%20White%20Paper%20-%20County%20Council%20Report%20-%20FINAL.pdf,full_council/2025-01-09/originals/Devolution White Paper - County Council Report - FINAL.pdf,../data/council_documents/full_council/2025-01-09/metadata.json
146,full_council,2025-01-09,Agenda frontsheet 09th-Jan-2025 10.00 County Council.pdf,agenda_frontsheet,https://democracy.kent.gov.uk/documents/g9645/Agenda%20frontsheet%2009th-Jan-2025%2010.00%20County%20Council.pdf?T=0,full_council/2025-01-09/originals/Agenda frontsheet 09th-Jan-2025 10.00 County Council.pdf,../data/council_documents/full_council/2025-01-09/metadata.json
147,full_council,2025-01-09,Appendix A - 16.12.24 Ministerial Letter.pdf,appendix,https://democracy.kent.gov.uk/documents/s128319/Appendix%20A%20-%2016.12.24%20Ministerial%20Letter.pdf,full_council/2025-01-09/originals/Appendix A - 16.12.24 Ministerial Letter.pdf,../data/council_documents/full_council/2025-01-09/metadata.json
148,full_council,2025-01-09,Appendix B - Devolution Framework 2024.pdf,appendix,https://democracy.kent.gov.uk/documents/s128320/Appendix%20B%20-%20Devolution%20Framework%202024.pdf,full_council/2025-01-09/originals/Appendix B - Devolution Framework 2024.pdf,../data/council_documents/full_council/2025-01-09/metadata.json


### Auditing

The auditing and cleaning code below does 4 important things:
* 1.	Checks for missing metadata.
* 2.	Deduplicates entries in metadata.json.
* 3.	Compares recorded metadata with actual PDFs in the originals/ folder.
* 4.	Deletes truly empty folders and quarantines folders with discrepancies.

Improvements:
* Cleans up duplicate PDFs by URL and filename
* Automatically renames suffix files to cleaner names
* Keeps only one copy per duplicate group
* Fully updates metadata.json:
* Removes entries for deleted files
* Updates filename/path for renamed files

In [25]:
from pathlib import Path
from difflib import SequenceMatcher
from collections import defaultdict
import os
import json
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

base_folder = Path("../data/council_documents")
deleted_files = []
renamed_files = []
keep_set = set()

for folder in base_folder.rglob("*/originals"):
    pdfs = list(folder.glob("*.pdf"))
    size_map = defaultdict(list)

    metadata_path = folder.parent / "metadata.json"
    url_lookup = {}
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            try:
                metadata = json.load(f)
                for entry in metadata:
                    url_lookup[entry.get("filename")] = entry.get("url")
            except:
                pass

    for f in pdfs:
        size_map[f.stat().st_size].append(f)

    for size, files in size_map.items():
        if len(files) > 1:
            checked_pairs = set()
            for i, f1 in enumerate(files):
                for f2 in files[i + 1:]:
                    pair_key = tuple(sorted([f1.name, f2.name]))
                    if pair_key in checked_pairs:
                        continue
                    checked_pairs.add(pair_key)

                    name1, name2 = f1.name, f2.name
                    ratio = SequenceMatcher(None, name1, name2).ratio()
                    if ratio > 0.85:
                        url1 = url_lookup.get(name1)
                        url2 = url_lookup.get(name2)

                        # Skip if either file is already marked to keep
                        if f1 in keep_set or f2 in keep_set:
                            continue

                        def ends_with_suffix(name):
                            return any(name.rstrip(".pdf").endswith(f"_{n}") for n in range(1, 10))

                        # Case 1: One has a URL, the other doesn't
                        if url1 and not url2:
                            try:
                                os.remove(f2)
                                deleted_files.append((str(folder), name2))
                                keep_set.add(f1)
                                if ends_with_suffix(name1):
                                    new_name = name1.replace(f"_{name1.split('_')[-1]}", ".pdf")
                                    new_path = f1.with_name(new_name)
                                    if not new_path.exists():
                                        try:
                                            f1.rename(new_path)
                                            renamed_files.append((str(folder), name1, new_name))
                                            keep_set.discard(f1)
                                            keep_set.add(new_path)
                                        except FileNotFoundError:
                                            pass
                            except FileNotFoundError:
                                pass

                        elif url2 and not url1:
                            try:
                                os.remove(f1)
                                deleted_files.append((str(folder), name1))
                                keep_set.add(f2)
                                if ends_with_suffix(name2):
                                    new_name = name2.replace(f"_{name2.split('_')[-1]}", ".pdf")
                                    new_path = f2.with_name(new_name)
                                    if not new_path.exists():
                                        try:
                                            f2.rename(new_path)
                                            renamed_files.append((str(folder), name2, new_name))
                                            keep_set.discard(f2)
                                            keep_set.add(new_path)
                                        except FileNotFoundError:
                                            pass
                            except FileNotFoundError:
                                pass

                        # Case 2: Both have URLs and they are equal → keep cleaner name
                        elif url1 == url2 and url1 is not None:
                            if ends_with_suffix(name1):
                                try:
                                    os.remove(f1)
                                    deleted_files.append((str(folder), name1))
                                    keep_set.add(f2)
                                except FileNotFoundError:
                                    pass
                            elif ends_with_suffix(name2):
                                try:
                                    os.remove(f2)
                                    deleted_files.append((str(folder), name2))
                                    keep_set.add(f1)
                                except FileNotFoundError:
                                    pass

print(f"✅ Deleted {len(deleted_files)} files and renamed {len(renamed_files)} files.")
# pd.DataFrame(deleted_files, columns=["Folder", "Deleted File"]).head(20)

✅ Deleted 18 files and renamed 18 files.


In [26]:
import os
import json

# === CONFIGURATION ===
base_path = "/Users/lgfolder/github/council-assistant/data/council_documents/"

# === TRACKING ===
quarantine_report = []

def load_metadata(metadata_path):
    try:
        with open(metadata_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data if isinstance(data, list) else []
    except Exception:
        return []

def deduplicate_metadata(metadata):
    seen = set()
    deduped = []
    for entry in metadata:
        if isinstance(entry, dict):
            fn = entry.get("filename")
            if fn and fn not in seen:
                seen.add(fn)
                deduped.append(entry)
    return deduped

def audit_folder(folder_name):
    folder_path = os.path.join(base_path, folder_name)
    if not os.path.isdir(folder_path) or folder_name == "quarantine":
        return

    metadata_path = os.path.join(folder_path, "metadata.json")
    originals_path = os.path.join(folder_path, "originals")

    metadata = load_metadata(metadata_path)
    metadata = deduplicate_metadata(metadata)

    # Re-save deduplicated metadata if needed
    if metadata:
        with open(metadata_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)

    metadata_filenames = {entry["filename"] for entry in metadata if "filename" in entry}
    actual_files = set()

    if os.path.exists(originals_path):
        for f in os.listdir(originals_path):
            if f.lower().endswith(".pdf"):
                actual_files.add(f)

    # Report files that exist but are missing metadata entries
    missing_from_metadata = actual_files - metadata_filenames
    if missing_from_metadata:
        quarantine_report.append({
            "folder": folder_name,
            "missing_files": sorted(missing_from_metadata)
        })

# === MAIN LOOP ===
for folder in sorted(os.listdir(base_path)):
    audit_folder(folder)

# === REPORT ===
if quarantine_report:
    print(f"\n📋 Found {len(quarantine_report)} folders with missing metadata:")
    for entry in quarantine_report:
        print(f"\n📅 {entry['folder']}")
        for fname in entry["missing_files"]:
            print(f"   🔍 {fname}")
else:
    print("✅ All folders are consistent with metadata.")

✅ All folders are consistent with metadata.


### Enriching metadata with documenst from pdfs + Collecting all metadata from subfolders into one warehouse

In [27]:
import json
import hashlib
from pathlib import Path
from PyPDF2 import PdfReader
import jsonlines
from slugify import slugify  # pip install python-slugify

# Define the root folder for PDFs and metadata
base_dir = Path("../data/council_documents")

# Define and create the output folder for the new warehouse
output_dir = Path("../data/document_metadata")
output_dir.mkdir(parents=True, exist_ok=True)

# Define the output path for the final metadata file
output_path = output_dir / "document_metadata.jsonl"
# Gather all metadata.json files
metadata_files = list(base_dir.rglob("metadata.json"))
all_metadata = []

def hash_pdf(pdf_path):
    try:
        with open(pdf_path, "rb") as f:
            return hashlib.sha256(f.read()).hexdigest()
    except:
        return None

for meta_path in metadata_files:
    with meta_path.open("r", encoding="utf-8") as f:
        try:
            entries = json.load(f)
        except json.JSONDecodeError:
            print(f"⚠️ Skipping: {meta_path}")
            continue

    for entry in entries:
        path = entry.get("path")
        if not path:
            continue

        pdf_path = base_dir / path
        if not pdf_path.exists():
            print(f"❌ PDF not found: {pdf_path}")
            continue

        try:
            reader = PdfReader(str(pdf_path))
            info = reader.metadata or {}
            num_pages = len(reader.pages)
        except Exception as e:
            print(f"⚠️ Failed to read PDF: {pdf_path.name} → {e}")
            info = {}
            num_pages = None

        # Build full record
        full_entry = dict(entry)

        full_entry.update({
            # doc_id will be added later
            "title": info.get("/Title", ""),
            "author": info.get("/Author", ""),
            "subject": info.get("/Subject", ""),
            "keywords": info.get("/Keywords", ""),
            "producer": info.get("/Producer", ""),
            "creator": info.get("/Creator", ""),
            "creation_date": info.get("/CreationDate", ""),
            "mod_date": info.get("/ModDate", ""),
            "num_pages": num_pages,
            "file_size_kb": round(pdf_path.stat().st_size / 1024, 1),
            "hash": entry.get("hash") or hash_pdf(pdf_path)
        })

        all_metadata.append(full_entry)

with jsonlines.open(output_path, mode='w') as writer:
    for record in all_metadata:
        writer.write(record)

❌ PDF not found: ../data/council_documents/cabinet/2025-01-30/originals/Appendix B Capital Investment by Directorate_1.pdf
❌ PDF not found: ../data/council_documents/cabinet/2025-01-30/originals/Appendix C - Potential Capital Projects_1.pdf
❌ PDF not found: ../data/council_documents/cabinet/2025-01-30/originals/Appendix A Capital Investment Summary_1.pdf
❌ PDF not found: ../data/council_documents/cabinet/2025-01-30/originals/Appendix D 21.01.25_1.pdf
❌ PDF not found: ../data/council_documents/cabinet/2025-01-30/originals/Draft Revenue Budget 2025-26 MTFP 2025-28 Draft Capital Programme 2025-35 and Treasury Management _1.pdf
❌ PDF not found: ../data/council_documents/cabinet/2025-01-30/originals/Draft Revenue Budget 2025-26 and 2025-28 MTFP and Draft Capital Programme 2025-35_1.pdf
❌ PDF not found: ../data/council_documents/full_council/2025-03-13/originals/Scrutiny Committee - call-in Family Hubs Report to County Council - Final_1.pdf
❌ PDF not found: ../data/council_documents/full_cou

Open the metadata warehouse and review it

In [40]:
import jsonlines
import pandas as pd
from pathlib import Path

records = []
with jsonlines.open(Path("../data/document_metadata/document_metadata.jsonl"), "r") as reader:
    for obj in reader:
        records.append(obj)

doc_metadata_df = pd.DataFrame(records)
#doc_metadata_df.head()

In [50]:
doc_metadata_df.sort_values(by="file_size_kb", ascending=False).head(10)
#doc_metadata_df.sample(5)

Unnamed: 0,filename,path,type,committee,meeting_date,document_category,url,created,hash,title,author,subject,keywords,producer,creator,creation_date,mod_date,num_pages,file_size_kb,doc_id
12,24-00096 - Commissioning Plan for Education Pr...,cabinet/2025-01-30/originals/24-00096 - Commis...,pdf,cabinet,2025-01-30,plan,https://democracy.kent.gov.uk/documents/s12874...,2025-05-07T23:46:53.229573,ea74aa0aec422d6cb2441a9274e8bd989de7aaf53403ee...,,Paul Wilson - CY EPA,,,Aspose.Words for .NET 22.5.0,Microsoft Office Word,D:20250115115800Z,D:20250115115800Z,139,12270.4,cabinet-2025-01-30-24-00096-commissioning-plan...
56,APPENDIX B - ADOPTION of KMWLP 2024-39.pdf,cabinet/2025-03-04/originals/APPENDIX B - ADOP...,pdf,cabinet,2025-03-04,appendix,https://democracy.kent.gov.uk/documents/s12966...,2025-05-07T23:46:21.657364,25a663df6b98713a0b8a9b31ce2444c7bfaaa500a24442...,KMWLP 2024-39 - March 2025,Alice Short - GT GC,,,Microsoft® Word for Microsoft 365,Microsoft® Word for Microsoft 365,D:20250218142302Z,D:20250218142526Z,209,10197.6,cabinet-2025-03-04-appendix-b-adoption-of-kmwl...
57,APPENDIX B - CLEAN VERSION for ADOPTION of KMW...,full_council/2025-03-13/originals/APPENDIX B -...,pdf,full_council,2025-03-13,appendix,https://democracy.kent.gov.uk/documents/s12997...,2025-05-07T23:38:13.211402,25a663df6b98713a0b8a9b31ce2444c7bfaaa500a24442...,KMWLP 2024-39 - March 2025,Alice Short - GT GC,,,Microsoft® Word for Microsoft 365,Microsoft® Word for Microsoft 365,D:20250218142302Z,D:20250218142526Z,209,10197.6,full-council-2025-03-13-appendix-b-clean-versi...
81,Appendix 3 - Statutory Scrutiny Guidance.pdf,full_council/2025-03-13/originals/Appendix 3 -...,pdf,full_council,2025-03-13,appendix,https://democracy.kent.gov.uk/documents/s12997...,2025-05-07T23:38:40.357818,79ad21cd2fa558918c80a8b8df734891e7895e7aaf8d4b...,Overview and scrutiny_ statutory guidan...d co...,Joel Cook - CED GLD,,,Microsoft: Print To PDF,,D:20250305231740+00'00',D:20250305231740+00'00',33,8108.0,full-council-2025-03-13-appendix-3-statutory-s...
67,Appendix 1 - Kent CMPB 2024 Annual Report.pdf,full_council/2025-03-13/originals/Appendix 1 -...,pdf,full_council,2025-03-13,appendix,https://democracy.kent.gov.uk/documents/s12994...,2025-05-07T23:38:22.523639,300b70bff49bdd6c5a48d566a14d6fb7174e82a6143623...,,Carol Cook - ST SPRCA,,,Microsoft® Word for Microsoft 365,Microsoft® Word for Microsoft 365,D:20250227182740+00'00',D:20250227182740+00'00',25,2599.6,full-council-2025-03-13-appendix-1-kent-cmpb-2...
118,1 - Annex 3 - Labour Group Alternative Budget ...,full_council/2025-02-13/originals/1 - Annex 3 ...,pdf,full_council,2025-02-13,appendix,https://democracy.kent.gov.uk/documents/s12915...,2025-05-07T23:39:35.068774,b740089ac24fdad5bd6859f4c896791835d4b1fe41e81f...,,Chris Headey - ST F,,,Microsoft® Excel® for Microsoft 365,Microsoft® Excel® for Microsoft 365,D:20250207140413+00'00',D:20250207140413+00'00',26,1411.7,full-council-2025-02-13-1-annex-3-labour-group...
41,Appendix 1 - Performance Report Quarter 2.pdf,cabinet/2025-01-09/originals/Appendix 1 - Perf...,pdf,cabinet,2025-01-09,appendix,https://democracy.kent.gov.uk/documents/s12818...,2025-05-07T23:47:28.372423,4ec7305df60287982b818c651724d6b2657e32a1d3bdf1...,,Graham Harlow - ST SC,,,Aspose.Words for .NET 22.5.0,Microsoft Office Word,D:20241121100900Z,D:20241231091300Z,68,1364.6,cabinet-2025-01-09-appendix-1-performance-repo...
93,Appendix M - Treasury Management Strategy 2025...,full_council/2025-02-13/originals/Appendix M -...,pdf,full_council,2025-02-13,appendix,https://democracy.kent.gov.uk/documents/s12906...,2025-05-07T23:39:26.748396,30641950d20147cf6f966602e6fbaae9439281bb43fa22...,,Mark Horsfield,,,Microsoft® Word for Microsoft 365,Microsoft® Word for Microsoft 365,D:20250120083838Z,D:20250205125523Z,22,1362.6,full-council-2025-02-13-appendix-m-treasury-ma...
105,Appendix L - Economic and Fiscal Context.pdf,full_council/2025-02-13/originals/Appendix L -...,pdf,full_council,2025-02-13,appendix,https://democracy.kent.gov.uk/documents/s12906...,2025-05-07T23:39:25.463144,3fce78c28d586d09bda6dc71b87fc3658b1972cc2683de...,,Simon Pleace,,,Microsoft® Word for Microsoft 365,Microsoft® Word for Microsoft 365,D:20250122085716Z,D:20250205125653Z,12,1208.4,full-council-2025-02-13-appendix-l-economic-an...
51,QPR Q3 Cabinet appendix 1.pdf,cabinet/2025-03-04/originals/QPR Q3 Cabinet ap...,pdf,cabinet,2025-03-04,appendix,https://democracy.kent.gov.uk/documents/s12943...,2025-05-07T23:46:16.021640,a466650a1a60d17a881425f049735c7aa3cf455ffe47dd...,,Graham Harlow - ST SC,,,Aspose.Words for .NET 22.5.0,Microsoft Office Word,D:20250211104100Z,D:20250213153400Z,71,1164.9,cabinet-2025-03-04-qpr-q3-cabinet-appendix-1-pdf


In [47]:
doc_metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   filename           132 non-null    object 
 1   path               132 non-null    object 
 2   type               132 non-null    object 
 3   committee          132 non-null    object 
 4   meeting_date       132 non-null    object 
 5   document_category  132 non-null    object 
 6   url                132 non-null    object 
 7   created            132 non-null    object 
 8   hash               132 non-null    object 
 9   title              132 non-null    object 
 10  author             132 non-null    object 
 11  subject            132 non-null    object 
 12  keywords           132 non-null    object 
 13  producer           132 non-null    object 
 14  creator            132 non-null    object 
 15  creation_date      132 non-null    object 
 16  mod_date           132 non

### Add doc_id to Document Metadata Warehouse

In [31]:
import jsonlines
import pandas as pd
from pathlib import Path
from slugify import slugify  # pip install python-slugify

# Load document metadata
metadata_path = Path("../data/document_metadata/document_metadata.jsonl")
records = []
with jsonlines.open(metadata_path, "r") as reader:
    for obj in reader:
        records.append(obj)

# Generate doc_id and update records
for record in records:
    committee = record.get("committee")
    date = record.get("meeting_date")
    filename = record.get("filename")

    if committee and date and filename:
        doc_id = slugify(f"{committee}__{date}__{filename}")
        record["doc_id"] = doc_id
    else:
        print(f"⚠️ Missing info for doc_id: {record.get('path')}")

# Overwrite with updated records
with jsonlines.open(metadata_path, "w") as writer:
    for record in records:
        writer.write(record)

print(f"✅ doc_id added to all valid records in: {metadata_path.resolve()}")

✅ doc_id added to all valid records in: /Users/lgfolder/github/council-assistant/data/document_metadata/document_metadata.jsonl


### Renaming categories of documents

In [32]:
import jsonlines
import pandas as pd
from pathlib import Path

# Load records
metadata_path = Path("../data/document_metadata/document_metadata.jsonl")
records = []
with jsonlines.open(metadata_path, "r") as reader:
    for obj in reader:
        records.append(obj)

# Apply rules
for entry in records:
    fn = entry.get("filename", "").lower()
    title = entry.get("title", "").lower()

    # Rule 1: Based on filename
    if "agenda frontsheet" in fn:
        entry["document_category"] = "agenda_frontsheet"

    # Rule 2: Based on title
    elif "executive decision" in title:
        entry["document_category"] = "decision"
    elif "agenda template" in title:
        entry["document_category"] = "agenda_frontsheet"
    elif "record of decision" in title:
        entry["document_category"] = "decision"
    elif "investment strategy" in title:
        entry["document_category"] = "strategy"
    elif "plans" in title:
        entry["document_category"] = "plan"
    elif "policy" in title:
        entry["document_category"] = "policy"


# Save updated jsonl
with jsonlines.open(metadata_path, mode='w') as writer:
    for entry in records:
        writer.write(entry)

print(f"✅ Updated and saved to: {metadata_path.resolve()}")

✅ Updated and saved to: /Users/lgfolder/github/council-assistant/data/document_metadata/document_metadata.jsonl
