In [None]:
import jsonlines
from collections import Counter

# Load the first 100 entries and collect all keys
path = "../data/document_metadata/raw_scraped_metadata.jsonl"
all_keys = []

with jsonlines.open(path, "r") as reader:
    for i, entry in enumerate(reader):
        all_keys.extend(entry.keys())
        if i >= 100:
            break

key_counts = Counter(all_keys)
print("🧩 Sample of available keys and their counts (from first 100 entries):")
for key, count in key_counts.items():
    print(f"{key}: {count}")

🧩 Sample of available keys and their counts (from first 100 entries):
filename: 101
path: 101
type: 101
committee: 101
meeting_date: 101
document_category: 101
url: 101
created: 101
source_metadata_file: 101
scraped: 101
doc_id: 101
title: 101
author: 101
subject: 101
keywords: 101
producer: 101
creator: 101
creation_date: 101
mod_date: 101
num_pages: 101
hash: 101
status: 101
redirect_to: 4


### Code to scrape 

In [None]:
import pandas as pd

# Define path
csv_path = "/Users/lgfolder/Downloads/rtw.csv"

# Load CSV
df = pd.read_csv(csv_path)

# Drop rows where required columns are missing (e.g., blank final row)
df.dropna(subset=["topic", "mother_url"], inplace=True)

# Add 'status' column if missing
if 'status' not in df.columns:
    df['status'] = 'not_started'

# Save cleaned file back (optional)
df.to_csv(csv_path, index=False)

# Display for review
df

In [None]:
import os
import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime

# === Load and prepare CSV ===
csv_path = "/Users/lgfolder/Downloads/rtw.csv"
df = pd.read_csv(csv_path)
df.dropna(subset=["topic", "mother_url"], inplace=True)

if 'status' not in df.columns:
    df['status'] = 'not_started'

# === Shared utilities ===
base_url = "https://democracy.kent.gov.uk/"
unknown_date_counter = 0
def assign_unknown_folder():
    global unknown_date_counter
    unknown_date_counter += 1
    return f"unknown-{unknown_date_counter}"

def extract_meeting_date(soup):
    text = soup.get_text(" ", strip=True)
    match1 = re.search(r"(?:Monday|Tuesday|Wednesday|Thursday|Friday),\s+(\d{1,2})(?:st|nd|rd|th)?(?:,)?\s+([A-Za-z]+)(?:,)?\s+(20\d{2})", text)
    match2 = re.search(r"held on (?:Monday|Tuesday|Wednesday|Thursday|Friday),\s+(\d{1,2})\s+([A-Za-z]+)\s+(20\d{2})", text)
    for match in [match1, match2]:
        if match:
            day, month, year = match.groups()
            try:
                return datetime.strptime(f"{day} {month} {year}", "%d %B %Y").strftime("%Y-%m-%d")
            except ValueError:
                return None
    return None

def get_document_category(filename):
    lower = filename.lower()
    patterns = {
        "agenda_frontsheet": ["agenda", "front"],
        "agenda": ["agenda", "additional agenda", "agenda item"],
        "minutes": ["printed minutes", "cpp minutes", "minutes"],
        "questions": ["questions", "answers", "q&a"],
        "appendix": ["appendix", "annex"],
        "motion": ["motion", "mtld"],
        "amendment": ["amendment"],
        "budget": ["budget", "revenue plan"],
        "report": ["report", "covering report", "update"],
        "decision_response": ["response", "decision"],
        "strategy": ["strategy"],
        "plan": ["plan"],
        "policy": ["policy", "statement"],
        "consultation": ["consultation"],
        "performance": ["performance", "quarterly performance", "qpr"],
        "terms_of_reference": ["terms of reference", "tor"],
        "supporting_material": ["glossary", "note"]
    }
    for category, keys in patterns.items():
        if any(k in lower for k in keys):
            return category
    return "other"

def download_pdf_and_record_metadata(pdf_url, destination_folder, meeting_date, seen_urls, topic):
    if pdf_url in seen_urls:
        return None
    seen_urls.add(pdf_url)
    filename = os.path.basename(urlparse(pdf_url).path)
    original_name = filename
    counter = 1
    while os.path.exists(os.path.join(destination_folder, "originals", filename)):
        root, ext = os.path.splitext(original_name)
        filename = f"{root}_{counter}{ext}"
        counter += 1
    full_path = os.path.join(destination_folder, "originals", filename)
    if not os.path.exists(full_path):
        try:
            r = requests.get(pdf_url, timeout=15)
            r.raise_for_status()
            os.makedirs(os.path.dirname(full_path), exist_ok=True)
            with open(full_path, "wb") as f:
                f.write(r.content)
        except Exception as e:
            print("❌ Download error:", e)
            return None
    return {
        "filename": filename,
        "path": f"originals/{filename}",
        "type": "pdf",
        "committee": topic,
        "meeting_date": meeting_date,
        "document_category": get_document_category(filename),
        "url": pdf_url,
        "created": datetime.now().isoformat()
    }

# === Loop through all tasks ===
for idx, row in df.iterrows():
    topic = row["topic"]
    mother_url = row["mother_url"]

    if row.get("status", "not_started") == "completed":
        print(f"✅ Skipping already completed: {topic}")
        continue

    try:
        print(f"\n🚀 Starting: {topic}")
        df.at[idx, "status"] = "in_progress"
        df.to_csv(csv_path, index=False)

        download_base = f"/Users/lgfolder/Downloads/rtw_council/{topic}"
        resp = requests.get(mother_url)
        soup = BeautifulSoup(resp.text, "html.parser")
        child_links = [urljoin(base_url, a["href"]) for a in soup.select("a[href*='ieListDocuments.aspx']")]
        print(f"  ➤ Found {len(child_links)} child pages")

        for child_url in child_links:
            print(f"  📄 Scraping child: {child_url}")
            resp = requests.get(child_url)
            child_soup = BeautifulSoup(resp.text, "html.parser")
            meeting_date = extract_meeting_date(child_soup) or assign_unknown_folder()
            meeting_folder = os.path.join(download_base, meeting_date)
            os.makedirs(os.path.join(meeting_folder, "originals"), exist_ok=True)

            seen_urls = set()
            metadata = []

            for a in child_soup.select("a[href$='.pdf']"):
                pdf_url = urljoin(base_url, a["href"])
                meta = download_pdf_and_record_metadata(pdf_url, meeting_folder, meeting_date, seen_urls, topic)
                if meta:
                    metadata.append(meta)

            # Save metadata
            if metadata:
                metadata_path = os.path.join(meeting_folder, "metadata.json")
                with open(metadata_path, "w", encoding="utf-8") as f:
                    json.dump(metadata, f, indent=2)
                print(f"  ✅ Metadata saved: {metadata_path}")

        df.at[idx, "status"] = "completed"
        print(f"🎉 Completed: {topic}")

    except Exception as e:
        print(f"❌ Failed: {topic} - {e}")
        df.at[idx, "status"] = "failed"

    finally:
        df.to_csv(csv_path, index=False)

In [None]:
mother_urls = [
    "https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=113&Year=0"  # full council
]
topic='full_council'

In [None]:
mother_urls = ["https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=115&Year=0"
               ]
topic = 'cabinet'

In [None]:
mother_urls = ["https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=128&Year=0"
               ]
topic = 'pension'

In [None]:
mother_urls = ["https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=896&Year=0"
               ]
topic = 'asc'

In [None]:
mother_urls = ["https://democracy.kent.gov.uk/ieListMeetings.aspx?CId=894&Year=0"
               ]
topic = 'cype'

### Scraping

In [None]:
import os
import re
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime

# === SETTINGS ===
download_base = f"/Users/lgfolder/Downloads/data scrape full 1 page only/{topic}"
base_url = "https://democracy.kent.gov.uk/"


# === Unknown date tracker ===
unknown_date_counter = 0
def assign_unknown_folder():
    global unknown_date_counter
    unknown_date_counter += 1
    return f"unknown-{unknown_date_counter}"

# === FUNCTION: Extract meeting date from page ===
def extract_meeting_date(soup):
    text = soup.get_text(" ", strip=True)

    # Pattern 1: "Thursday, 14 September 2023"
    match1 = re.search(
        r"(?:Thursday|Tuesday|Monday|Wednesday|Friday),\s+(\d{1,2})(?:st|nd|rd|th)?(?:,)?\s+([A-Za-z]+)(?:,)?\s+(20\d{2})",
        text
    )

    # Pattern 2: "Meeting of County Council held on Thursday, 17 October 2019 at 10.00 am"
    match2 = re.search(
        r"held on (?:Thursday|Tuesday|Monday|Wednesday|Friday),\s+(\d{1,2})\s+([A-Za-z]+)\s+(20\d{2})",
        text
    )

    for match in [match1, match2]:
        if match:
            day, month, year = match.groups()
            raw_date = f"{day} {month} {year}"
            try:
                dt = datetime.strptime(raw_date, "%d %B %Y")
                return dt.strftime("%Y-%m-%d")
            except ValueError:
                print(f"  ⚠️ Date parsing error: {raw_date}")

    return None

# === FUNCTION: Categorise document based on filename ===
def get_document_category(filename):
    lower = filename.lower()

    if "agenda" in lower and "front" in lower:
        return "agenda_frontsheet"
    elif "agenda" in lower or "additional agenda" in lower or "agenda item" in lower:
        return "agenda"
    elif "printed minutes" in lower or "cpp minutes" in lower or "minutes of previous" in lower:
        return "minutes"
    elif "minutes" in lower:
        return "minutes"
    elif "questions put" in lower or "answers to questions" in lower or "q&a" in lower:
        return "questions"
    elif "appendix" in lower or "annex" in lower:
        return "appendix"
    elif "motion" in lower or "mtld" in lower:
        return "motion"
    elif "amendment" in lower:
        return "amendment"
    elif "budget" in lower or "revenue plan" in lower:
        return "budget"
    elif "report" in lower or "covering report" in lower or "update" in lower:
        return "report"
    elif "response" in lower or "decision" in lower or "record of decision" in lower:
        return "decision_response"
    elif "strategy" in lower or "investment strategy" in lower or "capital strategy" in lower:
        return "strategy"
    elif "plan" in lower or "local plan" in lower or "delivery plan" in lower:
        return "plan"
    elif "policy" in lower or "statement" in lower:
        return "policy"
    elif "consultation" in lower:
        return "consultation"
    elif "performance" in lower or "quarterly performance" in lower or "qpr" in lower:
        return "performance"
    elif "terms of reference" in lower or "tor" in lower:
        return "terms_of_reference"
    elif "glossary" in lower or "note" in lower or "you said we did" in lower:
        return "supporting_material"
    else:
        return "other"

# === FUNCTION: Download file and collect metadata ===
def download_pdf_and_record_metadata(pdf_url, destination_folder, meeting_date, seen_urls):
    parsed = urlparse(pdf_url)

    # Skip if hostname is missing or clearly internal
    if not parsed.hostname or "kent.gov.uk" not in parsed.hostname:
        print(f"  ⚠️ Skipping invalid or internal link: {pdf_url}")
        return None

    # Skip if URL already processed for this meeting
    if pdf_url in seen_urls:
        print(f"    🔁 Skipping duplicate URL: {pdf_url}")
        return None
    seen_urls.add(pdf_url)

    # Handle duplicate filenames by appending _1, _2, etc.
    filename = os.path.basename(parsed.path)
    original_name = filename
    counter = 1
    while os.path.exists(os.path.join(destination_folder, "originals", filename)):
        filename_parts = os.path.splitext(original_name)
        filename = f"{filename_parts[0]}_{counter}{filename_parts[1]}"
        counter += 1

    path_rel = os.path.join("originals", filename)
    full_path = os.path.join(destination_folder, path_rel)

    # Download file if not already saved
    if not os.path.exists(full_path):
        print(f"    ⬇️ Downloading: {filename}")
        try:
            response = requests.get(pdf_url, timeout=20)
            response.raise_for_status()
        except Exception as e:
            print(f"  ❌ Failed to download {pdf_url}: {e}")
            return None  # Skip this entry on error

        with open(full_path, "wb") as f:
            f.write(response.content)
    else:
        print(f"    ⏩ Skipped (already exists): {filename}")

    # Return metadata
    return {
        "filename": filename,
        "path": path_rel,
        "type": "pdf",
        "committee": topic,
        "meeting_date": meeting_date,
        "document_category": get_document_category(filename),
        "url": pdf_url,
        "created": datetime.now().isoformat()
    }

# === MAIN SCRAPER LOOP ===
for mother_url in mother_urls:
    print(f"\n🔎 Loading mother page: {mother_url}")
    resp = requests.get(mother_url)
    soup = BeautifulSoup(resp.text, "html.parser")

    child_links = [
        urljoin(base_url, a["href"])
        for a in soup.select("a[href*='ieListDocuments.aspx']")
    ]

    print(f"  ➤ Found {len(child_links)} child pages.")

    for child_url in child_links:
        print(f"\n📄 Scraping child page: {child_url}")
        child_resp = requests.get(child_url)
        child_soup = BeautifulSoup(child_resp.text, "html.parser")

        meeting_date = extract_meeting_date(child_soup)
        if not meeting_date:
            meeting_date = assign_unknown_folder()

        meeting_folder = os.path.join(download_base, meeting_date)
        originals_folder = os.path.join(meeting_folder, "originals")
        os.makedirs(originals_folder, exist_ok=True)

        metadata = []
        seen_urls = set()  # Track URLs to avoid duplicates

        for a in child_soup.select("a[href]"):
            href = a['href']
            if '.pdf' in href.lower():
                pdf_url = urljoin(base_url, href)
                meta_entry = download_pdf_and_record_metadata(
                    pdf_url, meeting_folder, meeting_date, seen_urls
                )
                if meta_entry:
                    metadata.append(meta_entry)

        # === GRANDCHILD LINKS ===
        grandchild_links = [
            urljoin(base_url, a["href"])
            for a in child_soup.select("a")
            if "View the full list of documents" in a.get_text()
        ]

        for g_url in grandchild_links:
            print(f"    ↪️ Scraping grandchild: {g_url}")
            g_resp = requests.get(g_url)
            g_soup = BeautifulSoup(g_resp.text, "html.parser")

            g_date = extract_meeting_date(g_soup) or assign_unknown_folder()
            g_folder = os.path.join(download_base, g_date)
            g_originals = os.path.join(g_folder, "originals")
            os.makedirs(g_originals, exist_ok=True)

            seen_urls = set()  # Reset for each grandchild

            for a in g_soup.select("a[href]"):
                href = a['href']
                if '.pdf' in href.lower():
                    pdf_url = urljoin(base_url, href)
                    meta_entry = download_pdf_and_record_metadata(
                        pdf_url, g_folder, g_date, seen_urls
                    )
                    if meta_entry:
                        metadata.append(meta_entry)

        # === WRITE OR APPEND METADATA FILE (with deduplication) ===
        metadata_path = os.path.join(meeting_folder, "metadata.json")
        
        # Load existing metadata if present
        try:
            with open(metadata_path, "r", encoding="utf-8") as f:
                existing_metadata = json.load(f)
        except FileNotFoundError:
            existing_metadata = []

        # Remove None entries from new metadata
        metadata = [entry for entry in metadata if entry is not None]

        # Combine and deduplicate by URL
        combined = {entry["url"]: entry for entry in existing_metadata + metadata}
        metadata = list(combined.values())

        # Save updated metadata
        with open(metadata_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        print(f"  🗂️ Metadata updated at {metadata_path}")

### Correcting URLs in metadata

1.  Walks through every file matching ../data/council_documents/*/*/metadata.json
2.  For each entry inside the metadata.json:
	•	Checks if the "url" key exists and contains a space
	•	If so:
	•	Encodes the URL safely using urllib.parse.quote
	•	Preserves the query string (?T=9, etc.)
	•	Sets the updated value back to entry["url"]
3.	If any URLs were changed in that file:
	•	It overwrites the entire metadata.json with the updated entries using:json.dump(data, f, indent=2)
4.	Logs which files were updated.

Careful: script modifies the file contents on disk

In [None]:
import json
from pathlib import Path
from urllib.parse import quote

# Set base path
base_path = Path("../data/council_documents")
metadata_paths = list(base_path.glob("*/*/metadata.json"))
print(f"🔍 Found {len(metadata_paths)} metadata.json files")

for meta_file in metadata_paths:
    try:
        with meta_file.open("r", encoding="utf-8") as f:
            data = json.load(f)

        modified = False
        for entry in data:
            url = entry.get("url")
            if url and " " in url:
                # Split to preserve any query string
                if "?" in url:
                    url_base, query = url.split("?", 1)
                    url_base_encoded = quote(url_base, safe="/:")
                    entry["url"] = f"{url_base_encoded}?{query}"
                else:
                    entry["url"] = quote(url, safe="/:")
                modified = True

        if modified:
            with meta_file.open("w", encoding="utf-8") as f:
                json.dump(data, f, indent=2)
            print(f"✅ Updated: {meta_file}")

    except Exception as e:
        print(f"⚠️ Error processing {meta_file}: {e}")

### Update and Overwrite "path" in All metadata.json Files (to make them unique)

What It Does:
* Rebuilds the "path" field using: committee/meeting_date/originals/filename
* Only overwrites the file if changes were made
* Uses json.dump(..., indent=2, ensure_ascii=False) for human-readable formatting

In [None]:
from pathlib import Path
import json

base_dir = Path("../data/council_documents")
metadata_files = list(base_dir.rglob("metadata.json"))

for meta_path in metadata_files:
    with meta_path.open("r", encoding="utf-8") as f:
        metadata = json.load(f)

    modified = False
    for entry in metadata:
        committee = entry.get("committee")
        date = entry.get("meeting_date")
        filename = entry.get("filename")

        if committee and date and filename:
            new_path = f"{committee}/{date}/originals/{filename}"
            if entry.get("path") != new_path:
                entry["path"] = new_path
                modified = True

    if modified:
        with meta_path.open("w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        print(f"✅ Updated: {meta_path}")
    else:
        print(f"➖ No changes: {meta_path}")

### Identifying and relocating Public Packs

In [None]:
#relocate_public_packs.py

import os
import shutil
import json
from PyPDF2 import PdfReader
from pathlib import Path

def relocate_public_packs(base_folder="../data/council_documents"):
    base_path = Path(base_folder)

    for committee in os.listdir(base_path):
        committee_path = base_path / committee
        if not committee_path.is_dir():
            continue

        for meeting_date in os.listdir(committee_path):
            meeting_path = committee_path / meeting_date
            originals_path = meeting_path / "originals"
            public_packs_path = meeting_path / "public_packs"
            metadata_path = meeting_path / "metadata.json"

            if not originals_path.exists() or not metadata_path.exists():
                continue

            with metadata_path.open("r", encoding="utf-8") as f:
                try:
                    metadata = json.load(f)
                except json.JSONDecodeError:
                    print(f"⚠️ Skipping bad metadata: {metadata_path}")
                    continue

            moved_filenames = []

            for filename in os.listdir(originals_path):
                if not filename.endswith(".pdf"):
                    continue

                file_path = originals_path / filename

                try:
                    reader = PdfReader(str(file_path))
                    title = reader.metadata.get("/Title", "")
                    if "(Public Pack)" in title:
                        os.makedirs(public_packs_path, exist_ok=True)
                        dest_path = public_packs_path / filename
                        shutil.move(str(file_path), str(dest_path))
                        moved_filenames.append(filename)
                        print(f"✅ Moved public pack: {file_path} → {dest_path}")
                except Exception as e:
                    print(f"⚠️ Could not process {file_path}: {e}")

            # Update metadata
            modified = False
            for entry in metadata:
                if entry.get("filename") in moved_filenames:
                    entry["path"] = f"{committee}/{meeting_date}/public_packs/{entry['filename']}"
                    modified = True

            if modified:
                with metadata_path.open("w", encoding="utf-8") as f:
                    json.dump(metadata, f, indent=2, ensure_ascii=False)
                print(f"📝 Metadata updated: {metadata_path}")

relocate_public_packs()

### Hashing and removing duplicates

In [None]:
import hashlib
import json
import shutil
from pathlib import Path

base_dir = Path("../data/council_documents")
duplicates_dir = base_dir / "duplicates"
duplicates_dir.mkdir(exist_ok=True)

def hash_pdf(file_path):
    with file_path.open("rb") as f:
        return hashlib.sha256(f.read()).hexdigest()

def clean_filename_priority(name):
    """Return True if filename does NOT end with _1, _2, etc."""
    return not any(name.stem.endswith(f"_{i}") for i in range(1, 10))

# Process each committee/meeting/originals folder
for originals_dir in base_dir.rglob("originals"):
    if not originals_dir.is_dir():
        continue

    metadata_path = originals_dir.parent / "metadata.json"
    if not metadata_path.exists():
        print(f"⚠️ No metadata found: {metadata_path}")
        continue

    with metadata_path.open("r", encoding="utf-8") as f:
        try:
            metadata = json.load(f)
        except json.JSONDecodeError:
            print(f"❌ Could not parse metadata: {metadata_path}")
            continue

    # Build lookup: filename ➝ metadata entry
    file_lookup = {entry["filename"]: entry for entry in metadata if entry.get("filename")}
    updated_metadata = []

    # Group files by hash
    hash_groups = {}
    for pdf_path in originals_dir.glob("*.pdf"):
        file_hash = hash_pdf(pdf_path)
        hash_groups.setdefault(file_hash, []).append(pdf_path)

    for file_hash, paths in hash_groups.items():
        if len(paths) == 1:
            # Keep as-is, just add hash
            path = paths[0]
            entry = file_lookup.get(path.name)
            if entry:
                entry["hash"] = file_hash
                updated_metadata.append(entry)
            continue

        # Choose file to keep
        candidates = []
        for path in paths:
            entry = file_lookup.get(path.name)
            if entry:
                has_url = bool(entry.get("url"))
                is_clean = clean_filename_priority(path)
                candidates.append((has_url, is_clean, path, entry))

        candidates.sort(reverse=True)  # highest priority first
        keep_path, keep_entry = candidates[0][2], candidates[0][3]
        keep_entry["hash"] = file_hash
        updated_metadata.append(keep_entry)

        # Remove the rest
        for _, _, dupe_path, dupe_entry in candidates[1:]:
            print(f"🗑 Removing duplicate: {dupe_path.name}")
            target_path = duplicates_dir / dupe_path.name
            shutil.move(str(dupe_path), str(target_path))

    # Save cleaned metadata
    with metadata_path.open("w", encoding="utf-8") as f:
        json.dump(updated_metadata, f, indent=2, ensure_ascii=False)
    print(f"✅ Updated metadata: {metadata_path}")

### Diagnostic tool to consolidate data from underlying meeting level metadata generated during scraping

This is a diagnostic or inspection tool — useful for:
* Finding missing URLs or metadata
* Auditing document categories
* Filtering or fixing records across the entire dataset

In [None]:
import json
from pathlib import Path
import pandas as pd

base_dir = Path("../data/council_documents")
metadata_files = list(base_dir.rglob("metadata.json"))

all_rows = []

for meta_path in metadata_files:
    with meta_path.open("r", encoding="utf-8") as f:
        try:
            entries = json.load(f)
        except json.JSONDecodeError:
            continue

    for entry in entries:
        all_rows.append({
            "committee": entry.get("committee"),
            "meeting_date": entry.get("meeting_date"),
            "filename": entry.get("filename"),
            "document_category": entry.get("document_category"),
            "url": entry.get("url"),
            "path": entry.get("path"),
            "source_file": str(meta_path)
        })

# Create DataFrame
metadata_df = pd.DataFrame(all_rows)
metadata_df

### Auditing

The auditing and cleaning code below does 4 important things:
* 1.	Checks for missing metadata.
* 2.	Deduplicates entries in metadata.json.
* 3.	Compares recorded metadata with actual PDFs in the originals/ folder.
* 4.	Deletes truly empty folders and quarantines folders with discrepancies.

Improvements:
* Cleans up duplicate PDFs by URL and filename
* Automatically renames suffix files to cleaner names
* Keeps only one copy per duplicate group
* Fully updates metadata.json:
* Removes entries for deleted files
* Updates filename/path for renamed files

In [1]:
from pathlib import Path
from difflib import SequenceMatcher
from collections import defaultdict
import os
import json
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

base_folder = Path("../data/council_documents")
deleted_files = []
renamed_files = []
keep_set = set()

for folder in base_folder.rglob("*/originals"):
    pdfs = list(folder.glob("*.pdf"))
    size_map = defaultdict(list)

    metadata_path = folder.parent / "metadata.json"
    url_lookup = {}
    if metadata_path.exists():
        with open(metadata_path, "r") as f:
            try:
                metadata = json.load(f)
                for entry in metadata:
                    url_lookup[entry.get("filename")] = entry.get("url")
            except:
                pass

    for f in pdfs:
        size_map[f.stat().st_size].append(f)

    for size, files in size_map.items():
        if len(files) > 1:
            checked_pairs = set()
            for i, row in enumerate(rows[:5], 1):
                for f2 in files[i + 1:]:
                    pair_key = tuple(sorted([f1.name, f2.name]))
                    if pair_key in checked_pairs:
                        continue
                    checked_pairs.add(pair_key)

                    name1, name2 = f1.name, f2.name
                    ratio = SequenceMatcher(None, name1, name2).ratio()
                    if ratio > 0.85:
                        url1 = url_lookup.get(name1)
                        url2 = url_lookup.get(name2)

                        # Skip if either file is already marked to keep
                        if f1 in keep_set or f2 in keep_set:
                            continue

                        def ends_with_suffix(name):
                            return any(name.rstrip(".pdf").endswith(f"_{n}") for n in range(1, 10))

                        # Case 1: One has a URL, the other doesn't
                        if url1 and not url2:
                            try:
                                os.remove(f2)
                                deleted_files.append((str(folder), name2))
                                keep_set.add(f1)
                                if ends_with_suffix(name1):
                                    new_name = name1.replace(f"_{name1.split('_')[-1]}", ".pdf")
                                    new_path = f1.with_name(new_name)
                                    if not new_path.exists():
                                        try:
                                            f1.rename(new_path)
                                            renamed_files.append((str(folder), name1, new_name))
                                            keep_set.discard(f1)
                                            keep_set.add(new_path)
                                        except FileNotFoundError:
                                            pass
                            except FileNotFoundError:
                                pass

                        elif url2 and not url1:
                            try:
                                os.remove(f1)
                                deleted_files.append((str(folder), name1))
                                keep_set.add(f2)
                                if ends_with_suffix(name2):
                                    new_name = name2.replace(f"_{name2.split('_')[-1]}", ".pdf")
                                    new_path = f2.with_name(new_name)
                                    if not new_path.exists():
                                        try:
                                            f2.rename(new_path)
                                            renamed_files.append((str(folder), name2, new_name))
                                            keep_set.discard(f2)
                                            keep_set.add(new_path)
                                        except FileNotFoundError:
                                            pass
                            except FileNotFoundError:
                                pass

                        # Case 2: Both have URLs and they are equal → keep cleaner name
                        elif url1 == url2 and url1 is not None:
                            if ends_with_suffix(name1):
                                try:
                                    os.remove(f1)
                                    deleted_files.append((str(folder), name1))
                                    keep_set.add(f2)
                                except FileNotFoundError:
                                    pass
                            elif ends_with_suffix(name2):
                                try:
                                    os.remove(f2)
                                    deleted_files.append((str(folder), name2))
                                    keep_set.add(f1)
                                except FileNotFoundError:
                                    pass

print(f"✅ Deleted {len(deleted_files)} files and renamed {len(renamed_files)} files.")
# pd.DataFrame(deleted_files, columns=["Folder", "Deleted File"]).head(20)

✅ Deleted 0 files and renamed 0 files.


In [2]:
import os
import json

# === CONFIGURATION ===
base_path = "/Users/lgfolder/github/council-assistant/data/council_documents/"

# === TRACKING ===
quarantine_report = []

def load_metadata(metadata_path):
    try:
        with open(metadata_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data if isinstance(data, list) else []
    except Exception:
        return []

def deduplicate_metadata(metadata):
    seen = set()
    deduped = []
    for entry in metadata:
        if isinstance(entry, dict):
            fn = entry.get("filename")
            if fn and fn not in seen:
                seen.add(fn)
                deduped.append(entry)
    return deduped

def audit_folder(folder_name):
    folder_path = os.path.join(base_path, folder_name)
    if not os.path.isdir(folder_path) or folder_name == "quarantine":
        return

    metadata_path = os.path.join(folder_path, "metadata.json")
    originals_path = os.path.join(folder_path, "originals")

    metadata = load_metadata(metadata_path)
    metadata = deduplicate_metadata(metadata)

    # Re-save deduplicated metadata if needed
    if metadata:
        with open(metadata_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)

    metadata_filenames = {entry["filename"] for entry in metadata if "filename" in entry}
    actual_files = set()

    if os.path.exists(originals_path):
        for f in os.listdir(originals_path):
            if f.lower().endswith(".pdf"):
                actual_files.add(f)

    # Report files that exist but are missing metadata entries
    missing_from_metadata = actual_files - metadata_filenames
    if missing_from_metadata:
        quarantine_report.append({
            "folder": folder_name,
            "missing_files": sorted(missing_from_metadata)
        })

# === MAIN LOOP ===
for folder in sorted(os.listdir(base_path)):
    audit_folder(folder)

# === REPORT ===
if quarantine_report:
    print(f"\n📋 Found {len(quarantine_report)} folders with missing metadata:")
    for entry in quarantine_report:
        print(f"\n📅 {entry['folder']}")
        for fname in entry["missing_files"]:
            print(f"   🔍 {fname}")
else:
    print("✅ All folders are consistent with metadata.")

✅ All folders are consistent with metadata.


### Enriching metadata with documenst from pdfs + Collecting all metadata from subfolders into one warehouse

In [None]:
import json
import hashlib
from pathlib import Path
from PyPDF2 import PdfReader
import jsonlines
from slugify import slugify  # pip install python-slugify

# Define the root folder for PDFs and metadata
base_dir = Path("../data/council_documents")

# Define and create the output folder for the new warehouse
output_dir = Path("../data/document_metadata")
output_dir.mkdir(parents=True, exist_ok=True)

# Define the output path for the final metadata file
output_path = output_dir / "document_metadata.jsonl"
# Gather all metadata.json files
metadata_files = list(base_dir.rglob("metadata.json"))
all_metadata = []

def hash_pdf(pdf_path):
    try:
        with open(pdf_path, "rb") as f:
            return hashlib.sha256(f.read()).hexdigest()
    except:
        return None

for meta_path in metadata_files:
    with meta_path.open("r", encoding="utf-8") as f:
        try:
            entries = json.load(f)
        except json.JSONDecodeError:
            print(f"⚠️ Skipping: {meta_path}")
            continue

    for entry in entries:
        path = entry.get("path")
        if not path:
            continue

        pdf_path = base_dir / path
        if not pdf_path.exists():
            print(f"❌ PDF not found: {pdf_path}")
            continue

        try:
            reader = PdfReader(str(pdf_path))
            info = reader.metadata or {}
            num_pages = len(reader.pages)
        except Exception as e:
            print(f"⚠️ Failed to read PDF: {pdf_path.name} → {e}")
            info = {}
            num_pages = None

        # Build full record
        full_entry = dict(entry)

        full_entry.update({
            # doc_id will be added later
            "title": info.get("/Title", ""),
            "author": info.get("/Author", ""),
            "subject": info.get("/Subject", ""),
            "keywords": info.get("/Keywords", ""),
            "producer": info.get("/Producer", ""),
            "creator": info.get("/Creator", ""),
            "creation_date": info.get("/CreationDate", ""),
            "mod_date": info.get("/ModDate", ""),
            "num_pages": num_pages,
            "file_size_kb": round(pdf_path.stat().st_size / 1024, 1),
            "hash": entry.get("hash") or hash_pdf(pdf_path)
        })

        all_metadata.append(full_entry)

with jsonlines.open(output_path, mode='w') as writer:
    for record in all_metadata:
        writer.write(record)

Open the metadata warehouse and review it

In [None]:
import jsonlines
import pandas as pd
from pathlib import Path

records = []
with jsonlines.open(Path("../data/document_metadata/document_metadata.jsonl"), "r") as reader:
    for obj in reader:
        records.append(obj)

doc_metadata_df = pd.DataFrame(records)
#doc_metadata_df.head()

In [None]:
doc_metadata_df.sample(5)

In [None]:
doc_metadata_df.info()

### Add doc_id to Document Metadata Warehouse

In [None]:
import jsonlines
import pandas as pd
from pathlib import Path
from slugify import slugify  # pip install python-slugify

# Load document metadata
metadata_path = Path("../data/document_metadata/document_metadata.jsonl")
records = []
with jsonlines.open(metadata_path, "r") as reader:
    for obj in reader:
        records.append(obj)

# Generate doc_id and update records
for record in records:
    committee = record.get("committee")
    date = record.get("meeting_date")
    filename = record.get("filename")

    if committee and date and filename:
        doc_id = slugify(f"{committee}__{date}__{filename}")
        record["doc_id"] = doc_id
    else:
        print(f"⚠️ Missing info for doc_id: {record.get('path')}")

# Overwrite with updated records
with jsonlines.open(metadata_path, "w") as writer:
    for record in records:
        writer.write(record)

print(f"✅ doc_id added to all valid records in: {metadata_path.resolve()}")

### Renaming categories of documents

In [None]:
import jsonlines
import pandas as pd
from pathlib import Path

# Load records
metadata_path = Path("../data/document_metadata/document_metadata.jsonl")
records = []
with jsonlines.open(metadata_path, "r") as reader:
    for obj in reader:
        records.append(obj)

# Apply rules
for entry in records:
    fn = entry.get("filename", "").lower()
    title = entry.get("title", "").lower()

    # Rule 1: Based on filename
    if "agenda frontsheet" in fn:
        entry["document_category"] = "agenda_frontsheet"

    # Rule 2: Based on title
    elif "executive decision" in title:
        entry["document_category"] = "decision"
    elif "agenda template" in title:
        entry["document_category"] = "agenda_frontsheet"
    elif "record of decision" in title:
        entry["document_category"] = "decision"
    elif "investment strategy" in title:
        entry["document_category"] = "strategy"
    elif "plans" in title:
        entry["document_category"] = "plan"
    elif "policy" in title:
        entry["document_category"] = "policy"


# Save updated jsonl
with jsonlines.open(metadata_path, mode='w') as writer:
    for entry in records:
        writer.write(entry)

print(f"✅ Updated and saved to: {metadata_path.resolve()}")

In [None]:
import jsonlines

with jsonlines.open("../data/processed_register/document_manifest.jsonl", "r") as reader:
    entries = list(reader)

# How many are ready?
ready = [e for e in entries if e.get("status") == "ready_for_embedding"]
print(f"Found {len(ready)} entries with status = 'ready_for_embedding'")