## Scraping borough councils

### Scraping experiments

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
import time
import json
import os
import re

# Load the CSV file
df = pd.read_csv("/Users/lgfolder/Downloads/rtw.csv")

# Set for visited URLs to prevent duplicates
visited = set()

# Path to results file
results_path = "/Users/lgfolder/Downloads/pdf_results_rtw.json"
if os.path.exists(results_path):
    with open(results_path, "r") as f:
        results_log = json.load(f)
else:
    results_log = []

# Helper: Check if a URL points to a PDF
def is_pdf(url):
    return url.lower().endswith(".pdf")

# Helper: Allow only ieListDocuments URLs with CId and MId (order-insensitive)
def is_allowed_ieListDocuments_url(url):
    if not isinstance(url, str):
        return False
    parsed = urlparse(url)
    if "ieListDocuments.aspx" not in parsed.path:
        return False
    query = parse_qs(parsed.query)
    return "CId" in query and "MId" in query

# Helper: Check if it's a valid starting page (meeting list)
def is_valid_start_url(url):
    return isinstance(url, str) and url.startswith("http") and "ieListMeetings.aspx" in url

# Crawl PDFs from a specific meeting document page
def crawl_ieListDocuments_page(url, topic, source_url, depth=0, max_depth=2):
    pdf_links = set()
    if depth > max_depth or url in visited:
        return pdf_links

    if not is_allowed_ieListDocuments_url(url):
        return pdf_links

    visited.add(url)
    print(f"Crawling: {url} (depth={depth})")

    try:
        resp = requests.get(url, timeout=10)
        if resp.status_code != 200:
            return pdf_links
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return pdf_links

    soup = BeautifulSoup(resp.text, "html.parser")

    # Try to extract committee, date, and time from a heading
    committee = date = time_ = None
    heading_text = soup.find(string=re.compile(r" - .*\d{1,2}.*\d{4}"))
    if heading_text:
        match = re.match(r"(?P<committee>.+?) - .*?(?P<date>\d{1,2}(?:st|nd|rd|th)?\s+\w+,\s+\d{4})\s+(?P<time>\d{1,2}\.\d{2}\s*[ap]m)", heading_text.strip(), re.IGNORECASE)
        if match:
            committee = match.group("committee").strip()
            date = match.group("date").strip()
            time_ = match.group("time").strip()

    for a_tag in reversed(soup.find_all("a", href=True)):
        link = urljoin(url, a_tag['href'])
        if is_pdf(link):
            if link not in pdf_links:
                pdf_links.add(link)
                # Save immediately with metadata
                results_log.append({
                    "topic": topic,
                    "source_url": source_url,
                    "page_url": url,
                    "pdf_url": link,
                    "committee": committee,
                    "date": date,
                    "time": time_
                })
                with open(results_path, "w") as f:
                    json.dump(results_log, f, indent=2)
        time.sleep(0.25)

    return pdf_links

# Top-level crawler: start from meeting list, extract child meeting document pages
def crawl_from_meeting_list(start_url, topic):
    meeting_pdf_links = set()

    print(f"Scanning meeting list: {start_url}")
    try:
        resp = requests.get(start_url, timeout=10)
        if resp.status_code != 200:
            return meeting_pdf_links
    except Exception as e:
        print(f"Failed to fetch meeting list page {start_url}: {e}")
        return meeting_pdf_links

    soup = BeautifulSoup(resp.text, "html.parser")
    for a_tag in soup.find_all("a", href=True):
        link = urljoin(start_url, a_tag['href'])
        if is_allowed_ieListDocuments_url(link):
            meeting_pdf_links |= crawl_ieListDocuments_page(link, topic, start_url)
            time.sleep(0.25)

    return meeting_pdf_links

# Run crawler for each topic
pdf_map = {}
for _, row in df.iterrows():
    topic = row.get('topic')
    start_url = row.get('mother_url')

    if not is_valid_start_url(start_url):
        print(f"Skipping (not allowed): {start_url}")
        continue

    visited.clear()
    pdf_links = crawl_from_meeting_list(start_url, topic)
    pdf_map[topic] = list(pdf_links)
    print(f"Saved {len(pdf_links)} PDFs for topic: {topic}\n")

# The pdf_map now holds topic -> [list of PDF URLs]


In [None]:
import pandas as pd
import json

# Load the JSON file
with open(results_path, "r") as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Drop duplicate PDF URLs, keeping the first occurrence
df_deduped = df.drop_duplicates(subset=["pdf_url"])
pd.set_option('display.max_colwidth', None)  # Show full column content
# Display the deduplicated DataFrame
df_deduped  # or just df_deduped to see all

In [None]:

# Get unique page URLs
unique_pages = df["page_url"].drop_duplicates()

# Display the count and the URLs
#print(f"Total unique page URLs: {len(unique_pages)}")
unique_pages.reset_index(drop=True)  # Nicely indexed for viewing
unique_pages

#### Downloading the texts and metadata from the pdfs without saving them

In [None]:
import requests
import pdfplumber
from io import BytesIO

url = "https://democracy.tunbridgewells.gov.uk/documents/s70214/15.%20Date%20of%20Next%20Meeting.pdf"
#url = "https://democracy.tunbridgewells.gov.uk/documents/s76856/Appendix B - Appointments to Committees.pdf"
#url = "https://democracy.tunbridgewells.gov.uk/documents/s76855/Appendix A Political balance and allocation of Committee seats.pdf"
#url = "https://democracy.tunbridgewells.gov.uk/documents/s76866/10 Motion on Notice from Cllr Mobbs.pdf"

def extract_text_from_pdf_url(url):
    try:
        response = requests.get(url, stream=True, timeout=15)
        response.raise_for_status()
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() or ''
        return text.strip()
    except Exception as e:
        return f"Failed: {e}"

# Run it
extract_text_from_pdf_url(url)

### Downloading pdf metadata from one url link

In [None]:
import requests
from PyPDF2 import PdfReader
from io import BytesIO
import pandas as pd


def extract_all_pdf_metadata(url):
    try:
        response = requests.get(url, stream=True, timeout=15)
        response.raise_for_status()
        content = response.content

        reader = PdfReader(BytesIO(content))
        raw_meta = reader.metadata or {}

        # Convert keys to plain strings (e.g., "/Author" → "Author")
        meta = {k.strip('/'): v for k, v in raw_meta.items()}

        # Add general info
        meta["Pages"] = len(reader.pages)
        meta["Encrypted"] = reader.is_encrypted
        meta["FileSizeBytes"] = len(content)
        meta["PDFHeader"] = content[:8].decode("utf-8", errors="ignore").strip()

        # Page size (if available)
        try:
            box = reader.pages[0].mediabox
            meta["PageWidth"] = float(box.width)
            meta["PageHeight"] = float(box.height)
        except Exception:
            pass

        return meta
    except Exception as e:
        return {"error": str(e)}

# Example usage:
metadata = extract_all_pdf_metadata(url)
df_meta = pd.DataFrame([metadata])
df_meta

### Downloading metadata from all urls from the scraping results json

In [None]:
import json
import requests
from PyPDF2 import PdfReader
from io import BytesIO
import pandas as pd
from tqdm import tqdm  # progress bar

# Load results from your scraper
with open("/Users/lgfolder/Downloads/pdf_results.json", "r") as f:
    data = json.load(f)

# Extract unique PDF URLs
pdf_urls = list({entry["pdf_url"] for entry in data if entry.get("pdf_url", "").endswith(".pdf")})

def extract_all_pdf_metadata(url):
    try:
        response = requests.get(url, stream=True, timeout=15)
        response.raise_for_status()
        content = response.content

        reader = PdfReader(BytesIO(content))
        raw_meta = reader.metadata or {}
        meta = {k.strip('/'): v for k, v in raw_meta.items()}

        # General info
        meta["Pages"] = len(reader.pages)
        meta["Encrypted"] = reader.is_encrypted
        meta["FileSizeBytes"] = len(content)
        meta["PDFHeader"] = content[:8].decode("utf-8", errors="ignore").strip()
        meta["SourceURL"] = url

        # Page dimensions
        try:
            box = reader.pages[0].mediabox
            meta["PageWidth"] = float(box.width)
            meta["PageHeight"] = float(box.height)
        except Exception:
            pass

        return meta
    except Exception as e:
        return {"SourceURL": url, "error": str(e)}

# Run metadata extraction for each PDF
metadata_list = [extract_all_pdf_metadata(url) for url in tqdm(pdf_urls)]

# Display as DataFrame
df_meta = pd.DataFrame(metadata_list)
df_meta

### Scrape the full content of PDF files from links in JSONs

In [None]:
import json, time, hashlib, requests, os
from PyPDF2 import PdfReader
from io import BytesIO
import jsonlines
from tqdm import tqdm
import re
from datetime import datetime

# INPUT: Load results JSON
with open("/Users/lgfolder/Downloads/pdf_results.json", "r") as f:
    scraped_records = json.load(f)

# OUTPUT paths
jsonl_path = "/Users/lgfolder/Downloads/raw_scraped_metadata_rtw_test.jsonl"
id_register_path = "/Users/lgfolder/Downloads/document_ids_rtw_test.json"

# Convert to lookup by URL
scraped_by_url = {r["pdf_url"]: r for r in scraped_records if "pdf_url" in r}

# Load or initialize ID register
if os.path.exists(id_register_path):
    with open(id_register_path, "r") as f:
        doc_id_register = json.load(f)
else:
    doc_id_register = {}

# Track existing IDs to avoid duplication
existing_ids = set(doc["id"] for doc in doc_id_register.values())

# Utility: Generate short hash-based ID
def compute_doc_id(content):
    short_hash = hashlib.sha256(content).hexdigest()[:8]
    return f"doc_{short_hash}"

# Utility: Clean string to avoid json serialization issues
def clean_meta(meta):
    return {k.strip("/"): str(v) if v is not None else None for k, v in meta.items()}

# Utility: Extract meeting date from title or subject
def extract_meeting_date(text):
    match = re.search(r"(\d{1,2})[\-/](\d{1,2})[\-/](\d{2,4})", text)
    if match:
        d, m, y = match.groups()
        if len(y) == 2:
            y = '20' + y  # assume 21st century
        try:
            return datetime(int(y), int(m), int(d)).strftime("%Y-%m-%d")
        except:
            return None
    return None

# Classify document type from title or URL
def classify_doc_type(title, url):
    title = (title or "").lower()
    url = url.lower()
    if "agenda" in title or "agenda" in url:
        return "Agenda"
    elif "minute" in title or "minute" in url:
        return "Minutes"
    elif "report" in title or "report" in url:
        return "Report"
    return "Unknown"

# Main extractor
def extract_pdf_metadata(url):
    try:
        response = requests.get(url, stream=True, timeout=20)
        response.raise_for_status()
        content = response.content

        doc_id = compute_doc_id(content)
        if doc_id in existing_ids:
            return None, None  # already processed

        reader = PdfReader(BytesIO(content))
        raw_meta = clean_meta(reader.metadata or {})
        text = ''.join((p.extract_text() or '') for p in reader.pages)

        # Base record
        record = {
            "document_id": doc_id,
            "source_url": url,
            "title": raw_meta.get("Title"),
            "author": raw_meta.get("Author"),
            "creator": raw_meta.get("Creator"),
            "producer": raw_meta.get("Producer"),
            "created": raw_meta.get("CreationDate"),
            "modified": raw_meta.get("ModDate"),
            "subject": raw_meta.get("Subject"),
            "keywords": raw_meta.get("Keywords"),
            "pages": len(reader.pages),
            "filesize_bytes": len(content),
            "text": text.strip(),
            "word_count": len(text.split()),
            "char_count": len(text),
            "avg_words_per_page": round(len(text.split()) / max(1, len(reader.pages)), 2),
            "error": None  # placeholder for error tracking
        }

        # Scraping step 1 integration
        scraped = scraped_by_url.get(url, {})
        record.update({
            "page_url": scraped.get("page_url"),
            "topic": scraped.get("topic"),
            "committee": scraped.get("committee"),
            "meeting_date": scraped.get("date"),
            "meeting_time": scraped.get("time")
        })

        # Fallback date extraction
        if not record["meeting_date"]:
            record["meeting_date"] = extract_meeting_date(record.get("title", "") + record.get("subject", ""))

        # Document classification
        record["document_type"] = classify_doc_type(record.get("title"), url)

        return record, doc_id

    except Exception as e:
        return {"source_url": url, "error": str(e)}, None

# Collect and process all unique PDF URLs
pdf_urls = list({entry["pdf_url"] for entry in scraped_records if entry.get("pdf_url", "").endswith(".pdf")})

with jsonlines.open(jsonl_path, mode="a") as writer:
    for url in tqdm(pdf_urls):
        record, doc_id = extract_pdf_metadata(url)
        if record and doc_id:
            writer.write(record)
            doc_id_register[url] = {"id": doc_id}
        time.sleep(2.0)

# Save document ID register
with open(id_register_path, "w") as f:
    json.dump(doc_id_register, f, indent=2)


In [None]:
import pandas as pd
import jsonlines

# Path to your metadata file

# Load all lines
with jsonlines.open(jsonl_path) as reader:
    records = list(reader)

# Remove the 'text' field from each record
for r in records:
    r.pop("text", None)

# Create and display DataFrame
df = pd.DataFrame(records)

In [None]:
df.info()

In [None]:
df.sample(5)

### EDA of metadata

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import jsonlines
import re
from datetime import datetime

# Load enriched metadata
#jsonl_path = "/Users/lgfolder/Downloads/raw_scraped_metadata_rtw_test_enriched.jsonl"
jsonl_path = '../data/meetings/meetings_metadata.jsonl'
with jsonlines.open(jsonl_path) as reader:
    records = list(reader)

# Drop text field for performance
for r in records:
    r.pop("text", None)

df = pd.DataFrame(records)

In [None]:
df.info()

In [None]:

# === Fix PDF metadata date format (e.g., D:20240313121419+00'00') ===
def clean_pdf_date(date_str):
    if not isinstance(date_str, str) or not date_str.startswith("D:"):
        return None
    try:
        match = re.match(r"D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})", date_str)
        if match:
            y, m, d, h, mi, s = map(int, match.groups())
            return datetime(y, m, d, h, mi, s)
    except Exception:
        return None
    return None

df["created_parsed"] = df["created"].apply(clean_pdf_date)
df["year_month"] = df["created_parsed"].dt.to_period("M")

# === Parse meeting_date to YYYY-MM-DD ===
df["meeting_date_parsed"] = pd.to_datetime(df["meeting_date"], errors="coerce", dayfirst=True)

# === Additional prep ===
df["author_clean"] = df["author"].fillna("Unknown")
df["text_length"] = df["char_count"]  # Already computed earlier
df["pages"] = pd.to_numeric(df["pages"], errors="coerce")

# === Visualizations ===

# 1. Creation date distribution
if df["year_month"].notna().any():
    plt.figure(figsize=(10, 4))
    df["year_month"].value_counts().sort_index().plot(kind="bar")
    plt.title("PDFs by Creation Date (Year-Month)")
    plt.xlabel("Creation Date")
    plt.ylabel("Document Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# 2. Top 10 authors
plt.figure(figsize=(8, 4))
df["author_clean"].value_counts().head(10).plot(kind="bar")
plt.title("Top 10 Authors")
plt.xlabel("Author")
plt.tight_layout()
plt.show()

# 3. Text length distribution
plt.figure(figsize=(8, 4))
df["text_length"].plot(kind="hist", bins=30)
plt.title("Distribution of Text Lengths")
plt.xlabel("Characters")
plt.tight_layout()
plt.show()

# 4. Page count distribution
plt.figure(figsize=(8, 4))
df["pages"].plot(kind="hist", bins=20)
plt.title("Distribution of Document Page Counts")
plt.xlabel("Pages")
plt.tight_layout()
plt.show()

# 5. Document type
if "document_type" in df:
    plt.figure(figsize=(6, 4))
    df["document_type"].value_counts().plot(kind="bar")
    plt.title("Document Types")
    plt.tight_layout()
    plt.show()

# 6. Committee counts
if "committee" in df:
    plt.figure(figsize=(8, 4))
    df["committee"].value_counts().head(10).plot(kind="bar")
    plt.title("Top 10 Committees")
    plt.tight_layout()
    plt.show()

# 7. Numeric density
if "number_density" in df:
    plt.figure(figsize=(6, 4))
    df["number_density"].plot(kind="hist", bins=30)
    plt.title("Numeric Density Distribution")
    plt.xlabel("Numbers per Word")
    plt.tight_layout()
    plt.show()

# 8. Table-heavy flag
if "is_table_heavy" in df:
    plt.figure(figsize=(4, 4))
    df["is_table_heavy"].value_counts().plot(kind="bar")
    plt.title("Table-Heavy Documents")
    plt.xlabel("True / False")
    plt.tight_layout()
    plt.show()

In [None]:
# Group by parsed meeting date
if df["meeting_date_parsed"].notna().any():
    plt.figure(figsize=(10, 4))
    df["meeting_date_parsed"].value_counts().sort_index().plot(kind="bar")
    plt.title("Distribution of Documents by Meeting Date")
    plt.xlabel("Meeting Date")
    plt.ylabel("Number of Documents")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ No valid meeting_date found.")

In [None]:
df["meeting_month"] = df["meeting_date_parsed"].dt.to_period("M")
df["meeting_month"].value_counts().sort_index().plot(kind="bar", figsize=(10, 4))
plt.title("Documents by Meeting Month")
plt.xlabel("Meeting Month")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Maidstone (encoded)

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs, unquote
import base64
import time
import json
import os
import re

# Load the CSV file
df = pd.read_csv("/Users/lgfolder/Downloads/maidstone.csv")

# Set for visited URLs to prevent duplicates
visited = set()

# Path to results file
results_path = "/Users/lgfolder/Downloads/pdf_results_maidstone.json"
if os.path.exists(results_path):
    with open(results_path, "r") as f:
        results_log = json.load(f)
else:
    results_log = []

# Helper: Check if a URL points to a PDF
def is_pdf(url):
    return url.lower().endswith(".pdf")

# Helper: Allow only ieListDocuments URLs with CId and MId (order-insensitive)
def is_allowed_ieListDocuments_url(url):
    if not isinstance(url, str):
        return False
    parsed = urlparse(url)
    if "ieListDocuments.aspx" not in parsed.path:
        return False
    query = parse_qs(parsed.query)
    return "CId" in query and "MId" in query

# Helper: Check if it's a valid starting page (meeting list or embedded list)
def is_valid_start_url(url):
    return isinstance(url, str) and url.startswith("http") and (
        "ieListMeetings.aspx" in url or "sq_content_src" in url)

# Decode Maidstone-style embedded links

def extract_embedded_url(url):
    parsed = urlparse(url)
    qs = parse_qs(parsed.query)
    sq_encoded = qs.get("sq_content_src", [None])[0]
    if not sq_encoded:
        return None
    try:
        decoded = base64.b64decode(sq_encoded[1:]).decode("utf-8")  # strip +
        real_url = parse_qs(decoded).get("url", [None])[0]
        return unquote(real_url) if real_url else None
    except Exception:
        return None

# Crawl PDFs from a specific meeting document page
def crawl_ieListDocuments_page(url, topic, source_url, depth=0, max_depth=2):
    pdf_links = set()
    if depth > max_depth or url in visited:
        return pdf_links

    if not is_allowed_ieListDocuments_url(url):
        return pdf_links

    visited.add(url)
    print(f"Crawling: {url} (depth={depth})")

    try:
        resp = requests.get(url, timeout=10)
        if resp.status_code != 200:
            return pdf_links
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return pdf_links

    soup = BeautifulSoup(resp.text, "html.parser")

    # Try to extract committee, date, and time from a heading
    committee = date = time_ = None
    heading_text = soup.find(string=re.compile(r" - .*\d{1,2}.*\d{4}"))
    if heading_text:
        match = re.match(r"(?P<committee>.+?) - .*?(?P<date>\d{1,2}(?:st|nd|rd|th)?\s+\w+,\s+\d{4})\s+(?P<time>\d{1,2}\.\d{2}\s*[ap]m)", heading_text.strip(), re.IGNORECASE)
        if match:
            committee = match.group("committee").strip()
            date = match.group("date").strip()
            time_ = match.group("time").strip()

    for a_tag in reversed(soup.find_all("a", href=True)):
        link = urljoin(url, a_tag['href'])
        if is_pdf(link):
            if link not in pdf_links:
                pdf_links.add(link)
                # Save immediately with metadata
                results_log.append({
                    "topic": topic,
                    "source_url": source_url,
                    "page_url": url,
                    "pdf_url": link,
                    "committee": committee,
                    "date": date,
                    "time": time_
                })
                with open(results_path, "w") as f:
                    json.dump(results_log, f, indent=2)
        time.sleep(0.25)

    return pdf_links

# Top-level crawler: start from meeting list, extract child meeting document pages
def crawl_from_meeting_list(start_url, topic):
    meeting_pdf_links = set()

    # Decode embedded URL if present
    embedded_url = extract_embedded_url(start_url)
    if embedded_url:
        print(f"Resolved embedded URL: {embedded_url}")
        start_url = embedded_url

    print(f"Scanning meeting list: {start_url}")
    try:
        resp = requests.get(start_url, timeout=10)
        if resp.status_code != 200:
            return meeting_pdf_links
    except Exception as e:
        print(f"Failed to fetch meeting list page {start_url}: {e}")
        return meeting_pdf_links

    soup = BeautifulSoup(resp.text, "html.parser")
    for a_tag in soup.find_all("a", href=True):
        link = urljoin(start_url, a_tag['href'])
        if is_allowed_ieListDocuments_url(link):
            meeting_pdf_links |= crawl_ieListDocuments_page(link, topic, start_url)
            time.sleep(0.25)

    return meeting_pdf_links

# Run crawler for each topic
pdf_map = {}
for _, row in df.iterrows():
    topic = row.get('topic')
    start_url = row.get('mother_url')

    if not is_valid_start_url(start_url):
        print(f"Skipping (not allowed): {start_url}")
        continue

    visited.clear()
    pdf_links = crawl_from_meeting_list(start_url, topic)
    pdf_map[topic] = list(pdf_links)
    print(f"Saved {len(pdf_links)} PDFs for topic: {topic}\n")

# The pdf_map now holds topic -> [list of PDF URLs]


## KCC Meeting metadata EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)

# Load JSONL file into a DataFrame
file_path = '../data/meetings/meetings_metadata.jsonl'
data = pd.read_json(file_path, lines=True)

# Show basic info
data.info()

#### Loading the metadata and examining the scraping results

In [None]:
# Show the first few rows
#data.sample(1)

removing duplicates

In [None]:
# Find duplicated web_meeting_codes
duplicate_codes = data[data['web_meeting_code'].duplicated(keep=False)]

# Sort and select specific columns
duplicate_codes = duplicate_codes.sort_values('web_meeting_code')[
    ['web_meeting_code', 'scrape_timestamp', 'meeting_title', 'meeting_status', 'meeting_date']
]
duplicate_codes

In [None]:
# Drop duplicates by web_meeting_code, keeping the last entry
data = data.sort_values('scrape_timestamp')  # Ensure correct ordering
data = data.drop_duplicates(subset='web_meeting_code', keep='last')

In [None]:
assert data['web_meeting_code'].duplicated().sum() == 0  # Should be 0

checking for missing values

In [None]:
# Column names
print("Columns:", data.columns.tolist())

# Null values
print("\nMissing values per column:\n", data.isnull().sum())

# Unique values per column
print("\nUnique values per column:")
for col in data.columns:
    try:
        unique_count = data[col].nunique()
        print(f"{col}: {unique_count}")
    except TypeError:
        print(f"{col}: ❌ Cannot count unique values (unhashable type like list or dict)")
        
# If 'meeting_date' exists and is string, convert to datetime
if 'meeting_date' in data.columns:
    data['meeting_date'] = pd.to_datetime(data['meeting_date'], errors='coerce')
    print("\nDate range:", data['meeting_date'].min(), "to", data['meeting_date'].max())

In [None]:
# Convert meeting_date to datetime if not already
data['meeting_date'] = pd.to_datetime(data['meeting_date'], errors='coerce')

# Drop rows where meeting_date is NaT
data = data.dropna(subset=['meeting_date'])

In [None]:
codes = data['web_meeting_code'].dropna().astype(int)
code_range = pd.Series(range(codes.min(), codes.max() + 1))

missing_codes = code_range[~code_range.isin(codes)]
print(f"Missing codes: {len(missing_codes)}")

plt.figure(figsize=(12, 4))
codes.plot(kind='hist', bins=100)
plt.title('Distribution of Scraped web_meeting_code')
plt.xlabel('web_meeting_code')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert to datetime
data['scrape_timestamp'] = pd.to_datetime(data['scrape_timestamp'], errors='coerce')

# Floor to minute
data['scrape_minute'] = data['scrape_timestamp'].dt.floor('T')

# Count scrapes per minute
scrape_counts = data['scrape_minute'].value_counts().sort_index()

# Create full range of minutes between first and last scrape
full_range = pd.date_range(start=data['scrape_minute'].min(),
                           end=data['scrape_minute'].max(),
                           freq='T')

# Reindex with full range, fill missing with 0
scrape_counts_full = scrape_counts.reindex(full_range, fill_value=0)

# Plot
plt.figure(figsize=(14, 5))
scrape_counts_full.plot()
plt.title('Scraped Meetings per Minute (with Gaps)')
plt.xlabel('Scrape Time (Minute)')
plt.ylabel('Number of Meetings Scraped')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert and extract meeting_month
data['meeting_date'] = pd.to_datetime(data['meeting_date'], errors='coerce')
data['meeting_month'] = data['meeting_date'].dt.to_period('M')

# Count meetings per month
monthly_counts = data['meeting_month'].value_counts().sort_index()

# Create full continuous monthly index
full_index = pd.period_range(start=data['meeting_month'].min(),
                             end=data['meeting_month'].max(),
                             freq='M')

# Reindex to ensure gaps are shown with 0s
monthly_counts_full = monthly_counts.reindex(full_index, fill_value=0)

# Plot
plt.figure(figsize=(12, 5))
monthly_counts_full.plot(kind='bar')
plt.title('Meetings per Month (Including Gaps)')
plt.xlabel('Meeting Month')
plt.ylabel('Number of Meetings')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
missing_months = data[data['meeting_month'].isna()]
#missing_months.sample(2)

In [None]:
missing_month_count = data['meeting_month'].isna().sum()
print(f"Missing meeting_month values: {missing_month_count}")

In [None]:
missing_dates = data[data['meeting_date'].isna()]
missing_dates[['meeting_id', 'meeting_title', 'committee_name', 'meeting_date']]

In [None]:
# Define the fields you consider essential
essential_fields = ['meeting_title', 'meeting_status', 'committee_name', 'meeting_date', 'meeting_time', 'agenda_items']

# Filter rows where web_meeting_code exists AND any essential field is missing
incomplete_rows = data[
    data['web_meeting_code'].notna() & 
    data[essential_fields].isnull().any(axis=1)
]

print(len(incomplete_rows))
incomplete_rows

In [None]:
rerun = incomplete_rows['web_meeting_code'].to_list()
rerun

In [None]:
import pandas as pd

# Ensure codes are integers and sorted
codes = data['web_meeting_code'].dropna().astype(int)
all_codes = pd.Series(range(codes.min(), codes.max() + 1))

# Identify missing codes
missing_codes = all_codes[~all_codes.isin(codes)]

# Group into consecutive ranges
gap_ranges = []
if not missing_codes.empty:
    start = prev = missing_codes.iloc[0]
    for code in missing_codes[1:]:
        if code == prev + 1:
            prev = code
        else:
            gap_ranges.append((start, prev))
            start = prev = code
    gap_ranges.append((start, prev))  # add final group

# Create DataFrame of missing ranges
missing_df = pd.DataFrame(gap_ranges, columns=['missing_start', 'missing_end'])
missing_df['missing_count'] = missing_df['missing_end'] - missing_df['missing_start'] + 1

missing_df.sort_values(by=['missing_count'], ascending=False).head(15)

In [None]:
# Ensure meeting_date is datetime
data['meeting_date'] = pd.to_datetime(data['meeting_date'], errors='coerce')

# Create a Year-Month string column
data['year_month'] = data['meeting_date'].dt.to_period('M').astype(str)

# Group by year_month and get min, max, and count of web_meeting_code
grouped = data.groupby('year_month')['web_meeting_code'].agg(['min', 'max', 'count']).reset_index()

# Sort by month
grouped = grouped.sort_values('year_month')
grouped.tail(10)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure datetime is parsed
data['meeting_date'] = pd.to_datetime(data['meeting_date'], errors='coerce')

# Extract numeric month (1–12) and month name
data['month'] = data['meeting_date'].dt.month
data['month_name'] = data['meeting_date'].dt.month_name()

# Group by month number and count meetings
monthly_pattern = data.groupby(['month', 'month_name']).size().reset_index(name='count')

# Sort by calendar order
monthly_pattern = monthly_pattern.sort_values('month')

# Plot
plt.figure(figsize=(10, 5))
plt.bar(monthly_pattern['month_name'], monthly_pattern['count'], color='cornflowerblue')
plt.title('Seasonal Meeting Pattern by Calendar Month')
plt.xlabel('Month')
plt.ylabel('Number of Meetings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Show a few non-empty agenda items
data['agenda_items'].dropna().iloc[0:3].tolist()

#### Cleanup of junk pdf scraping results

In [None]:
import re

def clean_item_title(title):
    if isinstance(title, str):
        return re.sub(r'\s*PDF\s*\d+(\.\d+)?\s*(KB|MB)', '', title, flags=re.IGNORECASE).strip()
    return title

def clean_agenda_items(items):
    if isinstance(items, list):
        for item in items:
            if isinstance(item, dict) and 'item_title' in item:
                item['item_title'] = clean_item_title(item['item_title'])
    return items

# Apply cleaning to the full DataFrame
data['agenda_items'] = data['agenda_items'].apply(clean_agenda_items)

#### Explore agendas

In [None]:
import pandas as pd

# Explode agenda_items so each item gets its own row
agenda_data = data.explode('agenda_items').dropna(subset=['agenda_items']).copy()

# Flatten the nested dictionaries into columns
agenda_flat = pd.json_normalize(agenda_data['agenda_items'])

# Add meeting context to each item
agenda_flat['meeting_id'] = agenda_data['meeting_id'].values
agenda_flat['committee_name'] = agenda_data['committee_name'].values
agenda_flat['meeting_date'] = agenda_data['meeting_date'].values

# Preview
agenda_flat.head()

In [None]:
agenda_flat.groupby('meeting_id').size().sort_values(ascending=False)

In [None]:
agenda_flat['item_title'].str.lower().str.extract(r'(\w+)')[0].value_counts().head(20)

In [None]:
# Flatten the list of all pdf URLs
all_pdfs = [url for sublist in agenda_flat['pdf_urls'] if isinstance(sublist, list) for url in sublist]

# Convert to Series for easier analysis
pdf_series = pd.Series(all_pdfs)

# Count total and duplicate entries
total_pdfs = pdf_series.size
unique_pdfs = pdf_series.nunique()
duplicate_pdfs = total_pdfs - unique_pdfs

print(f"Total PDF URLs: {total_pdfs}")
print(f"Unique PDF URLs: {unique_pdfs}")
print(f"Duplicate PDF URLs: {duplicate_pdfs}")

### New committee cleanup approach

In [None]:
# Count occurrences of each committee_name
committee_counts = data['committee_name'].value_counts(dropna=False)

# Display result
committee_counts.tail(10)

In [None]:
# Create DataFrame from unique non-null committee names
committee_series = data['committee_name'].dropna().unique()
df_committee_parts = pd.DataFrame({'original_name': committee_series})

# Adjust split logic
df_committee_parts['first_part'] = df_committee_parts['original_name'].apply(
    lambda x: x.rsplit(',', 1)[0] if ',' in x else None
)
df_committee_parts['last_part'] = df_committee_parts['original_name'].apply(
    lambda x: x.rsplit(',', 1)[1].strip() if ',' in x else x
)

df_committee_parts.tail(3)

In [None]:
# Define a function to extract the new committee name
def extract_last_part(name):
    if pd.isna(name):
        return name
    return name.rsplit(',', 1)[1].strip() if ',' in name else name

# Apply the transformation back to data, overwriting it
data['committee_name'] = data['committee_name'].apply(extract_last_part)

In [None]:
# Inspect datatypes and nested objects
print(data.dtypes)

# Optional: check a sample problematic row
sample = data.sample(1)
print(sample.to_dict())

### New agenda cleanup approach

Cell 1: Clean PDF garbage from titles

In [None]:
import re

PDF_PATTERN = re.compile(r'\s*PDF\s*\d+(\.\d+)?\s*(KB|MB)', flags=re.IGNORECASE)

agenda_flat['item_title'] = agenda_flat['item_title'].fillna('').apply(
    lambda x: PDF_PATTERN.sub('', x).strip()
)

Cell 2: Tag low-value items instead of dropping

In [None]:
# Define regex patterns to flag low-value or boilerplate items
JUNK_PATTERNS = [
    re.compile(r'(?i)\bapologies\b'),
    re.compile(r'(?i)declaration[s]? of (interest|disclosable|inter)'),
    re.compile(r'(?i)date of next meeting'),
    re.compile(r'(?i)exempt items'),
    re.compile(r'(?i)^work programme'),
    re.compile(r'(?i)future meeting'),
    re.compile(r'(?i)^introduction'),
    re.compile(r'(?i)substitutes'),
    re.compile(r'(?i)questions'),
    re.compile(r'(?i)tributes'),
    re.compile(r'(?i)election of'),
]

def is_junk(title):
    title = str(title).lower().strip()
    return any(p.search(title) for p in JUNK_PATTERNS)

agenda_flat['is_junk'] = agenda_flat['item_title'].apply(is_junk)

Cell 3: Assign fallback item numbers

In [None]:
# Replace missing or blank item_numbers with zero-padded fallback numbers
agenda_flat['item_number'] = agenda_flat['item_number'].fillna('')

agenda_flat['item_number'] = agenda_flat.groupby('meeting_id')['item_number'].transform(
    lambda x: [item if item.strip() else f"{i+1:03d}" for i, item in enumerate(x)]
)

Cell 4: Generate agenda_id

In [None]:
# Extract number part and zero-pad to 3 digits
agenda_flat['item_num_clean'] = agenda_flat['item_number'].str.extract(r'(\d+)')[0].fillna('000').str.zfill(3)

# Concatenate with meeting_id
agenda_flat['agenda_id'] = agenda_flat['meeting_id'].astype(str) + '_' + agenda_flat['item_num_clean']

Reorder columns in agenda_flat

In [None]:
# Desired leading columns
lead_cols = ['agenda_id', 'item_num_clean']

# Get the rest of the columns (preserving order but excluding the leads)
other_cols = [col for col in agenda_flat.columns if col not in lead_cols]

# Reassign agenda_flat with new column order
agenda_flat = agenda_flat[lead_cols + other_cols]

In [None]:
agenda_flat.head(5)

In [None]:
# Assign machine reasable meeting time
#agenda_flat['meeting_date_ts'] = agenda_flat['meeting_date'].astype('int64') // 1_000_000  # milliseconds

In [None]:
# Sort agenda items chronologically and by item within each meeting
agenda_flat = agenda_flat.sort_values(by=['meeting_date', 'agenda_id'], ascending=False).reset_index(drop=True)

Cell 5: Save to agenda_items.jsonl

In [None]:
output_path = '../data/metadata/agenda_items.jsonl'
agenda_flat.to_json(output_path, orient='records', lines=True)

print(f"✅ Saved {len(agenda_flat)} agenda items to: {output_path}")

### Committee and agenda cleaning code

In [None]:
import pandas as pd
import re
from functools import lru_cache
import numpy as np

# Constants and compiled patterns
PDF_PATTERN = re.compile(r'\s*PDF\s*\d+(\.\d+)?\s*(KB|MB)', flags=re.IGNORECASE)
MINUTES_PATTERN = re.compile(r'(?i)^minutes of the meeting|^minutes for')
JUNK_PATTERNS = [
    re.compile(r'(?i)apologi(es|s)'),
    re.compile(r'(?i)declaration[s]? of (interest|disclosable|inter)'),
    re.compile(r'(?i)date of next meeting'),
    re.compile(r'(?i)exempt items'),
    re.compile(r'(?i)minutes'),
    re.compile(r'(?i)^work programme'),
    re.compile(r'(?i)future meeting'),
    re.compile(r'(?i)^introduction'),
    re.compile(r'(?i)substitutes'),
    re.compile(r'(?i)questions'),
    re.compile(r'(?i)tributes'),
    re.compile(r'(?i)election of')
]

# Committee normalization setup
keyword_map = {
    'county council': 'County Council',
    'kent and medway police and crime panel': 'Kent and Medway Police and Crime Panel',
    'select committee': 'Select Committee',
    'member development': 'Member Development',
    'regulation committee appeal': 'Regulation Committee Appeal Panel (Transport)',
    'standing advisory council on religious': 'Standing Advisory Council on Religious Education (SACRE)',
    'adult social care': 'Adult Social Care Cabinet Committee',
    'pension': 'Pension Fund Committee',
    'education': "Children's, Young People and Education Cabinet Committee",
    'personnel': "Personnel Committee",
    'regulation committee member panel': 'Regulation Committee',
    "children's social care and health cabinet committee": "Children's, Young People and Education Cabinet Committee",
    'environment & transport cabinet committee': "Environment & Transport Cabinet Committee",
    'governance and audit committee': "Governance and Audit Committee",
    'nhs joint': "Kent and Medway NHS Joint Overview and Scrutiny Committee",
    'standards committee': "Standards Committee",
    'wellbeing board': "Kent Health and Wellbeing Board",
    'scrutiny committee': "Scrutiny Committee",
    'governor appointments': "Governor Appointments Panel",
    'regulation committee appeal panel': "Regulation Committee Appeal Panel (Transport)",
    "mental health guardianship": "Regulation Committee"
}

@lru_cache(maxsize=500)
def normalize_committee(name):
    if not isinstance(name, str):
        return name
    name_lower = name.lower()
    for keyword, canonical in keyword_map.items():
        if keyword in name_lower:
            return canonical
    return name

# Cleaning functions
def clean_single_item(item):
    if not isinstance(item, dict):
        return item
    return {
        **item,
        'item_title': PDF_PATTERN.sub('', item.get('item_title', '')).strip()
    }

def is_junk_item(item):
    if not isinstance(item, dict):
        return True
    title = str(item.get('item_title', '')).lower()
    return (
        not title.strip() or
        any(pattern.search(title) for pattern in JUNK_PATTERNS) or
        MINUTES_PATTERN.search(title)
    )

def filter_junk_items(items):
    if not isinstance(items, list):
        return []
    return [item for item in items if not is_junk_item(item)]

def clean_data_chunk(chunk):
    """Helper function for parallel processing"""
    return chunk.assign(agenda_items=chunk['agenda_items'].apply(filter_junk_items))

# Main cleaning pipeline
def clean_data(file_path):
    # Load data
    df = pd.read_json(file_path, lines=True)
    
    # Convert dates
    df['scrape_timestamp'] = pd.to_datetime(df['scrape_timestamp'], errors='coerce')
    df['meeting_date'] = pd.to_datetime(df['meeting_date'], errors='coerce')
    
    # Clean agenda items
    df['agenda_items'] = df['agenda_items'].apply(
        lambda x: [clean_single_item(i) for i in x] if isinstance(x, list) else x
    )
    
    # Normalize committees
    df['committee_name'] = df['committee_name'].apply(normalize_committee)
    df = df[~df['committee_name'].str.contains('forum', case=False, na=False)]
    
    # Filter junk items (sequential version - removed parallel processing)
    df['agenda_items'] = df['agenda_items'].apply(filter_junk_items)
    
    # Final filtering
    df = df[df['agenda_items'].str.len() > 0]
    valid_committees = df['committee_name'].dropna().unique()
    df = df[df['committee_name'].isin(valid_committees)]
    
    return df.reset_index(drop=True)

# View agenda items function
def get_committee_agenda_items(df, committee_name):
    committee_meetings = df[
        df['committee_name'].str.lower() == committee_name.lower()
    ].copy()
    
    if committee_meetings.empty:
        print(f"No meetings found for committee: {committee_name}")
        return pd.DataFrame()
    
    agenda_data = committee_meetings.explode('agenda_items').dropna(subset=['agenda_items'])
    agenda_flat = pd.json_normalize(agenda_data['agenda_items'])
    
    # Add meeting context
    context_cols = ['meeting_id', 'committee_name', 'meeting_date', 'meeting_title']
    for col in context_cols:
        agenda_flat[col] = agenda_data[col].values
    
    agenda_flat['num_pdfs'] = agenda_flat['pdf_urls'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    return agenda_flat.sort_values(['meeting_date', 'item_number']).reset_index(drop=True)

# Execute pipeline
if __name__ == "__main__":
    FILE_PATH = '../data/meetings/meetings_metadata.jsonl'
    
    # Clean the data
    cleaned_data = clean_data(FILE_PATH)
    
    # Example usage - view Cabinet items
    cabinet_items = get_committee_agenda_items(cleaned_data, "Governance and Audit Committee")
    print(f"Found {len(cabinet_items)} agenda items for Cabinet")
    
    # Show available committees
    print("\nAvailable committees:")
    print(cleaned_data['committee_name'].unique())
    
    # Uncomment to view the results
    #print(cabinet_items.head())

In [None]:
cabinet_items.sample(2)

### Code to delete error rows in the meetings_metadata.json (if corrupted and have errors)

In [None]:
import pandas as pd
import shutil

# Define paths
source_path = '../data/meetings/meetings_metadata.jsonl'
backup_path = '../data/meetings/meetings_metadata_backup.jsonl'

# Step 1: Create backup copy
shutil.copy(source_path, backup_path)

# Step 2: Load data
data = pd.read_json(source_path, lines=True)

# Step 3: Filter out rows where 'error' is not NaN
cleaned_data = data[data['error'].isna()]

# Step 4: Overwrite original file with cleaned data
#cleaned_data.to_json(source_path, orient='records', lines=True)

print(f"Backup saved to: {backup_path}")
print(f"Cleaned file saved to: {source_path}")