### Mass processing

In [22]:
import os
import json
import hashlib
from pathlib import Path
import openai
import pdfplumber
import toml
from tqdm import tqdm

# === CONFIG ===
DOCS_ROOT = Path("../data/council_documents")
OUTPUT_DIR = Path("../data/jsons")
PROMPT_PATH = Path("../prompts/mom_summarisation_v2.prompt")
MODEL = "gpt-4o"
TEMPERATURE = 0.2

# Load OpenAI API key from .streamlit/secrets.toml
secrets = toml.load("../.streamlit/secrets.toml")
openai.api_key = secrets["OPENAI_API_KEY"]

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def find_all_pdfs(base_dir):
    return list(base_dir.glob("**/originals/*.pdf"))

def hash_path(p):
    return hashlib.md5(str(p).encode("utf-8")).hexdigest()

def extract_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n\n".join(page.extract_text() or "" for page in pdf.pages).strip()

def load_prompt(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.read().split("\n", 1)
        return lines[0], lines[1]  # system_prompt, user_template

def generate_summary(text, system_prompt, user_template):
    user_prompt = user_template.replace("{{TRANSCRIPT}}", text)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=TEMPERATURE
    )
    return response.choices[0].message.content

def strip_json_block(raw):
    return raw.strip().removeprefix("```json").removesuffix("```").strip()

def main():
    system_prompt, user_template = load_prompt(PROMPT_PATH)
    pdfs = find_all_pdfs(DOCS_ROOT)

    print(f"🗂 Found {len(pdfs)} PDFs in 'originals' subfolders")

    for pdf_path in tqdm(pdfs, desc="Processing"):
        # Generate doc_id and output path based on PDF hash
        doc_id = "doc_" + hash_path(pdf_path)[:8]
        doc_out_path = OUTPUT_DIR / f"{doc_id}.json"
        if doc_out_path.exists():
            continue  # skip already processed

        try:
            text = extract_text(pdf_path)
            if not text.strip():
                print(f"⚠️ Skipping empty PDF: {pdf_path}")
                continue
            raw_output = generate_summary(text, system_prompt, user_template)
            cleaned = strip_json_block(raw_output)
            parsed = json.loads(cleaned)
            parsed["doc_id"] = doc_id

            with open(doc_out_path, "w", encoding="utf-8") as f:
                json.dump(parsed, f, indent=2)
            # If desired, comment out or remove the old out_path block above to avoid duplicate saving
            # (Old out_path logic removed)
        except Exception as e:
            print(f"❌ Error in {pdf_path}: {e}")

if __name__ == "__main__":
    main()

🗂 Found 432 PDFs in 'originals' subfolders


Processing:   0%|          | 0/432 [00:00<?, ?it/s]CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
Processing:   0%|          | 1/432 [00:08<1:04:32,  8.99s/it]

❌ Error in ../data/council_documents/cabinet/2024-11-28/originals/Appendix 3 - EQIA.pdf: Expecting value: line 1 column 1 (char 0)


Processing:   0%|          | 2/432 [00:26<1:40:02, 13.96s/it]CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


❌ Error in ../data/council_documents/cabinet/2024-11-28/originals/Appendix 2 - Environmental Achievements 23-24.pdf: Expecting value: line 1 column 1 (char 0)


Processing:   0%|          | 2/432 [00:31<1:53:56, 15.90s/it]


KeyboardInterrupt: 

### Testing on one pdf

In [20]:
import pdfplumber
import openai
import json
import toml

# === CONFIG ===
PDF_PATH = "../data/council_documents/full_council/2025-03-13/originals/Printed minutes 13th-Mar-2025 10.00 County Council.pdf"
PROMPT_PATH = "../prompts/mom_summarisation_v2.prompt"
MODEL = "gpt-4o"
TEMPERATURE = 0.2

# === LOAD API KEY FROM secrets.toml ===
secrets = toml.load("../.streamlit/secrets.toml")
openai.api_key = secrets["OPENAI_API_KEY"]

# === EXTRACT TEXT FROM PDF ===
with pdfplumber.open(PDF_PATH) as pdf:
    text = "\n\n".join(page.extract_text() or "" for page in pdf.pages).strip()

# === LOAD PROMPT ===
with open(PROMPT_PATH, "r", encoding="utf-8") as f:
    lines = f.read().split("\n", 1)
    system_prompt = lines[0]
    user_prompt_template = lines[1]
    user_prompt = user_prompt_template.replace("{{TRANSCRIPT}}", text)

# === CALL OPENAI API ===
response = openai.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=TEMPERATURE
)

# === PARSE RESPONSE ===
raw = response.choices[0].message.content
print("🔹 Full Raw Output:\n", raw)

try:
    parsed = json.loads(raw.strip().removeprefix("```json").removesuffix("```").strip())
    parsed  # This will display nicely in most notebooks
except Exception as e:
    print("❌ Failed to parse JSON:", e)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

🔹 Full Raw Output:
 # Kent County Council Meeting Summary - 13 March 2025

## Key Decisions and Discussions

- **Kent Minerals and Waste Local Plan 2024-2039**
  - The council adopted the updated plan, which outlines waste management and mineral extraction strategies for the next 15 years. This plan aims to ensure sustainable development and environmental protection. The motion passed with 52 votes in favor and 11 abstentions.

- **Commissioned Family Hubs**
  - A motion was passed to continue funding Seashells and Millmead from the Family Hub Grant to maximize service benefits. The council decided not to reconsider the decision, despite a call-in by the Scrutiny Committee.

- **Disposable Vapes**
  - The council agreed to support a comprehensive ban on disposable vapes, effective from June 2025. The motion emphasized the need for robust regulations to prevent environmental and health risks, and called for government action to close legal loopholes. The motion passed with 34 votes in f

In [21]:
# === PARSE RESPONSE ===
raw = response.choices[0].message.content
print("🔹 Full Raw Output:\n", raw)

# No JSON parsing required
# If needed, you can optionally write `raw` to a Markdown or text file
# For example:
# with open("council_summary.md", "w", encoding="utf-8") as f:
#     f.write(raw)

🔹 Full Raw Output:
 # Kent County Council Meeting Summary - 13 March 2025

## Key Decisions and Discussions

- **Kent Minerals and Waste Local Plan 2024-2039**
  - The council adopted the updated plan, which outlines waste management and mineral extraction strategies for the next 15 years. This plan aims to ensure sustainable development and environmental protection. The motion passed with 52 votes in favor and 11 abstentions.

- **Commissioned Family Hubs**
  - A motion was passed to continue funding Seashells and Millmead from the Family Hub Grant to maximize service benefits. The council decided not to reconsider the decision, despite a call-in by the Scrutiny Committee.

- **Disposable Vapes**
  - The council agreed to support a comprehensive ban on disposable vapes, effective from June 2025. The motion emphasized the need for robust regulations to prevent environmental and health risks, and called for government action to close legal loopholes. The motion passed with 34 votes in f