## Unified Search Page

In [None]:
import pandas as pd
import jsonlines

# Load all files
with jsonlines.open("../data/metadata/documents.jsonl", "r") as reader:
    documents = pd.DataFrame(reader)

with jsonlines.open("../data/pdf_metadata/scraped_pdf_metadata.jsonl", "r") as reader:
    scraped_meta = pd.DataFrame(reader)

with jsonlines.open("../data/pdf_summaries/summaries.jsonl", "r") as reader:
    summaries = pd.DataFrame(reader)

# Drop duplicates early based on doc_id
documents = documents.drop_duplicates(subset="doc_id")
scraped_meta = scraped_meta.drop_duplicates(subset="doc_id")
summaries = summaries.drop_duplicates(subset="doc_id")

# Merge
pdf_merged = documents.merge(scraped_meta, on="doc_id", how="left")
pdf_merged = pdf_merged.merge(summaries, on="doc_id", how="left")

# Optional: drop duplicates again if same doc_id had slight metadata duplication
pdf_merged = pdf_merged.drop_duplicates(subset="doc_id")

# Optional: ensure each URL is unique (if needed)
pdf_merged = pdf_merged.drop_duplicates(subset="url", keep="first")

In [None]:
pdf_merged.info()

In [None]:
pdf_merged["Meeting Date"] = pd.to_datetime(pdf_merged["meeting_date"], unit="ms", errors="coerce").dt.strftime("%-d %B %Y")

pdf_display = pdf_merged[[
    "Meeting Date",
    "committee_name",
    "item_title",
    "doc_category",
    "display_title",
    "summary",
]].rename(columns={
    "committee_name": "Committee",
    "item_title": "Agenda Item",
    "doc_category": "Type",
    "display_title": "Document Title",
    "summary": "Summary"
})

In [None]:
pdf_display

In [None]:
with jsonlines.open("../data/metadata/meetings.jsonl", "r") as reader:
    meetings = pd.DataFrame(reader)

In [None]:
meetings

In [None]:
meetings['meeting_status'].value_counts()

In [None]:
pdf_merged.info()

In [None]:
# Load meetings.jsonl and merge web_meeting_code into pdf_merged
with jsonlines.open("../data/metadata/meetings.jsonl", "r") as reader:
    meetings = pd.DataFrame(reader)

# Only keep columns we need
meetings_trimmed = meetings[["meeting_id", "web_meeting_code"]].drop_duplicates()

# Merge into pdf_merged
pdf_merged = pdf_merged.merge(meetings_trimmed, on="meeting_id", how="left")

In [None]:
import pandas as pd

# Assume your merged dataframe is called pdf_merged
# Make sure it has both meeting_date (converted) and web_meeting_code

# Step 1: Convert Unix timestamp to readable date
pdf_merged["Meeting Date"] = pd.to_datetime(
    pdf_merged["meeting_date"], unit="ms", errors="coerce"
).dt.strftime("%-d %B %Y")

# Step 2: Construct clickable URLs using web_meeting_code
pdf_merged["Meeting Date"] = pdf_merged.apply(
    lambda row: f'<a href="https://democracy.kent.gov.uk/ieListDocuments.aspx?MId={row["web_meeting_code"]}" target="_blank">{row["Meeting Date"]}</a>'
    if pd.notnull(row["web_meeting_code"]) and pd.notnull(row["Meeting Date"]) else row["Meeting Date"],
    axis=1
)

In [None]:
from IPython.display import display, HTML

display(HTML(pdf_merged[["Meeting Date", "doc_id"]].head(10).to_html(escape=False, index=False)))

In [None]:
import pandas as pd
import jsonlines

# Load agenda FAISS metadata
with jsonlines.open("../data/embeddings/agendas/metadata_agenda.jsonl", "r") as reader:
    agenda_hits = pd.DataFrame(reader)

# Load full agenda metadata
with jsonlines.open("../data/metadata/agendas.jsonl", "r") as reader:
    agenda_full = pd.DataFrame(reader)

# Confirm agenda_id join
agenda_hits["agenda_id"] = agenda_hits["chunk_id"]

# Force proper merge
agenda_full["meeting_date"] = pd.to_numeric(agenda_full["meeting_date"], errors="coerce")
merged = agenda_hits.merge(
    agenda_full[["agenda_id", "meeting_date", "committee_name", "item_title"]],
    how="left",
    on="agenda_id"
)
merged["meeting_date"] = merged["meeting_date_y"]
# Build preview row
for i, row in merged.head(5).iterrows():
    raw_date = row.get("meeting_date", None)
    print("Raw date:", raw_date)
    try:
        if isinstance(raw_date, (float, int)) and not pd.isnull(raw_date):
            meeting_dt = pd.to_datetime(int(raw_date), unit="ms", errors="coerce")
            meeting_str = meeting_dt.strftime("%-d %b %Y") if pd.notnull(meeting_dt) else "N/A"
        else:
            meeting_str = str(raw_date)
    except Exception:
        meeting_str = "EXCEPTION"
    print(f"{row['agenda_id']}: {meeting_str}")

In [None]:
print("Sample agenda_id from FAISS metadata:", agenda_hits["agenda_id"].head().tolist())
print("Sample agenda_id from full metadata:", agenda_full["agenda_id"].head().tolist())

# Check overlap
print("Matching agenda_ids:", agenda_hits["agenda_id"].isin(agenda_full["agenda_id"]).sum())

In [None]:
matched = merged[merged["meeting_date"].notna()]
for i, row in matched.head(5).iterrows():
    raw_date = row.get("meeting_date")
    meeting_dt = pd.to_datetime(int(raw_date), unit="ms", errors="coerce")
    meeting_str = meeting_dt.strftime("%-d %b %Y") if pd.notnull(meeting_dt) else "N/A"
    print(f"{row['agenda_id']}: {meeting_str}")

In [None]:
merged.columns.tolist()

In [None]:
import os
print("KEY?", os.getenv("OPENAI_API_KEY"))

In [67]:
import pandas as pd
import numpy as np
import jsonlines
import faiss
from openai import OpenAI
import os

import toml
from openai import OpenAI

import toml
from openai import OpenAI

secrets = toml.load("../.streamlit/secrets.toml")
api_key = secrets["OPENAI_API_KEY"]

client = OpenAI(api_key=api_key)


# Load metadata
with jsonlines.open("../data/metadata/documents.jsonl", "r") as reader:
    documents = pd.DataFrame(reader).drop_duplicates(subset="doc_id")
with jsonlines.open("../data/pdf_metadata/scraped_pdf_metadata.jsonl", "r") as reader:
    scraped_meta = pd.DataFrame(reader).drop_duplicates(subset="doc_id")
with jsonlines.open("../data/pdf_summaries/summaries.jsonl", "r") as reader:
    summaries = pd.DataFrame(reader).drop_duplicates(subset="doc_id")
with jsonlines.open("../data/metadata/meetings.jsonl", "r") as reader:
    meetings = pd.DataFrame(reader).drop_duplicates(subset="meeting_id")
with jsonlines.open("../data/embeddings/pdf_summaries/metadata_pdf_summaries.jsonl", "r") as reader:
    summaries_df = pd.DataFrame(reader).drop_duplicates(subset="doc_id")

# Query
query = "Is Kent county council worried about its debt levels?"
query_vector = client.embeddings.create(input=[query], model="text-embedding-3-small").data[0].embedding
query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)

# FAISS search
summary_index = faiss.read_index("data/embeddings/pdf_summaries/pdf_summary_index.faiss")
D, I = summary_index.search(query_vector, 100)
summary_hits = summaries_df.iloc[I[0]].copy()
summary_hits["score"] = D[0]

RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char *) at /Users/runner/work/faiss-wheels/faiss-wheels/faiss/faiss/impl/io.cpp:68: Error: 'f' failed: could not open data/embeddings/pdf_summaries/pdf_summary_index.faiss for reading: No such file or directory

In [68]:
from pathlib import Path
import faiss

index_path = Path("../data/embeddings/pdf_summaries/pdf_summary_index.faiss").resolve()
print("Resolved FAISS index path:", index_path)

summary_index = faiss.read_index(str(index_path))

Resolved FAISS index path: /Users/lgfolder/github/council-assistant/data/embeddings/pdf_summaries/pdf_summary_index.faiss


In [69]:
query = "Is Kent county council worried about its debt levels?"
query_vector = client.embeddings.create(input=[query], model="text-embedding-3-small").data[0].embedding
query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)

D, I = summary_index.search(query_vector, 100)
summary_hits = summaries_df.iloc[I[0]].copy()
summary_hits["score"] = D[0]

In [70]:
summary_hits[["doc_id", "score"]].head(10)

Unnamed: 0,doc_id,score
9097,kcc_cc_2024-03-13_fde0b9,0.638321
8832,kcc_cc_2023-11-01_9dbe66,0.650005
3009,kcc_cc_2019-07-24_2d4862,0.655137
1355,kcc_cc_2018-02-20_fbc391,0.684023
9894,kcc_cc_2023-10-05_161faa,0.686362
10304,kcc_cc_2024-12-12_249e7e,0.691618
472,kcc_cc_2017-10-19_073b72,0.704259
8469,kcc_cc_2023-11-02_e29584,0.705475
10021,kcc_cc_2023-12-19_82b7bf,0.709517
10367,kcc_cc_2024-11-27_6aa28a,0.72209


In [71]:
# Ensure types are clean
summary_hits["doc_id"] = summary_hits["doc_id"].astype(str).str.strip()
documents["doc_id"] = documents["doc_id"].astype(str).str.strip()
scraped_meta["doc_id"] = scraped_meta["doc_id"].astype(str).str.strip()
summaries["doc_id"] = summaries["doc_id"].astype(str).str.strip()

# Merge
filtered_pdf_hits = summary_hits.merge(documents, on="doc_id", how="left")
filtered_pdf_hits = filtered_pdf_hits.merge(scraped_meta, on="doc_id", how="left")
filtered_pdf_hits = filtered_pdf_hits.merge(summaries, on="doc_id", how="left")
filtered_pdf_hits = filtered_pdf_hits.merge(meetings[["meeting_id", "web_meeting_code"]], on="meeting_id", how="left")
filtered_pdf_hits = filtered_pdf_hits.drop_duplicates(subset="doc_id")

# Display
filtered_pdf_hits[["doc_id", "score", "title", "url", "meeting_id"]].head(10)

Unnamed: 0,doc_id,score,title,url,meeting_id
0,kcc_cc_2024-03-13_fde0b9,0.638321,,https://democracy.kent.gov.uk/documents/s12403...,kent_cc_9275
1,kcc_cc_2023-11-01_9dbe66,0.650005,,https://democracy.kent.gov.uk/documents/s12151...,kent_cc_9259
2,kcc_cc_2019-07-24_2d4862,0.655137,,https://democracy.kent.gov.uk/documents/s91513...,kent_cc_8222
3,kcc_cc_2018-02-20_fbc391,0.684023,Appendix F – Prudential Indicators,https://democracy.kent.gov.uk/documents/s82962...,kent_cc_7818
4,kcc_cc_2023-10-05_161faa,0.686362,,https://democracy.kent.gov.uk/documents/s12123...,kent_cc_9394
5,kcc_cc_2024-12-12_249e7e,0.691618,,https://democracy.kent.gov.uk/documents/s12796...,kent_cc_9476
6,kcc_cc_2017-10-19_073b72,0.704259,,https://democracy.kent.gov.uk/documents/s79227...,kent_cc_7567
7,kcc_cc_2023-11-02_e29584,0.705475,,https://democracy.kent.gov.uk/documents/s12155...,kent_cc_9220
8,kcc_cc_2023-12-19_82b7bf,0.709517,By:,https://democracy.kent.gov.uk/documents/s12251...,kent_cc_9422
9,kcc_cc_2024-11-27_6aa28a,0.72209,Proposed Granting of Formal Lease for Electric...,https://democracy.kent.gov.uk/documents/s12764...,kent_cc_9482


In [86]:
import numpy as np
import pandas as pd
import faiss
import jsonlines
from openai import OpenAI
import toml
from pathlib import Path

# === Load secrets ===
secrets = toml.load("../.streamlit/secrets.toml")
client = OpenAI(api_key=secrets["OPENAI_API_KEY"])

# === Load FAISS index and summary metadata ===
summary_index = faiss.read_index(str(Path("../data/embeddings/pdf_summaries/pdf_summary_index.faiss").resolve()))
with jsonlines.open("../data/embeddings/pdf_summaries/metadata_pdf_summaries.jsonl", "r") as reader:
    summary_meta = pd.DataFrame(reader).drop_duplicates(subset="doc_id")

# === Load full document metadata ===
with jsonlines.open("../data/metadata/documents.jsonl", "r") as reader:
    documents = pd.DataFrame(reader)
with jsonlines.open("../data/pdf_metadata/scraped_pdf_metadata.jsonl", "r") as reader:
    scraped = pd.DataFrame(reader)
with jsonlines.open("../data/pdf_summaries/summaries.jsonl", "r") as reader:
    summaries = pd.DataFrame(reader)

# === Merge into full PDF metadata ===
pdf_data = documents.merge(scraped, on="doc_id", how="left")
pdf_data = pdf_data.merge(summaries[["doc_id", "summary", "display_title", "keywords"]], on="doc_id", how="left")
pdf_data = pdf_data.drop_duplicates(subset="doc_id")

# === Your query here ===
query = "mermaid"
print(f"\n🔍 Searching for: {query}")

# === Generate query embedding and search
query_vector = client.embeddings.create(input=[query], model="text-embedding-3-small").data[0].embedding
query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)
D, I = summary_index.search(query_vector, 100)

# === Extract and join results
valid_indices = I[0][I[0] != -1]
if len(valid_indices) == 0:
    print("\n❌ No matches found.")
else:
    faiss_subset = summary_meta.iloc[valid_indices].copy()
    faiss_subset["score"] = D[0][:len(valid_indices)]
    results = faiss_subset.merge(pdf_data, on="doc_id", how="left").sort_values("score")

    # === Display top results
    for _, row in results.head(10).iterrows():
        title = (
            row.get("display_title") or
            row.get("title") or
            row.get("source_filename") or
            row["doc_id"]
        )
        print(f"\n🔗 {title} — Score: {row['score']:.3f}")
        print(row.get("summary", "No summary available."))
        print("---")


🔍 Searching for: mermaid


IndexError: positional indexers are out-of-bounds

In [87]:
!ls data/embeddings/pdf_summaries/

ls: data/embeddings/pdf_summaries/: No such file or directory


In [88]:
filtered_pdf_hits = faiss_subset.merge(pdf_merged, on="doc_id", how="left")

In [89]:
filtered_pdf_hits

Unnamed: 0,doc_id,chunk_id,text,display_title_x,timestamp,text_hash,source_type,score,url,meeting_id,...,metadata_title,source_filename,subject,keywords_x,encryption,error,summary,keywords_y,display_title_y,web_meeting_code
0,kcc_cc_2020-02-13_dc276c,kcc_cc_2020-02-13_dc276c,This document outlines the investment strategy...,Kent County Council Investment Strategy,2025-05-22T02:22:37.689549,efecbbc14b42a4a4ffd33dbeac24d9fb69fe2185ea33bb...,pdf_summary,0.647063,https://democracy.kent.gov.uk/documents/s95885...,kent_cc_8192,...,,Appendix 5 - Investment Strategy Final.pdf,,,,,This document outlines the investment strategy...,"[investment strategy, Kent County Council, loc...",Kent County Council Investment Strategy,8192
1,kcc_cc_2017-10-19_073b72,kcc_cc_2017-10-19_073b72,This document outlines Kent County Council's M...,Autumn Budget Statement 2017,2025-05-21T22:46:15.272234,6d06c6e5d3a316dccdb2e11cf2e3afe20a17005513569d...,pdf_summary,0.647253,https://democracy.kent.gov.uk/documents/s79227...,kent_cc_7567,...,,Item 7 - Autumn Budget Statement.pdf,,,,,This document outlines Kent County Council's M...,"[budget, financial plan, Kent County Council, ...",Autumn Budget Statement 2017,7567
2,kcc_cc_2023-11-01_9dbe66,kcc_cc_2023-11-01_9dbe66,This document outlines the financial recovery ...,Budget Recovery Report 2023-24,2025-05-23T01:39:39.602424,3c200ba1ffdf2658dc87369d6c1fdf9a28b3c9e3c456fe...,pdf_summary,0.663942,https://democracy.kent.gov.uk/documents/s12151...,kent_cc_9259,...,,Cabinet Report - Appendix 1 - Budget Recovery ...,,,,,This document outlines the financial recovery ...,"[budget recovery, financial plan, Kent County ...",Budget Recovery Report 2023-24,9259
3,kcc_cc_2020-07-16_fceefa,kcc_cc_2020-07-16_fceefa,This document outlines the need for a strategi...,Strategic Reset County Council Paper,2025-05-22T04:05:11.319819,bd6fe2b37ac96d86757e9a70afb56c48b76a75aeea5a6c...,pdf_summary,0.672677,https://democracy.kent.gov.uk/documents/s97421...,kent_cc_8456,...,,Strategic Reset County Council Paper v0.5.pdf,,,,,This document outlines the need for a strategi...,"[strategic reset, Kent County Council, COVID-1...",Strategic Reset County Council Paper,8456
4,kcc_cc_2024-03-04_b3f00d,kcc_cc_2024-03-04_b3f00d,This document contains the minutes from the AC...,"ACCESS Joint Committee Minutes - December 4, 2023",2025-05-23T05:15:38.858570,a3ee38463acb657e40a8cd94839a8dfc27c7b6d8220aef...,pdf_summary,0.676608,https://democracy.kent.gov.uk/documents/s12361...,kent_cc_9423,...,Minutes,20240304 ACCESS 4 December 2023 draft minutes ...,,,,,This document contains the minutes from the AC...,"[ACCESS Joint Committee, minutes, Business Pla...","ACCESS Joint Committee Minutes - December 4, 2023",9423
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,kcc_cc_2018-07-06_8924d9,kcc_cc_2018-07-06_8924d9,This report outlines Kent County Council's app...,Managing Highway Infrastructure in Kent,2025-05-21T23:59:42.316316,d31588c06c23f25cc2ddbd9b2f71fe39daf2c55f27de22...,pdf_summary,0.825955,https://democracy.kent.gov.uk/documents/s85043...,kent_cc_7907,...,Insurance Report 2014,Managing Highway Infrastructure in Kent.pdf,,,,,This report outlines Kent County Council's app...,"[highway infrastructure, asset management, Ken...",Managing Highway Infrastructure in Kent,7907
96,kcc_cc_2019-04-03_30de63,kcc_cc_2019-04-03_30de63,This document contains the minutes of the Kent...,Scrutiny Committee Meeting Minutes - January 2019,2025-05-22T00:00:22.538251,5aead2138a1d21650c20c8ee80cc07321fe68083046877...,pdf_summary,0.827701,https://democracy.kent.gov.uk/documents/s89156...,kent_cc_7913,...,Minutes,Minutes of Previous Meeting.pdf,,,,,This document contains the minutes of the Kent...,"[Scrutiny Committee, meeting minutes, budget, ...",Scrutiny Committee Meeting Minutes - January 2019,7913
97,kcc_cc_2018-12-13_85fffe,kcc_cc_2018-12-13_85fffe,This document contains questions raised during...,County Council Meeting Questions - December 2018,2025-05-21T23:54:11.710970,c6faaf9b25f4255f1b5e86876540ad66806e29689ceef0...,pdf_summary,0.827782,https://democracy.kent.gov.uk/documents/s88303...,kent_cc_7891,...,,CCQs in order 13 December.pdf,,,,,This document contains questions raised during...,"[Kent County Council, livestock movement, elec...",County Council Meeting Questions - December 2018,7891
98,kcc_cc_2022-01-25_026af5,kcc_cc_2022-01-25_026af5,This document outlines Kent County Council's A...,Anti-Fraud and Corruption Strategy,2025-05-22T19:52:42.424232,7da298c5ed00d8a0bc3e8d841ff82388d407c8fa14ea8b...,pdf_summary,0.828240,https://democracy.kent.gov.uk/documents/s10892...,kent_cc_8722,...,,Item 16 Appendix 3 - Anti-Fraud Corruption Str...,,,,,This document outlines Kent County Council's A...,"[fraud, corruption, strategy, local government...",Anti-Fraud and Corruption Strategy,8722


In [112]:
# Load summary metadata
with jsonlines.open("../data/embeddings/pdf_summaries/metadata_pdf_summaries.jsonl", "r") as reader:
    summaries_df = pd.DataFrame(reader).drop_duplicates(subset="doc_id")

# Load FAISS index
summary_index = faiss.read_index("../data/embeddings/pdf_summaries/pdf_summary_index.faiss")

# Run FAISS search
summary_D, summary_I = summary_index.search(query_vector, 100)

# Filter out invalid index values
valid_summary_indices = summary_I[0][summary_I[0] != -1]
valid_summary_indices = [i for i in valid_summary_indices if i < len(summaries_df)]

# Get top summary hits
summary_hits = summaries_df.iloc[valid_summary_indices].copy()
summary_hits["score"] = summary_D[0][:len(summary_hits)]
summary_hits["source_type"] = "pdf"

In [113]:
summary_hits

Unnamed: 0,doc_id,chunk_id,text,display_title,timestamp,text_hash,source_type,score
6837,kcc_cc_2022-05-19_cf65cf,kcc_cc_2022-05-19_cf65cf,This document discusses the potential developm...,Plan Sea: Coastal and Marine Strategy for Kent,2025-05-22T21:13:38.493150,a960806e21b755a3c4286a57ea8428025619e7f1d6005a...,pdf,1.534189
10168,kcc_cc_2024-11-06_0a1eb1,kcc_cc_2024-11-06_0a1eb1,This document outlines various policies and st...,Relevant Policies and Strategies Supporting Wo...,2025-05-23T05:23:59.633085,f7cef486372fad7c26e5c896a8600b706ff035cb79d70f...,pdf,1.550454
138,kcc_cc_2017-11-13_39ea49,kcc_cc_2017-11-13_39ea49,This document outlines the Medway Estuary and ...,Medway Estuary and Swale Shoreline Management ...,2025-05-21T22:03:21.809213,4b08a1783563b2721203d6782533037cdbc8ba9431410e...,pdf,1.567909
13277,kcc_cc_2018-09-26_bec06c,kcc_cc_2018-09-26_bec06c,This document outlines the themes and question...,Themes and Questions for the Loneliness and So...,2025-05-23T09:12:06.687976,f62ae0c9d06db88b71dc7a4dd58f0697625c9db5534d08...,pdf,1.585628
137,kcc_cc_2017-11-13_686e2b,kcc_cc_2017-11-13_686e2b,This document presents the Medway Estuary and ...,Medway Estuary and Swale Flood Risk Strategy P...,2025-05-21T22:03:16.588139,0c563feb9b4dce54f5867074e97d6d5761fa6cbef62de9...,pdf,1.588299
...,...,...,...,...,...,...,...,...
1462,kcc_cc_2018-07-16_56d683,kcc_cc_2018-07-16_56d683,This document contains the minutes from the Ke...,Minutes of Kent Flood Risk Management Committe...,2025-05-21T23:43:44.888465,ecdd3731d6eec45c43822b560769942c32fdff0df8a96b...,pdf,1.674482
3341,kcc_cc_2019-11-28_280305,kcc_cc_2019-11-28_280305,This report outlines the proposed reconfigurat...,Turner Contemporary Capital Project Record of ...,2025-05-22T03:21:52.470809,17863934fb4dfddf119a600d11538f09b650cc8af3db41...,pdf,1.674813
11229,kcc_cc_2025-03-18_24bee9,kcc_cc_2025-03-18_24bee9,This document outlines the work programme for ...,Scrutiny Committee Work Programme - March 2025,2025-05-23T06:26:13.212307,fd07093506dea0c4c0fc28130b0aad623e86f1cd44684b...,pdf,1.675125
8990,kcc_cc_2024-03-07_0f34c6,kcc_cc_2024-03-07_0f34c6,This report presents the Environment and Trans...,Mobility as a Service Pilot Report,2025-05-23T01:48:27.580528,9536e07a4c6eaa83b3a4d0151ae0da9be88930f3a04124...,pdf,1.675210


In [92]:
# Assume pdf_merged is already loaded globally at top of file
pdf_combined = summary_hits.copy()

# Ensure consistent ID format
pdf_combined["doc_id"] = pdf_combined["doc_id"].astype(str).str.strip()
pdf_merged["doc_id"] = pdf_merged["doc_id"].astype(str).str.strip()

# Merge extra info from pdf_merged
pdf_combined = pdf_combined.merge(
    pdf_merged[["doc_id", "display_title", "url", "meeting_date"]],
    on="doc_id",
    how="left"
)

In [94]:
pdf_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   doc_id           99 non-null     object 
 1   chunk_id         99 non-null     object 
 2   text             99 non-null     object 
 3   display_title_x  99 non-null     object 
 4   timestamp        99 non-null     object 
 5   text_hash        99 non-null     object 
 6   source_type      99 non-null     object 
 7   score            99 non-null     float32
 8   display_title_y  99 non-null     object 
 9   url              99 non-null     object 
 10  meeting_date     99 non-null     int64  
dtypes: float32(1), int64(1), object(9)
memory usage: 8.2+ KB


In [95]:
pdf_combined = pdf_combined.rename(columns={"display_title_y": "display_title"})
pdf_combined[["doc_id", "display_title", "url", "meeting_date", "score"]]

Unnamed: 0,doc_id,display_title,url,meeting_date,score
0,kcc_cc_2022-05-19_cf65cf,Plan Sea: Coastal and Marine Strategy for Kent,https://democracy.kent.gov.uk/documents/s11206...,1652918400000,1.534189
1,kcc_cc_2024-11-06_0a1eb1,Relevant Policies and Strategies Supporting Wo...,https://democracy.kent.gov.uk/documents/s12724...,1730851200000,1.550454
2,kcc_cc_2017-11-13_39ea49,Medway Estuary and Swale Shoreline Management ...,https://democracy.kent.gov.uk/documents/s80678...,1510531200000,1.567909
3,kcc_cc_2018-09-26_bec06c,Themes and Questions for the Loneliness and So...,https://democracy.kent.gov.uk/documents/s86396...,1537920000000,1.585628
4,kcc_cc_2017-11-13_686e2b,Medway Estuary and Swale Flood Risk Strategy P...,https://democracy.kent.gov.uk/documents/s81427...,1510531200000,1.588299
...,...,...,...,...,...
94,kcc_cc_2018-07-16_56d683,Minutes of Kent Flood Risk Management Committe...,https://democracy.kent.gov.uk/documents/s85295...,1531699200000,1.674482
95,kcc_cc_2019-11-28_280305,Turner Contemporary Capital Project Record of ...,https://democracy.kent.gov.uk/documents/s94438...,1574899200000,1.674813
96,kcc_cc_2025-03-18_24bee9,Scrutiny Committee Work Programme - March 2025,https://democracy.kent.gov.uk/documents/s13017...,1742256000000,1.675125
97,kcc_cc_2024-03-07_0f34c6,Mobility as a Service Pilot Report,https://democracy.kent.gov.uk/documents/s12372...,1709769600000,1.675210


In [97]:
combined = pd.concat([agenda_hits, summary_hits], ignore_index=True)

In [98]:
pdf_combined = combined[combined["source_type"] == "pdf"]

In [99]:
pdf_combined["doc_id"] = pdf_combined["doc_id"].astype(str).str.strip()
pdf_merged["doc_id"] = pdf_merged["doc_id"].astype(str).str.strip()
pdf_combined = pdf_combined.merge(
    pdf_merged[["doc_id", "display_title", "url", "meeting_date"]],
    on="doc_id",
    how="left"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdf_combined["doc_id"] = pdf_combined["doc_id"].astype(str).str.strip()


In [100]:
pdf_combined = pdf_combined.rename(columns={"display_title_y": "display_title"})

In [109]:
def get_meeting_str(raw_date):
    if pd.isnull(raw_date):
        return "N/A"
    try:
        return pd.to_datetime(int(raw_date), unit="ms").strftime("%b %Y")
    except Exception:
        try:
            return pd.to_datetime(raw_date).strftime("%b %Y")
        except Exception:
            return "N/A"

In [110]:
pdf_links = pdf_combined.head(6).copy()
pdf_links["meeting_str"] = pdf_links["meeting_date_y"].apply(get_meeting_str)

In [111]:
pdf_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   doc_id           6 non-null      object 
 1   chunk_id         6 non-null      object 
 2   text             6 non-null      object 
 3   meeting_id       0 non-null      object 
 4   committee_id     0 non-null      object 
 5   meeting_date_x   0 non-null      float64
 6   source_type      6 non-null      object 
 7   agenda_id        0 non-null      object 
 8   display_title_x  6 non-null      object 
 9   timestamp        6 non-null      object 
 10  text_hash        6 non-null      object 
 11  score            6 non-null      float32
 12  display_title    6 non-null      object 
 13  url              6 non-null      object 
 14  meeting_date_y   6 non-null      int64  
 15  meeting_str      6 non-null      object 
dtypes: float32(1), float64(1), int64(1), object(13)
memory usage: 872.

### PDF sumamry table

In [116]:
import jsonlines
import pandas as pd

# Load source files
with jsonlines.open("../data/metadata/documents.jsonl", "r") as reader:
    documents = pd.DataFrame(reader)

with jsonlines.open("../data/pdf_metadata/scraped_pdf_metadata.jsonl", "r") as reader:
    scraped = pd.DataFrame(reader)

with jsonlines.open("../data/pdf_summaries/summaries.jsonl", "r") as reader:
    summaries = pd.DataFrame(reader)

# Merge all together on doc_id
pdf_data = documents.merge(scraped, on="doc_id", how="left")
pdf_data = pdf_data.merge(summaries[["doc_id", "summary", "display_title", "keywords"]], on="doc_id", how="left")
pdf_data = pdf_data.drop_duplicates(subset="doc_id")

In [117]:
pdf_data

Unnamed: 0,url,meeting_id,agenda_id,committee_name,committee_id,meeting_date,item_title,doc_id,doc_category,status,...,num_tables_detected,metadata_title,source_filename,subject,keywords_x,encryption,error,summary,display_title,keywords_y
0,https://democracy.kent.gov.uk/documents/s67450...,kent_cc_7379,kent_cc_7379__0002,Electoral and Boundary Review Committee,electoral-and-boundary-review,1466553600000,Minutes - 22 May 2015,kcc_cc_2016-06-22_d5312b,minutes,complete,...,0.0,Minutes,Minutes of Previous Meeting.pdf,,,,,This document contains the minutes from the El...,Minutes of the Electoral and Boundary Review C...,"[Electoral Review, Kent County Council, Bounda..."
1,https://democracy.kent.gov.uk/documents/s69950...,kent_cc_7393,kent_cc_7393__0052,Planning Applications Committee,planning-applications,1468368000000,Minutes - 18 May 2016,kcc_cc_2016-07-13_0d5be3,minutes,complete,...,0.0,,Minutes.pdf,,,,,This document contains the minutes from the Pl...,Planning Applications Committee Minutes - 18 M...,"[planning, committee minutes, waste disposal, ..."
5,https://democracy.kent.gov.uk/documents/s69943...,kent_cc_7393,kent_cc_7393__0053,Planning Applications Committee,planning-applications,1468368000000,Application TM/15/1636 (KCC/TM/0141/2015) - Se...,kcc_cc_2016-07-13_2f8155,report,complete,...,2.0,Discussion,Item C1 Nepicar Report.pdf,,,,,This document discusses a section 73 applicati...,Nepicar Sand Quarry Section 73 Application Report,"[Nepicar Sand Quarry, section 73 application, ..."
6,https://democracy.kent.gov.uk/documents/s69944...,kent_cc_7393,kent_cc_7393__0054,Planning Applications Committee,planning-applications,1468368000000,Proposal TM/16/00853/CR3 (KCC/TM/0031/2016) - ...,kcc_cc_2016-07-13_c88d53,other,complete,...,1.0,Discussion,Itm D1 Land adjacent to Hall Road Wouldham.pdf,,,,,This document discusses the proposal for a new...,Proposed Primary School Development at Hall Ro...,"[primary school, Wouldham, planning applicatio..."
10,https://democracy.kent.gov.uk/documents/s69945...,kent_cc_7393,kent_cc_7393__0055,Planning Applications Committee,planning-applications,1468368000000,Proposal 16/503410/COUNTY (KCC/MA/0084/2016) -...,kcc_cc_2016-07-13_31517f,other,complete,...,1.0,Discussion,Item D2 South Borough CP School.pdf,,,,,This document discusses a planning application...,Planning Application for South Borough School ...,"[school expansion, planning application, Maids..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14209,https://democracy.kent.gov.uk/documents/s64959...,kent_cc_6357,kent_cc_6357__F610,Kent and Medway NHS Joint Overview and Scrutin...,kent-and-medway-nhs-joint-overview-and-scrutiny,1461888000000,Header-only document,kcc_cc_2016-04-29_973814,other,complete,...,2.0,,NHS England Paper.pdf,,,,,This document provides an update on the Kent a...,Kent and Medway Vascular Services Review Update,"[vascular services, Kent and Medway, NHS, heal..."
14210,https://democracy.kent.gov.uk/documents/s64964...,kent_cc_6357,kent_cc_6357__F611,Kent and Medway NHS Joint Overview and Scrutin...,kent-and-medway-nhs-joint-overview-and-scrutiny,1461888000000,Header-only document,kcc_cc_2016-04-29_5b61b6,other,complete,...,0.0,Background Note,Kent and Medway Hyper Acute and Acute Stroke S...,,"Council meetings;Government, politics and publ...",,,This document provides background information ...,Kent and Medway Hyper Acute and Acute Stroke S...,"[stroke services, Kent and Medway, NHS, health..."
14211,https://democracy.kent.gov.uk/documents/s64965...,kent_cc_6357,kent_cc_6357__F612,Kent and Medway NHS Joint Overview and Scrutin...,kent-and-medway-nhs-joint-overview-and-scrutiny,1461888000000,Header-only document,kcc_cc_2016-04-29_5a2f5b,other,complete,...,3.0,,CCGs Paper.pdf,,,,,This document provides an update on the Kent a...,Kent and Medway Stroke Services Review Update,"[stroke services, healthcare review, Kent and ..."
14212,https://democracy.kent.gov.uk/documents/s66420...,kent_cc_6374,kent_cc_6374__F613,Select Committee - Grammar Schools and Social ...,select-grammar-schools-and-social-mobility,1465171200000,Header-only document,kcc_cc_2016-06-06_00da22,report,complete,...,0.0,,Grammar schools and social mobility Select Com...,,,,,This document presents the final draft of the ...,Grammar Schools and Social Mobility Select Com...,"[grammar schools, social mobility, education, ..."


In [127]:
import faiss
import numpy as np
from openai import OpenAI
import os

# Load FAISS index and metadata
index = faiss.read_index("../data/embeddings/pdf_summaries/pdf_summary_index.faiss")
with jsonlines.open("../data/embeddings/pdf_summaries/metadata_pdf_summaries.jsonl", "r") as reader:
    summary_meta = pd.DataFrame(reader).drop_duplicates(subset="doc_id")

# Get query vector
# === Load secrets ===
secrets = toml.load("../.streamlit/secrets.toml")
client = OpenAI(api_key=secrets["OPENAI_API_KEY"])

query = "schools"
query_vector = client.embeddings.create(input=[query], model="text-embedding-3-small").data[0].embedding
query_vector = np.array(query_vector, dtype=np.float32).reshape(1, -1)

# Search index
D, I = index.search(query_vector, 100)
valid_indices = I[0][I[0] != -1]
valid_indices = [i for i in valid_indices if i < len(summary_meta)]

# Extract matching summaries
faiss_subset = summary_meta.iloc[valid_indices].copy()
faiss_subset["score"] = D[0][:len(faiss_subset)]

In [128]:
results = faiss_subset.merge(pdf_data, on="doc_id", how="left")

In [129]:
results

Unnamed: 0,doc_id,chunk_id,text,display_title_x,timestamp,text_hash,source_type,score,url,meeting_id,...,num_tables_detected,metadata_title,source_filename,subject,keywords_x,encryption,error,summary,display_title_y,keywords_y
0,kcc_cc_2024-09-26_b2d144,kcc_cc_2024-09-26_b2d144,This document contains the minutes from the Pe...,Draft Minutes of Personnel Committee Meeting -...,2025-05-23T05:51:10.203641,436414e480cbb41c90cf2004ff5a0aa302aec2f977e4b5...,pdf_summary,1.232895,https://democracy.kent.gov.uk/documents/s12690...,kent_cc_9516,...,0.0,Minutes,Draft Minutes - Personnel Committee - 12 June ...,,,,,This document contains the minutes from the Pe...,Draft Minutes of Personnel Committee Meeting -...,"[Personnel Committee, meeting minutes, employe..."
1,kcc_cc_2019-01-11_5b6d5d,kcc_cc_2019-01-11_5b6d5d,This document outlines the proposed coordinate...,Proposed Coordinated Scheme of Admissions 2020/21,2025-05-22T00:15:19.376564,b0e02d46fd90867fd6d2d61612cfb594ad065e6f481ccf...,pdf_summary,1.235690,https://democracy.kent.gov.uk/documents/s88556...,kent_cc_7951,...,0.0,,PROD Co-ordinated Scheme of Admissions 2019.pdf,,,,,This document outlines the proposed coordinate...,Proposed Coordinated Scheme of Admissions 2020/21,"[admissions, Kent County Council, education, s..."
2,kcc_cc_2020-01-10_a25621,kcc_cc_2020-01-10_a25621,This document outlines the proposed coordinate...,Kent Coordinated Admissions Scheme 2021-22,2025-05-22T03:33:17.006571,35bd6f6f0287460562b04ca3d2f57e4ad8134b7e23cc56...,pdf_summary,1.236384,https://democracy.kent.gov.uk/documents/s95222...,kent_cc_8276,...,0.0,,Item 11 - App 1 - PROD Co-ordinated Scheme of ...,,,,,This document outlines the proposed coordinate...,Kent Coordinated Admissions Scheme 2021-22,"[admissions, Kent, schools, education, coordin..."
3,kcc_cc_2024-12-04_516a75,kcc_cc_2024-12-04_516a75,This document outlines the savings targets and...,Appendix 2 - Savings Report 2024-25,2025-05-23T05:45:59.936990,f589e4431813c4281554a80518602a3890b8cb5524693e...,pdf_summary,1.239898,https://democracy.kent.gov.uk/documents/s12775...,kent_cc_9507,...,5.0,,Appendix 2 - Savings.pdf,,,,,This document outlines the savings targets and...,Appendix 2 - Savings Report 2024-25,"[savings, budget, council, Adult Social Care, ..."
4,kcc_cc_2017-01-12_cf2eb2,kcc_cc_2017-01-12_cf2eb2,This document contains the minutes from the En...,Minutes of the Environment & Transport Cabinet...,2025-05-23T08:11:46.022578,db9073789cadf1be991431bdd37f036562839279dac1c9...,pdf_summary,1.240515,https://democracy.kent.gov.uk/documents/s74445...,kent_cc_6243,...,0.0,Minutes,Item A4 minutes.pdf,,,,,This document contains the minutes from the En...,Minutes of the Environment & Transport Cabinet...,"[Environment, Transport, Kent County Council, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,kcc_cc_2017-03-27_6b6edf,kcc_cc_2017-03-27_6b6edf,This document outlines the revisions to Kent C...,Local Transport Plan 4 Consultation Revised Plan,2025-05-23T08:11:27.830201,5937183f80c6bb7f4e392390a92cb3464c57e8e6fb4de7...,pdf_summary,1.295903,https://democracy.kent.gov.uk/documents/s76309...,kent_cc_6242,...,1.0,THE REPORT,Item 6 - Cabinet March Report - LTP4 Consultat...,,,,,This document outlines the revisions to Kent C...,Local Transport Plan 4 Consultation Revised Plan,"[Local Transport Plan, Kent County Council, tr..."
90,kcc_cc_2021-01-15_ca1354,kcc_cc_2021-01-15_ca1354,This document outlines the Kent County Council...,Proposed Primary Admissions Scheme 2022,2025-05-22T04:38:03.686306,519c521ffa19762309679823edcb2ebc6f58876c9aba55...,pdf_summary,1.296131,https://democracy.kent.gov.uk/documents/s10135...,kent_cc_8523,...,4.0,,Appendix A - Proposed Primary Scheme 2022.pdf,,,,,This document outlines the Kent County Council...,Proposed Primary Admissions Scheme 2022,"[primary admissions, Kent County Council, co-o..."
91,kcc_cc_2021-06-30_e80576,kcc_cc_2021-06-30_e80576,This document outlines the updated admission a...,Updated Admission Arrangements for Kent Schools,2025-05-22T04:40:37.463106,36f68afc548dbe4be907ab0a4f219a3e156796b0a4b341...,pdf_summary,1.296331,https://democracy.kent.gov.uk/documents/s10435...,kent_cc_8525,...,0.0,,PROD.pdf,,,,,This document outlines the updated admission a...,Updated Admission Arrangements for Kent Schools,"[admission arrangements, Kent County Council, ..."
92,kcc_cc_2018-11-27_d8edb5,kcc_cc_2018-11-27_d8edb5,This document provides a comprehensive overvie...,Educational Terms Overview,2025-05-21T23:37:21.189032,5a1a55d7de8204232df5ed9ba8709ae29f2bc8004ac7ca...,pdf_summary,1.296533,https://democracy.kent.gov.uk/documents/s87767...,kent_cc_7805,...,4.0,,Item 9 - Educational terms.pdf,,,,,This document provides a comprehensive overvie...,Educational Terms Overview,"[education, academies, A Levels, GCSE, assessm..."


In [130]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   doc_id               94 non-null     object 
 1   chunk_id             94 non-null     object 
 2   text                 94 non-null     object 
 3   display_title_x      94 non-null     object 
 4   timestamp            94 non-null     object 
 5   text_hash            94 non-null     object 
 6   source_type          94 non-null     object 
 7   score                94 non-null     float32
 8   url                  94 non-null     object 
 9   meeting_id           94 non-null     object 
 10  agenda_id            94 non-null     object 
 11  committee_name       94 non-null     object 
 12  committee_id         94 non-null     object 
 13  meeting_date         94 non-null     int64  
 14  item_title           94 non-null     object 
 15  doc_category         94 non-null     objec

In [131]:
def get_meeting_str(raw_date):
    if pd.isnull(raw_date):
        return "N/A"
    try:
        return pd.to_datetime(int(raw_date), unit="ms").strftime("%b %Y")
    except Exception:
        return "N/A"

# Build a display label
def format_label(title, source_filename):
    if not isinstance(title, str) or pd.isnull(title):
        title = ""
    if not isinstance(source_filename, str) or pd.isnull(source_filename):
        return title or "Untitled"
    filename_simple = source_filename.rsplit(".", 1)[0]
    return f"{title} ({filename_simple})" if filename_simple.lower() not in title.lower() else title

# Construct rows
rows = []
for _, row in results.head(10).iterrows():
    label = format_label(row.get("display_title_y"), row.get("source_filename"))
    date_str = get_meeting_str(row.get("meeting_date"))
    rows.append({
        "Document": label,
        "Date": date_str,
        "Summary": row["summary"] if pd.notnull(row.get("summary")) else "No summary available."
    })

final_df = pd.DataFrame(rows)

In [132]:
final_df

Unnamed: 0,Document,Date,Summary
0,Draft Minutes of Personnel Committee Meeting -...,Sep 2024,This document contains the minutes from the Pe...
1,Proposed Coordinated Scheme of Admissions 2020...,Jan 2019,This document outlines the proposed coordinate...
2,Kent Coordinated Admissions Scheme 2021-22 (It...,Jan 2020,This document outlines the proposed coordinate...
3,Appendix 2 - Savings Report 2024-25,Dec 2024,This document outlines the savings targets and...
4,Minutes of the Environment & Transport Cabinet...,Jan 2017,This document contains the minutes from the En...
5,Kent County Council Admissions Arrangements 20...,Jan 2024,This document outlines the determined admissio...
6,Proposed Primary Admissions Arrangements 2019 ...,Jan 2018,This document outlines the determined admissio...
7,Engineering Guild Action Plan (Item C5 App1 Ac...,Mar 2017,This document outlines the action plan for the...
8,Proposed Secondary Admissions Arrangements 201...,Jan 2018,This document outlines the determined admissio...
9,Proposal for Education Services Company Implem...,Mar 2017,This document outlines a proposal to create a ...


In [None]:
import pandas as pd
import jsonlines
from pathlib import Path

# Load any JSONL file into a dataframe
def load_jsonl(path):
    with jsonlines.open(path, "r") as reader:
        return pd.DataFrame(reader)

# Paths to your source files
base = Path("../data")
documents = load_jsonl(base / "metadata" / "documents.jsonl")
scraped = load_jsonl(base / "pdf_metadata" / "scraped_pdf_metadata.jsonl")
summaries = load_jsonl(base / "pdf_summaries" / "summaries.jsonl")
manifest = load_jsonl(base / "processed_register" / "pdf_manifest.jsonl")

# Ensure doc_id is string and stripped
for df in [documents, scraped, summaries, manifest]:
    if "doc_id" in df.columns:
        df["doc_id"] = df["doc_id"].astype(str).str.strip()

# Merge step by step
merged = documents.merge(scraped, on="doc_id", how="outer", suffixes=("", "_scraped"))
merged = merged.merge(summaries[["doc_id", "summary", "display_title", "keywords"]], on="doc_id", how="left")
merged = merged.merge(manifest.drop(columns=["status"], errors="ignore"), on="doc_id", how="left", suffixes=("", "_manifest"))

# Drop any duplicates
merged = merged.drop_duplicates(subset="doc_id")

# Drop unused or redundant columns
columns_to_drop = ["creator", "producer", "metadata_path", 'encryption', 'modDate']
merged = merged.drop(columns=columns_to_drop, errors="ignore")
merged = merged.drop(columns=["keywords_x"], errors="ignore")
merged = merged.rename(columns={"keywords_y": "keywords"})

# Output path
output_path = base / "metadata" / "pdf_meta_warehouse.jsonl"
with jsonlines.open(output_path, mode="w") as writer:
    for record in merged.to_dict(orient="records"):
        writer.write(record)

print(f"✅ Merged {len(merged)} documents to {output_path}")

✅ Merged 14064 documents to ../data/metadata/pdf_meta_warehouse.jsonl


In [158]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14064 entries, 0 to 14213
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   url                  14064 non-null  object 
 1   meeting_id           14064 non-null  object 
 2   agenda_id            14064 non-null  object 
 3   committee_name       14064 non-null  object 
 4   committee_id         14064 non-null  object 
 5   meeting_date         14064 non-null  int64  
 6   item_title           14064 non-null  object 
 7   doc_id               14064 non-null  object 
 8   doc_category         14064 non-null  object 
 9   status               14064 non-null  object 
 10  format               14042 non-null  object 
 11  title                7532 non-null   object 
 12  author               12967 non-null  object 
 13  creationDate         13972 non-null  object 
 14  num_tables_detected  14043 non-null  float64
 15  metadata_title       14043 non-null  obje

In [153]:
merged.sample(3)

Unnamed: 0,url,meeting_id,agenda_id,committee_name,committee_id,meeting_date,item_title,doc_id,doc_category,status,...,summary,display_title,keywords,scraped,summarised,chunked,embedded,timestamp,embedding_small,text_hash
2775,https://democracy.kent.gov.uk/documents/s78337...,kent_cc_7777,kent_cc_7777__2679,Scrutiny Committee,scrutiny,1504656000000,Select Committee Work Programme,kcc_cc_2017-09-06_815047,appendix,complete,...,This document outlines the assessment of a sel...,Assessment of Affordable Housing Provision in ...,"[affordable housing, Kent, infrastructure, pla...",True,True,False,False,2025-05-21T23:35:29.773129,False,017b362c4b068d6add31432e950564ac4ee3f918ae70bf...
1171,https://democracy.kent.gov.uk/documents/s70005...,kent_cc_6182,kent_cc_6182__21832,Kent Health and Wellbeing Board,kent-health-and-wellbeing-board,1468972800000,Kent Health and Wellbeing Board Work Programme,kcc_cc_2016-07-20_90d16c,plan,complete,...,This document outlines the work programme for ...,Kent Health and Wellbeing Board Work Plan 2016/17,"[Kent, Health and Wellbeing Board, work progra...",True,True,False,False,2025-05-23T07:44:01.058725,False,718d2b4076e415dec5aa3e2dc07a116ebaddd71f0896ba...
12998,https://democracy.kent.gov.uk/documents/s12638...,kent_cc_9447,kent_cc_9447__18472,Pension Board,pension-board,1725321600000,Pensions Administration,kcc_cc_2024-09-03_04eafe,report,complete,...,This document presents the performance metrics...,May to July 2024 Performance Report - Pension ...,"[Pension Board, performance report, service le...",True,True,False,False,2025-05-23T05:19:54.662972,False,74d60adb93d0c7c402b167ddc7632bcd3010df72bd183e...


In [164]:
merged['doc_category'].value_counts().head(30)

doc_category
report                 4029
other                  3556
appendix               2068
minutes                1281
prod                    568
decision_response       494
plan                    463
terms_of_reference      328
strategy                270
policy                  268
budget                  258
performance             230
consultation             80
supporting_material      62
amendment                30
agenda_frontsheet        28
motion                   26
questions                14
eqia                     11
Name: count, dtype: int64

In [165]:
merged.columns.tolist()

['url',
 'meeting_id',
 'agenda_id',
 'committee_name',
 'committee_id',
 'meeting_date',
 'item_title',
 'doc_id',
 'doc_category',
 'status',
 'format',
 'title',
 'author',
 'creationDate',
 'num_tables_detected',
 'metadata_title',
 'source_filename',
 'subject',
 'error',
 'summary',
 'display_title',
 'keywords',
 'scraped',
 'summarised',
 'chunked',
 'embedded',
 'timestamp',
 'embedding_small',
 'text_hash']

In [None]:
import streamlit as st
import pandas as pd
import jsonlines
from pathlib import Path
import os
import numpy as np
import faiss
from openai import OpenAI
from dotenv import load_dotenv
from openai import OpenAI
import numpy as np
from typing import List, Dict


# --------------------------
# 1. CONFIGURE PATHS (EXACT TO YOUR SYSTEM)
# --------------------------
ROOT_FOLDER = Path("/Users/lgfolder/github/council-assistant")
DATA_FOLDER = ROOT_FOLDER / "data"

PATHS = {
    "meetings": DATA_FOLDER / "metadata/meetings.jsonl",
    "agendas": DATA_FOLDER / "metadata/agendas.jsonl",
    "pdf_warehouse": DATA_FOLDER / "metadata/pdf_warehouse.jsonl",  # Primary source
    "pdf_index": DATA_FOLDER / "embeddings/pdf_summaries/pdf_summary_index.faiss",
    "pdf_metadata": DATA_FOLDER / "embeddings/pdf_summaries/metadata_pdf_summaries.jsonl",
    "agenda_index": DATA_FOLDER / "embeddings/agendas/agenda_index.faiss",
    "agenda_metadata": DATA_FOLDER / "embeddings/agendas/metadata_agenda.jsonl"
}
