In [156]:
import os
import json
import pandas as pd
from openai import OpenAI
import faiss

# === SETTINGS ===
EMBEDDING_TYPE = "large"
METADATA_PATH = f"../data/embeddings/metadata_{EMBEDDING_TYPE}.jsonl"
INDEX_PATH = f"../data/embeddings/council_index_{EMBEDDING_TYPE}.faiss"
MANIFEST_PATH = "../data/processed_register/document_manifest.jsonl"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

# === LOAD CHUNK METADATA ===
with open(METADATA_PATH, "r", encoding="utf-8") as f:
    chunk_data = [json.loads(line) for line in f]
chunk_df = pd.DataFrame(chunk_data)

# === LOAD VECTOR INDEX ===
index = faiss.read_index(INDEX_PATH)
assert index.ntotal == len(chunk_df), "Index and metadata length mismatch"

# === LOAD MANIFEST ===
with open(MANIFEST_PATH, "r", encoding="utf-8") as f:
    manifest_data = [json.loads(line) for line in f]
manifest_df = pd.DataFrame(manifest_data)

# === MERGE METADATA ===
merge_cols = ["doc_id", "committee", "meeting_date", "filename"]
chunk_df = chunk_df.merge(manifest_df[merge_cols], on="doc_id", how="left", suffixes=("_x", "_y"))

# === FIX COLUMN CLASH ===
chunk_df["committee"] = chunk_df["committee_y"]
chunk_df["meeting_date"] = chunk_df["meeting_date_y"]
chunk_df.drop(columns=["committee_x", "committee_y", "meeting_date_x", "meeting_date_y"], inplace=True)

# === DISPLAY SAMPLE ===
print("✅ Final cleaned columns:", chunk_df.columns.tolist())
display(chunk_df[["doc_id", "committee", "meeting_date", "filename"]].drop_duplicates().head(10))

✅ Final cleaned columns: ['doc_id', 'chunk_id', 'text', 'source_file', 'filename', 'committee', 'meeting_date']


Unnamed: 0,doc_id,committee,meeting_date,filename
0,doc_85f48edf,cabinet,2025-03-13,Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf
2,doc_21db8ba1,cabinet,2025-01-30,Item 7 - Supplementary - 24-00109 - Transfer t...
52,doc_446acb5e,cabinet,2025-01-30,24-00096 - PRoD.pdf
69,doc_28fdef90,cabinet,2025-01-30,Minutes of the meeting held on 9th January 202...
126,doc_3dc50afe,cabinet,2025-01-30,Appendix B Capital Investment by Directorate.pdf
142,doc_dacbe976,cabinet,2025-01-30,24-00115 - Appendix B - KCC ICS priorities mea...
216,doc_c6a4e655,cabinet,2025-01-30,24-00115 - Adoption of Integrated Care Strateg...
266,doc_8c64b8d7,cabinet,2025-01-30,Printed minutes 30th-Jan-2025 10.00 Cabinet.pdf
382,doc_184a37c4,cabinet,2025-01-30,24-00109 - Appendix C - DPIA Screening Tool ou...
426,doc_3d85ef70,cabinet,2025-01-30,24-00108 - Appendix - NOV 2024-25.pdf


In [158]:
import numpy as np

def embed_query(query: str, model: str = EMBEDDING_MODEL) -> np.ndarray:
    response = client.embeddings.create(
        model=model,
        input=query,
    )
    return np.array(response.data[0].embedding).astype("float32")

def search_and_display(query: str, top_k: int = 5):
    print(f"🔍 Query: '{query}'\n")

    query_vec = embed_query(query)
    D, I = index.search(query_vec.reshape(1, -1), top_k)

    for rank, i in enumerate(I[0]):
        row = chunk_df.iloc[i]
        print(f"— Rank {rank+1}")
        print(f"📄 File: {row['filename']}")
        print(f"📅 Date: {row['meeting_date']} — Committee: {row['committee']}")
        print(f"🧩 Chunk ID: {row['chunk_id']}")
        print(f"📝 Text: {row['text'][:800].strip()}...\n")

In [159]:
search_and_display("What was decided about SEN children in 2025?")

🔍 Query: 'What was decided about SEN children in 2025?'

— Rank 1
📄 File: 24-00096 - Commissioning Plan for Education Provision in Kent 2025-29.pdf
📅 Date: 2025-01-30 — Committee: cabinet
🧩 Chunk ID: 16
📝 Text: We will
monitor the impact of any displacement of children into maintained schools arising from the
Government’s decision to tax independent school fees.
As in previous years, the number of pupils identified as requiring a specialist place to meet their
educational needs remains a challenge. We will address the need for high quality, sustainable
SEN provision within the context of our Safety Valve Agreement with the DfE. Between the
academic years 2025-26 and 2028-29, we currently intend to commission 530 additional specialist
places.
The sector and the Local Authority continue to face challenges related to costs; for the County
Council the imbalance between the cost of providing additional places and the funding that we
receive remains. We will continue to ensure a sufficient s

In [217]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

# Step 1: Gather top-K chunks with optional adjacency
def gather_context_chunks(results_df, top_k=5, adjacent=1):
    seen = set()
    context_chunks = []

    for _, row in results_df.head(top_k).iterrows():
        doc_id = row["doc_id"]
        center_chunk = row["chunk_id"]

        for offset in range(-adjacent, adjacent + 1):
            cid = center_chunk + offset
            key = (doc_id, cid)
            if key in seen:
                continue
            seen.add(key)

            match = chunk_df[
                (chunk_df["doc_id"] == doc_id) &
                (chunk_df["chunk_id"] == cid)
            ]
            if not match.empty:
                text = match.iloc[0]["text"]
                context_chunks.append(text.strip())

    return "\n\n---\n\n".join(context_chunks)

# Step 2: Prompt builder

system_prompt = """
You are Council Assistant, an AI designed to summarize and answer questions using official local government documents.
Your task is to provide a concise and factual answer based solely on the excerpts below.

Instructions:
- Base your answer only on the information provided in the documents.
- Do not make assumptions or use outside knowledge.
- If the information is insufficient or ambiguous, say: "The answer cannot be determined from the documents provided."
- Your answer should be clear, direct, and structured in full sentences.
"""

def build_user_prompt(query, context_text):
    return f"""Answer the following question using only the information in the provided context.

Question:
{query}

Relevant Documents:
{context_text}

Answer:"""

# Step 3: End-to-end function: search + assemble + GPT call
def answer_with_rag(query, top_k=5, adjacent=1):
    # Embed query
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query
    )
    query_vector = np.array(response.data[0].embedding).astype("float32")

    # Search FAISS
    _, I = index.search(query_vector.reshape(1, -1), top_k)
    top_indices = I[0]

    # Get corresponding rows from metadata_df
    results_df = metadata_df.iloc[top_indices].copy()
    results_df["chunk_id"] = results_df["chunk_id"].astype(int)
    results_df["doc_id"] = results_df["doc_id"].astype(str)

    # Merge with full chunk_df to get complete chunk text
    results_df = results_df.merge(chunk_df, on=["doc_id", "chunk_id"], how="left")

    # Build context and prompt
    context_text = gather_context_chunks(results_df, top_k=top_k, adjacent=adjacent)
    user_prompt = build_user_prompt(query, context_text)

    # Get completion
    completion = client.chat.completions.create(
        model="gpt-4.1-nano", #"gpt-4o-mini",#"gpt-4-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.2,
        max_tokens=800
    )

    return completion.choices[0].message.content.strip()

In [218]:
import json

# ✅ Load the FAISS index
import faiss
INDEX_PATH = "../data/embeddings/council_index_large.faiss"
index = faiss.read_index(INDEX_PATH)

# ✅ Load the metadata from JSONL
METADATA_PATH = "../data/embeddings/metadata_large.jsonl"
with open(METADATA_PATH, "r", encoding="utf-8") as f:
    metadata = [json.loads(line) for line in f]

# ✅ Sanity check
assert len(metadata) == index.ntotal, f"Mismatch: {len(metadata)} metadata entries vs {index.ntotal} embeddings"

# ✅ Convert to DataFrame
metadata_df = pd.DataFrame(metadata)
metadata_df["chunk_id"] = metadata_df["chunk_id"].astype(int)
metadata_df["doc_id"] = metadata_df["doc_id"].astype(str)

In [219]:
question = "What was decided about SEN children in 2025?"

In [220]:
search_and_display(question + " (top 5)")    

🔍 Query: 'What was decided about SEN children in 2025? (top 5)'

— Rank 1
📄 File: 24-00115 - Appendix A - Current Health wellbeing activity within KCC.pdf
📅 Date: 2025-01-30 — Committee: cabinet
🧩 Chunk ID: 8
📝 Text: Children who are more level of development at the end of
Based Parenting
likely to experience poorer outcomes, including children in care and care leavers, refugees and those who the Early Years Foundation Stage
support
have offended, will receive more support. We will work together to help individuals, families, communities will have improved from 65.8% to at
• Ensure local access to
and schools build emotional resilience, tackle bullying and loneliness and provide opportunities for children, least 70%.
support for families
young people and families to form supportive networks and take part in social and leisure opportunities.
By 2028/29, the average attainment
Children and young people at most risk of significant and enduring mental health needs will receive timely
8 sco

In [221]:
print(answer_with_rag("How is SEN provision being funded in 2025?", top_k=10, adjacent=1))

In 2025, SEN provision is being funded through a combination of sources including contributions from the Department for Education (DfE), the Council's reserves, and adjustments within the Dedicated Schools’ Grant (DSG). Specifically, the Council expects to receive £9 million from the DfE as part of the Safety Valve Programme, and it will contribute a further £15.1 million from reserves. Additionally, the Schools’ Funding Forum has agreed to transfer 1.2% from the Schools Block to the High Needs Block to support SEN services in mainstream schools. These measures aim to reduce the high needs deficit and support sustainable SEN provision.


In [222]:
print(answer_with_rag("How is SEN provision being funded in 2025?", top_k=5, adjacent=2))

SEN provision in 2025 is being funded through a combination of sources, including a transfer of 1.2% from the Schools Block to the High Needs Block, approved by the Secretary of State, and contributions from the Department for Education (DfE) as part of the Safety Valve Programme. The Council is also working to improve efficiencies and reduce reliance on independent schools by reviewing commissioning strategies and arrangements, supporting inclusive practices in mainstream schools, and collaborating with health and social care partners. Additionally, some costs are met from the General Fund, and efforts are underway to bring the High Needs Block deficit to a breakeven position by 2027-28.


**How is SEN provision being funded in 2025?**

* gpt4-turbo

In 2025, funding for Special Educational Needs (SEN) provision in Kent is supported through a combination of local and national measures:

- **Safety Valve Programme**: Kent County Council and the Department for Education (DfE) are co-funding a plan to eliminate the historic deficit in the High Needs Block. This programme aims to achieve a balanced budget by 2027–28.

- **1.2% Schools Block Transfer**: The Schools’ Funding Forum approved a transfer of 1.2% from the Schools Block to the High Needs Block to enhance support for SEN in mainstream schools.

- **Efficiency and Commissioning Improvements**: The council is reviewing its SEN commissioning strategy to reduce reliance on expensive independent placements, increase consistency, and coordinate more closely with Health and Social Care partners.

- **Parental and School Support**: Efforts are underway to build capacity in mainstream schools and increase parental confidence through inclusive practices and systemic support.

Together, these strategies form the backbone of a sustainable funding approach for SEN provision in 2025.

* gpt4o-mini (10,2)

In 2025, SEN provision is being funded through several mechanisms. The High Needs Block (HNB) of the Dedicated Schools Grant (DSG) is the primary source, which supports payments to maintained schools, academies, independent schools, and other educational institutions for children with special educational needs (SEN). However, there is a significant financial challenge, with a forecasted deficit of £227 million by March 2025.

To address this, the Kent County Council (KCC) is participating in the Safety Valve Programme, which involves contributions from both the Department for Education (DfE) and the Council to help reduce the deficit. By the end of 2024-25, the deficit is expected to decrease to approximately £96 million due to these contributions.

Additionally, the Schools’ Funding Forum has agreed to a 1.2% transfer from the Schools Block to the High Needs Block to support SEN provision in mainstream schools. This transfer has been approved by the Secretary of State. Furthermore, KCC is expected to receive an additional £9 million from the DfE in 2024-25 as part of the Safety Valve commitment, alongside a required contribution of £15.1 million from reserves.

Overall, the funding for SEN provision in 2025 will rely on a combination of the High Needs Block, transfers from the Schools Block, and contributions from both the DfE and KCC as part of the ongoing efforts to manage the deficit and improve the sustainability of SEN services.

* gpt4o-mini (20,1)

In 2025, SEN provision is being funded through the High Needs Block of the Dedicated Schools Grant (DSG), which has been allocated £345.5 million, an increase of £22.8 million from the previous year. This funding supports the educational needs of children with special educational needs and disabilities (SEND) across various educational settings, including mainstream and special schools. Additionally, the Schools’ Funding Forum has agreed to a 1.2% transfer from the Schools Block to the High Needs Block to further support SEN provision in mainstream schools. The Council is also participating in the Safety Valve Programme, which involves contributions from both the Department for Education (DfE) and the Council to address historical deficits and ensure sustainable funding for SEN services.

* "chatgpt-4o-latest" (10,1)

In 2025, SEN provision in Kent is being funded through a combination of sources:

1. **Safety Valve Programme**: Kent County Council (KCC) is receiving contributions from the Department for Education (DfE) as part of the Safety Valve Programme to address the historic deficit in the High Needs Block. In 2024-25, KCC is receiving £9 million from the DfE and contributing £15.1 million from its own reserves.

2. **Schools Block Transfer**: The Schools’ Funding Forum has agreed to transfer 1.2% from the Schools Block to the High Needs Block to support SEN provision in mainstream schools.

3. **High Needs Block of the Dedicated Schools Grant (DSG)**: This block continues to fund SEN placements and support, although it is under significant financial pressure due to rising demand and costs.

4. **Efficiency Measures and Strategic Changes**: KCC is implementing actions outlined in the Safety Valve Agreement to improve sustainability, including reviewing commissioning strategies, increasing mainstream school capacity, and reducing reliance on independent schools.

Despite these funding sources, there remains a significant in-year funding shortfall and an accumulated deficit, indicating ongoing financial challenges in sustaining SEN provision.

* "chatgpt-4o-latest" (5,2)

In 2025, SEN provision is being funded through several sources and strategies:

1. **High Needs Block (HNB) of the Dedicated Schools Grant (DSG):** This remains the primary funding source for SEN provision, covering placements and support in maintained and independent schools, post-16 institutions, and other SEN support services.

2. **Safety Valve Programme:** Kent County Council (KCC) is participating in this Department for Education (DfE) initiative, which includes contributions from both the DfE and the Council to reduce the historic deficit on the High Needs Block. The deficit is forecast to be reduced to £96 million by the end of 2024-25.

3. **Schools Block Transfer:** A 1.2% transfer from the Schools Block to the High Needs Block has been approved by the Schools’ Funding Forum and the Secretary of State to support SEN provision in mainstream schools.

4. **General Fund:** Costs associated with EHCP assessments, placements, and annual reviews are funded from the Council’s General Fund.

5. **Efficiency Measures:** KCC is implementing actions under the Safety Valve Agreement to improve sustainability, including reviewing commissioning strategies, reducing reliance on independent schools, and building capacity in mainstream schools.

Despite these funding sources, there remains a forecast in-year funding shortfall of £54 million for High Needs placements and support in 2024-25 due to rising demand and costs.

* gbt3.5 -turbo (10,1)

SEN provision in Kent in 2025 is being funded through contributions from the Safety Valve Programme, where the Department for Education (DfE) and the Council are reducing the deficit on the High Needs Block. Additionally, there is a 1.2% transfer from the Schools Block to the High Needs Block to support SEN provision in mainstream schools. The Council is also receiving funding from the DfE and contributing from reserves to reduce the deficit further.

* gbt3.5 -turbo (5,2)

SEN provision in 2025 is being funded through various measures including contributions from the Department for Education (DfE) and the Council to reduce the deficit on the High Needs Block. Additionally, there is a 1.2% transfer from the Schools Block to the High Needs Block to support SEN provision in mainstream schools. Efforts are being made to identify efficiencies and improvements in high needs provision, such as reviewing commissioning strategies and arrangements with independent providers. The Council is working towards achieving a positive in-year balance on its central schools' Dedicated Schools Grant reserve by 2027-28 through the Safety Valve Agreement.

* 4.1- nano (10,1)

In 2025, SEN provision is being funded through a combination of sources including contributions from the Department for Education (DfE), the Council's reserves, and adjustments within the Dedicated Schools’ Grant (DSG). Specifically, the Council expects to receive £9 million from the DfE as part of the Safety Valve Programme, and it will contribute a further £15.1 million from reserves. Additionally, the Schools’ Funding Forum has agreed to transfer 1.2% from the Schools Block to the High Needs Block to support SEN services in mainstream schools. These measures aim to reduce the high needs deficit and support sustainable SEN provision.

* 4.1- nano (5,2)

SEN provision in 2025 is being funded through a combination of sources, including a transfer of 1.2% from the Schools Block to the High Needs Block, approved by the Secretary of State, and contributions from the Department for Education (DfE) as part of the Safety Valve Programme. The Council is also working to improve efficiencies and reduce reliance on independent schools by reviewing commissioning strategies and arrangements, supporting inclusive practices in mainstream schools, and collaborating with health and social care partners. Additionally, some costs are met from the General Fund, and efforts are underway to bring the High Needs Block deficit to a breakeven position by 2027-28.

### Maidstone

The documents provided do not mention specific details about Maidstone in the year 2025. The information available discusses birth rates up to 2023, the adoption of the Local Plan Review in 2024, and forecasts about school places without specifying the year 2025. Therefore, the answer cannot be determined from the documents provided.

The local elections originally scheduled for May 2025 may be postponed to May 2026. This postponement is contingent upon the area's participation in the Devolution Priority Programme or the necessity of reorganisation to unlock devolution opportunities. The decision to postpone will be made by central government following a request from the councils involved. This postponement is intended to facilitate the delivery of both reorganisation and devolution within the most ambitious timeframe. Requests for considering election postponement must be submitted by January 10, 2025, and any decision to postpone requires parliamentary approval through secondary legislation.

The documents provided do not specify which individuals or groups explicitly voted against the postponement of the local elections in 2025. They discuss the complexity of the decision, the unease among members, and the process involving a request to the government following a council decision, but they do not list specific votes or positions taken by individuals or groups during a vote. Therefore, the answer cannot be determined from the documents provided.

The key spending items in the 2025 budget include:

1. **Dedicated Schools Grant (DSG)**: There is an anticipated deficit of £23.9 million in 2025-26 for High Needs Education, which is expected to exceed the grant allocation. This deficit is projected to reduce to £8.3 million in 2026-27, with a surplus of £5.9 million forecasted for 2027-28.

2. **Public Health Contracts**: There are contractual and inflationary increases amounting to £197.9 thousand in 2025-26.

3. **Public Health - Sexual Health**: An increase in costs associated with long-acting reversible contraception for GP charges and prescribing costs is expected, totaling £148 thousand in 2025-26.

4. **Public Health - Stop Smoking Services**: There is an increase in Nicotine Replacement Therapy (NRT) due to demand, totaling £75 thousand in 2025-26.

5. **Public Health - Sexual Health**: There is an increased demand for sexual health services, totaling £27.3 thousand in 2025-26.

6. **Family Hubs**: A temporary extension and increase in the Family Hubs and Start for Life Grant is planned, totaling £423.9 thousand in 2025-26.

7. **Household Support Fund**: The government-funded Household Support Fund is extended for a further 6 months, with significant funding of £8,437 thousand allocated for the period up to March 2025.

These items represent a mix of externally funded initiatives and demand-driven costs in areas such as education, public health, and family support services.

5 chunks, 1 nearby, model 4-turbo

The most contentious issues during the 2025 budget debates were primarily related to the significant mismatch between forecast spending growth and available funding, which necessitated finding additional savings and income sources to balance the budget. Specific contentious points included:

1. The reliance on one-off financial solutions in the draft budget for 2025-26, which was not seen as a sustainable long-term strategy.
2. The uncertainty surrounding multi-year settlements from the government, which had not been announced at the time of the budget planning.
3. Significant forecast increases in spending due to factors such as price increases, changes in demand, and other non-inflationary factors affecting costs, including pay increases.
4. The need for compensating savings or temporary unbudgeted funding from reserves due to increased gate fees and contract inflation, particularly in waste management, which faced considerable volatility and pressure on the revenue budget.

These issues collectively contributed to a challenging budgetary environment where balancing the budget required stringent financial management and potentially unpopular fiscal measures.



10 chunks, 2 nearby, turbo 4 

The most contentious issues during the 2025 budget debates included:

1. **Reliance on One-off Measures**: The 2025-26 draft budget continued to rely on one-off financial solutions, which was a point of concern as it indicated a lack of sustainable, long-term financial planning.

2. **Unfunded Spending Growth**: There was significant spending growth that exceeded the forecasted funding, necessitating the need for additional savings and income to balance the budget. This included spending on pay increases and other non-inflationary cost factors.

3. **Insufficient Government Funding**: The administration highlighted that the multi-year settlements from the government had not been announced, which added uncertainty to the financial planning process.

4. **Net Zero Funding and Environmental Commitments**: There were concerns about the affordability of funding for Net Zero requirements, with potential implications for the council's ability to meet environmental targets.

5. **Waste Management Financial Pressures**: The volatility in income from waste management due to market conditions and increased gate fees led to unfunded pressures on the revenue budget.

6. **Capital Works and Infrastructure Risks**: Significant risks were associated with capital works, such as the collapse of the Galley Hill cliff, which required costly remedial measures that the council might need to fund.

7. **Special Educational Needs and Disabilities (SEND) Costs**: There were ongoing financial pressures from increased costs associated with Education & Health Care Plans (EHCP) and home-to-school transport for SEND students.

These issues collectively highlighted the challenges of balancing the need for essential public services against the backdrop of limited funding and rising costs.


model 4o-mini (10,2)


The most contentious issues during the budget debates in 2025 included significant spending growth, particularly in Adult Social Care, which accounted for £80 million of a total £150 million increase. Despite this growth, the council faced a shortfall of over £60 million in required savings/income for 2025-2026, compounded by insufficient government funding that only provided a net increase of £30 million. Additionally, the council's decision to raise council tax by 2.99% plus an additional 2% for the social care premium was contentious, as it highlighted the pressure to make further savings and the impact on discretionary services. The lack of government support for areas like Kent, which do not have high levels of deprivation, further exacerbated the situation, leading to concerns about the sustainability of funding for essential services.


model 4o-mini, (5,1)

The most contentious issues during the budget debates in 2025 included significant spending growth that continued to exceed forecast funding, necessitating savings and income to balance the budget. Key factors contributing to this contentiousness were the reliance on one-off funding, the impact of inflation on costs, particularly in waste management, and the need for compensating in-year savings due to unfunded pressures. Additionally, the uncertainty surrounding multi-year settlements from the government added to the complexity of the budget discussions.

model 4o-mini, (20,1)

The most contentious issues during the budget debates in 2025 included the significant spending growth that exceeded forecast funding, which necessitated savings and income to balance the budget. Specifically, there was a forecast spending growth of £150 million, with £80 million allocated to Adult Social Care, while the council faced a requirement for over £60 million in net savings/income for 2025-2026, in addition to the £90 million saved for 2024-25. 

Another contentious point was the insufficient net growth in funding from the government, which amounted to only £30 million, failing to cover the anticipated spending growth. This led to plans for a council tax increase of 2.99% plus an additional 2% for the social care premium, which was met with concerns about the impact on residents. Additionally, there were discussions regarding the allocation of government grants, as the council did not receive a new recovery grant intended for areas with high deprivation, which further highlighted the financial pressures faced by the council.

In [225]:
import json
chunk_path = "../data/council_documents/cabinet/2025-01-30/chunks/24-00096 - PROD 30th-Jan-2025 10.00 Cabinet_chunks.json"  # use real path to a chunk file
with open(chunk_path, "r") as f:
    data = json.load(f)
    print(data[0].keys())  # see if 'page' is among the keys

dict_keys(['text', 'doc_id', 'committee', 'meeting_date', 'page_num', 'chunk_id', 'char_start', 'char_end', 'source_file', 'source', 'filename'])


In [226]:
import pandas as pd
import json
import faiss
import numpy as np
from openai import OpenAI
import os

# Load API
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Load data
metadata_df = pd.read_pickle("../data/merged_metadata.pkl")
index = faiss.read_index("../data/embeddings/council_index_large.faiss")

print(f"✅ Loaded metadata with {len(metadata_df)} rows and FAISS index with dimension {index.d}")

✅ Loaded metadata with 11785 rows and FAISS index with dimension 3072


In [228]:
query = "What was decided about SEN provision in 2025?"

# Embed the query
embed_response = client.embeddings.create(
    model="text-embedding-3-large",  # gpt-4o-mini uses this embedding model
    input=[query]
)
query_vector = np.array(embed_response.data[0].embedding, dtype="float32").reshape(1, -1)

# Run FAISS search
k = 5  # number of top results
D, I = index.search(query_vector, k)
print("🔎 Top matches (indices):", I[0])

🔎 Top matches (indices): [ 721  720 1176 3278  379]


In [229]:
# Show top 5 matched metadata rows
top_k = 5
top_indices = I[0][:top_k]
top_matches = metadata_df.iloc[top_indices].copy()
top_matches[["doc_id", "committee", "meeting_date", "filename", "page_num", "source", "url"]]

Unnamed: 0,doc_id,committee,meeting_date,filename,page_num,source,url
721,doc_5a8c3d1f,cabinet,2025-01-30,24-00096 - Commissioning Plan for Education Pr...,63,data/council_documents/cabinet/2025-01-30/orig...,https://democracy.kent.gov.uk/documents/s12874...
720,doc_5a8c3d1f,cabinet,2025-01-30,24-00096 - Commissioning Plan for Education Pr...,63,data/council_documents/cabinet/2025-01-30/orig...,https://democracy.kent.gov.uk/documents/s12874...
1176,doc_fa62f954,cabinet,2025-01-30,Appendix L - Treasury Management Strategy 2025...,4,data/council_documents/cabinet/2025-01-30/orig...,https://democracy.kent.gov.uk/documents/s12876...
3278,doc_394f5308,cabinet,2025-03-04,APPENDIX B - ADOPTION of KMWLP 2024-39.pdf,56,data/council_documents/cabinet/2025-03-04/orig...,https://democracy.kent.gov.uk/documents/s12966...
379,doc_1ff16c89,cabinet,2025-01-30,Appendix H - Assessment of FInancial Resilienc...,1,data/council_documents/cabinet/2025-01-30/orig...,https://democracy.kent.gov.uk/documents/s12875...


In [230]:
# Show the text of the matched chunks
for i in top_indices:
    print(f"\n--- Chunk {i} ---\n")
    print(metadata_df.iloc[i]["text"])


--- Chunk 721 ---

7.5. Folkestone and Hythe
District commentary
• The birth rate in Folkestone and Hythe continued to fall in 2023 and it dropped below the
County average but is comparable to the National rate. The number of recorded births has
also fallen, with just 898 births recorded in 2023.
• We forecast around 23% of primary school places will be surplus across the District
throughout the Plan period.
• Within the secondary sector, we forecast a small deficit of non-selective secondary school
places in both Folkestone and Hythe and Romney Marsh at different points.
• The adopted Core Strategy (2022) sets out a long-term vision for the District from 2019/20
to 2036/37.

--- Chunk 720 ---

7.5. Folkestone and Hythe
District commentary
• The birth rate in Folkestone and Hythe continued to fall in 2023 and it dropped below the
County average but is comparable to the National rate.

--- Chunk 1176 ---

Link forecast that the Bank of England will reduce Bank Rate (in cuts of 25bps) t

In [231]:
import pandas as pd

metadata_df = pd.read_pickle("../data/merged_metadata.pkl")

# Check if 'document_type' column exists
if 'document_type' in metadata_df.columns:
    print("✅ 'document_type' column found")
    print("Unique values (sample):", metadata_df['document_type'].dropna().unique()[:10])
else:
    print("❌ 'document_type' column is missing")

❌ 'document_type' column is missing


In [233]:
query = "What was mentioned about Maidstone in 2025?"
query_vector = np.array(get_embedding(query)).astype("float32").reshape(1, -1)

# Run FAISS search
k = 5
D, I = index.search(query_vector, k)
indices = I  # store for later steps

print("🔎 Top matches (indices):", indices[0])

🔎 Top matches (indices): [1006 1005 2887 6024 2642]


In [234]:
selected_rows = metadata_df.iloc[indices[0]].copy()
selected_rows["chunk_id"] = selected_rows["chunk_id"].astype(int)
selected_rows[["doc_id", "chunk_id", "source_file", "filename", "committee", "meeting_date"]]

Unnamed: 0,doc_id,chunk_id,source_file,filename,committee,meeting_date
1006,doc_21db8ba1,2,cabinet/2025-01-30/originals/Item 7 - Suppleme...,Item 7 - Supplementary - 24-00109 - Transfer t...,cabinet,2025-01-30
1005,doc_21db8ba1,1,cabinet/2025-01-30/originals/Item 7 - Suppleme...,Item 7 - Supplementary - 24-00109 - Transfer t...,cabinet,2025-01-30
2887,doc_8c9da9ea,7,cabinet/2025-03-04/originals/APPENDIX A - Plan...,APPENDIX A - Planning Inspectors Report on the...,cabinet,2025-03-04
6024,doc_de20114f,0,full_council/2025-03-13/originals/Item 8 - Ken...,Item 8 - Kent Minerals and Waste Local Plan 20...,full_council,2025-03-13
2642,doc_19a76705,3,cabinet/2025-03-04/originals/Minutes of Previo...,Minutes of Previous Meeting.pdf,cabinet,2025-03-04


In [235]:
# Load the chunk JSONs
import json
from pathlib import Path

# Display matching file paths
selected_files = selected_rows["source_file"].tolist()
selected_chunks = []

for path in selected_files:
    json_path = Path("data/council_documents") / path.replace("/originals/", "/chunks/").replace(".pdf", "_chunks.json")
    if json_path.exists():
        with open(json_path) as f:
            chunks = json.load(f)
            for row in selected_rows[selected_rows["source_file"] == path].itertuples():
                selected_chunks.append(chunks[row.chunk_id]["text"])
    else:
        selected_chunks.append(f"❌ Missing chunk file: {json_path}")

# Show result
for i, chunk in enumerate(selected_chunks):
    print(f"\n--- Chunk {i+1} ---\n{chunk[:500]}...")


--- Chunk 1 ---
❌ Missing chunk file: data/council_documents/cabinet/2025-01-30/chunks/Item 7 - Supplementary - 24-00109 - Transfer the 18-25 section of the Strengthening Independence Ser_chunks.json...

--- Chunk 2 ---
❌ Missing chunk file: data/council_documents/cabinet/2025-01-30/chunks/Item 7 - Supplementary - 24-00109 - Transfer the 18-25 section of the Strengthening Independence Ser_chunks.json...

--- Chunk 3 ---
❌ Missing chunk file: data/council_documents/cabinet/2025-03-04/chunks/APPENDIX A - Planning Inspectors Report on the Examination of the KMWLP 2024-39_chunks.json...

--- Chunk 4 ---
❌ Missing chunk file: data/council_documents/full_council/2025-03-13/chunks/Item 8 - Kent Minerals and Waste Local Plan 2024-39 13th-Mar-2025 10.00 County Council_chunks.json...

--- Chunk 5 ---
❌ Missing chunk file: data/council_documents/cabinet/2025-03-04/chunks/Minutes of Previous Meeting_chunks.json...


In [237]:
import json
import glob

chunk_paths = glob.glob("../data/council_documents/**/chunks/*.json", recursive=True)
print(f"Found {len(chunk_paths)} chunk files")

# Load one example
with open(chunk_paths[0]) as f:
    chunks = json.load(f)

# Inspect structure
print(chunks[0].keys())
chunks[0]

Found 143 chunk files
dict_keys(['text', 'doc_id', 'committee', 'meeting_date', 'page_num', 'chunk_id', 'char_start', 'char_end', 'source_file', 'source', 'filename'])


{'text': 'CABINET\nThursday, 13th March, 2025, at 3.00 pm Ask for: Georgina Little\n(or on the rising of Full Council,\nwhichever is the later)\nCouncil Chamber, Sessions House, Telephone: Tel: 03000 414043\nCounty Hall, Maidstone Email:\ngeorgina.little@kent\n.gov.uk\n.\nNote – Meeting organised to allow for consideration, if required by County Council, of\nDecision 24/00093 – Commissioned Family Hubs.\nUNRESTRICTED ITEMS\n(During these items the meeting is likely to be open to the public)\n1. Apologies\n2.',
 'doc_id': 'doc_85f48edf',
 'committee': 'cabinet',
 'meeting_date': '2025-03-13',
 'page_num': 1,
 'chunk_id': 0,
 'char_start': 0,
 'char_end': 489,
 'source_file': 'cabinet/2025-03-13/originals/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf',
 'source': 'data/council_documents/cabinet/2025-03-13/originals/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf',
 'filename': 'Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf'}

In [238]:
from openai import OpenAI
import numpy as np

# ✅ Setup embedding model
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
model = "text-embedding-3-small"

def get_embedding(text):
    return client.embeddings.create(input=[text], model=model).data[0].embedding

# ✅ Create query vector
query = "What was mentioned about Maidstone in 2025?"
query_vector = np.array(get_embedding(query)).astype("float32").reshape(1, -1)

# ✅ Load FAISS index
import faiss, pickle
index = faiss.read_index("data/faiss_index.index")
with open("../data/merged_metadata.pkl", "rb") as f:
    metadata_df = pickle.load(f)

# ✅ Search
k = 5
D, I = index.search(query_vector, k)
selected_rows = metadata_df.iloc[I[0]].copy()
selected_rows["chunk_id"] = selected_rows["chunk_id"].astype(int)

# ✅ Show retrieved chunks
selected_rows[["filename", "committee", "meeting_date", "page_num"]]

RuntimeError: Error in faiss::FileIOReader::FileIOReader(const char *) at /Users/runner/work/faiss-wheels/faiss-wheels/faiss/faiss/impl/io.cpp:68: Error: 'f' failed: could not open data/faiss_index.index for reading: No such file or directory

In [240]:
import os
from glob import glob

chunk_files = glob("data/council_documents/**/chunks/*.json", recursive=True)
print("✅ Chunk files found:", len(chunk_files))

# Show a few paths if any
for path in chunk_files[:5]:
    print(path)

✅ Chunk files found: 0


In [239]:
import os
import json
from glob import glob

sample_chunks = glob("data/council_documents/**/chunks/*.json", recursive=True)
print("✅ Found", len(sample_chunks), "chunk files")

with open(sample_chunks[0], "r") as f:
    chunk_data = json.load(f)

# Display sample
for chunk in chunk_data[:2]:
    print("🔹 source:", chunk.get("source"))
    print("🔹 source_file:", chunk.get("source_file"))
    print("✅ exists:", os.path.exists(chunk.get("source")))
    print()

✅ Found 0 chunk files


IndexError: list index out of range

In [241]:
import pandas as pd

df = pd.read_pickle("../data/merged_metadata.pkl")
df.shape  # should show number of rows and columns

(0, 11)

In [243]:
import os

os.path.getsize("../data/merged_metadata.pkl")  # shows file size in bytes

899

In [244]:
import glob
import json

paths = glob.glob("../data/council_documents/**/originals/metadata.json", recursive=True)
print(f"Found {len(paths)} metadata files.")
print(paths[:3])  # show sample paths

Found 0 metadata files.
[]


In [245]:
from pathlib import Path

# Step 1: Locate all chunk files
BASE_DIR = Path("../data/council_documents")
chunk_files = list(BASE_DIR.rglob("chunks/*.json"))
print(f"🔍 Found {len(chunk_files)} chunk files")
chunk_files[:5]  # Show a few sample paths

🔍 Found 143 chunk files


[PosixPath('../data/council_documents/cabinet/2025-03-13/chunks/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet_chunks.json'),
 PosixPath('../data/council_documents/cabinet/2025-03-13/chunks/Public reports pack 13th-Mar-2025 15.00 Cabinet_chunks.json'),
 PosixPath('../data/council_documents/cabinet/2025-01-30/chunks/Appendix A Capital Investment Summary_chunks.json'),
 PosixPath('../data/council_documents/cabinet/2025-01-30/chunks/Printed minutes 30th-Jan-2025 10.00 Cabinet_chunks.json'),
 PosixPath('../data/council_documents/cabinet/2025-01-30/chunks/24-00115 - Appendix B - KCC ICS priorities measures_chunks.json')]

In [246]:
import json

# Try loading the first chunk file
sample_path = chunk_files[0]
with sample_path.open("r", encoding="utf-8") as f:
    sample_chunks = json.load(f)

print(f"✅ Loaded {len(sample_chunks)} chunks from:\n{sample_path}")
sample_chunks[0]  # Show the structure of the first chunk

✅ Loaded 2 chunks from:
../data/council_documents/cabinet/2025-03-13/chunks/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet_chunks.json


{'text': 'CABINET\nThursday, 13th March, 2025, at 3.00 pm Ask for: Georgina Little\n(or on the rising of Full Council,\nwhichever is the later)\nCouncil Chamber, Sessions House, Telephone: Tel: 03000 414043\nCounty Hall, Maidstone Email:\ngeorgina.little@kent\n.gov.uk\n.\nNote – Meeting organised to allow for consideration, if required by County Council, of\nDecision 24/00093 – Commissioned Family Hubs.\nUNRESTRICTED ITEMS\n(During these items the meeting is likely to be open to the public)\n1. Apologies\n2.',
 'doc_id': 'doc_85f48edf',
 'committee': 'cabinet',
 'meeting_date': '2025-03-13',
 'page_num': 1,
 'chunk_id': 0,
 'char_start': 0,
 'char_end': 489,
 'source_file': 'cabinet/2025-03-13/originals/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf',
 'source': 'data/council_documents/cabinet/2025-03-13/originals/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf',
 'filename': 'Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf'}

In [247]:
# Locate metadata.json for this chunk file
metadata_path = sample_path.parent.parent / "metadata.json"
print("🔍 Looking for metadata here:", metadata_path)

if metadata_path.exists():
    with metadata_path.open("r", encoding="utf-8") as f:
        meta = json.load(f)
    print(f"✅ Loaded {len(meta)} metadata records")
    meta[:2]  # show a few examples
else:
    print("❌ metadata.json not found")

🔍 Looking for metadata here: ../data/council_documents/cabinet/2025-03-13/metadata.json
✅ Loaded 2 metadata records


In [249]:
# Look at all the chunk filenames in this file
print("📄 Filenames in chunks:")
for c in chunks:
    fname = c.get("filename")
    found = fname in url_lookup
    print("✔️" if found else "❌", fname)

# Check a specific one
first_fname = chunks[0].get("filename")
print("\n🔍 First chunk filename:", first_fname)
print("➡️ Metadata match:", url_lookup.get(first_fname, "❌ Not found"))

📄 Filenames in chunks:
✔️ Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf
✔️ Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf

🔍 First chunk filename: Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf
➡️ Metadata match: {'url': 'https://democracy.kent.gov.uk/documents/g9767/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf?T=0', 'document_type': ''}


In [250]:
import json
from pathlib import Path

# Path to the metadata.json you just checked
meta_path = Path("../data/council_documents/cabinet/2025-03-13/metadata.json")

with meta_path.open("r", encoding="utf-8") as f:
    metadata_raw = json.load(f)

# Display cleanly
import pandas as pd
pd.DataFrame(metadata_raw)

Unnamed: 0,filename,path,type,committee,meeting_date,document_category,url,created
0,Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf,originals/Agenda frontsheet 13th-Mar-2025 15.0...,pdf,cabinet,2025-03-13,agenda_frontsheet,https://democracy.kent.gov.uk/documents/g9767/...,2025-05-07T00:35:46.746119
1,Public reports pack 13th-Mar-2025 15.00 Cabine...,originals/Public reports pack 13th-Mar-2025 15...,pdf,cabinet,2025-03-13,report,https://democracy.kent.gov.uk/documents/g9767/...,2025-05-07T00:35:47.721603


In [251]:
import json
from pathlib import Path

chunk_files = list(Path("../data/council_documents").rglob("*_chunks.json"))
print(f"Found {len(chunk_files)} chunk files")

# Load and inspect the first 3
for f in chunk_files[:3]:
    with open(f) as file:
        data = json.load(file)
    source_path = data[0].get("source", "MISSING")
    print(f"\n📄 File: {f}")
    print(f"🔗 source: {source_path}")

Found 143 chunk files

📄 File: ../data/council_documents/cabinet/2025-03-13/chunks/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet_chunks.json
🔗 source: data/council_documents/cabinet/2025-03-13/originals/Agenda frontsheet 13th-Mar-2025 15.00 Cabinet.pdf

📄 File: ../data/council_documents/cabinet/2025-03-13/chunks/Public reports pack 13th-Mar-2025 15.00 Cabinet_chunks.json
🔗 source: data/council_documents/cabinet/2025-03-13/originals/Public reports pack 13th-Mar-2025 15.00 Cabinet.pdf

📄 File: ../data/council_documents/cabinet/2025-01-30/chunks/Appendix A Capital Investment Summary_chunks.json
🔗 source: data/council_documents/cabinet/2025-01-30/originals/Appendix A Capital Investment Summary.pdf


In [252]:
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

BASE_DIR = Path("../data/council_documents")
OUTPUT_PATH = Path("../data/merged_metadata.pkl")

# Look for all chunk files under committee/meeting/chunks/
chunk_files = list(BASE_DIR.glob("*/*/chunks/*__chunks.json"))
print(f"🔍 Found {len(chunk_files)} chunk files")

rows = []

for path in tqdm(chunk_files):
    try:
        with open(path, "r", encoding="utf-8") as f:
            chunks = json.load(f)
    except Exception as e:
        print(f"❌ Error reading {path}: {e}")
        continue

    # Get committee and meeting date from path
    try:
        committee = path.parts[2]   # e.g. 'cabinet'
        meeting_date = path.parts[3]  # e.g. '2025-01-30'
    except IndexError:
        print(f"⚠️ Skipping malformed path: {path}")
        continue

    for chunk in chunks:
        row = {
            "chunk_id": chunk.get("chunk_id"),
            "doc_id": chunk.get("doc_id"),
            "source": chunk.get("source"),
            "hash": chunk.get("hash"),
            "url": chunk.get("url"),
            "committee": committee,
            "meeting_date": meeting_date,
            "source_path": str(path.relative_to("data"))
        }
        rows.append(row)

# Build and save DataFrame
data = pd.DataFrame(rows)
data.to_pickle(OUTPUT_PATH)
print(f"✅ Saved {len(data)} rows to {OUTPUT_PATH}")

🔍 Found 0 chunk files


0it [00:00, ?it/s]

✅ Saved 0 rows to ../data/merged_metadata.pkl



