In [2]:
import re
import fitz  # PyMuPDF
import json
from collections import defaultdict
import pandas as pd
import os
from nltk.tokenize import sent_tokenize

In [3]:
# one file:
pdf_folder = 'data/batch/'
metting_name = "gib_mcp_rgc_min__2024-04-09__01"
pdf_file = metting_name + ".pdf"
pdf_path = os.path.join(pdf_folder, pdf_file)
# read the info from a xlsx file
df = pd.read_excel("data/meetingmap.xlsx")
df = df.set_index('standard name')
# get the info of the meeting
meeting_info = df.loc[pdf_file]
# return all the columns for the meeting
meeting_info = meeting_info.to_dict()
meeting_info

{'location': 'gibsons',
 'location type': 'municipality',
 'meeting type': 'regular_council',
 'data type': 'minutes',
 'meeting date': Timestamp('2024-04-09 00:00:00'),
 'transcript': 'Yes',
 'comment': nan}

In [4]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text


# create a dictionary to store the text of each pdf and the metadata and if there is a txt with the same name
context = {}
    
text = extract_text_from_pdf(pdf_path)
context['pdf'] = text
# get the metadata
metadata = fitz.open(pdf_path).metadata
context['metadata'] = metadata
# check if there is a txt with the same name
txt_path = os.path.join(pdf_folder, pdf_file.replace('.pdf', '.txt'))
if os.path.exists(txt_path):
    with open(txt_path, 'r') as file:
        context['transcript'] = file.read()

else:
        context['transcript'] = None

In [5]:
context.keys()

dict_keys(['pdf', 'metadata', 'transcript'])

In [6]:
# load the theme keywords from themekeywordmap.xlsx
# convert it to a dictionary with the theme as key and different keywords as a list
# it has two columns: theme and single phrase
# combine all the phrases that have the same theme in  a list as follows
# category_keywords = {
#     "theme 1": ["phrase 1", "phrase 2", "phrase 3"],
#     "theme 2": ["phrase 4", "phrase 5", "phrase 6", "phrase 7"]
# }
df = pd.read_excel("data/themekeywordmap.xlsx")
category_keywords = {}
for theme, group in df.groupby('theme'):
    category_keywords[theme] = group['phrase'].tolist()
print("total number of themes: ", len(category_keywords.keys()))



total number of themes:  56


# Step 1: Segment using NLP

In [7]:
split_patterns = [
    r"MOVED by",
    r"SECONDED by",
    r"WHEREAS",
    r"THEREFORE BE IT RESOLVED THAT",
    r"CARRIED UNANIMOUSLY",
    r"REJECTED",
    r"THAT",
    r"APPROVED",
    r"ADOPTED",
    r"RESOLVED"
]

In [8]:
def segment_document(document, patterns):
    """
    Segment the document based on defined patterns.
    """
    combined_pattern = '|'.join(patterns)
    segments = re.split(combined_pattern, document, flags=re.IGNORECASE)
    
    # Filter out empty segments and strip whitespace
    segments = [seg.strip() for seg in segments if seg.strip()]
    return segments

In [9]:
def match_keywords(segment, category_keywords):
    """
    Match segments against category keywords.
    """
    matched_categories = []
    for category, keywords in category_keywords.items():
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', segment, re.IGNORECASE):
                matched_categories.append(category)
                break  # Break after the first match to avoid redundant checks
    return matched_categories

In [10]:
def combine_segments(segments, category_keywords):
    """
    Combine nearby segments and filter them based on category keywords.
    """
    combined_segments = []
    current_segment = ""
    current_categories = set()

    for segment in segments:
        matched_categories = match_keywords(segment, category_keywords)
        if matched_categories:
            if current_segment:
                combined_segments.append({
                    "text": current_segment,
                    "categories": list(current_categories)
                })
                current_segment = segment
                current_categories = set(matched_categories)
            else:
                current_segment = segment
                current_categories.update(matched_categories)
        else:
            current_segment += " " + segment
    
    if current_segment:
        combined_segments.append({
            "text": current_segment,
            "categories": list(current_categories)
        })

    return combined_segments

In [12]:
segments = segment_document(context['pdf'], split_patterns)
combined_segments = combine_segments(segments, category_keywords)

In [13]:
# initial assesment results
print("Total initial segments:", len(combined_segments))

Total initial segments: 8


In [15]:
#output_nlp = json.dumps(combined_segments, indent=2)

In [14]:
len(segments)

42

In [11]:
def combine_segments_v2(segments, category_keywords):
    """
    Combine nearby segments and filter them based on category keywords.
    """
    combined_segments = []
    current_segment = ""
    current_categories = set()
    last_matched_categories = set()

    for segment in segments:
        matched_categories = match_keywords(segment, category_keywords)
        if matched_categories:
            # If current segment is not empty and the new segment has different categories,
            # add the current segment to combined_segments and start a new one
            if current_segment and matched_categories != last_matched_categories:
                combined_segments.append({
                    "text": current_segment,
                    "categories": list(current_categories)
                })
                current_segment = segment
                current_categories = set(matched_categories)
            else:
                current_segment += " " + segment
                current_categories.update(matched_categories)
            last_matched_categories = matched_categories
        else:
            current_segment += " " + segment
    
    # Append the last segment
    if current_segment:
        combined_segments.append({
            "text": current_segment,
            "categories": list(current_categories)
        })

    return combined_segments

In [16]:
Proposal_indicators = ["Moved", "Seconded", "Motion", "Carried", "Proposal", "Passed", "Adopted", "Adoption", "Rejected", "Lost", "Moved", "approve", "Seconded" , "Adopt", "Resolution", "rejected", "Ordinance", "defeated", "discussed", "withdrawn", "tabled", "Amendment", "Amendment", "Recommendation", "granted", "Petition", "denied", "Vote", "result"]
# lower case it
Proposal_indicators = [x.lower() for x in Proposal_indicators]

In [12]:

def combine_segments_v3(segments, category_keywords, Proposal_indicators):
    combined_proposals = []
    temp_segment = ""

    def contains_proposal_indicators(text):
            return any(indicator in text.lower() for indicator in Proposal_indicators)
    def match_keywords(segment, category_keywords):
        """
        Match segments against category keywords.
        """
        matched_categories = []
        for category, keywords in category_keywords.items():
            for keyword in keywords:
                if re.search(r'\b' + re.escape(keyword) + r'\b', segment, re.IGNORECASE):
                    matched_categories.append(category)
        return matched_categories
  
    
    for seg in segments:
        """Splits/combine the segments such that it contains one proposals indicator."""
        for line in seg.split('\n'):
            temp_segment += line + " "
            if contains_proposal_indicators(line):
                categories = match_keywords(temp_segment, category_keywords)
                combined_proposals.append({
                    "text": temp_segment.strip(),
                    "categories": list(set(categories))
                })
                temp_segment = ""
                category_set = set()
    if temp_segment.strip():
        # append it to the last proposal
        categories = match_keywords(temp_segment, category_keywords)
        combined_proposals[-1] = {
                    "text": combined_proposals[-1]["text"] + " " + temp_segment.strip(),
                    "categories": list(set(categories+ combined_proposals[-1]["categories"])),
                }
    for i in reversed(range(len(combined_proposals))):
        if len(combined_proposals[i]["text"]) < 50:
            combined_proposals[i-1]["text"] = combined_proposals[i-1]["text"] + " " + combined_proposals[i]["text"]
            # remove the current segment
            del combined_proposals[i]
    return combined_proposals

In [81]:
# combined_segments = combine_segments(segments, category_keywords)
#combined_segments = combine_segments_v2(segments, category_keywords)
combined_segments = combine_segments_v3(segments, category_keywords, Proposal_indicators)

In [82]:
len(combined_segments)

13

In [23]:
combined_segments[3]

{'text': '2024 Councillor Thompson Councillor Croal Development Permit Delegation Authority Amendment Bylaw No.1054- 04, 2024 be .  CARRIED',
 'categories': ['housing']}

# step 2: find category based on vector database

In [13]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


# Generate embeddings for each category based on keywords
category_embeddings = {}
for category, keywords in category_keywords.items():
    category_embeddings[category] = model.encode(keywords, convert_to_tensor=True)

# Flatten category embeddings for FAISS indexing
flat_embeddings = []
category_indices = []
for category, embeddings in category_embeddings.items():
    for embedding in embeddings:
        flat_embeddings.append(embedding.cpu().detach().numpy())
        category_indices.append(category)

flat_embeddings = np.vstack(flat_embeddings)

# Create FAISS index
dimension = flat_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(flat_embeddings)

# Save FAISS index and data for later use
faiss.write_index(index, 'data/faiss_index.bin')
np.save('data/category_indices.npy', category_indices)


  from tqdm.autonotebook import tqdm, trange
  return torch._C._cuda_getDeviceCount() > 0


In [14]:
def find_elbow_point_indices(data):
    # Sort the data and keep track of the original indices
    sorted_data_with_indices = sorted((val, idx) for idx, val in enumerate(data))
    sorted_data = [val for val, idx in sorted_data_with_indices]
    sorted_indices = [idx for val, idx in sorted_data_with_indices]
    
    # Calculate the differences between consecutive elements
    differences = np.diff(sorted_data)
    
    # Find the index where the difference significantly increases
    elbow_index = np.argmax(differences)
    
    # Find the elbow point value
    elbow_point = sorted_data[elbow_index]
    
    # Find indices of elements to remove (smaller than or equal to the elbow point)
    to_remove_indices = [idx for idx, val in enumerate(data) if val <= elbow_point]
    return to_remove_indices

In [15]:
def query_vector_database(text, model, index, category_indices,num_categories_to_search=20):
    """
    Query the FAISS index with a text embedding and return the most relevant categories.
    Avoid returning repetitive categories and apply a similarity threshold.
    """
    # Generate embedding for the text
    text_embedding = model.encode([text], convert_to_tensor=True)
    text_embedding = text_embedding.cpu().detach().numpy()


    # Search the index for the most similar embeddings
    distances, indices = index.search(text_embedding, num_categories_to_search)
    to_remove = find_elbow_point_indices(distances[0])

    # Filter out categories based on the threshold and avoid repetitions
    seen_categories = set()
    categories_by_vd = []
    for idx in  indices[0]:
        if idx in to_remove:
            continue
        category = category_indices[idx]
        if category not in seen_categories:
            categories_by_vd.append(category)
            seen_categories.add(category)

    

    return categories_by_vd

In [None]:
# Load the FAISS index and category indices
index = faiss.read_index('data/faiss_index.bin')
category_indices = np.load('data/category_indices.npy', allow_pickle=True)


# Verify categories and update segments
for segment in combined_segments:
    suggested_categories = query_vector_database(segment['text'], model, index, category_indices)
    segment["categories_by_vd"] = suggested_categories
    segment["id"] = combined_segments.index(segment)

# Print the updated categorized segments
for segment in combined_segments:
    print(json.dumps(segment, indent=2))

In [88]:
len(combined_segments)

13

In [51]:
combined_segments[10]

{'text': 'Regular Council Meeting Minutes - Tuesday, April 9, 2024    R2024-71  Support for Community Emergency Preparedness Fund (CEPF) Councillor De Andrade Councillor Croal Council supports the Sunshine Coast Regional District applying for,  receiving, and managing Community Emergency Preparedness Fund (CEPF)  Evacuation Route Planning grant funding on behalf of the Town of Gibsons.  CARRIED',
 'categories': ['housing',
  'environmental_exposures__extreme_weather',
  'injury_prevention',
  'other__geographically-oriented',
  'other__emergency_management'],
 'categories_by_vd': ['housing',
  'environmental_exposures__extreme_weather',
  'injury_prevention',
  'youth_children',
  'other__emergency_management',
  'mental_health',
  'injury_prevention__youth_self-harm',
  'environmental_exposures__air_quality_'],
 'id': 'gib_mcp_rgc_min__2024-04-09__01|10'}

# step 3: lang-graph

In [17]:
local_llm = "llama3"

In [18]:
# load the LANGCHAIN_API_KEY from the environment
import os
from dotenv import load_dotenv
load_dotenv()
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

In [19]:
### Index

from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

In [20]:
# split the context['pdf'] and create vector store
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, chunk_overlap=10
)
def filter_none_values(metadata):
    return {k: v for k, v in metadata.items() if v is not None}
filtered_metadata = filter_none_values(context['metadata'])

text_splits = text_splitter.split_text(context['pdf'])
metadata_list = [filtered_metadata] * len(text_splits)

# Add  text_splits to vectorDB with  nomic-embed-text-v1.5  and inference_mode="local
vectorstore = Chroma.from_texts(
        texts=text_splits,
        metadatas=metadata_list,
        # embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
        embedding=embeddings.OllamaEmbeddings(model="nomic-embed-text:v1.5"),
)
retriever = vectorstore.as_retriever()

In [146]:
len(text_splits)

6

In [27]:
prompt_count = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> 
You are an assitance that detect the number of unique proposals with a specific categoty (theme) in a suggested proposal of a council meeting note. \n
You are provided with suggested proposal and its category as user prompt. \n
The goal is to find count of unique proposal in the suggested proposal with the specific category. \n
Give a integer count of unique proposal as a JSON with single key 'count'. \n

example:
proposal: ### Minutes of the Council meeting of February 6, 2024, be approved.\n CARRIED UNANIMOUSLY \n Council Meeting\n Minutes, February 27, 2024 3\n 3. Council (City Finance and Services) \n MOVED by Councillor Dominato\n SECONDED by Councillor Carr\n THAT the Minutes of the Council meeting following the Standing Committee on City\n Finance and Services meeting of February 7, 2024, be approved.\n CARRIED UNANIMOUSLY\n 4. Court of Revision (Business Improvement Areas) - February 8, 2024\n MOVED by Councillor Bligh\n SECONDED by Councillor Zhou\n  THAT the Minutes of the Court of Revision (Business Improvement Areas) meeting of\n February 8, 2024, be approved.\n CARRIED UNANIMOUSLY\n MATTERS ADOPTED ON CONSENT 
categories: ["City Finance and Services", "Business Improvement Areas"]
output: "count": 2

Let's think step by step. Here are the steps to solve the task:
1. Validity check: A proposal should suggest some action or decision and it should have a unique decision. 
2. Proposal Count: See if the suggested proposal includes more than one proposal. calculate the count.
5. Write: Only return the count as integer in a JSON format with a single key 'count'.

RULES:
- your output MUST HAVE the exact JSON format.
- Your answer must not include any speculation or inference. Do not assume or change dates and times. Only provide information that is explicitly stated in the context.
- An answer is considered grounded if **all** information in **every** sentence in the answer is **explicitly** mentioned in the source context, **no** extra information is added and **no** inferred information is added.

     <|eot_id|><|start_header_id|>user<|end_header_id|>
    Category:\n {category} \n\n
    suggested_proposal:\n {suggested_proposal} \n\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["category", "suggested_proposal"],
)

In [28]:
num = 12
suggested_proposal = combined_segments[num]["text"]
llm = ChatOllama(model=local_llm, format="json", temperature=0)
retrieval_counter = prompt_count | llm | JsonOutputParser()
category = combined_segments[num]["categories_by_vd"]
print(retrieval_counter.invoke({"category": category, "suggested_proposal": suggested_proposal}))

{'count': 0}


In [29]:
import difflib

def find_closest_match(original_string, substring):
    # Initialize variables
    closest_match_index = -1
    highest_similarity = 0
    substring_length = len(substring)
    
    # Define a function to calculate similarity ratio
    def similarity(s1, s2):
        return difflib.SequenceMatcher(None, s1, s2).ratio()
    
    # Compare the substring against all possible substrings of the same length in the original string
    for i in range(len(original_string) - substring_length + 1):
        current_substring = original_string[i:i + substring_length]
        current_similarity = similarity(current_substring, substring)
        if current_similarity > highest_similarity:
            highest_similarity = current_similarity
            closest_match_index = i
    
    return closest_match_index

In [30]:
"""### Example:
#### User Input:  
# "context": "The meeting was called to order at 10:00 AM. Moved by John Doe, the proposal to increase the budget for the community park was discussed. The committee deliberated on various aspects. Motion Carried. Moved by Tom. The agenda item to increase wages. Motion Carried. The next item"\n
"suggested_proposal": "increase the budget for the community park was discussed." \n\n
#### Model correct Output:
"start": "Moved by John Doe, the proposal to increase",
"end": " deliberated on various aspects. Motion Carried."""

prompt_boundry = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> 
You are an AI assistance to identify the adjusted boundary of a suggested_proposal within context (meeting notes). \n
You are given the context and the suggested proposal (a chunk of the context that is not accurately bounded). \n
Your task is to first locate the exact chunk (suggested_proposal) in the context, then add or remove words from it such that the new chunk include all related information about the proposal. 
Write the first and last 5 to 10 words of the context that is about the suggested proposal. Do NOT write about another proposal. Only one proposal should be in the boundary. \n

### Instructions:
1. **Output Format**:
    - Provide the output in JSON format with the keys 'start' and 'end'.
    - Both 'start' and 'end' values should include atleast five and atmost ten words from the context. 

2. **Rules**:
    - **Use the suggested proposal as the basis**: Ensure the adjusted boundary in the context indeed is same as the original suggested_proposal.
    - Do not use speculations or inferences. Only provide information that is explicitly stated in the context and is about the suggested_proposal.
    - Maintain the coherence and logical flow of the proposal.
    - All necessary information such as the mover, the proposal title, the vote result, or note are considered related information and should be inside the boundary.
    - ** word count**: DO NOT return less than 5 words or more than 10 words for each boundary marker 'start' and 'end'.
    - only one proposal that is related to the suggested_proposal should be selected.

     <|eot_id|><|start_header_id|>user<|end_header_id|>
     Given the context and suggested proposal, follow the above instructions to determine the precise boundary. Ensure your output is in JSON with the required word count. Return ONLY and only one proposal. \n\n
    "context":\n {document} \n\n
    "suggested_proposal":\n {suggested_proposal} \n\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["document", "suggested_proposal"],
)




In [31]:
num = 0
suggested_proposal = combined_segments[num]["text"]

llm_boundry = ChatOllama(model=local_llm, temperature=0, keep_alive=0)
retrieval_boundary = prompt_boundry | llm_boundry | JsonOutputParser()
txt_chunk = retriever.invoke(suggested_proposal)[:3]
txt_chunk = [d.page_content for d in txt_chunk]
boundry = retrieval_boundary.invoke({"document": txt_chunk, "suggested_proposal": suggested_proposal})
print(boundry)

{'start': 'The Mayor called the meeting to order at 7:00pm. APPROVAL OF THE AGENDA R2024-63 Regular Council Agenda - April 9, 2024 Councillor De Andrade Councillor Lumley the Regular Business Agenda of April 9, 2024 be adopted.', 'end': 'CARRIED ADOPTION OF MINUTES'}


In [137]:
suggested_proposal

'Regular Council  MEETING MINUTES  Tuesday, April 9, 2024  Council Chambers, 7:00pm  Town Hall, 474 South Fletcher Road, Gibsons, BC      PRESENT:  Mayor Silas White   Councillor David Croal  Councillor Annemarie De Andrade  Councillor Stafford Lumley  Councillor Christi Thompson  Youth Representative Cael Read     STAFF:     Emanuel Machado, Chief Administrative Officer  Rebecca Anderson, Corporate Officer  Lorraine Coughlin, Director of Finance  Trevor Rutley, Director of Infrastructure Services  Lesley-Anne Staats, Director of Planning via Zoom   Noni Weitz, Manager of Financial Services  Heidi Siller, Executive Assistant (recorder)    CALL TO ORDER  The Mayor called the meeting to order at 7:00pm.    APPROVAL OF THE AGENDA      R2024-63  Regular Council Agenda - April 9, 2024 Councillor De Andrade Councillor Lumley the Regular Business Agenda of April 9, 2024 be .  CARRIED ADOPTION OF MINUTES'

In [32]:
def adjusted_proposal_boundary(txt_chunk, proposal, boundry):
    start = boundry["start"]
    end = boundry["end"]
    # check if the length of end or start is less 30
    if len(start) < 30:
        start = start + ' ' + end[:20] + ' ' + proposal[:20]
    elif len(end) < 30:
        end =end + ' ' +  proposal[-30:] + ' ' +  start[-30:] 
    txt = ''
    for txts in txt_chunk:
        txt += ' ' + txts.replace('\n', ' ').strip()
    start_index = find_closest_match(txt, start)
    end_index = find_closest_match(txt, end)
    # change the end_index to the closest . or ; or end of the sentence
    end_index1 = txt.find(' ', end_index)
    end_index2 = txt.find('.', end_index)
    # find smaller positive index and set as end_index
    if end_index1 >= 0 and end_index2 >= 0:
        end_index = min(end_index1, end_index2)
    elif end_index1 >=0:
        end_index = end_index1
    elif end_index2 >=0:
        end_index = end_index2

    return txt[start_index:end_index+len(end)]
# get the sentence
proposal_adj = adjusted_proposal_boundary(txt_chunk, suggested_proposal, boundry)

proposal_adj

'ayor called the meeting to order at 7:00pm.    APPROVAL OF THE AGENDA      R2024-63  Regular Council Agenda - April 9, 2024  MOVED by Councillor De Andrade   SECONDED by Councillor Lumley    THAT the Regular Business Agenda of April 9, 2024 be adopted. CARRIED      ADOPTION OF MINUTES      R2024-64  Minutes of the Regul'

## get suggested proposals

In [33]:
prompt_getprop = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> 
You are an AI assistance to all the unique proposals from a all_proposals. \n
You are given the all_proposals: string  and  count:int from the user that shows how many proposals are in all_proposals.\n
Your task is to find different proposals from all_proposals such that each include all required information including proposer, title, results and etc. 
Output a JSON with keys as int and value as each individual proposal. \n

**Output Format**:
    '1': <str, content of proposal 1>,
    '2': <str, content of proposal 2>,
    ...
    'count': <str, content of proposal count>,

### Instructions:
1. **approach**: 
    - Find count number of unique proposals from all_proposals.
    - Provide the output as a JSON with len equal to count and write unique proposals as values .

2. **Rules**:
    - **Use the all_proposals as the basis**: Ensure your individual proposals are indeed from  the original all_proposals.
    - Do not use speculations or inferences. Only provide information that is explicitly stated in the all_proposals.
    - Maintain the coherence and logical flow of the proposal.
    - All necessary information such as the mover, the proposal title, the vote result, or note are considered related information and should be included for one proposal.
    - ** word count**: DO NOT return less than 5 words or more than 20 words for proposals.

     <|eot_id|><|start_header_id|>user<|end_header_id|>
     Given the all_proposals and count, follow the above instructions to determine the individual proposals. Ensure your output is only JSON. size of JSON should be equal to count. Do NOT return anythings else other than the JSON. \n\n
    "all_proposals":\n {all_proposals} \n\n
    "count":\n {count} \n\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["all_proposals", "count"]
)

In [34]:
num = 3
all_proposals = combined_segments[num]["text"] + ' ' + combined_segments[num+1]["text"]

llm_getprop = ChatOllama(model=local_llm, temperature=0, keep_alive=0)
retrieval_prop = prompt_getprop | llm_getprop | JsonOutputParser()
prop_dict = retrieval_prop.invoke({"all_proposals": all_proposals, "count": 2})
print(prop_dict)

{'1': '2024 Councillor Thompson Councillor Croal Development Permit Delegation Authority Amendment Bylaw No.1054-04, 2024 be . CARRIED COMMITTEE REPORTS Committee-of-the-Whole Meeting - March 19, 2024 The minutes of the Committee-of-the-Whole Meeting held March 19, 2024 were received.', '2': 'R2024-66 2024-2028 Preliminary General Services 5-Year Capital Plan Councillor Croal Councillor De Andrade the revised preliminary 5-year capital plan for general services be integrated into the 2024-2028 Financial Plan with the exception of the Dog Park and Pickleball projects which are to be removed from the capital plan and'}


## feaure extractor

In [37]:
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from typing import Optional, List



# Schema for structured response
class ProposalFeatures(BaseModel):
    title: str = Field(description="A short title of the proposal", required=True)
    category_llm: List[str] = Field(description=f"category if the proposal choosen from suggested category by user", required=True)
    vote_result: str = Field(description="The result of the vote", required=True)
    future_date: Optional[str] = Field(description="The future date of the proposal")

In [38]:
# Prompt template
prompt_extraction =PromptTemplate(
    template= """ <|begin_of_text|><|start_header_id|>system<|end_header_id|> 
You are an AI assistance that extract related information about a suggested_proposal from the context (meeting notes). \n

Instructuctions:
The output should be in json format and structured with the following schema:

    title: str = Field(description="A short title of the proposal", required=True)
    category_llm: List[str] = Field(description=f"proposal category from Suggested_categories", required=True)
    vote_result: str = Field(description="The result of the vote", required=True)
    future_date: Optional[str] = Field(description="The future date of the proposal")
    
RULES: 
1. Do not use speculations or inferences. You are given the Suggested_proposal and the context. every information should be only related to the Suggested_proposal from the context. 
2. The category_llm should be choosed from the suggested category by the user. It can be all or some of it that you think is related to the proposal. 
3. If you did not find any future date, return None.
4. Your output should be in JSON format with the keys 'title', 'category_llm', 'vote_result', and 'future_date'. DO NOT change it.  \n


<|eot_id|><|start_header_id|>user<|end_header_id|>
    "context":\n {document} \n\n
    "suggested_proposal":\n {suggested_proposal} \n\n
    "Suggested_categories":\n {category} \n\n
     Given the context, find the Suggested_proposal and return title, category_llm, vote_result, and future_date in a JSON formart as mentioned in the instruction and rules. DO NOT return any other key. Output should be of class ProposalFeatures.  \n\n
     <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["document", "suggested_proposal", "category"],
)

In [39]:
num = 8
suggested_proposal = combined_segments[num]["text"]
category = combined_segments[num]["categories_by_vd"]
txt_chunk = retriever.invoke(suggested_proposal)[:1]
txt_chunk = [d.page_content for d in txt_chunk] 
llm_extraction  = OllamaFunctions(model=local_llm,format="json", temperature=0)

# Chain
structured_llm = llm_extraction.with_structured_output(ProposalFeatures)
chain_feature = prompt_extraction | structured_llm

result = chain_feature.invoke({'document':txt_chunk,'suggested_proposal': suggested_proposal, 'category': category})
result

ProposalFeatures(title='Parcel Tax Roll Review Panel – Water/Sewer/Community Recreation Parcel Taxes', category_llm=['environmental_exposures__liquid_waste/_wastewater/sewage', 'environmental_exposures__health_impact_assessments_/human_health_risk_assessments_/environmental_assessments'], vote_result='CARRIED', future_date='Tuesday, May 7, 2024')

In [None]:
# conver to the excel for all proposal
# automatically nodes

## langGraph control flow

In [40]:
# NLP
pdf_folder = 'data/batch/'
metting_name = "gib_mcp_rgc_min__2024-04-09__01"
pdf_file = metting_name + ".pdf"
pdf_path = os.path.join(pdf_folder, pdf_file)
# get the info of the meeting
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

context = {}
text = extract_text_from_pdf(pdf_path)
context['pdf'] = text
# get the metadata
metadata = fitz.open(pdf_path).metadata
context['metadata'] = metadata
# check if there is a txt with the same name
txt_path = os.path.join(pdf_folder, pdf_file.replace('.pdf', '.txt'))
if os.path.exists(txt_path):
    with open(txt_path, 'r') as file:
        context['transcript'] = file.read()

else:
        context['transcript'] = None

df = pd.read_excel("data/themekeywordmap.xlsx")
category_keywords = {}
for theme, group in df.groupby('theme'):
    category_keywords[theme] = group['phrase'].tolist()

split_patterns = [
    r"MOVED by",
    r"SECONDED by",
    r"WHEREAS",
    r"THEREFORE BE IT RESOLVED THAT",
    r"CARRIED UNANIMOUSLY",
    r"REJECTED",
    r"THAT",
    r"APPROVED",
    r"ADOPTED",
    r"RESOLVED"
]
Proposal_indicators = ["Moved", "Seconded", "Motion", "Carried", "Proposal", "Passed", "Adopted", "Adoption", "Rejected", "Lost", "Moved", "approve", "Seconded" , "Adopt", "Resolution", "rejected", "Ordinance", "defeated", "discussed", "withdrawn", "tabled", "Amendment", "Amendment", "Recommendation", "granted", "Petition", "denied", "Vote", "result"]
# lower case it
Proposal_indicators = [x.lower() for x in Proposal_indicators]
segments = segment_document(context['pdf'], split_patterns)
combined_segments = combine_segments_v3(segments, category_keywords, Proposal_indicators)
len(combined_segments)

# Load the FAISS index and category indices
index = faiss.read_index('data/faiss_index.bin')
category_indices = np.load('data/category_indices.npy', allow_pickle=True)


# Verify categories and update segments
for segment in combined_segments:
    suggested_categories = query_vector_database(segment['text'], model, index, category_indices)
    segment["categories_by_vd"] = suggested_categories
    segment["id"] = combined_segments.index(segment)

# Print the updated categorized segments
for segment in combined_segments:
    print(json.dumps(segment, indent=2))


{
  "text": "Regular Council  MEETING MINUTES  Tuesday, April 9, 2024  Council Chambers, 7:00pm  Town Hall, 474 South Fletcher Road, Gibsons, BC      PRESENT:  Mayor Silas White   Councillor David Croal  Councillor Annemarie De Andrade  Councillor Stafford Lumley  Councillor Christi Thompson  Youth Representative Cael Read     STAFF:     Emanuel Machado, Chief Administrative Officer  Rebecca Anderson, Corporate Officer  Lorraine Coughlin, Director of Finance  Trevor Rutley, Director of Infrastructure Services  Lesley-Anne Staats, Director of Planning via Zoom   Noni Weitz, Manager of Financial Services  Heidi Siller, Executive Assistant (recorder)    CALL TO ORDER  The Mayor called the meeting to order at 7:00pm.    APPROVAL OF THE AGENDA      R2024-63  Regular Council Agenda - April 9, 2024 Councillor De Andrade Councillor Lumley the Regular Business Agenda of April 9, 2024 be .  CARRIED ADOPTION OF MINUTES",
  "categories": [
    "youth_children"
  ],
  "categories_by_vd": [
    "inj

In [41]:
# create a utility function to get the dictionary from combined_segments with specific id
# combined_segments = [{'id': <id1>, 'txt': <text 1>, 'category': <category 1>, "category_by_vd":< cat1>"},
#  {'id': <id2>, 'txt': <text 2>, 'category': <category 2>, "category_by_vd":< cat2>"}, ...]

def get_dict_by_id(combined_segments, id):
    for segment in combined_segments:
        if segment["id"] == id:
            return segment
    return None

In [42]:
from pprint import pprint

from langchain_core.documents import Document
from typing_extensions import TypedDict

from langgraph.graph import END, StateGraph

### State


class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        sprop: list of different suggested proposal dictionaries with key id, cat, prop by NER
        last_id: last visited ID
        count: number of unique proposal
        suggested_proposal: suggested_proposal
        output: dictionary of features and adjusted poposal
        document: list of related document
    """
    sprop: List[dict]
    last_id: int
    count: int
    suggested_proposal: str
    output: dict
    document: List[str]

In [43]:
### Nodes
final_dict = {0: {"title":"", "full proposal":"", "theme":[], "vote_result":"", "future_date":""}}

def iterate(state):
    """
    Iterate over the sprop dictionary and return the last_id.
    For the first time, call the iterate with input last_id as -1.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, sprop, that contains the next suggested proposal
    """
    print("-------************ITERATE NODE***************-------")
    last_id = state["last_id"]
    last_id += 1
    print("running for id: ", last_id)
    
    return {"last_id": last_id}

def proposal_count(state):
    """
    Determines whether the suggested_proposal is actually a proposal.
    update the count state

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): change the count of the proposal
    """

    print("---PROPOSAL COUNT NODE---")
    suggested_proposal = state["sprop"][state["last_id"]]["text"]
    category = get_dict_by_id(state["sprop"], state["last_id"])["categories_by_vd"]
    
    score = retrieval_counter.invoke({"category": category, "suggested_proposal": suggested_proposal})
        
    count = score["count"]
    return {"count": count,  "suggested_proposal": suggested_proposal}

def retrieve(state):
    """
    Retrieve documents from vectorstore

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, documents, that contains retrieved documents
    """
    print("---RETRIEVE NODE---")
    suggested_proposal = state["suggested_proposal"]

    # Retrieval
    document = retriever.invoke(suggested_proposal)[:3]
    document = [d.page_content for d in document]
    return {"document": document}



def get_proposal(state):
    """
    Get all the suggested proposal from the suggested proposal.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, suggested_proposal, that contains the suggested proposal
    """
    print("---GET PROPOSAL NODE---")
    all_proposals = state["suggested_proposal"]
    count = state["count"]


    prop_dict = retrieval_prop.invoke({"all_proposals": all_proposals, "count": count})
    
    state["sprop"].extend(prop_dict.values())
    return {"sprop": state["sprop"]}



def proposal_boundary(state):
    """
    Determines the currect bounday of the suggested_proposal.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): change the count of the proposal
    """

    print("---ADJUST BOUNDARY NODE---")
    suggested_proposal = state["suggested_proposal"]
    document = state["document"]
    boundry = retrieval_boundary.invoke({"document": document, "suggested_proposal": suggested_proposal})
    proposal_adj = adjusted_proposal_boundary(document, suggested_proposal, boundry)
    state["output"] = {"full proposal" :proposal_adj}
    return { "suggested_proposal": suggested_proposal, "document": document, "output": state["output"]}


def proposal_features(state):
    """
    Extract the features of the proposal.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): change the count of the proposal
    """

    print("---EXTRACT FEATURES NODE---")
    suggested_proposal = state["suggested_proposal"]
    prop = get_dict_by_id(state["sprop"], state["last_id"])
    document = state["document"][:1]
    result = chain_feature.invoke({'document':document ,'suggested_proposal': suggested_proposal, 'category': prop["categories_by_vd"]})
    # convert ProposalFeatures to dict
    state["output"].update(result.dict()) 

    # save to the final dict
    final_dict[state["last_id"]] = {k:v for k,v in state["output"].items() if k != "category_llm"}
    prop["categories_by_vd"].extend(prop["categories"])
    # check if state["output"]["category_llm"] is a list
    if type(state["output"]["category_llm"]) == list:
        prop["categories_by_vd"].extend(state["output"]["category_llm"])
    else:
        prop["categories_by_vd"].append(state["output"]["category_llm"])

    final_dict[state["last_id"]]['theme'] = list(set(prop["categories_by_vd"]))

    return { "suggested_proposal": suggested_proposal, "document": document, "output": state["output"]}

In [44]:
### Conditional edge
def route_count(state):
    """
    Route based on the value of count.
    0: Invalid proposal, go to iterate node
    1: Valid proposal, go to retrieve node
    >1: More than one proposal, go to get_suggested_proposal node

    Args:
        state (dict): The current graph state

    Returns:
        str: Next node to call
    """

    print("---ROUTING based on validity---")
    grade = state["count"]
    if grade == 0:
        print("Suggested proposal is NOT valid. Moving to the next one ... ")
        return "iterate"
    elif grade > 1:
        print("Suggested proposal is MORE than one. Calling get_proposal ...")
        return "get_proposal"
    else:
        print("Suggested proposal is valid. retrieve the document ...")
        return "retrieve"
    

def stop_iteration(state):
    """
    Stop the iteration if all the suggested proposal is visited.

    Args:
        state (dict): The current graph state

    Returns:
        str: Next node to call
    """

    print("---CHECK STOP ITERATION---")
    last_id = state["last_id"]
    if last_id == len(state["sprop"]):
        print("STOPPED!")
        return "END"
    print("Continue ...")
    return "proposal_count"

In [45]:
workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("iterate", iterate)  # iterate loop
workflow.add_node("proposal_count", proposal_count)  # count the proposal
workflow.add_node("retrieve", retrieve)  # retrieve related documents
workflow.add_node("get_proposal", get_proposal)  # get new proposal
workflow.add_node("proposal_boundary", proposal_boundary)  # adjust proposal boundary
workflow.add_node("proposal_features", proposal_features)  # extract proposal features


In [46]:
# graph build

workflow.set_entry_point("iterate")
workflow.add_conditional_edges(
    "iterate",
    stop_iteration,
    {
        "proposal_count": "proposal_count",
        "END": END,
    },
)
workflow.add_conditional_edges(
    "proposal_count",
    route_count,
    {
        "iterate": "iterate",
        "get_proposal": "get_proposal",
        "retrieve": "retrieve",
    },
)
workflow.add_edge("get_proposal", "iterate")
workflow.add_edge("retrieve", "proposal_boundary")
workflow.add_edge("proposal_boundary", "proposal_features")
workflow.add_edge("proposal_features", "iterate")

In [47]:
# Compile
app = workflow.compile()

# Test
inputs = {"sprop": combined_segments, "last_id": -1} #-1
for output in app.stream(inputs,  {"recursion_limit": 1000}):
    for key, value in output.items():
        pprint(f"Finished running: {key}")

-------************ITERATE NODE***************-------
running for id:  0
---CHECK STOP ITERATION---
Continue ...
'Finished running: iterate'
---PROPOSAL COUNT NODE---
---ROUTING based on validity---
Suggested proposal is valid. retrieve the document ...
'Finished running: proposal_count'
---RETRIEVE NODE---
'Finished running: retrieve'
---ADJUST BOUNDARY NODE---
'Finished running: proposal_boundary'
---EXTRACT FEATURES NODE---
'Finished running: proposal_features'
-------************ITERATE NODE***************-------
running for id:  1
---CHECK STOP ITERATION---
Continue ...
'Finished running: iterate'
---PROPOSAL COUNT NODE---
---ROUTING based on validity---
Suggested proposal is valid. retrieve the document ...
'Finished running: proposal_count'
---RETRIEVE NODE---
'Finished running: retrieve'
---ADJUST BOUNDARY NODE---
'Finished running: proposal_boundary'
---EXTRACT FEATURES NODE---
'Finished running: proposal_features'
-------************ITERATE NODE***************-------
running 

In [48]:
len(combined_segments)

13

In [72]:
# convert final_dict to csv file with keys as row and each key of the value as column
# it should look like this
# meeting name, id ,title, full proposal, theme, vote_result, future_date
import pandas as pd
# first load the csv output if exists.


df = pd.DataFrame.from_dict(final_dict, orient='index')
df["meeting_name"] = metting_name   
df = df.reset_index()
df = df.rename(columns={"index": "id"})
df = df[["meeting_name", "id", "title", "full proposal", "theme", "vote_result", "future_date"]]
# replace None values with ""
df = df.replace({None: "NONE"})

# append df0 to end of df if meeting_name is not in df0, otherwize replace the row with the same meeting_name and append the rest

try:
    df0 = pd.read_csv("data/output")
    if metting_name in df0["meeting_name"].unique():
        # replace the row
        df0 = df0[df0["meeting_name"] != metting_name]
        df = pd.concat([df, df0], ignore_index=True)
    else:
        df = pd.concat([df, df0], ignore_index=True)
except:
    pass


df.to_csv("data/output", index=False)
