In [1]:
import re
import fitz  # PyMuPDF
import json
from collections import defaultdict
import pandas as pd
import os
from nltk.tokenize import sent_tokenize

In [2]:
# one file:
pdf_folder = 'data/batch/'
metting_name = "gib_mcp_rgc_min__2024-04-09__01"
pdf_file = metting_name + ".pdf"
pdf_path = os.path.join(pdf_folder, pdf_file)
# read the info from a xlsx file
df = pd.read_excel("data/meetingmap.xlsx")
df = df.set_index('standard name')
# get the info of the meeting
meeting_info = df.loc[pdf_file]
# return all the columns for the meeting
meeting_info = meeting_info.to_dict()
meeting_info

{'location': 'gibsons',
 'location type': 'municipality',
 'meeting type': 'regular_council',
 'data type': 'minutes',
 'meeting date': Timestamp('2024-04-09 00:00:00'),
 'transcript': 'Yes',
 'comment': nan}

In [3]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text


# create a dictionary to store the text of each pdf and the metadata and if there is a txt with the same name
contex = {}
    
text = extract_text_from_pdf(pdf_path)
contex['pdf'] = text
# get the metadata
metadata = fitz.open(pdf_path).metadata
contex['metadata'] = metadata
# check if there is a txt with the same name
txt_path = os.path.join(pdf_folder, pdf_file.replace('.pdf', '.txt'))
if os.path.exists(txt_path):
    with open(txt_path, 'r') as file:
        contex['transcript'] = file.read()

else:
        contex['transcript'] = None

In [4]:
contex.keys()

dict_keys(['pdf', 'metadata', 'transcript'])

In [5]:
# load the theme keywords from themekeywordmap.xlsx
# convert it to a dictionary with the theme as key and different keywords as a list
# it has two columns: theme and single phrase
# combine all the phrases that have the same theme in  a list as follows
# category_keywords = {
#     "theme 1": ["phrase 1", "phrase 2", "phrase 3"],
#     "theme 2": ["phrase 4", "phrase 5", "phrase 6", "phrase 7"]
# }
df = pd.read_excel("data/themekeywordmap.xlsx")
category_keywords = {}
for theme, group in df.groupby('theme'):
    category_keywords[theme] = group['phrase'].tolist()
print("total number of themes: ", len(category_keywords.keys()))



total number of themes:  56


# Step 1: Segment using NLP

In [6]:
split_patterns = [
    r"MOVED by",
    r"SECONDED by",
    r"WHEREAS",
    r"THEREFORE BE IT RESOLVED THAT",
    r"CARRIED UNANIMOUSLY",
    r"REJECTED",
    r"THAT",
    r"APPROVED",
    r"ADOPTED",
    r"RESOLVED"
]

In [7]:
def segment_document(document, patterns):
    """
    Segment the document based on defined patterns.
    """
    combined_pattern = '|'.join(patterns)
    segments = re.split(combined_pattern, document, flags=re.IGNORECASE)
    
    # Filter out empty segments and strip whitespace
    segments = [seg.strip() for seg in segments if seg.strip()]
    return segments

In [8]:
def match_keywords(segment, category_keywords):
    """
    Match segments against category keywords.
    """
    matched_categories = []
    for category, keywords in category_keywords.items():
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', segment, re.IGNORECASE):
                matched_categories.append(category)
                break  # Break after the first match to avoid redundant checks
    return matched_categories

In [9]:
def combine_segments(segments, category_keywords):
    """
    Combine nearby segments and filter them based on category keywords.
    """
    combined_segments = []
    current_segment = ""
    current_categories = set()

    for segment in segments:
        matched_categories = match_keywords(segment, category_keywords)
        if matched_categories:
            if current_segment:
                combined_segments.append({
                    "text": current_segment,
                    "categories": list(current_categories)
                })
                current_segment = segment
                current_categories = set(matched_categories)
            else:
                current_segment = segment
                current_categories.update(matched_categories)
        else:
            current_segment += " " + segment
    
    if current_segment:
        combined_segments.append({
            "text": current_segment,
            "categories": list(current_categories)
        })

    return combined_segments

In [10]:
segments = segment_document(contex['pdf'], split_patterns)
combined_segments = combine_segments(segments, category_keywords)

In [11]:
# initial assesment results
print("Total initial segments:", len(combined_segments))

Total initial segments: 8


In [12]:
output_nlp = json.dumps(combined_segments, indent=2)

In [13]:
len(segments)

42

In [14]:
def combine_segments_v2(segments, category_keywords):
    """
    Combine nearby segments and filter them based on category keywords.
    """
    combined_segments = []
    current_segment = ""
    current_categories = set()
    last_matched_categories = set()

    for segment in segments:
        matched_categories = match_keywords(segment, category_keywords)
        if matched_categories:
            # If current segment is not empty and the new segment has different categories,
            # add the current segment to combined_segments and start a new one
            if current_segment and matched_categories != last_matched_categories:
                combined_segments.append({
                    "text": current_segment,
                    "categories": list(current_categories)
                })
                current_segment = segment
                current_categories = set(matched_categories)
            else:
                current_segment += " " + segment
                current_categories.update(matched_categories)
            last_matched_categories = matched_categories
        else:
            current_segment += " " + segment
    
    # Append the last segment
    if current_segment:
        combined_segments.append({
            "text": current_segment,
            "categories": list(current_categories)
        })

    return combined_segments

In [15]:
Proposal_indicators = ["Moved", "Seconded", "Motion", "Carried", "Proposal", "Passed", "Adopted", "Adoption", "Rejected", "Lost", "Moved", "approve", "Seconded" , "Adopt", "Resolution", "rejected", "Ordinance", "defeated", "discussed", "withdrawn", "tabled", "Amendment", "Amendment", "Recommendation", "granted", "Petition", "denied", "Vote", "result"]
# lower case it
Proposal_indicators = [x.lower() for x in Proposal_indicators]

In [16]:

def combine_segments_v3(segments, category_keywords, Proposal_indicators):
    combined_proposals = []
    temp_segment = ""

    def contains_proposal_indicators(text):
            return any(indicator in text.lower() for indicator in Proposal_indicators)
    def match_keywords(segment, category_keywords):
        """
        Match segments against category keywords.
        """
        matched_categories = []
        for category, keywords in category_keywords.items():
            for keyword in keywords:
                if re.search(r'\b' + re.escape(keyword) + r'\b', segment, re.IGNORECASE):
                    matched_categories.append(category)
        return matched_categories
  
    
    for seg in segments:
        """Splits/combine the segments such that it contains one proposals indicator."""
        for line in seg.split('\n'):
            temp_segment += line + " "
            if contains_proposal_indicators(line):
                categories = match_keywords(temp_segment, category_keywords)
                combined_proposals.append({
                    "text": temp_segment.strip(),
                    "categories": list(set(categories))
                })
                temp_segment = ""
                category_set = set()
    if temp_segment.strip():
        # append it to the last proposal
        categories = match_keywords(temp_segment, category_keywords)
        combined_proposals[-1] = {
                    "text": combined_proposals[-1]["text"] + " " + temp_segment.strip(),
                    "categories": list(set(categories+ combined_proposals[-1]["categories"]))
                }
    return combined_proposals

In [17]:
# combined_segments = combine_segments(segments, category_keywords)
#combined_segments = combine_segments_v2(segments, category_keywords)
combined_segments = combine_segments_v3(segments, category_keywords, Proposal_indicators)

In [18]:
# post assesment results
# if the len of the text is less than 50, add it to the previous segment
for i in reversed(range(len(combined_segments))):
    if len(combined_segments[i]["text"]) < 50:
        combined_segments[i-1]["text"] = combined_segments[i-1]["text"] + " " + combined_segments[i]["text"]
        # remove the current segment
        del combined_segments[i]

In [19]:
len(combined_segments)

13

# step 2: find category based on vector database

In [20]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


# Generate embeddings for each category based on keywords
category_embeddings = {}
for category, keywords in category_keywords.items():
    category_embeddings[category] = model.encode(keywords, convert_to_tensor=True)

# Flatten category embeddings for FAISS indexing
flat_embeddings = []
category_indices = []
for category, embeddings in category_embeddings.items():
    for embedding in embeddings:
        flat_embeddings.append(embedding.cpu().detach().numpy())
        category_indices.append(category)

flat_embeddings = np.vstack(flat_embeddings)

# Create FAISS index
dimension = flat_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(flat_embeddings)

# Save FAISS index and data for later use
faiss.write_index(index, 'data/faiss_index.bin')
np.save('data/category_indices.npy', category_indices)


  from tqdm.autonotebook import tqdm, trange
  return torch._C._cuda_getDeviceCount() > 0


In [21]:
def find_elbow_point_indices(data):
    # Sort the data and keep track of the original indices
    sorted_data_with_indices = sorted((val, idx) for idx, val in enumerate(data))
    sorted_data = [val for val, idx in sorted_data_with_indices]
    sorted_indices = [idx for val, idx in sorted_data_with_indices]
    
    # Calculate the differences between consecutive elements
    differences = np.diff(sorted_data)
    
    # Find the index where the difference significantly increases
    elbow_index = np.argmax(differences)
    
    # Find the elbow point value
    elbow_point = sorted_data[elbow_index]
    
    # Find indices of elements to remove (smaller than or equal to the elbow point)
    to_remove_indices = [idx for idx, val in enumerate(data) if val <= elbow_point]
    return to_remove_indices

In [22]:
def query_vector_database(text, model, index, category_indices,num_categories_to_search=20):
    """
    Query the FAISS index with a text embedding and return the most relevant categories.
    Avoid returning repetitive categories and apply a similarity threshold.
    """
    # Generate embedding for the text
    text_embedding = model.encode([text], convert_to_tensor=True)
    text_embedding = text_embedding.cpu().detach().numpy()


    # Search the index for the most similar embeddings
    distances, indices = index.search(text_embedding, num_categories_to_search)
    to_remove = find_elbow_point_indices(distances[0])

    # Filter out categories based on the threshold and avoid repetitions
    seen_categories = set()
    categories_by_vd = []
    for idx in  indices[0]:
        if idx in to_remove:
            continue
        category = category_indices[idx]
        if category not in seen_categories:
            categories_by_vd.append(category)
            seen_categories.add(category)

    

    return categories_by_vd

In [23]:
# Load the FAISS index and category indices
index = faiss.read_index('data/faiss_index.bin')
category_indices = np.load('data/category_indices.npy', allow_pickle=True)


# Verify categories and update segments
for segment in combined_segments:
    suggested_categories = query_vector_database(segment['text'], model, index, category_indices)
    segment["categories_by_vd"] = suggested_categories

# Print the updated categorized segments
for segment in combined_segments:
    print(json.dumps(segment, indent=2))

{
  "text": "Regular Council  MEETING MINUTES  Tuesday, April 9, 2024  Council Chambers, 7:00pm  Town Hall, 474 South Fletcher Road, Gibsons, BC      PRESENT:  Mayor Silas White   Councillor David Croal  Councillor Annemarie De Andrade  Councillor Stafford Lumley  Councillor Christi Thompson  Youth Representative Cael Read     STAFF:     Emanuel Machado, Chief Administrative Officer  Rebecca Anderson, Corporate Officer  Lorraine Coughlin, Director of Finance  Trevor Rutley, Director of Infrastructure Services  Lesley-Anne Staats, Director of Planning via Zoom   Noni Weitz, Manager of Financial Services  Heidi Siller, Executive Assistant (recorder)    CALL TO ORDER  The Mayor called the meeting to order at 7:00pm.    APPROVAL OF THE AGENDA      R2024-63  Regular Council Agenda - April 9, 2024 Councillor De Andrade Councillor Lumley the Regular Business Agenda of April 9, 2024 be .  CARRIED ADOPTION OF MINUTES",
  "categories": [
    "youth_children"
  ],
  "categories_by_vd": [
    "inj

In [24]:
len(combined_segments)

13

# step 3: lang-graph

In [25]:
local_llm = "llama3"

In [26]:
# load the LANGCHAIN_API_KEY from the environment
import os
from dotenv import load_dotenv
load_dotenv()
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

In [27]:
### Index

from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community import embeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

In [28]:
# split the context['pdf'] and create vector store
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=50
)
def filter_none_values(metadata):
    return {k: v for k, v in metadata.items() if v is not None}
filtered_metadata = filter_none_values(contex['metadata'])

text_splits = text_splitter.split_text(contex['pdf'])
metadata_list = [filtered_metadata] * len(text_splits)

# Add  text_splits to vectorDB with  nomic-embed-text-v1.5  and inference_mode="local
vectorstore = Chroma.from_texts(
        texts=text_splits,
        metadatas=metadata_list,
        # embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
        embedding=embeddings.OllamaEmbeddings(model="nomic-embed-text:v1.5"),
)
retriever = vectorstore.as_retriever()

In [29]:
len(text_splits)

8

In [44]:
prompt_count = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> 
You are an assitance that detect the number of unique proposals with a specific categoty (theme) in a suggested proposal of a council meeting note. \n
You are provided with suggested proposal and its category as user prompt. \n
The goal is to find count of unique proposal in the suggested proposal with the specific category. \n
Give a integer count of unique proposal as a JSON with single key 'count'. \n

example:
proposal: ### Minutes of the Council meeting of February 6, 2024, be approved.\n CARRIED UNANIMOUSLY \n Council Meeting\n Minutes, February 27, 2024 3\n 3. Council (City Finance and Services) \n MOVED by Councillor Dominato\n SECONDED by Councillor Carr\n THAT the Minutes of the Council meeting following the Standing Committee on City\n Finance and Services meeting of February 7, 2024, be approved.\n CARRIED UNANIMOUSLY\n 4. Court of Revision (Business Improvement Areas) - February 8, 2024\n MOVED by Councillor Bligh\n SECONDED by Councillor Zhou\n  THAT the Minutes of the Court of Revision (Business Improvement Areas) meeting of\n February 8, 2024, be approved.\n CARRIED UNANIMOUSLY\n MATTERS ADOPTED ON CONSENT 
categories: ["City Finance and Services", "Business Improvement Areas"]
output: "count": 2

Let's think step by step. Here are the steps to solve the task:
1. Validity check: A proposal should suggest some action or decision and it should have a unique decision. 
2. Proposal Count: See if the suggested proposal includes more than one proposal. calculate the count.
5. Write: Only return the count as integer in a JSON format with a single key 'count'.

RULES:
- your output MUST HAVE the exact JSON format.
- Your answer must not include any speculation or inference. Do not assume or change dates and times. Only provide information that is explicitly stated in the context.
- An answer is considered grounded if **all** information in **every** sentence in the answer is **explicitly** mentioned in the source context, **no** extra information is added and **no** inferred information is added.

     <|eot_id|><|start_header_id|>user<|end_header_id|>
    Category:\n {category} \n\n
    Suggested proposal:\n {proposal} \n\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["category", "proposal"],
)

In [58]:
num = 12
proposal = combined_segments[num]["text"]
llm = ChatOllama(model=local_llm, format="json", temperature=0)
retrieval_grader = prompt_count | llm | JsonOutputParser()
category = combined_segments[num]["categories_by_vd"]
print(retrieval_grader.invoke({"category": category, "proposal": proposal}))

{'count': 0}


In [146]:
prompt_boundry = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> 
You are an assitance that detect boundry of a suggested proposal from chunk of a council meeting note. \n
You are provided with chunk of meeting note and suggested proposal as user prompt. \n
The goal is to find few words from beggining and end of the proposal in the originam context. \n
Give a integer count of unique proposal as a JSON with keys 'start', 'end' as string. \n

RULE: 
1- Do not use speculations or inferences. Only provide information that is explicitly stated in the context.
2- The boundry is part of context that starts from 'start' and end with 'end'.
2- The boundary should be selected such that anything outside that in the context, does not have any related information about the proposal (including the result, mover, ...).
3- Both of start and end MUST have atleast 10 and atmost 20 words. 
4- Provide JSON with  key 'start', 'end'  and no premable or explanation.

Example:
context: Minutes of the Council meeting of February 6, 2024, be approved.\n CARRIED UNANIMOUSLY \n Council Meeting\n Minutes, February 27, 2024 3\n 3. Council (City Finance and Services) \n MOVED by Councillor Dominato\n SECONDED by Councillor Carr\n THAT the Minutes of the Council meeting following the Standing Committee on City\n Finance and Services meeting of February 7, 2024, be approved.\n CARRIED UNANIMOUSLY\n 4. Court of Revision (Business Improvement Areas) - February 8, 2024
proposal: the Minutes of the Council meeting following the Standing Committee on City Finance 

AI output:
"start": "3. Council (City Finance and Services) MOVED by Councillor Dominato SECONDED by Councillor Carr"
"end": "and Services meeting of February 7, 2024, be approved. CARRIED UNANIMOUSLY"


Important: 'start' and 'end' should be atleast ten words and not more than twenty words.
     <|eot_id|><|start_header_id|>user<|end_header_id|>
    Chunk of meeting note as context:\n {document} \n\n
    Suggested proposal:\n {proposal} \n\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["document", "proposal"],
)

In [148]:
num = 1
proposal = combined_segments[num]["text"]

llm = ChatOllama(model=local_llm, format="json", temperature=0)
retrieval_grader = prompt_boundry | llm | JsonOutputParser()
txt_chunk = retriever.invoke(proposal)
boundry = retrieval_grader.invoke({"document": txt_chunk[num], "proposal": proposal})
print(boundry)

{'start': 'THAT the 2024-2028 Financial Plan Bylaw and the 2024 Annual Tax Rate Bylaw be prepared for Council approval.', 'end': 'Councillor De Andrade the minutes of the Regular Council meeting held March 19, 2024 be . CARRIED'}


In [149]:
import difflib

def find_closest_match(original_string, substring):
    # Initialize variables
    closest_match_index = -1
    highest_similarity = 0
    substring_length = len(substring)
    
    # Define a function to calculate similarity ratio
    def similarity(s1, s2):
        return difflib.SequenceMatcher(None, s1, s2).ratio()
    
    # Compare the substring against all possible substrings of the same length in the original string
    for i in range(len(original_string) - substring_length + 1):
        current_substring = original_string[i:i + substring_length]
        current_similarity = similarity(current_substring, substring)
        if current_similarity > highest_similarity:
            highest_similarity = current_similarity
            closest_match_index = i
    
    return closest_match_index

In [156]:
start = boundry["start"]
end = boundry["end"]
start_index = find_closest_match(txt_chunk[num].page_content.replace('\n', ' ').strip(), start)
end_index = find_closest_match(txt_chunk[num].page_content.replace('\n', ' ').strip(), end)
proposal = txt_chunk[num].page_content.replace('\n', ' ').strip()[start_index:end_index+len(end)]
proposal

'THAT the 2024-2028 Financial Plan Bylaw and the 2024 Annual Tax Rate  Bylaw be prepared for Council approval.  CARRIED      ADMINISTRATION REPORTS    Budget Presentation     The budget presentation was received for information.         R2024-69  Parcel Tax Roll Review Panel – Water/Sewer/Community Recreation  Parcel Taxes  MOVED by Councillor De Andrade   SECONDED by Councillor Thompson    THAT Council convene a Parcel Tax R'

In [159]:
end

'Councillor De Andrade the minutes of the Regular Council meeting held March 19, 2024 be . CARRIED'