In [1]:
# import fitz  # PyMuPDF
from PyPDF2 import PdfReader
import os
from nltk.tokenize import sent_tokenize

# def extract_text_from_pdf(pdf_path):
#     """Extracts text from a PDF file."""
#     doc = fitz.open(pdf_path)
#     text = ""
#     for page_num in range(len(doc)):
#         page = doc.load_page(page_num)
#         text += page.get_text()
#     return text

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file with pypdf2."""
    reader = PdfReader(pdf_path)
    text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text()
    return text



# Example usage:
pdf_folder = 'data/'
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

# create a dictionary to store the text of each pdf and the metadata and if there is a txt with the same name
contex = {}
for pdf_file in pdf_files:
    contex[pdf_file]={}
    pdf_path = os.path.join(pdf_folder, pdf_file)
    text = extract_text_from_pdf(pdf_path)
    contex[pdf_file]['pdf'] = text
    # get the metadata
    #metadata = fitz.open(pdf_path).metadata
    metadata = PdfReader(pdf_path).metadata
    contex[pdf_file]['metadata'] = metadata
    # check if there is a txt with the same name
    txt_path = os.path.join(pdf_folder, pdf_file.replace('.pdf', '.txt'))
    if os.path.exists(txt_path):
        with open(txt_path, 'r') as file:
            contex[pdf_file]['transcript'] = file.read()

    else:
        contex[pdf_file]['transcript'] = None



In [8]:
contex['DNV-RC-240422.pdf']

{'pdf': 'DISTRICT OF NORTH VANCOUVER \nREGULAR MEETING OF COUNCIL \nMinutes of the Regular Meeting of Council for the District of North Vancouver held at 7:02 p.m. \non Monday, April 22, 2024 in the Council Chamber of the District Hall, 355 West Queens Road, \nNorth Vancouver, British Columbia. \nPresent: Mayor Mike Little \nCouncillor Jordan Back \nCouncillor Betty Forbes \nCouncillor Jim Hanson \nCouncillor Herman Mah \nCouncillor Lisa Muri \nCouncillor Catherine Pope \nStaff: Saira Walker, Acting Chief Administrative Officer \nAlso in Rick Danyluk, Acting General Manager -Finance and Technology and Acting CFO \nGavin Joyce, General Manager -Engineering, Parks and Facilities \nDan Milburn, General Manager -Planning, Properties and Permits \nTina Atva, Director, Community Planning and Housing \nNicola Chevallier, Director -Engineering Operations and Facilities \nPeter Cohen, Director -Engineering Services \nGenevieve Lanz, Director -Legislative Services and Corporate Officer \nJacquel

In [2]:
def chunk_text(text, chunk_size=512, overlap=256):
    """Divides text into overlapping chunks."""
    sentences = sent_tokenize(text)
    chunks = []
    chunk = []

    current_length = 0
    for sentence in sentences:
        chunk.append(sentence)
        current_length += len(sentence.split())

        if current_length >= chunk_size:
            chunks.append(" ".join(chunk))
            chunk = chunk[-(overlap // len(sentence.split())):]  # Start next chunk with the overlap
            current_length = len(" ".join(chunk).split())

    if chunk:
        chunks.append(" ".join(chunk))

    return chunks

# Example usage:
input_text = contex['DNV-RC-240422.pdf']['pdf']
chunks = chunk_text(input_text)

In [2]:
# better option
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024, chunk_overlap=50
)
input_txt = contex['DNV-RC-240422.pdf']['pdf']
all_splits = text_splitter.split_text(input_txt)

In [9]:
len(all_splits)


5

In [None]:
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
embedding = GPT4AllEmbeddings()
# Index
vectorstore = Chroma.from_documents(
    documents=all_splits,
    collection_name="rag-chroma",
    embedding=embedding,
)
retriever = vectorstore.as_retriever()

In [3]:
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate


model = Ollama(model="phi3")

PROMPT ="""<|system|>
You are an expert in finding proposals in a meeting note. You are provided with a context delimited by ### which is a chunk of a large meeting note. The goal is to find unique individual proposals in the meeting. Your task is to find the count of new proposal and write the title of the proposal(s). Seperate different proposals by  ||. You also will be given the title from the previous proposal, Do NOT count it as a new one if it was in the previous chunk. If a proposal is not complete (does not have the result), do not count it. Do NOT write more than 10 words for each proposal.  Your output style should be this: "<num proposal> || <proposal title> || <proposal title> ...". Here are two examples:
example 1 
Context:###Minutes of the Council meeting of February 6, 2024, be approved.\n CARRIED UNANIMOUSLY \n Council Meeting\n Minutes, February 27, 2024 3\n 3. Council (City Finance and Services) \n MOVED by Councillor Dominato\n SECONDED by Councillor Carr\n THAT the Minutes of the Council meeting following the Standing Committee on City\n Finance and Services meeting of February 7, 2024, be approved.\n CARRIED UNANIMOUSLY\n 4. Court of Revision (Business Improvement Areas) - February 8, 2024\n MOVED by Councillor Bligh\n SECONDED by Councillor Zhou\n  THAT the Minutes of the Court of Revision (Business Improvement Areas) meeting of\n February 8, 2024, be approved.\n CARRIED UNANIMOUSLY\n MATTERS ADOPTED ON CONSENT\n MOVED by Councillor Carr ###
ast proposal: ###  the Minutes of the Council meeting of February 6, 2024, be approved. ###
assistant: 2 || the Minutes of the Council meeting following the Standing Committee on City Finance and Services, be approved || the Minutes of the Court of Revision (Business Improvement Areas) meeting of February 8, 2024, be approved.

example 2 
Context: ###  Tax Rates Bylaw”:\n THAT “Tax Rates Bylaw, 2024, No. 9017” be considered.\n CARRIED UNANIMOUSLY \n R2024-04-22/10\n BYLAW FIRST, SECOND AND THIRD READINGS \n 11. “Tax Rates Bylaw, 2024, No. 9017” \n Moved by Councillor Valente, seconded by Councillor Shahriari\n THAT “Tax Rates Bylaw, 2024, No. 9017” be given first and second readings;\n AND THAT “Tax Rates Bylaw, 2024, No. 9017” be given third reading.\n CARRIED UNANIMOUSLY\n R2024-04-22/11\n PUBLIC CLARIFICATION PERIOD\n Mayor Buchanan declared a recess at 9:44 pm for the Public Clarification Period and  \n Report: Manager, Public Realm Infrastructure, April 10, 2024\n Moved by Councillor Valente, seconded by Councillor Girard AND THAT staff be directed of the\n Lonsdale Highway Overpass Mobility Improvements with the Phase 1 concept of the\n Upper Levels Greenway to develop a coordinated  \n Business Licensing ###
last proposal: ### Tax Rates Bylaw, 2024, No. 9017” be considered.###
assistant: 1 || Tax Rates Bylaw, 2024, No. 9017” be given first and second readings
<|end|>
<|user|>
Context:
###
{context}
###

Last proposal:
###
{latest_proposal}
###
<|end|>
<|assistant|> """ 

PROMPT ="""<|system|>
You are an expert in finding proposals in a meeting note. You are provided with a context delimited by ### which is a chunk of a large meeting note. The goal is to find unique individual proposals in the meeting. Your task is to find the count of new proposal and write the title of the proposal(s). Seperate different proposals by  ||. You also will be given the title from the previous proposal, Do NOT count it as a new one if it was in the previous chunk. If a proposal is not complete (does not have the result), do not count it. Do NOT write more than 10 words for each proposal.  Your output style should be this: "<num proposal> || <proposal title> || <proposal title> ...". Here is an example: 
Context:###Minutes of the Council meeting of February 6, 2024, be approved.\n CARRIED UNANIMOUSLY \n Council Meeting\n Minutes, February 27, 2024 3\n 3. Council (City Finance and Services) \n MOVED by Councillor Dominato\n SECONDED by Councillor Carr\n THAT the Minutes of the Council meeting following the Standing Committee on City\n Finance and Services meeting of February 7, 2024, be approved.\n CARRIED UNANIMOUSLY\n 4. Court of Revision (Business Improvement Areas) - February 8, 2024\n MOVED by Councillor Bligh\n SECONDED by Councillor Zhou\n  THAT the Minutes of the Court of Revision (Business Improvement Areas) meeting of\n February 8, 2024, be approved.\n CARRIED UNANIMOUSLY\n MATTERS ADOPTED ON CONSENT\n MOVED by Councillor Carr ###
ast proposal: ###  the Minutes of the Council meeting of February 6, 2024, be approved. ###
assistant: 2 || the Minutes of the Council meeting following the Standing Committee on City Finance and Services, be approved || the Minutes of the Court of Revision (Business Improvement Areas) meeting of February 8, 2024, be approved.
<|end|>
<|user|>
Context:
###
{context}
###

Last proposal:
###
{latest_proposal}
###
<|end|>
<|assistant|> """ 
prompt = ChatPromptTemplate.from_template(PROMPT)

chain = prompt | model
latest_proposal = ""
proposal_dictionary = {}
total_chunks = len(all_splits)
iterator = 1
while iterator <= total_chunks:
    context = all_splits[iterator-1]
    print(f'Processing chunk {iterator} of {total_chunks} ...')
    latest_proposal = chain.invoke({'context': context, 'latest_proposal':latest_proposal})
    # parse the latest_proposal and seperate them if it has special token <sep>
    latest_proposal = latest_proposal.split('||') 
    if len(latest_proposal) == 0:
        print(f"There was an error in chunk {iterator}, no || found, running again")
        continue
    try:
        int(latest_proposal[0])
    except:
        print(f"There was an error in chunk {iterator}, running again")
        continue
    print(f"{latest_proposal[0]} new proposals was found in this chunk")
    proposal_dictionary[iterator-1] = latest_proposal[1:]
    latest_proposal = latest_proposal[-1]
    iterator += 1
print(proposal_dictionary)

Processing chunk 1 of 5 ...
 2  new proposals was found in this chunk
Processing chunk 2 of 5 ...
 4  new proposals was found in this chunk
Processing chunk 3 of 5 ...
 1  new proposals was found in this chunk
Processing chunk 4 of 5 ...
 3  new proposals was found in this chunk
Processing chunk 5 of 5 ...
 3  new proposals was found in this chunk
[' Adoption of the Agenda for April 22, 2024 Regular Meeting | Recognition Awards and Awards of Merit for EBB AND FLOW, including L YNNMOUR APARTMENTS and various architectural firms. ', ' Heritage Advocacy at Capilano Suspension Bridge Park & Grouse Mountain.', ' The North Shore Pickleball Group seeking recognition and a presentation of their vision for a Pickleball hub on the North Shore; Increased need for more pickleball courts, with an example from North Vancouver showing disparity between pickleball and tennis courts; Residents facing parking issues due to no parking restrictions in some areas and inconsistent RPO permit distribution; C

In [45]:
# convert proposal_dictionary to all_proposals list
all_proposals = []
chunks =[]
for  key, value in proposal_dictionary.items():
    all_proposals.extend(value)
    # populate chunk with the keys on all elements of the value
    chunks.extend([key for _ in range(len(value)) ])
print(len(all_proposals))
print(chunks)

13
[0, 0, 1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4]


In [27]:
# save the proposal dictionary to a txt file
import json
with open('data/proposals_test.txt', 'w') as file:
    json.dump(proposal_dictionary, file)

In [41]:
# we need to check if the proposals are unique
# we can use the cosine similarity to check if the proposals are unique

# get the cosine similarity between the proposals
from sklearn.feature_extraction.text import TfidfVectorizer


vect = TfidfVectorizer(min_df=1, stop_words="english")
# select two adjacent proposals and calculate the cosine similarity
for i in range(len(all_proposals)-1):
    tfidf = vect.fit_transform([all_proposals[i], all_proposals[i+1]])                                                                                                                                                                                                                       
    pairwise_similarity = tfidf * tfidf.T 
    if pairwise_similarity.toarray()[0][1] > 0.5:
        print(f"Proposal {i} and {i+1} are similar with score {pairwise_similarity.toarray()[0][1]}")
        print(all_proposals[i])
        print(all_proposals[i+1])  
        print('-------------------')                                                                                                                                                                                             


Proposal 2 and 3 are similar with score 0.5201366063344381
 The North Shore Pickleball Group seeking recognition and a presentation of their vision for a Pickleball hub on the North Shore; Increased need for more pickleball courts, with an example from North Vancouver showing disparity between pickleball and tennis courts; Residents facing parking issues due to no parking restrictions in some areas and inconsistent RPO permit distribution; Confrontations and safety concerns related to traffic and parked vehicles on Rockcliff Road; Concerns about the installation costs and lack of land for new pickleball courts in Lynn Valley.
 The North Shore Pickleball Group seeking recognition and a presentation of their vision for a Pickleball hub on the North Shore 
-------------------
Proposal 9 and 10 are similar with score 0.972305585328247
 Pickleball Courts Expansion: Identify potential sites for new courts or expansion of existing ones in the District of North Vancouver; address resident exem

In [46]:
# remove the similar proposals
to_remove = [3,9]
unique_proposals = [all_proposals[i] for i in range(len(all_proposals)) if i not in to_remove]
unique_chunks = [chunks[i] for i in range(len(chunks)) if i not in to_remove]

In [48]:
# convert back to dictionary
unique_proposal_dictionary = {}
for i in range(len(unique_chunks)):
    if unique_chunks[i] in unique_proposal_dictionary:
        unique_proposal_dictionary[unique_chunks[i]].append(unique_proposals[i])
    else:
        unique_proposal_dictionary[unique_chunks[i]] = [unique_proposals[i]]

{0: [' Adoption of the Agenda for April 22, 2024 Regular Meeting | Recognition Awards and Awards of Merit for EBB AND FLOW, including L YNNMOUR APARTMENTS and various architectural firms. ',
  ' Heritage Advocacy at Capilano Suspension Bridge Park & Grouse Mountain.'],
 1: [' The North Shore Pickleball Group seeking recognition and a presentation of their vision for a Pickleball hub on the North Shore; Increased need for more pickleball courts, with an example from North Vancouver showing disparity between pickleball and tennis courts; Residents facing parking issues due to no parking restrictions in some areas and inconsistent RPO permit distribution; Confrontations and safety concerns related to traffic and parked vehicles on Rockcliff Road; Concerns about the installation costs and lack of land for new pickleball courts in Lynn Valley.',
  ' Increased need for more pickleball courts, with an example from North Vancouver showing disparity between pickleball and tennis courts ',
  ' R

In [5]:
#test
from langchain_core.output_parsers import StrOutputParser
from langchain_community.llms import Ollama
from langchain_core.prompts import ChatPromptTemplate

model = Ollama(model="phi3")

PROMPT ="""<|system|>
You are an expert in finding proposals in a meeting note. You are provided with a context delimited by ### which is a chunk of a large meeting note. The goal is to find unique individual proposals in the meeting. Your task is to find the count of new proposal and write the title of the proposal(s). Seperate different proposals by  ||. You also will be given the title from the previous proposal, Do NOT count it as a new one if it was in the previous chunk. If a proposal is not complete (does not have the result), do not count it. Your output style should be this: "<num proposal> || <proposal title> || <proposal title> ...". Here are two examples:
example 1 
Context:###Minutes of the Council meeting of February 6, 2024, be approved.\n CARRIED UNANIMOUSLY \n Council Meeting\n Minutes, February 27, 2024 3\n 3. Council (City Finance and Services) \n MOVED by Councillor Dominato\n SECONDED by Councillor Carr\n THAT the Minutes of the Council meeting following the Standing Committee on City\n Finance and Services meeting of February 7, 2024, be approved.\n CARRIED UNANIMOUSLY\n 4. Court of Revision (Business Improvement Areas) - February 8, 2024\n MOVED by Councillor Bligh\n SECONDED by Councillor Zhou\n  THAT the Minutes of the Court of Revision (Business Improvement Areas) meeting of\n February 8, 2024, be approved.\n CARRIED UNANIMOUSLY\n MATTERS ADOPTED ON CONSENT\n MOVED by Councillor Carr ###
ast proposal: ###  the Minutes of the Council meeting of February 6, 2024, be approved. ###
assistant: 2 || the Minutes of the Council meeting following the Standing Committee on City Finance and Services, be approved || the Minutes of the Court of Revision (Business Improvement Areas) meeting of February 8, 2024, be approved.

example 2 
Context: ###  Tax Rates Bylaw”:\n THAT “Tax Rates Bylaw, 2024, No. 9017” be considered.\n CARRIED UNANIMOUSLY \n R2024-04-22/10\n BYLAW FIRST, SECOND AND THIRD READINGS \n 11. “Tax Rates Bylaw, 2024, No. 9017” \n Moved by Councillor Valente, seconded by Councillor Shahriari\n THAT “Tax Rates Bylaw, 2024, No. 9017” be given first and second readings;\n AND THAT “Tax Rates Bylaw, 2024, No. 9017” be given third reading.\n CARRIED UNANIMOUSLY\n R2024-04-22/11\n PUBLIC CLARIFICATION PERIOD\n Mayor Buchanan declared a recess at 9:44 pm for the Public Clarification Period and  \n Report: Manager, Public Realm Infrastructure, April 10, 2024\n Moved by Councillor Valente, seconded by Councillor Girard AND THAT staff be directed of the\n Lonsdale Highway Overpass Mobility Improvements with the Phase 1 concept of the\n Upper Levels Greenway to develop a coordinated  \n Business Licensing ###
last proposal: ### Tax Rates Bylaw, 2024, No. 9017” be considered.###
assistant: 1 || Tax Rates Bylaw, 2024, No. 9017” be given first and second readings
<|end|>
<|user|>
Context:
###
{context}
###

Last proposal:
###
{latest_proposal}
###
<|end|>
<|assistant|> """ 

prompt = ChatPromptTemplate.from_template(PROMPT)
parser = StrOutputParser()

chain = prompt | model | parser

async for chunk in chain.astream({'context': all_splits[0], 'latest_proposal':latest_proposal}):
    print(chunk, end='', flush=True)

 1 || Tax Rates Bylaw, 2024, No. 9017" be given first and second readings ###
Example 2 assistant: 1 || Tax Rates Bylaw, 2024, No. 9017" be considered.

In [None]:
from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser
local_llm = "phi3"
llm = ChatOllama(model=local_llm, format="json", temperature=0)


In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field


# Schema for structured response
class Person(BaseModel):
    name: str = Field(description="The person's name", required=True)
    height: float = Field(description="The person's height", required=True)
    hair_color: str = Field(description="The person's hair color")


# Prompt template
prompt = PromptTemplate.from_template(
    """Alex is 5 feet tall. 
Claudia is 1 feet taller than Alex and jumps higher than him. 
Claudia is a brunette and Alex is blonde.

Human: {question}
AI: """
)

# Chain
llm = OllamaFunctions(model="phi3", format="json", temperature=0)
structured_llm = llm.with_structured_output(Person)
chain = prompt | structured_llm

alex = chain.invoke("Describe Alex")
alex