In [11]:
import re
import fitz  # PyMuPDF
import json
from collections import defaultdict
import pandas as pd
import os
from nltk.tokenize import sent_tokenize

In [1]:
# one file:
pdf_folder = 'data/batch/'
metting_name = "gib_mcp_rgc_min__2024-04-09__01"
pdf_file = metting_name + ".pdf"
pdf_path = os.path.join(pdf_folder, pdf_file)
# read the info from a xlsx file
df = pd.read_excel("data/meetingmap.xlsx")
df = df.set_index('standard name')
# get the info of the meeting
meeting_info = df.loc[pdf_file]
# return all the columns for the meeting
meeting_info = meeting_info.to_dict()
meeting_info

{'location': 'gibsons',
 'location type': 'municipality',
 'meeting type': 'regular_council',
 'data type': 'minutes',
 'meeting date': Timestamp('2024-04-09 00:00:00'),
 'transcript': 'Yes',
 'comment': nan}

In [15]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text


# create a dictionary to store the text of each pdf and the metadata and if there is a txt with the same name
contex = {}
    
text = extract_text_from_pdf(pdf_path)
contex['pdf'] = text
# get the metadata
metadata = fitz.open(pdf_path).metadata
contex['metadata'] = metadata
# check if there is a txt with the same name
txt_path = os.path.join(pdf_folder, pdf_file.replace('.pdf', '.txt'))
if os.path.exists(txt_path):
    with open(txt_path, 'r') as file:
        contex['transcript'] = file.read()

else:
        contex['transcript'] = None

In [16]:
contex.keys()

dict_keys(['pdf', 'metadata', 'transcript'])

In [55]:
# load the theme keywords from themekeywordmap.xlsx
# convert it to a dictionary with the theme as key and different keywords as a list
# it has two columns: theme and single phrase
# combine all the phrases that have the same theme in  a list as follows
# category_keywords = {
#     "theme 1": ["phrase 1", "phrase 2", "phrase 3"],
#     "theme 2": ["phrase 4", "phrase 5", "phrase 6", "phrase 7"]
# }
df = pd.read_excel("data/themekeywordmap.xlsx")
category_keywords = {}
for theme, group in df.groupby('theme'):
    category_keywords[theme] = group['phrase'].tolist()
print("total number of themes: ", len(category_keywords.keys()))



total number of themes:  56


# Step 1: Segment using NLP

In [18]:
split_patterns = [
    r"MOVED by",
    r"SECONDED by",
    r"WHEREAS",
    r"THEREFORE BE IT RESOLVED THAT",
    r"CARRIED UNANIMOUSLY",
    r"REJECTED",
    r"THAT",
    r"APPROVED",
    r"ADOPTED",
    r"RESOLVED"
]

In [12]:
def segment_document(document, patterns):
    """
    Segment the document based on defined patterns.
    """
    combined_pattern = '|'.join(patterns)
    segments = re.split(combined_pattern, document, flags=re.IGNORECASE)
    
    # Filter out empty segments and strip whitespace
    segments = [seg.strip() for seg in segments if seg.strip()]
    return segments

In [13]:
def match_keywords(segment, category_keywords):
    """
    Match segments against category keywords.
    """
    matched_categories = []
    for category, keywords in category_keywords.items():
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', segment, re.IGNORECASE):
                matched_categories.append(category)
                break  # Break after the first match to avoid redundant checks
    return matched_categories

In [14]:
def combine_segments(segments, category_keywords):
    """
    Combine nearby segments and filter them based on category keywords.
    """
    combined_segments = []
    current_segment = ""
    current_categories = set()

    for segment in segments:
        matched_categories = match_keywords(segment, category_keywords)
        if matched_categories:
            if current_segment:
                combined_segments.append({
                    "text": current_segment,
                    "categories": list(current_categories)
                })
                current_segment = segment
                current_categories = set(matched_categories)
            else:
                current_segment = segment
                current_categories.update(matched_categories)
        else:
            current_segment += " " + segment
    
    if current_segment:
        combined_segments.append({
            "text": current_segment,
            "categories": list(current_categories)
        })

    return combined_segments

In [21]:
segments = segment_document(contex['pdf'], split_patterns)
combined_segments = combine_segments(segments, category_keywords)

In [26]:
# initial assesment results
print("Total initial segments:", len(combined_segments))

Total initial segments: 8


In [29]:
output_nlp = json.dumps(combined_segments, indent=2)

In [33]:
len(segments)

42

In [45]:
def combine_segments_v2(segments, category_keywords):
    """
    Combine nearby segments and filter them based on category keywords.
    """
    combined_segments = []
    current_segment = ""
    current_categories = set()
    last_matched_categories = set()

    for segment in segments:
        matched_categories = match_keywords(segment, category_keywords)
        if matched_categories:
            # If current segment is not empty and the new segment has different categories,
            # add the current segment to combined_segments and start a new one
            if current_segment and matched_categories != last_matched_categories:
                combined_segments.append({
                    "text": current_segment,
                    "categories": list(current_categories)
                })
                current_segment = segment
                current_categories = set(matched_categories)
            else:
                current_segment += " " + segment
                current_categories.update(matched_categories)
            last_matched_categories = matched_categories
        else:
            current_segment += " " + segment
    
    # Append the last segment
    if current_segment:
        combined_segments.append({
            "text": current_segment,
            "categories": list(current_categories)
        })

    return combined_segments

In [46]:
combined_segments = combine_segments_v2(segments, category_keywords)

# step 2: find category based on vector database

In [56]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


# Generate embeddings for each category based on keywords
category_embeddings = {}
for category, keywords in category_keywords.items():
    category_embeddings[category] = model.encode(keywords, convert_to_tensor=True)

# Flatten category embeddings for FAISS indexing
flat_embeddings = []
category_indices = []
for category, embeddings in category_embeddings.items():
    for embedding in embeddings:
        flat_embeddings.append(embedding.cpu().detach().numpy())
        category_indices.append(category)

flat_embeddings = np.vstack(flat_embeddings)

# Create FAISS index
dimension = flat_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(flat_embeddings)

# Save FAISS index and data for later use
faiss.write_index(index, 'data/faiss_index.bin')
np.save('data/category_indices.npy', category_indices)




In [69]:
def find_elbow_point_indices(data):
    # Sort the data and keep track of the original indices
    sorted_data_with_indices = sorted((val, idx) for idx, val in enumerate(data))
    sorted_data = [val for val, idx in sorted_data_with_indices]
    sorted_indices = [idx for val, idx in sorted_data_with_indices]
    
    # Calculate the differences between consecutive elements
    differences = np.diff(sorted_data)
    
    # Find the index where the difference significantly increases
    elbow_index = np.argmax(differences)
    
    # Find the elbow point value
    elbow_point = sorted_data[elbow_index]
    
    # Find indices of elements to remove (smaller than or equal to the elbow point)
    to_remove_indices = [idx for idx, val in enumerate(data) if val <= elbow_point]
    return to_remove_indices

In [70]:
def query_vector_database(text, model, index, category_indices,num_categories_to_search=20):
    """
    Query the FAISS index with a text embedding and return the most relevant categories.
    Avoid returning repetitive categories and apply a similarity threshold.
    """
    # Generate embedding for the text
    text_embedding = model.encode([text], convert_to_tensor=True)
    text_embedding = text_embedding.cpu().detach().numpy()


    # Search the index for the most similar embeddings
    distances, indices = index.search(text_embedding, num_categories_to_search)
    to_remove = find_elbow_point_indices(distances[0])

    # Filter out categories based on the threshold and avoid repetitions
    seen_categories = set()
    categories_by_vd = []
    for idx in  indices[0]:
        if idx in to_remove:
            continue
        category = category_indices[idx]
        if category not in seen_categories:
            categories_by_vd.append(category)
            seen_categories.add(category)

    

    return categories_by_vd

In [71]:
# Load the FAISS index and category indices
index = faiss.read_index('data/faiss_index.bin')
category_indices = np.load('data/category_indices.npy', allow_pickle=True)


# Verify categories and update segments
for segment in combined_segments:
    suggested_categories = query_vector_database(segment['text'], model, index, category_indices)
    segment["categories_by_vd"] = suggested_categories

# Print the updated categorized segments
for segment in combined_segments:
    print(json.dumps(segment, indent=2))

{
  "text": " Regular Council \nMEETING MINUTES \nTuesday, April 9, 2024 \nCouncil Chambers, 7:00pm \nTown Hall, 474 South Fletcher Road, Gibsons, BC \n \n \nPRESENT: \nMayor Silas White  \nCouncillor David Croal \nCouncillor Annemarie De Andrade \nCouncillor Stafford Lumley \nCouncillor Christi Thompson \nYouth Representative Cael Read \n  \nSTAFF: \n  \nEmanuel Machado, Chief Administrative Officer \nRebecca Anderson, Corporate Officer \nLorraine Coughlin, Director of Finance \nTrevor Rutley, Director of Infrastructure Services \nLesley-Anne Staats, Director of Planning via Zoom  \nNoni Weitz, Manager of Financial Services \nHeidi Siller, Executive Assistant (recorder) \n \nCALL TO ORDER \nThe Mayor called the meeting to order at 7:00pm. \n \nAPPROVAL OF THE AGENDA \n \n \nR2024-63 \nRegular Council Agenda - April 9, 2024 Councillor De Andrade Councillor Lumley the Regular Business Agenda of April 9, 2024 be . \nCARRIED \n \n \nADOPTION OF MINUTES \n \n \nR2024-64 \nMinutes of the Re