In [11]:
import re
import fitz  # PyMuPDF
import json
from collections import defaultdict
import pandas as pd
import os
from nltk.tokenize import sent_tokenize

In [1]:
# one file:
pdf_folder = 'data/batch/'
metting_name = "gib_mcp_rgc_min__2024-04-09__01"
pdf_file = metting_name + ".pdf"
pdf_path = os.path.join(pdf_folder, pdf_file)
# read the info from a xlsx file
df = pd.read_excel("data/meetingmap.xlsx")
df = df.set_index('standard name')
# get the info of the meeting
meeting_info = df.loc[pdf_file]
# return all the columns for the meeting
meeting_info = meeting_info.to_dict()
meeting_info

{'location': 'gibsons',
 'location type': 'municipality',
 'meeting type': 'regular_council',
 'data type': 'minutes',
 'meeting date': Timestamp('2024-04-09 00:00:00'),
 'transcript': 'Yes',
 'comment': nan}

In [15]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text


# create a dictionary to store the text of each pdf and the metadata and if there is a txt with the same name
contex = {}
    
text = extract_text_from_pdf(pdf_path)
contex['pdf'] = text
# get the metadata
metadata = fitz.open(pdf_path).metadata
contex['metadata'] = metadata
# check if there is a txt with the same name
txt_path = os.path.join(pdf_folder, pdf_file.replace('.pdf', '.txt'))
if os.path.exists(txt_path):
    with open(txt_path, 'r') as file:
        contex['transcript'] = file.read()

else:
        contex['transcript'] = None

In [16]:
contex.keys()

dict_keys(['pdf', 'metadata', 'transcript'])

In [10]:
# load the theme keywords from themekeywordmap.xlsx
# convert it to a dictionary with the theme as key and different keywords as a list
# it has two columns: theme and single phrase
# combine all the phrases that have the same theme in  a list as follows
# category_keywords = {
#     "theme 1": ["phrase 1", "phrase 2", "phrase 3"],
#     "theme 2": ["phrase 4", "phrase 5", "phrase 6", "phrase 7"]
# }
df = pd.read_excel("data/themekeywordmap.xlsx")
category_keywords = {}
for theme, group in df.groupby('theme'):
    category_keywords[theme] = group['phrase'].tolist()
print("total number of themes: ", len(category_keywords.keys()))



total number of themes:  56


In [18]:
split_patterns = [
    r"MOVED by",
    r"SECONDED by",
    r"WHEREAS",
    r"THEREFORE BE IT RESOLVED THAT",
    r"CARRIED UNANIMOUSLY",
    r"REJECTED",
    r"THAT",
    r"APPROVED",
    r"ADOPTED",
    r"RESOLVED"
]

In [12]:
def segment_document(document, patterns):
    """
    Segment the document based on defined patterns.
    """
    combined_pattern = '|'.join(patterns)
    segments = re.split(combined_pattern, document, flags=re.IGNORECASE)
    
    # Filter out empty segments and strip whitespace
    segments = [seg.strip() for seg in segments if seg.strip()]
    return segments

In [13]:
def match_keywords(segment, category_keywords):
    """
    Match segments against category keywords.
    """
    matched_categories = []
    for category, keywords in category_keywords.items():
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', segment, re.IGNORECASE):
                matched_categories.append(category)
                break  # Break after the first match to avoid redundant checks
    return matched_categories

In [14]:
def combine_segments(segments, category_keywords):
    """
    Combine nearby segments and filter them based on category keywords.
    """
    combined_segments = []
    current_segment = ""
    current_categories = set()

    for segment in segments:
        matched_categories = match_keywords(segment, category_keywords)
        if matched_categories:
            if current_segment:
                combined_segments.append({
                    "text": current_segment,
                    "categories": list(current_categories)
                })
                current_segment = segment
                current_categories = set(matched_categories)
            else:
                current_segment = segment
                current_categories.update(matched_categories)
        else:
            current_segment += " " + segment
    
    if current_segment:
        combined_segments.append({
            "text": current_segment,
            "categories": list(current_categories)
        })

    return combined_segments

In [21]:
segments = segment_document(contex['pdf'], split_patterns)
combined_segments = combine_segments(segments, category_keywords)

In [26]:
# initial assesment results
print("Total initial segments:", len(combined_segments))

Total initial segments: 8


In [29]:
output_nlp = json.dumps(combined_segments, indent=2)