In [96]:
import fitz  # PyMuPDF
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
import pandas as pd

# Step 1: Extract text from the PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

from nltk.tokenize import RegexpTokenizer

def custom_tokenizer(text):
    # Preserve apostrophes in words
    tokenizer = RegexpTokenizer(r"\b[\w']+\b")
    return tokenizer.tokenize(text)


# Step 2: Clean the text
def clean_text(text):
    import re
    nltk.download('stopwords')
    nltk.download('punkt')

    # Use French stopwords
    stop_words = set(stopwords.words("french"))

    # Use custom tokenizer to preserve apostrophes
    words = custom_tokenizer(text)
    
    # Add additional custom stopwords - self referential keywords offering little to no insight
    custom_stopwords = {
        "québec", "québécois", "québécoise", "cinq", "dollars", "millions", "milliards", "budget", "ans", "années", "plan", "total", "notamment", 
        "cette", "cet", "ainsi", "autres", "mars", "afin", "entre", "dont", "selon", "1er", "vers", "sous"
    }
    stop_words.update(custom_stopwords)
    

    # Define regex patterns
    number_pattern = re.compile(r"^[\u00b1\u2013\u2212+-]?\d+([.,]\d+)?$")  # Matches numbers with ±, −, etc.
    year_pattern = re.compile(r"^[\u00b1\u2013\u2212+-]?\d{4}([-\‐‑–—‒﹣－\u00AD]\d{4})?|[\u00b1\u2013\u2212+-]?\d{4}[-‐‑–—‒﹣－\u00AD]?$")  # Matches years like 2022, 2022-2023, 2019-
    page_number_pattern = re.compile(r"^[a-zA-Z]\.\d+$")  # Matches page numbers like b.25, c.17
    dot_pattern = re.compile(r"^\.+$")  # Matches sequences of dots
    bullet_pattern = re.compile(r"^▪.*$")  # Matches tokens starting with ▪
    hierarchical_number_pattern = re.compile(r"^\d+(\.\d+)+$")  # Matches hierarchical numbers like 3.1.1

    # New patterns based on examples
    range_pattern = re.compile(r"^\d{2}[-–—]\d{2}$")  # Matches ranges like 06-07 or 17-18
    isbn_pattern = re.compile(r"^\d{3}-\d-\d{3}-\d{5}-\d$")  # Matches ISBN numbers like 978-2-550-83607-0
    alphanumeric_pattern = re.compile(r"^[a-zA-Z]+[-+]?\d+.*$")  # Matches alphanumeric sequences like a-13, aa1, aa+
    trailing_number_pattern = re.compile(r"^[^\d]+(?:-\d+)?\d+$")

    # Tokenize the text
    words = word_tokenize(text)
    cleaned_words = [
        word.lower() for word in words
        if word.lower() not in stop_words
        and word not in string.punctuation
        and not number_pattern.match(word)
        and not year_pattern.match(word)
        and not page_number_pattern.match(word)
        and not dot_pattern.match(word)
        and not bullet_pattern.match(word)
        and not hierarchical_number_pattern.match(word)
        and not range_pattern.match(word)
        and not isbn_pattern.match(word)
        and not alphanumeric_pattern.match(word)
        and not trailing_number_pattern.match(word)
        and len(word) > 2
    ]
    return cleaned_words


# Step 3: Count word frequencies and filter out single occurrences
def count_word_frequencies(cleaned_words):
    word_counts = {}
    for word in cleaned_words:
        word_counts[word] = word_counts.get(word, 0) + 1

    return word_counts

# Step 4: Export to CSV
def export_to_csv(word_counts, output_path):
    word_freq_df = pd.DataFrame(list(word_counts.items()), columns=["Word", "Frequency"])
    word_freq_df = word_freq_df.sort_values(by="Frequency", ascending=False)
    word_freq_df.to_csv(output_path, index=False, encoding="utf-8-sig", quotechar='"')
    print(f"Data exported to {output_path}")

# Main script
if __name__ == "__main__":
    pdf_path = "/Users/guillaumelagace/Downloads/Word-Cloud-BudgetQC/Budget1920_PlanBudgetaire.pdf"
    output_path = "/Users/guillaumelagace/Downloads/Word-Cloud-BudgetQC/BudgetQC2019-2020.csv"

    print("Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)

    print("Cleaning text...")
    cleaned_words = clean_text(text)

    print("Counting word frequencies...")
    word_counts = count_word_frequencies(cleaned_words)

    print("Exporting to CSV...")
    export_to_csv(word_counts, output_path)

    print("Done!")

Extracting text from PDF...
Cleaning text...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guillaumelagace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guillaumelagace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Counting word frequencies...
Exporting to CSV...
Data exported to /Users/guillaumelagace/Downloads/Word-Cloud-BudgetQC/BudgetQC2019-2020.csv
Done!
