In [1]:
import fitz #PyMuPDF
import pandas as pd

from typing import List

#### CODE INCOMPLETE
This code is incomplete and requires further work. I shall return to it in the future.
* Some .PDF files could not be converted to text properly due to encoding problems. E.g., "7 K H  % R D U G  D Q D O";
* The output does not properly divide chapters and paragraphs, which prevents the implementation of Latent Dirichlet Allocation (LDA);
* Footnotes are mixed in the middle of the text;
* Text is not saved as ASCII

In [2]:
def pdf_2_text(file_path: str) -> List[str]:
    """
    Extract text from a PDF file using PyMuPDF (fitz).

    Parameters:
     - file_path (str): Path to the PDF file.

    Returns:
     - List[str]: A list of strings, each representing the text extracted from a single page of the PDF.
    """
    doc = fitz.open(file_path)
    texts = []
    for page in doc:
        texts.append(page.get_text())
    doc.close()
    return texts

In [3]:
NOISE_HEADER_MARKERS = {4: "Copom Meeting",
                        5: "bcb.gov.br"}

INITIAL_TEXT_MARKERS = {
    1: ["THE BOARD ANALYZED THE RECENT PERFORMANCE ", "THE BOARD ANALYZED THE RECENT EVOLUTION "],
    2: ["THE BOARD ANALYZED THE RECENT PERFORMANCE "],
    3: ["THE MEMBERS OF THE COPOM ANALYZED ", "THE MEMBERS OF THE MONETARY POLICY "]}

In [4]:
def _remove_page_header_by_noise_marker(page: str, marker: str) -> str:
    """
    Remove the page header, using a noise text marker, and returning all the content after the header.

    Args:
     - page (str): The page text.
     - marker (str): The text marker that shows the end of the header.
     
    Return:
     - str: Page text without the header.
    """
    start_index = page.find(marker)
    if start_index == -1:
        raise ValueError(f"ERROR finding the marker: '{marker}' on the page.")
    return page[start_index + len(marker):]

In [5]:
def _find_start_of_content(page: str) -> str:
    """
    Finds the start of the main content on a page, assuming there is no noise header above header.
    The logic assumes that the header ends with the first line that is not followed by a whitespace, indicating the start of a paragraph.
    This method is not perfect, but does most of the job.
    
    Args:
     - page (str): The page text.
     
    Retorna:
     - str: Page text without the header.
    """
    current_pos = 0

    while current_pos < len(page):
        newline_pos = page.find('\n', current_pos) # Finds the next line break.
        if newline_pos == -1:
            return page[current_pos:]

        # If the next character is not a whitespace, it is the start of the content.
        if page[newline_pos:newline_pos + 2] != "\n ":
            return page[newline_pos + 1:]
        
        # Else, continues loop.
        current_pos = newline_pos + 1
        
    return "" # Error

In [6]:
def _extract_for_perfect_segregation(texts_pages: List[str], minute_version: int) -> List[str]:
    """Extract text from pages, when there is perfect segregation."""

    # No need for processing. Only for version 1 with perfect segregation.
    if minute_version == 1:
        return texts_pages

    # Get 'noise header marker'.
    noise_marker = NOISE_HEADER_MARKERS.get(minute_version)
    if not noise_marker:
        raise ValueError(f"Noise header marker not found for this file version: {minute_version}")

    processed_pages = []

    # Process.
    for page in texts_pages:
        cleaned_page = page.replace('\xa0', ' ')
        if minute_version == 4:
            header_removed = _remove_page_header_by_noise_marker(cleaned_page, noise_marker)
            content = _find_start_of_content(header_removed)
        else:
            content = _remove_page_header_by_noise_marker(cleaned_page, noise_marker)
        
        processed_pages.append(content)
        
    return processed_pages

In [7]:
def _extract_for_imperfect_segregation(texts_pages: List[str], minute_version: int) -> List[str]:
    """Extract text from pages, when there is IMperfect segregation."""
    
    processed_pages = []
    
    # Process first page.
    first_page = texts_pages[0].replace('\xa0', ' ')
    first_page_upper = first_page.upper()
    start_index = -1

    # Get 'initial text markers' for first page.
    initial_text_markers = INITIAL_TEXT_MARKERS.get(minute_version)
    if not initial_text_markers:
        raise ValueError(f"Initial text marker not found FOR this file version: {minute_version}")

    # Search 'initial text marker' in the first page.
    for marker in initial_text_markers:
        start_index = first_page_upper.find(marker)
        if start_index != -1:
            break
    if start_index == -1:
        raise ValueError(f"Initial text marker not found IN this file version: {minute_version}")
        
    processed_pages.append(first_page[start_index:])
    
    # Process next pages.
    for page in texts_pages[1:]:
        cleaned_page = page.replace('\xa0', ' ')

        content = _find_start_of_content(cleaned_page)
        processed_pages.append(content)
        
    return processed_pages

In [8]:

def extract_minute_from_text(texts_pages: List[str], minute_version: int, perfect_segregation: bool, initial_page: int) -> str:
    """
    Extract the Copom minute text from the list of text of each page of the .pdf file.
    
    Parameters:
     - texts_pages (List[str]): List of strings, each representing the text of a page.
     - minute_version (int): The format version of the Copom minute.
     - perfect_segregation (bool): If True, consider that the minute starts at a specific page, without unrellated topic above it.
     - initial_page (int): The page number where the minute starts.

    Returns:
     - str: The extracted Copom minute text.
    """

    # Adjust the page list index. initial_page is 1 based.
    relevant_pages = texts_pages[initial_page - 1:]
    
    if not relevant_pages: # error
        return "" 

    # Run relevant functions.
    if perfect_segregation:
        processed_pages = _extract_for_perfect_segregation(relevant_pages, minute_version)
    else:
        processed_pages = _extract_for_imperfect_segregation(relevant_pages, minute_version)
    
    # Joins all the pages into a single string, separated by newlines.
    copom_minute_text = "\n".join(processed_pages)
    
    return copom_minute_text

In [None]:
raw_folder_path = "./data/copom_minutes_raw"
output_folder_path = "./data/copom_minutes_texts"

df_minutes_format = pd.read_excel("minutes_format.xlsx")

for index, row in df_minutes_format.iterrows():
    if row.NeedAdjust == 1:
        continue
    minute_version = row.Version
    perfect_segregation = row.PerfSeg
    initial_page = row.InitialPage

    texts_pages = pdf_2_text(f"{raw_folder_path}/{row.Titulo}.pdf")

    minute_text = extract_minute_from_text(texts_pages, minute_version, perfect_segregation, initial_page)

    try:
        with open(f"{output_folder_path}/{row.Titulo}.txt", 'w', encoding='utf-8') as f:
            f.write(minute_text)

    except IOError as e:
        print(f"[INFO] Error saving {row.Titulo}: {e}")