The data used for the **Proof of Concept (PoC)** of the Timeline Generation idea is sourced from the **FDI Moot, an international competition focused on investment arbitration**, which simulates investment treaty arbitration proceedings.

The preprocessing of the FDI Moot document involves **separating concatenated documents into individual files to simulate a real-life user experience**, where users typically upload documents separately rather than after concatenating them together.

In [1]:
import os
import fitz

In [2]:
pdf_filename = 'fdi_moot_case_2024.pdf'
pdf_file_path = os.path.join(f'../Data/pdfs', pdf_filename)

In [3]:
def extract_table_of_contents(pdf_path):
    """
    Extracts the text of the first two pages of a PDF document to capture the table of contents.

    Args:
        pdf_path (str): The file path to the PDF document.

    Returns:
        str: The extracted text from the first two pages of the PDF document.
    """
    text = ""
    with fitz.open(pdf_path) as pdf_document:
        for page_num in range(1, 3):
            page = pdf_document[page_num]
            text += page.get_text()
    return text

In [4]:
table_of_contents = extract_table_of_contents(pdf_file_path)
table_of_contents = table_of_contents.split('\n')[2:-1]

In [5]:
table_of_contents_pages = []
for line in table_of_contents:
    page_number = line.split(' ')[-2]
    if page_number.isdigit():
        page_number = int(page_number)
        if not table_of_contents_pages or page_number > table_of_contents_pages[-1]:
            table_of_contents_pages.append(page_number)

In [6]:
def split_pdf_by_page_ranges(pdf_path, page_ranges):
    """
    Splits a PDF into multiple PDFs based on the specified page ranges.

    Args:
        pdf_path (str): The file path to the input PDF.
        page_ranges (list): A list of tuples specifying the start and end page numbers for each range.
                            Each tuple should contain two integers: (start_page, end_page).

    Raises:
        ValueError: If the input PDF has zero pages or if any specified page range is invalid.

    Note:
        The page ranges should be specified as 1-indexed page numbers.
        The last page range specified should cover until the last page of the PDF.

    Example:
        If `page_ranges` is [(1, 5), (6, 10), (11, 15)], the PDF will be split into three parts:
        - Part 1 will contain pages 1 to 5.
        - Part 2 will contain pages 6 to 10.
        - Part 3 will contain pages 11 to 15.
    """
    with fitz.open(pdf_path) as pdf_document:
        if len(pdf_document) == 0:
            raise ValueError("Cannot split PDF with zero pages")

        # For not encountering errors while reruning
        if page_ranges[-1] != len(pdf_document) + 1:
            page_ranges.append(len(pdf_document) + 1)
        
        folder_name = os.path.splitext(os.path.basename(pdf_path))[0]
        folder_path = f'../Data/{folder_name}'
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        
        for i, (start_page, end_page) in enumerate(zip(page_ranges[:-1], page_ranges[1:])):
            if end_page <= start_page:
                continue  # Skip invalid page ranges
            output_pdf_path = f'{folder_path}/{folder_name}_part_{i + 1}.pdf'
            new_pdf = fitz.open()
            for page_number in range(start_page - 1, end_page - 1):
                new_pdf.insert_pdf(pdf_document, from_page=page_number, to_page=page_number)
            new_pdf.save(output_pdf_path)

In [7]:
split_pdf_by_page_ranges(pdf_file_path, page_ranges=table_of_contents_pages)