In [8]:
from langchain_community.document_loaders import PDFMinerLoader

FILEPATH = "data/Gross Domestic Product.pdf"

loader = PDFMinerLoader(FILEPATH)

loader.load()


[Document(page_content="Gross Domestic Product (GDP) and Its \nImpact \n\nIntroduction to GDP: \nGross Domestic Product (GDP) serves as a vital economic indicator, offering insight into the total \neconomic output of a country within a specified timeframe. It encapsulates the total market value of \nall goods and services produced within a nation's borders, encompassing consumption, investment, \ngovernment spending, and net exports. As a quantitative measure, GDP provides a foundational \nunderstanding of an economy's overall health and performance. \n\nComponents of GDP: \nGDP is dissected into several components that reflect different facets of economic activity: \n\n1.  Consumer Spending (C): This component gauges the expenditures made by households on a \nwide array of goods and services, such as durable goods (cars, appliances), non-durable \ngoods (food, clothing), and services (healthcare, entertainment). Consumer spending is \ninfluenced by factors such as disposable income, c

In [9]:
from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
from bs4 import BeautifulSoup
import re

loader = PDFMinerPDFasHTMLLoader(FILEPATH)
data = loader.load()[0]

soup = BeautifulSoup(data.page_content, "html.parser")
content = soup.find_all("div")

cur_fs = None
cur_text = ""
snippets = []  # first collect all snippets that have the same font size
for c in content:
    sp = c.find("span")
    if not sp:
        continue
    st = sp.get("style")
    if not st:
        continue
    fs = re.findall("font-size:(\d+)px", st)
    if not fs:
        continue
    fs = int(fs[0])
    if not cur_fs:
        cur_fs = fs
    if fs == cur_fs:
        cur_text += c.text
    else:
        snippets.append((cur_text, cur_fs))
        cur_fs = fs
        cur_text = c.text
snippets.append((cur_text, cur_fs))
# Note: The above logic is very straightforward. One can also add more strategies such as removing duplicate snippets (as
# headers/footers in a PDF appear on multiple pages so if we find duplicates it's safe to assume that it is redundant info)


  fs = re.findall("font-size:(\d+)px", st)


In [10]:
from langchain.docstore.document import Document

cur_idx = -1
semantic_snippets = []
# Assumption: headings have higher font size than their respective content
for s in snippets:
    # if current snippet's font size > previous section's heading => it is a new heading
    if (
        not semantic_snippets
        or s[1] > semantic_snippets[cur_idx].metadata["heading_font"]
    ):
        metadata = {"heading": s[0], "content_font": 0, "heading_font": s[1]}
        metadata.update(data.metadata)
        semantic_snippets.append(Document(page_content="", metadata=metadata))
        cur_idx += 1
        continue

    # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create
    # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific)
    if (
        not semantic_snippets[cur_idx].metadata["content_font"]
        or s[1] <= semantic_snippets[cur_idx].metadata["content_font"]
    ):
        semantic_snippets[cur_idx].page_content += s[0]
        semantic_snippets[cur_idx].metadata["content_font"] = max(
            s[1], semantic_snippets[cur_idx].metadata["content_font"]
        )
        continue

    # if current snippet's font size > previous section's content but less than previous section's heading than also make a new
    # section (e.g. title of a PDF will have the highest font size but we don't want it to subsume all sections)
    metadata = {"heading": s[0], "content_font": 0, "heading_font": s[1]}
    metadata.update(data.metadata)
    semantic_snippets.append(Document(page_content="", metadata=metadata))
    cur_idx += 1
semantic_snippets


[Document(page_content="Introduction to GDP: \nGross Domestic Product (GDP) serves as a vital economic indicator, offering insight into the total \neconomic output of a country within a specified timeframe. It encapsulates the total market value of \nall goods and services produced within a nation's borders, encompassing consumption, investment, \ngovernment spending, and net exports. As a quantitative measure, GDP provides a foundational \nunderstanding of an economy's overall health and performance. \nComponents of GDP: \nGDP is dissected into several components that reflect different facets of economic activity: \n1. Consumer Spending (C): This component gauges the expenditures made by households on a \nwide array of goods and services, such as durable goods (cars, appliances), non-durable \ngoods (food, clothing), and services (healthcare, entertainment). Consumer spending is \ninfluenced by factors such as disposable income, consumer sentiment, and borrowing rates. \n2. Business I