In [2]:
!pip install pymupdf
import pymupdf

pdf_1 = "/content/drive/MyDrive/Colab Notebooks/pdfparser/1.pdf"
pdf_2 = "/content/drive/MyDrive/Colab Notebooks/pdfparser/2.pdf"
pdf_3 = "/content/drive/MyDrive/Colab Notebooks/pdfparser/3.pdf"
pdf_4 = "/content/drive/MyDrive/Colab Notebooks/pdfparser/4.pdf"



In [3]:
import fitz  # PyMuPDF
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text from the PDF.
    """
    doc = fitz.open(pdf_path)
    pdf_text = ''
    for page_num, page in enumerate(doc, start=1):
        page_text = page.get_text("text")
        pdf_text += f"\n\n--- Page {page_num} ---\n\n{page_text}"
    return pdf_text

In [5]:
pdf_1_text = extract_text_from_pdf(pdf_1)
print(pdf_1_text[:5000])



--- Page 1 ---

Alphabet Announces Fourth Quarter and Fiscal Year 2023 Results
MOUNTAIN VIEW, Calif. – January 30, 2024 – Alphabet Inc. (NASDAQ: GOOG, GOOGL) today announced 
financial results for the quarter and fiscal year ended December 31, 2023.
Sundar Pichai, CEO, said: “We are pleased with the ongoing strength in Search and the growing contribution from 
YouTube and Cloud. Each of these is already benefiting from our AI investments and innovation. As we enter the 
Gemini era, the best is yet to come.”
Ruth Porat, President and Chief Investment Officer; CFO said: “We ended 2023 with very strong fourth quarter 
financial results, with Q4 consolidated revenues of $86 billion, up 13% year over year. We remain committed to our 
work to durably re-engineer our cost base as we invest to support our growth opportunities.”
Q4 2023 Financial Highlights
The following table summarizes our consolidated financial results for the quarters and years ended December 31, 
2022 and 2023 (in millio

In [6]:
print(len(pdf_1_text))

1846


In [3]:
from operator import itemgetter
import fitz
import json
import numpy as np
import math


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool

    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def get_dev(inp_data, p_size):
    print(inp_data)
    print(p_size)
    data = np.array(inp_data)
    n = len(data)
    # mean = sum(data) / n
    deviations = [(x - p_size) ** 2 for x in data]
    variance = sum(deviations) / n
    std_dev = round(math.sqrt(variance),2)
    print(std_dev)
    return std_dev

def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)
    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict

    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

def font_tags_new(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)
    dev_font_sizes = get_dev(font_sizes, p_size)
    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = 'para'
        if size > p_size:
          if size - p_size < dev_font_sizes :
              size_tag[size] = 'para'
          else:
              if idx in (0,1):
                  size_tag[size] = 'header/title'
              else:
                size_tag[size] = 'sub-title'
        elif size < p_size:
            if p_size - size < dev_font_sizes :
                size_tag[size] = 'para'
            else:
                size_tag[size] = 's'
    return size_tag


def check_footers(blocks, size_tag):
    footers = []
    len_blocks = len(blocks)
    for index,b in enumerate(blocks[::-1]):
        if b['type'] == 0:
            for l in b["lines"]:
                for s in l["spans"]:
                    if s['text'].strip():
                        if "s" in size_tag[s['size']]:
                            footers.append(len_blocks - index - 1)
                        else:
                            return footers
                    else:
                        return footers


def headers_para_new(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict

    :rtype: list
    :return: texts with pre-prended element tags
    """
    # header_para = []  # list with headers and paragraphs
    header_para_dict = {}
    first = True  # boolean operator for first header
    previous_s = {}  # previous span
    # images_found = {}
    # type_3_found = {}
    # type_4_found = {}
    for page_number, page in enumerate(doc):
        blocks = page.get_text("dict")["blocks"]
        # checking if footer is present
        footer_indexes = check_footers(blocks, size_tag)
        for index,b in enumerate(blocks):  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_size_tag = size_tag[s['size']]
                                block_string = s['text']
                            else:
                                if s['size'] == previous_s['size']:
                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_size_tag = size_tag[s['size']]
                                        if "s" in block_size_tag and index in footer_indexes:
                                            block_size_tag = "footer"
                                        elif block_size_tag in ("sub-title", "header/title") and len(s['text']) > 60:
                                            block_size_tag = "para"
                                        elif block_size_tag == "para" and len(s['text']) < 60:
                                            block_size_tag = "sub-title"
                                        block_string = s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_size_tag = size_tag[s['size']]
                                        if "s" in block_size_tag and index in footer_indexes:
                                            block_size_tag = "footer"
                                        elif block_size_tag in ("sub-title", "header/title") and len(s['text']) > 60:
                                            block_size_tag = "para"
                                        block_string = s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    if len(block_string) >= 4:
                                        if block_size_tag in header_para_dict:
                                            header_para_dict[block_size_tag].append(block_string)
                                        else:
                                          header_para_dict[block_size_tag] = [block_string]
                                    # header_para.append(block_string)
                                    block_size_tag = size_tag[s['size']]
                                    if "s" in block_size_tag and index in footer_indexes:
                                        block_size_tag = "footer"
                                    elif block_size_tag in ("sub-title", "header/title") and len(s['text']) > 60:
                                            block_size_tag = "para"
                                    block_string = s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    # block_string += "|"
                if len(block_string) >= 4:
                    if block_size_tag in header_para_dict:
                        header_para_dict[block_size_tag].append(block_string)
                    else:
                        header_para_dict[block_size_tag] = [block_string]
                # header_para.append(block_string)
            # elif b['type'] == 1:
            #     if page_number in images_found:
            #       images_found[page_number] += 1
            #     else:
            #       images_found[page_number] = 1
            # elif b['type'] == 2:
            #   if page_number in type_3_found:
            #     type_3_found[page_number] += 1
            #   else:
            #     type_3_found[page_number] = 1
            # else:
            #   if page_number in type_4_found:
            #     type_4_found[page_number] += 1
            #   else:
            #     type_4_found[page_number] = 1
    return header_para_dict

In [8]:
# document = 'nationale nederlanden.pdf'
doc = fitz.open(pdf_1)

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags(font_counts, styles)

elements = headers_para(doc, size_tag)

print(elements)

# with open("doc.json", 'w') as json_out:
#     json.dump(elements, json_out)

['<h1>Alphabet Announces Fourth Quarter and Fiscal Year 2023 Results|', '', '<p>MOUNTAIN VIEW, Calif. – January 30, 2024 – Alphabet Inc. (NASDAQ: GOOG, GOOGL) today announced | financial results for the quarter and fiscal year ended December 31, 2023.|', '<p>Sundar Pichai, CEO, said: “We are pleased with the ongoing strength in Search and the growing contribution from | YouTube and Cloud. Each of these is already benefiting from our AI investments and innovation. As we enter the | Gemini era, the best is yet to come.”|', '<p>Ruth Porat, President and Chief Investment Officer; CFO said: “We ended 2023 with very strong fourth quarter | financial results, with Q4 consolidated revenues of $86 billion, up 13% year over year. We remain committed to our | work to durably re-engineer our cost base as we invest to support our growth opportunities.”|', '<p>Q4 2023 Financial Highlights|', '<p>The following table summarizes our consolidated financial results for the quarters and years ended Decemb

In [10]:
import pprint

elements_json = json.dumps(elements)
pprint.pprint(elements_json)

('["<h1>Alphabet Announces Fourth Quarter and Fiscal Year 2023 Results|", "", '
 '"<p>MOUNTAIN VIEW, Calif. \\u2013 January 30, 2024 \\u2013 Alphabet Inc. '
 '(NASDAQ: GOOG, GOOGL) today announced | financial results for the quarter '
 'and fiscal year ended December 31, 2023.|", "<p>Sundar Pichai, CEO, said: '
 '\\u201cWe are pleased with the ongoing strength in Search and the growing '
 'contribution from | YouTube and Cloud. Each of these is already benefiting '
 'from our AI investments and innovation. As we enter the | Gemini era, the '
 'best is yet to come.\\u201d|", "<p>Ruth Porat, President and Chief '
 'Investment Officer; CFO said: \\u201cWe ended 2023 with very strong fourth '
 'quarter | financial results, with Q4 consolidated revenues of $86 billion, '
 'up 13% year over year. We remain committed to our | work to durably '
 're-engineer our cost base as we invest to support our growth '
 'opportunities.\\u201d|", "<p>Q4 2023 Financial Highlights|", "<p>The '
 'following t

In [33]:
doc = fitz.open(pdf_1)

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags_new(font_counts, styles)

elements_dict = headers_para_new(doc, size_tag)

print(elements_dict)

elements_dict_json = json.dumps(elements_dict)
pprint.pprint(elements_dict_json)

{'header/title': ['Alphabet Announces Fourth Quarter and Fiscal Year 2023 Results'], 'para': ['MOUNTAIN VIEW, Calif. – January 30, 2024 – Alphabet Inc. (NASDAQ: GOOG, GOOGL) today announced  financial results for the quarter and fiscal year ended December 31, 2023.', 'Sundar Pichai, CEO, said: “We are pleased with the ongoing strength in Search and the growing contribution from  YouTube and Cloud. Each of these is already benefiting from our AI investments and innovation. As we enter the  Gemini era, the best is yet to come.”', 'Ruth Porat, President and Chief Investment Officer; CFO said: “We ended 2023 with very strong fourth quarter  financial results, with Q4 consolidated revenues of $86 billion, up 13% year over year. We remain committed to our  work to durably re-engineer our cost base as we invest to support our growth opportunities.”', 'Q4 2023 Financial Highlights', 'The following table summarizes our consolidated financial results for the quarters and years ended December 31,

In [34]:
doc = fitz.open(pdf_1)

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags_new(font_counts, styles)

elements_dict = headers_para_new(doc, size_tag)

print(elements_dict)
print("----------------------------------------------------------------\n"*5)
print(elements_dict.keys())
print("----------------------------------------------------------------\n"*5)
elements_dict_json = json.dumps(elements_dict)
pprint.pprint(elements_dict_json)

[16.0, 10.0, 9.0, 6.5, 5.849999904632568]
10.0
3.65
{'header/title': ['Alphabet Announces Fourth Quarter and Fiscal Year 2023 Results'], 'para': ['MOUNTAIN VIEW, Calif. – January 30, 2024 – Alphabet Inc. (NASDAQ: GOOG, GOOGL) today announced  financial results for the quarter and fiscal year ended December 31, 2023.', 'Sundar Pichai, CEO, said: “We are pleased with the ongoing strength in Search and the growing contribution from  YouTube and Cloud. Each of these is already benefiting from our AI investments and innovation. As we enter the  Gemini era, the best is yet to come.”', 'Ruth Porat, President and Chief Investment Officer; CFO said: “We ended 2023 with very strong fourth quarter  financial results, with Q4 consolidated revenues of $86 billion, up 13% year over year. We remain committed to our  work to durably re-engineer our cost base as we invest to support our growth opportunities.”', 'Q4 2023 Financial Highlights', 'The following table summarizes our consolidated financial r

In [29]:
doc = fitz.open(pdf_2)

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags_new(font_counts, styles)

elements_dict = headers_para_new(doc, size_tag)

print(elements_dict)
print("----------------------------------------------------------------\n"*5)
print(elements_dict.keys())
print("----------------------------------------------------------------\n"*5)
elements_dict_json = json.dumps(elements_dict)
pprint.pprint(elements_dict_json)

[16.079999923706055, 11.039999961853027, 10.079999923706055, 9.119999885559082, 8.816884994506836, 7.919999599456787, 6.480000019073486]
7.919999599456787
3.49
{'header/title': ['Saudi Basic Industries Corporation (SABIC) '], 'para': ['Business Overview  SABIC is the largest Petrochemical company based out in Riyadh, Saudi Arabia. 70% of the company’s shares are owned by  Saudi Aramco, with the remaining 30% publicly traded on the Saudi stock exchange. The company operates through three  Strategic Business Units – Petrochemicals, Specialities, and Agri-Nutrients and caters to businesses in key end markets  such as F&B, Building & Construction, Agriculture, Consumer Goods, transportation, Industrial Automotive, Electrical and  Electronics, & Pharmaceuticals.  ', 'The key strategic regions SABIC caters to are Saudi Arabia, China, Asia Pacific, and Europe contributing 15% to 20% each  as of Q3 2023. As of 2022, the company has generated revenues amounting to SAR 198 Bn and an annual produ

In [6]:
doc = fitz.open(pdf_1)
import pprint

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags_new(font_counts, styles)

elements_dict = headers_para_new(doc, size_tag)

print(elements_dict)

elements_dict_json = json.dumps(elements_dict)
pprint.pprint(elements_dict_json)

{'header/title': ['Alphabet Announces Fourth Quarter and Fiscal Year 2023 Results'], 'para': ['MOUNTAIN VIEW, Calif. – January 30, 2024 – Alphabet Inc. (NASDAQ: GOOG, GOOGL) today announced  financial results for the quarter and fiscal year ended December 31, 2023.', 'Sundar Pichai, CEO, said: “We are pleased with the ongoing strength in Search and the growing contribution from  YouTube and Cloud. Each of these is already benefiting from our AI investments and innovation. As we enter the  Gemini era, the best is yet to come.”', 'Ruth Porat, President and Chief Investment Officer; CFO said: “We ended 2023 with very strong fourth quarter  financial results, with Q4 consolidated revenues of $86 billion, up 13% year over year. We remain committed to our  work to durably re-engineer our cost base as we invest to support our growth opportunities.”', 'Q4 2023 Financial Highlights', 'The following table summarizes our consolidated financial results for the quarters and years ended December 31,

In [11]:
doc = fitz.open(pdf_2)
import pprint

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags_new(font_counts, styles)

elements_dict = headers_para_new(doc, size_tag)

# print(elements_dict)

# elements_dict_json = json.dumps(elements_dict)
# pprint.pprint(elements_dict_json)

{4: 3}
{}
{}


In [43]:
!apt install ghostscript python3-tk
!pip install camelot-py ghostscript

import camelot
import ghostscript

tables = camelot.read_pdf(pdf_1, process_background=True)
print(len(tables))
if tables:
  print(tables[0])

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ghostscript is already the newest version (9.55.0~dfsg1-0ubuntu5.9).
python3-tk is already the newest version (3.10.8-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
0


In [48]:
tables = camelot.read_pdf(pdf_1, flavor="stream", split_text=True)
print(len(tables))
# if tables:
#   print(tables[0])
for table in tables:
  print(table)

2
<Table shape=(12, 1)>
<Table shape=(12, 9)>




In [54]:
tables = camelot.read_pdf(pdf_2, flavor="stream", split_text=True)
print(len(tables))
# if tables:
#   print(tables[0])
for table in tables:
  print(table)



1
<Table shape=(28, 3)>


In [8]:
doc = fitz.open(pdf_2)
for page_number,page in enumerate(doc):
    print(f"page: {page_number}")
    images = page.get_images(full=True)
    print(len(images))
    # for image in images:
    #     print(images)
    for img_index, img in enumerate(images):
        xref = img[0]  # The image reference
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]  # Image format (e.g., 'png', 'jpeg')

        # Save the extracted image
        image_filename = f"pdf_2_page_{page_number+1}_{img_index}.{image_ext}"
        with open(image_filename, "wb") as image_file:
            image_file.write(image_bytes)
        print(f"Extracted {image_filename}")

page: 0
0
page: 1
0
page: 2
0
page: 3
0
page: 4
3
Extracted pdf_2_page_5_0.png
Extracted pdf_2_page_5_1.png
Extracted pdf_2_page_5_2.png
page: 5
0
page: 6
0


In [14]:
!pip install pdfminer.six
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTImage, LTTextBox, LTTextLine

def extract_images_from_pdf(pdf_path):
    # Iterate through the pages of the PDF
    for page_num, page_layout in enumerate(extract_pages(pdf_path)):
        for element in page_layout:
            if isinstance(element, LTImage):  # Check if the element is an image
                image_bytes = element.stream.get_data()  # Extract the image data

                # Save the extracted image
                image_filename = f"pdf_miner_pdf_2_page_{page_num+1}_{element.name}.jpg"
                with open(image_filename, "wb") as image_file:
                    image_file.write(image_bytes)
                print(f"Extracted {image_filename}")
            else:
                continue
extract_images_from_pdf(pdf_2)



In [52]:
doc = fitz.open(pdf_3)
for page_number,page in enumerate(doc):
    print(f"page: {page_number}")
    images = page.get_images()
    print(len(images))
    for image in images:
        print(images)


page: 0
1
[(10, 0, 568, 150, 8, 'DeviceRGB', '', 'Image10', 'DCTDecode')]
page: 1
0
page: 2
0
page: 3
0
page: 4
0
page: 5
0
page: 6
0
page: 7
0


In [53]:
doc = fitz.open(pdf_4)
for page_number,page in enumerate(doc):
    print(f"page: {page_number}")
    images = page.get_images()
    print(len(images))
    for image in images:
        print(images)


page: 0
1
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode')]
page: 1
1
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode')]
page: 2
1
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode')]
page: 3
1
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode')]
page: 4
2
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode'), (31, 0, 1343, 976, 8, 'DeviceRGB', '', 'Im2', 'FlateDecode')]
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode'), (31, 0, 1343, 976, 8, 'DeviceRGB', '', 'Im2', 'FlateDecode')]
page: 5
2
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode'), (15, 0, 1338, 1160, 8, 'DeviceRGB', '', 'Im3', 'DCTDecode')]
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode'), (15, 0, 1338, 1160, 8, 'DeviceRGB', '', 'Im3', 'DCTDecode')]
page: 6
2
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode'), (108, 0, 1329, 1240, 8, 'DeviceRGB', '', 'Im4', 'DCTDecode')]
[(95, 0, 75, 71, 1, '', '', 'Im1', 'CCITTFaxDecode'), (108, 0, 1329, 1240, 8, 'DeviceRGB', '', 'Im4', 'DCTDecode')]
page: 7
3
[(95, 0, 7

In [5]:
!pip install pdfplumber
import pdfplumber
import pandas as pd

def extract_tables_from_pdf_with_pdfplumber(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for i, table in enumerate(tables):
                # Convert the table to a DataFrame
                df = pd.DataFrame(table)
                print(f"page_num: {page_num}, table_num: {i}")
                print(df.head())



In [6]:
extract_tables_from_pdf_with_pdfplumber(pdf_1)

page_num: 0, table_num: 0
                                                   0
0     Revenues $ 76,048 $ 86,310 $ 282,836 $ 307,394
1  Change in revenues year over year 1 % 13 % 10 ...
2  Change in constant currency revenues year over...
3                                                   
4  Operating income $ 18,160 $ 23,697 $ 74,842 $ ...


In [7]:
extract_tables_from_pdf_with_pdfplumber(pdf_2)

page_num: 1, table_num: 0
      0 1
0  85.5  
page_num: 4, table_num: 0
                          0     1     2     3     4     5             6   \
0  Income Statement (SAR MM)  2018  2019  2020  2021  2022  LTM - Sep 23   

     7     8     9     10    11  
0  2023  2024  2025  2026  2027  
page_num: 4, table_num: 1
       0      1      2      3
0    0.1      -      -      -
1  (1.4)  (1.3)  (1.2)  (1.1)
page_num: 5, table_num: 0
                       0     1     2     3     4     5       6     7     8   \
0  Balance Sheet (SAR MM)  2018  2019  2020  2021  2022  Sep-23  2023  2024   

     9     10    11  
0  2025  2026  2027  
page_num: 5, table_num: 1
       0      1      2      3
0  122.8  117.6  110.1  101.9
1    7.8    8.8   10.0   11.1
2   19.4   18.4   17.5   16.5
page_num: 5, table_num: 2
     0    1    2    3
0    -    -    -    -
1  3.6  3.3  2.9  2.5
2  1.2  1.3  1.5  1.7
page_num: 5, table_num: 3
      0     1     2     3
0  23.3  20.9  18.6  16.3
1   6.6   7.5   8.5   9.

In [8]:
extract_tables_from_pdf_with_pdfplumber(pdf_3)

page_num: 1, table_num: 0
     0
0  31%
1  62%
2   8%
page_num: 1, table_num: 1
     0
0  10%
1  77%
2  13%
page_num: 1, table_num: 2
     0
0  10%
1  60%
2  30%
page_num: 1, table_num: 3
     0
0  23%
1  62%
2  15%
page_num: 2, table_num: 0
      0     1
0        None
1  None      
page_num: 2, table_num: 1
      0     1
0  None      
1        None
page_num: 2, table_num: 2
      0     1
0  None      
1        None
page_num: 2, table_num: 3
      0     1     2
0        None  None
1  None  None      
2  None        None
page_num: 2, table_num: 4
      0     1     2
0        None  None
1  None  None      
2  None        None
page_num: 2, table_num: 5
  0 1
0    
page_num: 2, table_num: 6
      0     1     2
0  None  None      
1              None
page_num: 3, table_num: 0
      0     1     2
0     1  None  None
1  None        None
2  None  None      
page_num: 3, table_num: 1
      0     1     2
0  None     1  None
1        None  None
2  None  None      
page_num: 3, table_num: 2
      

In [17]:
!pip install tabula-py
import tabula

def extract_tables_from_pdf(pdf_path, pages="all"):
    # Extract all tables from the PDF
    tables = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)

    for i, table in enumerate(tables):
        # Save each table as a CSV file
        # table.to_csv(f"{output_folder}/table_{i+1}.csv", index=False)
        # print(f"Extracted table {i+1} to {output_folder}/table_{i+1}.csv")
        print(f"Found table number {i}")
        print(table.head())





In [12]:
extract_tables_from_pdf(pdf_1)

Found table number 0
            Revenues$76,048$86,310$ 282,836$ 307,394
0    Change in revenues year over year1 %13 %10 %9 %
1  Change in constant currency revenues year over...
2                                                NaN
3       Operating income$18,160$23,697$74,842$84,293
4                   Operating margin24 %27 %26 %27 %


In [13]:
extract_tables_from_pdf(pdf_2)

Found table number 0
              Key Financials Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3  \
0  Income Statement (SAR MM)       2018       2019       2020       2021   
1                    Revenue      169.1      135.4      116.9      174.9   
2               YoY Growth %        NaN    (19.9%)    (13.6%)      49.5%   
3                       COGS    (111.3)    (106.0)     (94.3)    (127.0)   
4               Gross Profit       57.8       29.4       22.6       47.9   

  Unnamed: 4    Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9  \
0       2022  LTM - Sep 23       2023       2024       2025       2026   
1      198.5         161.4      145.7      149.5      160.6      172.0   
2      13.5%       (18.7%)     (9.7%)       2.6%       7.4%       7.1%   
3    (155.8)       (136.9)    (131.7)    (135.6)    (144.1)    (150.6)   
4       42.7          24.6       14.0       13.9       16.5       21.4   

  Unnamed: 10  
0        2027  
1       179.0  
2        4.0%  
3     (154.4)

In [22]:
extract_tables_from_pdf(pdf_2, pages=(5,))

Found table number 0
              Key Financials Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3  \
0  Income Statement (SAR MM)       2018       2019       2020       2021   
1                    Revenue      169.1      135.4      116.9      174.9   
2               YoY Growth %        NaN    (19.9%)    (13.6%)      49.5%   
3                       COGS    (111.3)    (106.0)     (94.3)    (127.0)   
4               Gross Profit       57.8       29.4       22.6       47.9   

  Unnamed: 4    Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9  \
0       2022  LTM - Sep 23       2023       2024       2025       2026   
1      198.5         161.4      145.7      149.5      160.6      172.0   
2      13.5%       (18.7%)     (9.7%)       2.6%       7.4%       7.1%   
3    (155.8)       (136.9)    (131.7)    (135.6)    (144.1)    (150.6)   
4       42.7          24.6       14.0       13.9       16.5       21.4   

  Unnamed: 10  
0        2027  
1       179.0  
2        4.0%  
3     (154.4)

# Package the code here

In [26]:
!pip install pymupdf
!pip install tabula-py

from operator import itemgetter
import fitz
import json
import numpy as np
import math
import tabula
import re
import pprint



In [43]:
pattern_heading = "^\d+[)\.\sa-zA-Z0-9]+\\n"  # Like 1. Header, 1) Header, 1 header, etc
pattern_subheading = "^(\d+\.\d+[\)\.]?\s*)(.+)\n"  # Like 1.1 Header, 1.2) Header, 1.4. header, etc
def extract_tables_from_pdf(pdf_path, pages="all"):
    # Extract all tables from the PDF
    tables = []
    extracted_tables = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)
    if extracted_tables:
      tables = extracted_tables
    return tables

def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool

    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def get_dev(inp_data, p_size):
    print(inp_data)
    print(p_size)
    data = np.array(inp_data)
    n = len(data)
    # mean = sum(data) / n
    deviations = [(x - p_size) ** 2 for x in data]
    variance = sum(deviations) / n
    std_dev = round(math.sqrt(variance),2)
    print(std_dev)
    return std_dev


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)
    dev_font_sizes = get_dev(font_sizes, p_size)
    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = 'para'
        if size > p_size:
          if size - p_size < dev_font_sizes :
              size_tag[size] = 'para'
          else:
              if idx in (0,1):
                  size_tag[size] = 'header/title'
              else:
                size_tag[size] = 'sub-title'
        elif size < p_size:
            if p_size - size < dev_font_sizes :
                size_tag[size] = 'para'
            else:
                size_tag[size] = 's'
    return size_tag


def check_footers(blocks, size_tag):
    footers = []
    len_blocks = len(blocks)
    for index,b in enumerate(blocks[::-1]):
        if b['type'] == 0:
            for l in b["lines"]:
                for s in l["spans"]:
                    if s['text'].strip():
                        if "s" in size_tag[s['size']]:
                            footers.append(len_blocks - index - 1)
                        else:
                            return footers
                    else:
                        return footers


def extract_elements(doc, size_tag, doc_name, doc_path):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict

    :rtype: list
    :return: texts with pre-prended element tags
    """
    elements = []
    first = True  # boolean operator for first header
    previous_s = {}  # previous span
    for page_number, page in enumerate(doc):

        # Extracting images
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]  # The image reference
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]  # Image format (e.g., 'png', 'jpeg')

            # Save the extracted image
            image_filename = f"{doc_name}_page_{page_number+1}_image_{img_index}.{image_ext}"
            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
            element= {"type": "image",
                       "data": image_filename,
                       "page": page_number + 1}
            elements.append(element)

        # Extracting tables
        extracted_tables = extract_tables_from_pdf(doc_path, pages=(page_number + 1,))
        for i, table in enumerate(extracted_tables):
            # Save each table as a CSV file
            table.to_csv(f"{doc_name}_page_{page_number+1}_table_{i+1}.csv", index=False)
            # print(f"Extracted table {i+1}")
            element= {"type": "table",
                       "data": f"{doc_name}_page_{page_number+1}_table_{i+1}.csv",
                       "page": page_number + 1}
            elements.append(element)

        # Extracting text elements
        blocks = page.get_text("dict")["blocks"]
        # checking if footer is present
        footer_indexes = check_footers(blocks, size_tag)
        for index,b in enumerate(blocks):  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text
                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_size_tag = size_tag[s['size']]
                                block_string = s['text']
                            else:
                                if s['size'] == previous_s['size']:
                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_size_tag = size_tag[s['size']]
                                        if "s" in block_size_tag and index in footer_indexes:
                                            block_size_tag = "footer"
                                        elif block_size_tag in ("sub-title", "header/title"):
                                            if len(s['text']) > 60:
                                                block_size_tag = "para"
                                            elif re.match(pattern_subheading, s['text']):
                                                block_size_tag = "sub-heading"
                                            elif re.match(pattern_heading, s['text']):
                                                block_size_tag = "heading"
                                            else:
                                                block_size_tag = "para"
                                        elif block_size_tag == "para" and len(s['text']) < 60:
                                            block_size_tag = "sub-title"
                                        block_string = s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_size_tag = size_tag[s['size']]
                                        if "s" in block_size_tag and index in footer_indexes:
                                            block_size_tag = "footer"
                                        elif block_size_tag in ("sub-title", "header/title"):
                                            if len(s['text']) > 60:
                                                block_size_tag = "para"
                                            elif re.match(pattern_subheading, s['text']):
                                                block_size_tag = "sub-heading"
                                            elif re.match(pattern_heading, s['text']):
                                                block_size_tag = "heading"
                                            else:
                                                block_size_tag = "para"
                                        elif block_size_tag == "para" and len(s['text']) < 60:
                                            block_size_tag = "other"
                                        block_string = s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    if len(block_string) >= 4:
                                        element = {"type": block_size_tag,
                                                   "data": block_string,
                                                   "page": page_number + 1}
                                        elements.append(element)
                                        # if block_size_tag in element_dict:
                                        #     element_dict[block_size_tag].append(block_string)
                                        # else:
                                        #   element_dict[block_size_tag] = [block_string]
                                    # header_para.append(block_string)
                                    block_size_tag = size_tag[s['size']]
                                    if "s" in block_size_tag and index in footer_indexes:
                                            block_size_tag = "footer"
                                    elif block_size_tag in ("sub-title", "header/title"):
                                        if len(s['text']) > 60:
                                            block_size_tag = "para"
                                        elif re.match(pattern_subheading, s['text']):
                                            block_size_tag = "sub-heading"
                                        elif re.match(pattern_heading, s['text']):
                                            block_size_tag = "heading"
                                        else:
                                            block_size_tag = "para"
                                    elif block_size_tag == "para" and len(s['text']) < 60:
                                        block_size_tag = "other"
                                    block_string = s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    # block_string += "|"
                if len(block_string) >= 4:
                    element = {"type": block_size_tag,
                                "data": block_string,
                                "page": page_number + 1}
                    elements.append(element)
    return elements

In [44]:
doc = fitz.open(pdf_1)
font_counts, styles = fonts(doc, granularity=False)
size_tag = font_tags(font_counts, styles)
elements = extract_elements(doc, size_tag, "pdf_1", pdf_1)
print(len(elements))
print(elements)

# print(elements_dict)

# elements_dict_json = json.dumps(elements_dict)
# pprint.pprint(elements_dict_json)



[16.0, 10.0, 9.0, 6.5, 5.849999904632568]
10.0
3.65
23
[{'type': 'table', 'data': 'pdf_1_page_1_table_1.csv', 'page': 1}, {'type': 'header/title', 'data': 'Alphabet Announces Fourth Quarter and Fiscal Year 2023 Results', 'page': 1}, {'type': 'para', 'data': 'MOUNTAIN VIEW, Calif. – January 30, 2024 – Alphabet Inc. (NASDAQ: GOOG, GOOGL) today announced  financial results for the quarter and fiscal year ended December 31, 2023.', 'page': 1}, {'type': 'para', 'data': 'Sundar Pichai, CEO, said: “We are pleased with the ongoing strength in Search and the growing contribution from  YouTube and Cloud. Each of these is already benefiting from our AI investments and innovation. As we enter the  Gemini era, the best is yet to come.”', 'page': 1}, {'type': 'para', 'data': 'Ruth Porat, President and Chief Investment Officer; CFO said: “We ended 2023 with very strong fourth quarter  financial results, with Q4 consolidated revenues of $86 billion, up 13% year over year. We remain committed to our  w

In [45]:
print(json.dumps(elements, indent=4))

[
    {
        "type": "table",
        "data": "pdf_1_page_1_table_1.csv",
        "page": 1
    },
    {
        "type": "header/title",
        "data": "Alphabet Announces Fourth Quarter and Fiscal Year 2023 Results",
        "page": 1
    },
    {
        "type": "para",
        "data": "MOUNTAIN VIEW, Calif. \u2013 January 30, 2024 \u2013 Alphabet Inc. (NASDAQ: GOOG, GOOGL) today announced  financial results for the quarter and fiscal year ended December 31, 2023.",
        "page": 1
    },
    {
        "type": "para",
        "data": "Sundar Pichai, CEO, said: \u201cWe are pleased with the ongoing strength in Search and the growing contribution from  YouTube and Cloud. Each of these is already benefiting from our AI investments and innovation. As we enter the  Gemini era, the best is yet to come.\u201d",
        "page": 1
    },
    {
        "type": "para",
        "data": "Ruth Porat, President and Chief Investment Officer; CFO said: \u201cWe ended 2023 with very strong fourt