<a href="https://colab.research.google.com/github/nguyenkien1402/llamaindex-practices/blob/main/unstructureio/pdf_with_tables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
from google.colab import drive
MOUNTPOINT = '/content/gdrive'
DATADIR = os.path.join(MOUNTPOINT, 'MyDrive', 'llm-poc')
drive.mount(MOUNTPOINT)

Mounted at /content/gdrive


# Install libraries

In [None]:
!pip install "unstructured[all-docs]" llama-index==0.8.49 chromadb==0.4.14 transformers

In [None]:
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [None]:
!pip install paddleocr

In [11]:
!git clone https://github.com/Belval/pdf2image

Cloning into 'pdf2image'...
remote: Enumerating objects: 892, done.[K
remote: Counting objects: 100% (265/265), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 892 (delta 238), reused 211 (delta 210), pack-reused 627[K
Receiving objects: 100% (892/892), 4.69 MiB | 11.20 MiB/s, done.
Resolving deltas: 100% (528/528), done.


In [13]:
!pip install poppler-utils
!sudo apt-get install -y poppler-utils
!sudo apt-get install python-poppler

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate package python-poppler


In [9]:
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 18 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 3s (1,729 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

## Import modules

In [27]:
from unstructured.partition.pdf import partition_pdf
from pydantic import BaseModel
from typing import Any, Optional

In [16]:
# Get elements
raw_pdf_elements = partition_pdf(filename="gdrive/MyDrive/llm-doc/data/The World's Billionaires - Wikipedia.pdf",
                                 # Unstructured first finds embedded image blocks
                                 extract_images_in_pdf=False,
                                 # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
                                 # Titles are any sub-section of the document
                                 infer_table_structure=True,
                                 # Post processing to aggregate text once we have the title
                                 chunking_strategy="by_title",
                                 # Chunking params to aggregate text blocks
                                 # Attempt to create a new chunk 3800 chars
                                 # Attempt to keep chunks > 2000 chars
                                #  max_characters=4000,
                                #  new_after_n_chars=3800,
                                #  combine_text_under_n_chars=2000
                                 )

In [18]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 248,
 "<class 'unstructured.documents.elements.Table'>": 12,
 "<class 'unstructured.documents.elements.TableChunk'>": 58}

In [23]:
tables = [el for el in raw_pdf_elements if el.category == "Table"]
tables_chunk = [el for el in raw_pdf_elements if el.category == "TableChunk"]

print(tables[0].text)
print(tables[2].metadata.text_as_html)

Publication details Publisher Whale Media Investments Forbes family Publication Forbes March 1987[1] First published Latest publication April 4, 2023 Current list details (2023)[2] Wealthiest Bernard Arnault Net worth (1st) US$211 billion Number of billionaires 2,640 (from 2668) Total list net worth value US$12.2 trillion (from US$ 12.7 trillion) Number of women 337 New members to the list 150
<table><thead><th>1A</th><th>Bernard Arnault &amp; | family</th><th>$211 billion &amp;</th><th>74</th><th>Bf § France</th><th>LVMH</th></thead><tr><td>2</td><td>| Elon Musk</td><td>$180 billion W</td><td>54</td><td>= United States</td><td>Tesla, SpaceX</td></tr><tr><td>3</td><td>| Jeff Bezos</td><td>$114 billion W</td><td>59</td><td>BS United States</td><td>Amazon</td></tr><tr><td>4a |</td><td>Larry Ellison</td><td>$107 billion &amp;</td><td>78</td><td>5 United States</td><td>Oracle Corporation</td></tr><tr><td>5</td><td>| Warren Buffett</td><td>$106 billion W</td><td>92</td><td>= United States</

In [28]:
class Element(BaseModel):
    type: str
    text: Any

# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

70
248
