In [1]:
from pdf_ingest import PdfIngest
import json
from io import StringIO
from lxml import etree
from IPython.core.display import HTML
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain.chains.summarize import load_summarize_chain

In [2]:
def print_elements(els: list) -> None:
    for el in test.elements:
        print(el.to_dict()['type'].upper(), ': ', el.text)

In [3]:
def print_element_json(element):
    print(json.dumps(element.to_dict(), indent=2))

In [4]:
def print_table_html(table_html:str) -> None:
    parser = etree.XMLParser(remove_blank_text=True)
    file_obj = StringIO(table_html)
    tree = etree.parse(file_obj, parser)
    print(etree.tostring(tree, pretty_print=True).decode())

In [26]:
def summarize_table(table_html):
    llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')
    chain = load_summarize_chain(llm, chain_type='stuff')
    out_dict = chain.invoke([Document(page_content=table_html)])
    return out_dict['output_text']

In [6]:
file_path = "./data/pdf/KEM_T2018_T528.pdf"

In [7]:
test = PdfIngest(file_path)

In [8]:
test.pdf2elements()

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
test.clean_elements()

In [10]:
print_elements(test.elements)

UNCATEGORIZEDTEXT :  KEMET Organic Capacitor (KO-CAP®) – Low ESR T528 Low ESL Polymer Electrolytic for CPU/GPU Decoupling
IMAGE :  KEMET 
NARRATIVETEXT :  a  company
TITLE :  Overview
TITLE :  KO-CAP EIN
NARRATIVETEXT :  The KEMET Organic Capacitor (KO-CAP) is a solid electrolytic capacitor with a conductive polymer cathode capable of delivering very low ESR and improved capacitance retention at high frequencies. KO-CAP combines the low ESR of multilayer ceramic, the high capacitance of aluminum electrolytic, and the volumetric efficiency of tantalum into a single surface mount package. Unlike liquid electrolyte-based capacitors, KO-CAP has a very long operational life and high ripple current capabilities.
NARRATIVETEXT :  The T528 low ESL Facedown Terminal Polymer Electrolytic combines ultra-low ESR and high capacitance in a package design that offers the lowest ESL in the market. This series offers exceptional performance for high-speed microprocessor, FPGA, or ASIC decoupling design

In [11]:
for el in test.elements:
    print_element_json(el)

{
  "type": "UncategorizedText",
  "element_id": "b2d5e9c22175d09c917c9a91abcfe388",
  "text": "KEMET Organic Capacitor (KO-CAP\u00ae) \u2013 Low ESR T528 Low ESL Polymer Electrolytic for CPU/GPU Decoupling",
  "metadata": {
    "coordinates": {
      "points": [
        [
          100.0,
          99.73309999999995
        ],
        [
          100.0,
          198.55000000000018
        ],
        [
          1232.695000000001,
          198.55000000000018
        ],
        [
          1232.695000000001,
          99.73309999999995
        ]
      ],
      "system": "PixelSpace",
      "layout_width": 1700,
      "layout_height": 2200
    },
    "last_modified": "2024-06-29T16:44:09",
    "filetype": "application/pdf",
    "languages": [
      "eng"
    ],
    "page_number": 1,
    "file_directory": "./data/pdf",
    "filename": "KEM_T2018_T528.pdf"
  }
}
{
  "type": "Image",
  "element_id": "b633da905202d83164752aec640123f2",
  "text": "KEMET ",
  "metadata": {
    "detection_cla

In [17]:
# Remove Images
element_wo_images = [el for el in test.elements if el.category != 'Image']

In [21]:
len(element_wo_images)

182

In [22]:
test.elements2chunks()

In [23]:
test.print_chunks()

KEMET Organic Capacitor (KO-CAP®) – Low ESR T528 Low ESL Polymer Electrolytic for CPU/GPU Decoupling

KEMET 

a  company


--------------------------------------------------------------------------------
Overview

KO-CAP EIN

The KEMET Organic Capacitor (KO-CAP) is a solid electrolytic capacitor with a conductive polymer cathode capable of delivering very low ESR and improved capacitance retention at high frequencies. KO-CAP combines the low ESR of multilayer ceramic, the high capacitance of aluminum electrolytic, and the volumetric efficiency of tantalum into a single surface mount package. Unlike liquid electrolyte-based capacitors, KO-CAP has a very long operational life and high ripple current capabilities.

The T528 low ESL Facedown Terminal Polymer Electrolytic combines ultra-low ESR and high capacitance in a package design that offers the lowest ESL in the market. This series offers exceptional performance for high-speed microprocessor, FPGA, or ASIC decoupling designs. The T528

In [24]:
table_chunks = [ch for ch in test.chunks if ch.category == 'Table']

In [25]:
table_chunks

[<unstructured.documents.elements.Table at 0x71ced85ab4d0>,
 <unstructured.documents.elements.Table at 0x71ced85aa7b0>,
 <unstructured.documents.elements.Table at 0x71ced85ab7a0>,
 <unstructured.documents.elements.Table at 0x71ced85abad0>,
 <unstructured.documents.elements.Table at 0x71ced85abc80>,
 <unstructured.documents.elements.Table at 0x71cebe9d4650>,
 <unstructured.documents.elements.TableChunk at 0x71cebe9d4a10>,
 <unstructured.documents.elements.TableChunk at 0x71ced85a3680>,
 <unstructured.documents.elements.Table at 0x71ced859ebd0>,
 <unstructured.documents.elements.Table at 0x71cebe9d5040>,
 <unstructured.documents.elements.Table at 0x71cebe9d5460>,
 <unstructured.documents.elements.Table at 0x71cebe9d57f0>,
 <unstructured.documents.elements.Table at 0x71cebe9d5d30>,
 <unstructured.documents.elements.Table at 0x71cebe9d6390>,
 <unstructured.documents.elements.Table at 0x71cebe9d6840>,
 <unstructured.documents.elements.Table at 0x71ced85ab440>,
 <unstructured.documents.eleme

In [27]:
for ch in table_chunks:
    summary = summarize_table(ch.metadata.text_as_html)
    print(summary, '\n')
    []

The table provides information about different specifications for a tantalum capacitor, including the capacitor class, case size, capacitance code, capacitance tolerance, rated voltage, termination finish, failure rate/design, ESR code, and packaging. It also explains the codes and numbers used to represent these specifications. 

The table provides performance characteristics for a specific item, including operating temperature, capacitance range, capacitance tolerance, rated voltage range, DF, ESR, and leakage current. The item has an operating temperature range of -55°C to 105°C, a capacitance range of 150 - 470 pF at 120 Hz/25°C, and a capacitance tolerance of M Tolerance (20%). The rated voltage range is 2-6.3V, with a DF of <10% at 120 Hz and an ESR specified at 100 kHz. The leakage current is <0.1 CV (pA) at rated voltage after 5 minutes. 

The table provides specifications for the performance of a component at various temperature, voltage, and environmental conditions. It inclu

In [None]:
HTML(table_chunks[7].metadata.text_as_html)

In [None]:
table_html = table_chunks[6].metadata.text_as_html
llm = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-1106')
chain = load_summarize_chain(llm, chain_type='stuff')
out_dict = chain.invoke([Document(page_content=table_html)])
print(out_dict['input_documents'])
print(out_dict['output_text'])