In [1]:
from unstructured.partition.pdf import partition_pdf
from langchain.schema.document import Document
# from unstructured.staging.base import elements_from_base64_gzipped_json
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
import os
from io import BytesIO
import re
from langchain_community.llms.ollama import Ollama

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["PATH"] += r";C:\Program Files\Tesseract-OCR"

def clean_text(text):
    # Replace consecutive non-alphabetic characters with a space
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Add spaces between joined words
    text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text)  # Add spaces between numbers and letters
    text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', text)  # Add spaces between letters and numbers
    return text

def get_chunks(file_path: str):
    chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,   # Extract tables
    strategy="hi_res",            # Required for table extraction
    chunking_strategy="by_title",
    max_characters=10000,
    combine_text_under_n_chars=2000,
    new_after_n_chars=6000,
    )
    return chunks

file_path = "C:/Users/nandi/OneDrive/Documents/battery_components_extractor/battery_components_extractor/data/f5/27.pdf"
chunks = get_chunks(file_path=file_path)

tables, texts = [], []

for chunk in chunks:
    if "Table" in str(type(chunk)):
        tables.append(chunk)
    elif "CompositeElement" in str(type(chunk)):
        texts.append(chunk)

# ✅ Store texts as they are (no summarization)
text_summaries = [chunk.text for chunk in texts]

def convert_table_to_text(table_chunk):
    """Converts table content to readable Markdown-like text."""
    table_html = table_chunk.metadata.text_as_html
    rows = table_html.split("<tr>")[1:]  # Extract table rows
    formatted_table = ["Table Data:\n"]

    for row in rows:
        cells = row.replace("</td>", "|").replace("<td>", "").strip()
        formatted_table.append(cells)

    return "\n".join(formatted_table)

def generate_chunk_id(source, page, index):
    return f"{source}:{page}:{index}"

In [3]:
table_texts = [convert_table_to_text(table) for table in tables]

print(texts[0].metadata.to_dict().get("filename"))
print(texts[0].text)


27.pdf
nature COMMUNICATIONS:

ARTICLE

i/o 01038

146-020-7976, MOE

Interface chemistry of an amide electrolyte for highly reversible lithium metal batteries

Qidi Wang @i 12, Zhenpeng Yao@: 3, Chenglong Zhao@: 4 Tomas Verhallen®, Daniel P. Tabor @ 3, Ming Liu, Frans Ooms®, Feiyu Kang!2, Alan Aspuru-Guzik@: 26, Yong-Sheng Hu@:*, Marnix Wagemaker@i >" & Baohua Li@ '2™4

Metallic lithium is a promising anode to increase the energy density of rechargeable lithium batteries. Despite extensive efforts, detrimental reactivity of lithium metal with electrolytes and uncontrolled dendrite growth remain challenging interconnected issues hindering highly reversible Li-metal batteries. Herein, we report a rationally designed amide-based electrolyte based on the desired interface products. This amide electrolyte achieves a high average Coulombic efficiency during cycling, resulting in an outstanding capacity retention with a 3.5 mAh cm~2 high-mass-loaded LiNig.gCo9;Mno1O2 cathode. The interface r

In [6]:
d = [Document(page_content=text.text, metadata=text.metadata.to_dict()) for text in texts]

In [9]:
type(d)

list

In [10]:
d[0]

Document(metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2025-02-12T16:26:41', 'page_number': 1, 'orig_elements': 'eJzNWml32ziW/SuwP8zYOSLDfUnN6YrjJGVPe8nETs3prqqTA5KghA5FaLhYUXq6fnvfB5CyZDk1iefYMx+qYkBYHt5y8d4Ff/n7vqjEXNTdR1nsv2D7YZbGofB9i5dFagUO/peWPLBCnrh5kQRuUYr9Cdufi44XvOOY8/f9XKmmkDXvRKvbFV+pvvs4E3I669DjuXGAOUP3UhbdDL1uFFLvQsm6o3m//OI6kR1NWOjYwW8TNjY9J7YTagdRYCe7bT0c7f121XZiTqd4Jz+L6mrBc7H/D/xQiE7knVT1x7zibftx0agMwxw78X0XO+yXshLdaiH03Hfn+1rYetrzqT7RL/uinu7/pnvb7uNcFbKUQuvLc7zQcjzL9a7d6IUXvQhcmr3AzI91P89EQyclITrxmXSxDzX1jWDHl+fnHy5Oj4+uTy8vrl7QpFGC0zlmk+B3jZOVcRlHSWzxoEysQGDrRJTCKmOeRLHjO1kWPp5xPMeOJ8yHukNtnaGdRnZMbS8Kbeeeth7/MPNEied6T2yeo/fXp8dnbzbtcS276l57RIUXIypCKxMegsVNCivz09wK40SEQRE6aeQ/oj1crd8Eajb20G0KCt0OU/++thn/h/Z4SnXL54o5ruMnmwr/UOdQz1Q18osormnkfcrP/cxPnNIqwtyzAi9xrKQUhZVGXuGWWZql/BGRyvfsEMp1owGqhnbgmeCI/cRO72nr8Q8LBt8LHfeJreMGESY4VpzGgMnzy62wuOBNwzt5I75moSwuMwidWCIJOMLDg3F8EVtBXAruRUVZ8vLxLOTG2v1hAQ1HQzvCnWLCxYdJ4ns6zIyH2SgMA++pAeu07kRT

In [14]:
new_documents = []
chunk_ids = []

for idx, element in enumerate(d):
    metadata = element.metadata
    element.metadata.pop("languages")
    element.metadata.pop("orig_elements")
    source = metadata.get('filename', 'unknown')
    page = metadata.get('page_number', 0)
    chunk_id = generate_chunk_id(source, page, idx)
    # type_ = metadata.get('filetype', "unknown")
    
    # # Create a Document object with metadata
    # doc = Document(
    #     page_content=element,
    #     metadata={
    #         "id": chunk_id,
    #         "source": source,
    #         "page_number": page,
    #         "type": type_
    #     }
    # )
    new_documents.append(element)
    chunk_ids.append(chunk_id)

In [15]:
new_documents

[Document(metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'last_modified': '2025-02-12T16:26:41', 'page_number': 1, 'orig_elements': 'eJzNWml32ziW/SuwP8zYOSLDfUnN6YrjJGVPe8nETs3prqqTA5KghA5FaLhYUXq6fnvfB5CyZDk1iefYMx+qYkBYHt5y8d4Ff/n7vqjEXNTdR1nsv2D7YZbGofB9i5dFagUO/peWPLBCnrh5kQRuUYr9Cdufi44XvOOY8/f9XKmmkDXvRKvbFV+pvvs4E3I669DjuXGAOUP3UhbdDL1uFFLvQsm6o3m//OI6kR1NWOjYwW8TNjY9J7YTagdRYCe7bT0c7f121XZiTqd4Jz+L6mrBc7H/D/xQiE7knVT1x7zibftx0agMwxw78X0XO+yXshLdaiH03Hfn+1rYetrzqT7RL/uinu7/pnvb7uNcFbKUQuvLc7zQcjzL9a7d6IUXvQhcmr3AzI91P89EQyclITrxmXSxDzX1jWDHl+fnHy5Oj4+uTy8vrl7QpFGC0zlmk+B3jZOVcRlHSWzxoEysQGDrRJTCKmOeRLHjO1kWPp5xPMeOJ8yHukNtnaGdRnZMbS8Kbeeeth7/MPNEied6T2yeo/fXp8dnbzbtcS276l57RIUXIypCKxMegsVNCivz09wK40SEQRE6aeQ/oj1crd8Eajb20G0KCt0OU/++thn/h/Z4SnXL54o5ruMnmwr/UOdQz1Q18osormnkfcrP/cxPnNIqwtyzAi9xrKQUhZVGXuGWWZql/BGRyvfsEMp1owGqhnbgmeCI/cRO72nr8Q8LBt8LHfeJreMGESY4VpzGgMnzy62wuOBNwzt5I75moSwuMwidWCIJOMLDg3F8EVtBXAruRUVZ8vLxLOTG2v1hAQ1HQzvCnWLCxYdJ4ns6zIyH2SgMA++pAeu07kR