In [65]:
def performance_decorator(func):
    """
    A decorator that measures the performance metrics of a function and saves them to an Excel file.

    Args:
        func (function): The function to be decorated.

    Returns:
        function: The decorated function.

    """
    import time
    import psutil
    import pandas as pd
    import numpy as np
    import os

    def wrapper(*args, **kwargs):
        file_path = 'output_data/performance_metrics.xlsx'
        print(args[0])
        file_name = args[0].split('/')[1].split('\\')[1].split('.')[0]
        tool_name = func.__name__
        start_time = time.time()
        start_cpu = psutil.cpu_percent(interval=None)
        start_memory = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2

        result = func(*args, **kwargs)

        end_time = time.time()
        end_cpu = psutil.cpu_percent(interval=None)
        end_memory = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2

        if result is None: # is from the rest of them 
            # the additional metrics is a dict that have that can be llm_tokens, embedding_tokens and pages calls or just one of them
            llm_tokens = np.nan
            embedding_tokens = np.nan
            pages_calls = np.nan

            metrics = {
                'Tool': [tool_name],
                'file_name': [file_name],
                'Execution Time (seconds)': [end_time - start_time],
                'CPU Usage (percent)': [end_cpu - start_cpu],
                'Memory Usage (MB)': [end_memory - start_memory],
                'llm_tokens': [llm_tokens],
                'embedding_tokens': [embedding_tokens],
                'pages_calls': [pages_calls]
            }
            df = pd.DataFrame(metrics)

            # Check if the file already exists
            if os.path.exists(file_path):
                with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
                    df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
            else:
                df.to_excel(file_path, index=False)

        elif isinstance(result, list): # is from llama tesseract
            for item in result:
                df = pd.DataFrame(item, index=[0])  # Add index=[0] if item is a dictionary with scalar values
                df.insert(0, 'file_name', file_name)
                df.insert(0, 'Tool', tool_name)

                # Check if the file already exists
                if os.path.exists(file_path):
                    with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
                        df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
                else:
                    df.to_excel(file_path, index=False)
    
        else: # is from llama index 
                        # the additional metrics is a dict that have that can be llm_tokens, embedding_tokens and pages calls or just one of them
            llm_tokens = result.get('llm_tokens', 'N/A')
            embedding_tokens = result.get('embedding_tokens', 'N/A')
            pages_calls = result.get('pages_calls', 'N/A')

            metrics = {
                'Tool': [tool_name],
                'file_name': [file_name],
                'Execution Time (seconds)': [end_time - start_time],
                'CPU Usage (percent)': [end_cpu - start_cpu],
                'Memory Usage (MB)': [end_memory - start_memory],
                'llm_tokens': [llm_tokens],
                'embedding_tokens': [embedding_tokens],
                'pages_calls': [pages_calls]
            }
            df = pd.DataFrame(metrics)

            # Check if the file already exists
            if os.path.exists(file_path):
                with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
                    df.to_excel(writer, index=False, header=False, startrow=writer.sheets['Sheet1'].max_row)
            else:
                df.to_excel(file_path, index=False)
        
        return result

    return wrapper

In [66]:
import PyPDF2

def get_number_of_pages(file_path):
    # Open the PDF file
    with open(file_path, "rb") as file:
        # Create PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)
        # Get the number of pages
        number_of_pages = len(pdf_reader.pages)
        return number_of_pages

In [67]:
@performance_decorator
def process_pdf_file_PyPDF(file_path, output_file):
    from langchain_community.document_loaders import PyPDFLoader
    
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()
    
    # Save the data to a text file for inspection
    with open(output_file, "w", encoding='utf-8') as f:
        for i, page in enumerate(pages):
            f.write(page.page_content)
            f.write("\n")  # Optional: add a newline between pages

In [68]:
@performance_decorator
def process_pdf_file_UnstructuredPDF_default_strategy(file_path, output_file):
    from langchain_community.document_loaders import UnstructuredPDFLoader
    loader = UnstructuredPDFLoader(file_path, mode="elements")
    pages = loader.load_and_split()
    
    # Save the data to a text file for inspection
    with open(output_file, "w", encoding='utf-8') as f:
        for i, page in enumerate(pages):
            f.write(page.page_content)
            f.write("\n")  # Optional: add a newline between pages

In [69]:

@performance_decorator
def process_pdf_file_UnstructuredPDF_OCR_only_strategy(file_path, output_file):
    from langchain_community.document_loaders import UnstructuredPDFLoader
    loader = UnstructuredPDFLoader(file_path, mode="elements", strategy='ocr_only')
    pages = loader.load_and_split()
    
    # Save the data to a text file for inspection
    with open(output_file, "w", encoding='utf-8') as f:
        for i, page in enumerate(pages):
            f.write(page.page_content)
            f.write("\n")

In [70]:
@performance_decorator
def process_pdf_file_UnstructuredPDF_hig_res_strategy(file_path, output_file):
    from langchain_community.document_loaders import UnstructuredPDFLoader
    loader = UnstructuredPDFLoader(file_path, mode="elements", strategy='hi_res')
    pages = loader.load_and_split()
    
    # Save the data to a text file for inspection
    with open(output_file, "w", encoding='utf-8') as f:
        for i, page in enumerate(pages):
            f.write(page.page_content)
            f.write("\n")

In [71]:
@performance_decorator
def process_pdf_file_PDFMiner(file_path, output_file):
    from langchain_community.document_loaders import PDFMinerLoader
    loader = PDFMinerLoader(file_path)
    data = loader.load()

    # Save the data to a text file for inspection
    with open(output_file, "w", encoding='utf-8') as f:
        for i, page in enumerate(data):
            f.write(page.page_content)
            f.write("\n")

In [72]:
@performance_decorator
def process_pdf_file_PDFMiner_as_HTML(file_path, output_file):
    from langchain_community.document_loaders import PDFMinerPDFasHTMLLoader
    from bs4 import BeautifulSoup
    import re
    
    loader = PDFMinerPDFasHTMLLoader(file_path)
    data = loader.load()[0]   # entire PDF is loaded as a single Document
    # print(data)
    soup = BeautifulSoup(data.page_content,'html.parser')
    content = soup.find_all('div')

    # save the content to a html file 
    with open(output_file, "w", encoding='utf-8') as f:
        f.write(str(content))

    cur_fs = None
    cur_text = ''
    snippets = []   # first collect all snippets that have the same font size
    for c in content:
        sp = c.find('span')
        if not sp:
            continue
        st = sp.get('style')
        if not st:
            continue
        fs = re.findall('font-size:(\d+)px',st)
        if not fs:
            continue
        fs = int(fs[0])
        if not cur_fs:
            cur_fs = fs
        if fs == cur_fs:
            cur_text += c.text
        else:
            snippets.append((cur_text,cur_fs))
            cur_fs = fs
            cur_text = c.text
    snippets.append((cur_text,cur_fs))

    # print the snippets
    for s in snippets:
        # print(s)
        pass

In [73]:
@performance_decorator
def process_pdf_file_PyMuPDF(file_path, output_file):
    import fitz as PyMuPDF
    # TODO Add the ocr 

    doc = PyMuPDF.open(file_path) # open a document
    
    out = open(output_file, "wb") # create a text output
    
    for page in doc: # iterate the document pages
        text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
        out.write(text) # write text of page
        out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
    
    out.close()

In [74]:
@performance_decorator
def process_pdf_file_pdfminerSix(file_path, output_file):
    from io import StringIO
    from pdfminer.high_level import extract_text_to_fp
    from pdfminer.layout import LAParams

    output_string = StringIO()
    with open(file_path, 'rb') as f:
        extract_text_to_fp(f, output_string, laparams=LAParams())

    # Save the data to a text file for inspection
    with open(output_file, "w", encoding='utf-8') as f:
        f.write(output_string.getvalue().strip())

In [75]:
@performance_decorator
def process_pdf_file_textract_with_correction(file_path, output_file):
    import tesseract_with_llama2_corrections as tesseract_with_llama2
    raw_ocr, corrected_text, filter_text, performance_metrics  = tesseract_with_llama2.tesseract_with_llm_correction(file_path)
    with open(output_file + "_raw_ocr.md", "w", encoding='utf-8') as f:
        f.write(raw_ocr)
    with open(output_file + "_corrected", "w", encoding='utf-8') as f:
        f.write(corrected_text)
    with open(output_file + "_fileted.md", "w", encoding='utf-8') as f:
        f.write(filter_text)
    
    return performance_metrics

In [76]:
@performance_decorator
def process_pdf_file_llama_index_md(file_path, output_file):
    import nest_asyncio 
    from llama_parse import LlamaParse
    from os import getenv

    nest_asyncio.apply()

    key = getenv("LlamaIndex")

    parser = LlamaParse(
        api_key=key,
        result_type="markdown",
        num_workers=4,
        verbose=True,
        language="en"
    )

    # sync 
    document = parser.load_data(file_path)

    with open(output_file, "w", encoding='utf-8') as f:
        f.write(document[0].text)
    
    return {'pages_calls': get_number_of_pages(file_path)}

In [77]:
@performance_decorator
def process_pdf_file_llama_index_txt(file_path, output_file):
    import nest_asyncio 
    from llama_parse import LlamaParse
    from os import getenv

    nest_asyncio.apply()

    key = getenv("LlamaIndex")

    parser = LlamaParse(
        api_key=key,
        result_type="text",
        num_workers=4,
        verbose=True,
        language="en"
    )

    # sync 
    document = parser.load_data(file_path)
    
    with open(output_file, "w", encoding='utf-8') as f:
        f.write(document[0].text)

    return {'pages_calls': get_number_of_pages(file_path)}

In [78]:
def reset_performance_metrics_file():
    """
    Resets the performance metrics file by clearing its contents and adding a header.
    """
    import os
    import pandas as pd

    with open('output_data/performance_metrics.txt', 'w') as f:
        f.write("Performance metrics for each function:\n\n")

    # clear the excel file
    if os.path.exists('output_data/performance_metrics.xlsx'):
        os.remove('output_data/performance_metrics.xlsx')

    import pandas as pd

    df = pd.DataFrame(columns=['Tool', 'File', 'Execution Time (seconds)', 'CPU Usage (percent)', 'Memory Usage (MB)', 'llm_tokens', 'embedding_tokens', 'pages_calls'])
    df.to_excel('output_data/performance_metrics.xlsx', index=False)


In [79]:
def clear_output_files(path):
    """
    Clears all the files and folders in path directory with extensions '.txt', '.html', and '.md'.
    """
    import os
    import shutil
    for file in os.listdir(path):
        if file.endswith(".txt") or file.endswith(".html") or file.endswith(".md"):
            os.remove(os.path.join(path, file))
        elif os.path.isdir(os.path.join(path, file)):
            shutil.rmtree(os.path.join(path, file))

In [80]:
def run_all(input_folder_path, output_folder_path) -> None:
    """
    Runs all the PDF processing functions and saves the output to respective files.
    """
    import os
    from tqdm import tqdm
    
    # get all the files in the folder
    files = os.listdir(input_folder_path)
    
    clear_output_files(output_folder_path)
    reset_performance_metrics_file()

    for file in files:
        print("file: ", file)
        output_path = os.path.join(output_folder_path, file.split(".")[0])
        input_path = os.path.join(input_folder_path, file)
        print("output_path: ", output_path)
        # create a folder for the output data in the output folder
        if not os.path.exists(file.split(".")[0]):
            path = os.path.join(output_folder_path, file.split(".")[0])
            os.makedirs(path)

        tasks = [
            (process_pdf_file_PyPDF, (input_path, os.path.join(output_path, "PyPDF.txt"))),
            (process_pdf_file_UnstructuredPDF_hig_res_strategy, (input_path, os.path.join(output_path, "Unstructured_hi_res.txt"))),
            (process_pdf_file_UnstructuredPDF_default_strategy, (input_path, os.path.join(output_path, "Unstructured.txt"))),
            (process_pdf_file_UnstructuredPDF_OCR_only_strategy, (input_path, os.path.join(output_path, "Unstructured_OCR.txt"))),
            (process_pdf_file_PDFMiner, (input_path, os.path.join(output_path, "PDFMiner.txt"))),
            (process_pdf_file_PDFMiner_as_HTML, (input_path, os.path.join(output_path, "PDFMiner_HTML.html"))),
            (process_pdf_file_PyMuPDF, (input_path, os.path.join(output_path, "PyMuPDF.txt"))),
            (process_pdf_file_pdfminerSix, (input_path, os.path.join(output_path, "pdfminerSix.txt"))),
            (process_pdf_file_textract_with_correction, (input_path, os.path.join(output_path, "textract"))),       
            (process_pdf_file_llama_index_md, (input_path, os.path.join(output_path, "llama_index_md.md"))),
            (process_pdf_file_llama_index_txt, (input_path, os.path.join(output_path, "llama_index:txt.txt")))
        ]

        with tqdm(total=len(tasks)) as pbar:
            for task in tasks:
                func, args = task
                pbar.set_description(f"Processing {args[1]}")
                try:
                    func(*args)
                except Exception as e:
                    print(f"\nAn error occurred while processing {args[1]}: {str(e)}")
                    print("Press any key to continue...")
                    input()
                pbar.update()


run_all("input_data/pdf", "output_data" )

file:  aidonHanInterface.pdf
output_path:  output_data\aidonHanInterface


Processing output_data\aidonHanInterface\PyPDF.txt:   0%|          | 0/11 [00:00<?, ?it/s]

input_data/pdf\aidonHanInterface.pdf


Processing output_data\aidonHanInterface\Unstructured_hi_res.txt:   9%|▉         | 1/11 [00:01<00:11,  1.19s/it]

input_data/pdf\aidonHanInterface.pdf


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Processing output_data\aidonHanInterface\Unstructured.txt:  18%|█▊        | 2/11 [00:32<02:49, 18.88s/it]       

input_data/pdf\aidonHanInterface.pdf


Processing output_data\aidonHanInterface\Unstructured_OCR.txt:  27%|██▋       | 3/11 [00:34<01:27, 10.97s/it]

input_data/pdf\aidonHanInterface.pdf


Processing output_data\aidonHanInterface\PDFMiner.txt:  36%|███▋      | 4/11 [01:06<02:15, 19.40s/it]        

input_data/pdf\aidonHanInterface.pdf


Processing output_data\aidonHanInterface\PDFMiner_HTML.html:  45%|████▌     | 5/11 [01:07<01:17, 12.96s/it]

input_data/pdf\aidonHanInterface.pdf


Processing output_data\aidonHanInterface\pdfminerSix.txt:  64%|██████▎   | 7/11 [01:09<00:36,  9.02s/it]   

input_data/pdf\aidonHanInterface.pdf
input_data/pdf\aidonHanInterface.pdf


Processing output_data\aidonHanInterface\textract:  73%|███████▎  | 8/11 [01:10<00:14,  4.90s/it]       

input_data/pdf\aidonHanInterface.pdf
Now converting all pages of PDF file input_data/pdf\aidonHanInterface.pdf to images...
Done converting pages from PDF file input_data/pdf\aidonHanInterface.pdf to images.
Tesseract version: 5.3.3.20231005
Extracting text from converted pages...
Processing page 1 with LLM...
Processing page 2 with LLM...
Processing page 3 with LLM...
Processing page 4 with LLM...
Processing page 5 with LLM...
Processing page 6 with LLM...
Processing page 7 with LLM...
Processing page 8 with LLM...
Processing page 9 with LLM...
Now filtering out hallucinations from corrected text...
No existing database found at ./sentence_embeddings.sqlite. Creating a new one.


Processing output_data\aidonHanInterface\llama_index.md:  82%|████████▏ | 9/11 [02:53<01:00, 30.30s/it]

Done filtering out hallucinations.
input_data/pdf\aidonHanInterface.pdf
Started parsing the file under job_id 461bb80d-52b3-4861-815b-2e977e95f3c2
.

Processing output_data\aidonHanInterface\llama_index.txt:  91%|█████████ | 10/11 [03:28<00:31, 31.70s/it]

input_data/pdf\aidonHanInterface.pdf
Started parsing the file under job_id 5c1fadc8-ad1c-40f8-a6aa-4b9edcbec19c


Processing output_data\aidonHanInterface\llama_index.txt: 100%|██████████| 11/11 [03:33<00:00, 19.41s/it]


file:  images.pdf
output_path:  output_data\images


Processing output_data\images\Unstructured_hi_res.txt:   9%|▉         | 1/11 [00:00<00:01,  7.63it/s]This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name


input_data/pdf\images.pdf
input_data/pdf\images.pdf


Processing output_data\images\Unstructured.txt:  18%|█▊        | 2/11 [00:13<01:12,  8.11s/it]       

input_data/pdf\images.pdf


Processing output_data\images\Unstructured_OCR.txt:  27%|██▋       | 3/11 [00:28<01:30, 11.33s/it]

input_data/pdf\images.pdf


Processing output_data\images\textract:  73%|███████▎  | 8/11 [00:44<00:14,  4.76s/it]            

input_data/pdf\images.pdf
input_data/pdf\images.pdf
input_data/pdf\images.pdf
input_data/pdf\images.pdf
input_data/pdf\images.pdf
Now converting all pages of PDF file input_data/pdf\images.pdf to images...
Done converting pages from PDF file input_data/pdf\images.pdf to images.
Tesseract version: 5.3.3.20231005
Extracting text from converted pages...
Processing page 1 with LLM...
Processing page 2 with LLM...
Processing page 3 with LLM...
Processing page 4 with LLM...


Processing output_data\images\textract:  73%|███████▎  | 8/11 [01:03<00:14,  4.76s/it]

Now filtering out hallucinations from corrected text...


Processing output_data\images\llama_index.md:  82%|████████▏ | 9/11 [01:22<00:20, 10.27s/it]

Done filtering out hallucinations.
input_data/pdf\images.pdf
Started parsing the file under job_id fcd31e9c-c61e-4259-8b69-987bfb652fa8
.

Processing output_data\images\llama_index.txt:  91%|█████████ | 10/11 [02:14<00:18, 18.88s/it]

input_data/pdf\images.pdf
Started parsing the file under job_id d45ed79d-5ba0-478c-b33c-ad2e0e9b0488


Processing output_data\images\llama_index.txt: 100%|██████████| 11/11 [02:19<00:00, 12.65s/it]


file:  mixed1.pdf
output_path:  output_data\mixed1


Processing output_data\mixed1\PyPDF.txt:   0%|          | 0/11 [00:00<?, ?it/s]

input_data/pdf\mixed1.pdf


Processing output_data\mixed1\Unstructured_hi_res.txt:   9%|▉         | 1/11 [00:00<00:02,  3.53it/s]

input_data/pdf\mixed1.pdf


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Processing output_data\mixed1\Unstructured.txt:  18%|█▊        | 2/11 [00:10<00:55,  6.13s/it]       

input_data/pdf\mixed1.pdf


Processing output_data\mixed1\Unstructured_OCR.txt:  27%|██▋       | 3/11 [00:10<00:27,  3.49s/it]

input_data/pdf\mixed1.pdf


Processing output_data\mixed1\PDFMiner.txt:  36%|███▋      | 4/11 [00:23<00:48,  6.95s/it]        

input_data/pdf\mixed1.pdf


Processing output_data\mixed1\PDFMiner_HTML.html:  45%|████▌     | 5/11 [00:23<00:27,  4.59s/it]

input_data/pdf\mixed1.pdf


Processing output_data\mixed1\pdfminerSix.txt:  64%|██████▎   | 7/11 [00:23<00:12,  3.14s/it]   

input_data/pdf\mixed1.pdf
input_data/pdf\mixed1.pdf


Processing output_data\mixed1\textract:  73%|███████▎  | 8/11 [00:24<00:05,  1.69s/it]       

input_data/pdf\mixed1.pdf
Now converting all pages of PDF file input_data/pdf\mixed1.pdf to images...
Done converting pages from PDF file input_data/pdf\mixed1.pdf to images.
Tesseract version: 5.3.3.20231005
Extracting text from converted pages...
Processing page 1 with LLM...
Processing page 2 with LLM...
Now filtering out hallucinations from corrected text...


Processing output_data\mixed1\llama_index.md:  82%|████████▏ | 9/11 [01:47<00:45, 22.81s/it]

Done filtering out hallucinations.
input_data/pdf\mixed1.pdf
Started parsing the file under job_id 50fa6c63-36ec-4b8b-92e9-d452abfddf7a


Processing output_data\mixed1\llama_index.txt:  91%|█████████ | 10/11 [02:09<00:22, 22.74s/it]

input_data/pdf\mixed1.pdf
Started parsing the file under job_id ecd41655-2314-4b2e-8f0c-243c66992d16


Processing output_data\mixed1\llama_index.txt: 100%|██████████| 11/11 [02:14<00:00, 12.26s/it]


file:  tables.pdf
output_path:  output_data\tables


Processing output_data\tables\PyPDF.txt:   0%|          | 0/11 [00:00<?, ?it/s]

input_data/pdf\tables.pdf


Processing output_data\tables\Unstructured_hi_res.txt:   9%|▉         | 1/11 [00:01<00:12,  1.28s/it]

input_data/pdf\tables.pdf


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Processing output_data\tables\Unstructured.txt:  18%|█▊        | 2/11 [00:15<01:20,  8.99s/it]       

input_data/pdf\tables.pdf


Processing output_data\tables\Unstructured_OCR.txt:  27%|██▋       | 3/11 [00:17<00:45,  5.71s/it]

input_data/pdf\tables.pdf


Processing output_data\tables\PDFMiner.txt:  36%|███▋      | 4/11 [00:30<00:59,  8.57s/it]        

input_data/pdf\tables.pdf


Processing output_data\tables\PDFMiner_HTML.html:  45%|████▌     | 5/11 [00:31<00:36,  6.01s/it]

input_data/pdf\tables.pdf


Processing output_data\tables\pdfminerSix.txt:  64%|██████▎   | 7/11 [00:33<00:17,  4.48s/it]   

input_data/pdf\tables.pdf
input_data/pdf\tables.pdf


Processing output_data\tables\textract:  73%|███████▎  | 8/11 [00:34<00:07,  2.56s/it]       

input_data/pdf\tables.pdf
Now converting all pages of PDF file input_data/pdf\tables.pdf to images...
Done converting pages from PDF file input_data/pdf\tables.pdf to images.
Tesseract version: 5.3.3.20231005
Extracting text from converted pages...
Processing page 1 with LLM...
Processing page 2 with LLM...
Processing page 3 with LLM...
Processing page 4 with LLM...
Now filtering out hallucinations from corrected text...
Done filtering out hallucinations.


Processing output_data\tables\llama_index.md:  82%|████████▏ | 9/11 [01:16<00:25, 12.78s/it]

input_data/pdf\tables.pdf
Started parsing the file under job_id ae51b559-9fd4-4d18-bfdb-16e593a120f6


Processing output_data\tables\llama_index.txt:  91%|█████████ | 10/11 [01:45<00:17, 17.15s/it]

input_data/pdf\tables.pdf
Started parsing the file under job_id c5eb2a83-af41-473e-8c63-a0bb0acb6765


Processing output_data\tables\llama_index.txt: 100%|██████████| 11/11 [01:51<00:00, 10.09s/it]


file:  texts.pdf
output_path:  output_data\texts


Processing output_data\texts\PyPDF.txt:   0%|          | 0/11 [00:00<?, ?it/s]

input_data/pdf\texts.pdf


Processing output_data\texts\Unstructured_hi_res.txt:   9%|▉         | 1/11 [00:00<00:03,  2.60it/s]

input_data/pdf\texts.pdf


This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Processing output_data\texts\Unstructured.txt:  18%|█▊        | 2/11 [00:16<01:25,  9.53s/it]       

input_data/pdf\texts.pdf


Processing output_data\texts\Unstructured_OCR.txt:  27%|██▋       | 3/11 [00:16<00:43,  5.41s/it]

input_data/pdf\texts.pdf


Processing output_data\texts\PDFMiner.txt:  36%|███▋      | 4/11 [00:34<01:12, 10.39s/it]        

input_data/pdf\texts.pdf


Processing output_data\texts\PDFMiner_HTML.html:  45%|████▌     | 5/11 [00:35<00:40,  6.80s/it]

input_data/pdf\texts.pdf


Processing output_data\texts\pdfminerSix.txt:  64%|██████▎   | 7/11 [00:35<00:18,  4.68s/it]   

input_data/pdf\texts.pdf
input_data/pdf\texts.pdf


Processing output_data\texts\textract:  73%|███████▎  | 8/11 [00:36<00:07,  2.51s/it]       

input_data/pdf\texts.pdf
Now converting all pages of PDF file input_data/pdf\texts.pdf to images...
Done converting pages from PDF file input_data/pdf\texts.pdf to images.
Tesseract version: 5.3.3.20231005
Extracting text from converted pages...
Processing page 1 with LLM...
Processing page 2 with LLM...
Processing page 3 with LLM...
Processing page 4 with LLM...
Now filtering out hallucinations from corrected text...


Processing output_data\texts\llama_index.md:  82%|████████▏ | 9/11 [01:57<00:46, 23.08s/it]

Done filtering out hallucinations.
input_data/pdf\texts.pdf
Started parsing the file under job_id d5de99f3-dfaa-4d74-9ae6-d1e4649bc930


Processing output_data\texts\llama_index.txt:  91%|█████████ | 10/11 [02:02<00:18, 18.01s/it]

input_data/pdf\texts.pdf
Started parsing the file under job_id 224cf00a-6f85-4bc9-b29d-0a4f87d48301


Processing output_data\texts\llama_index.txt: 100%|██████████| 11/11 [02:07<00:00, 11.59s/it]
