In [None]:
from unstructured.partition.docx import partition_docx
file_path = "/home/naufal/file_extractor/source/pdfs/Doc1.pdf"
with open(file_path, "rb") as file:
    binary_file = file.read()

In [None]:
with open("/home/naufal/file_extractor/source/pdfs/Doc.pdf", "wb") as file:
    file.write(binary_file)

In [None]:
import pymupdf
file = pymupdf.open("/home/naufal/file_extractor/source/pdfs/Doc1.pdf")
extracted_text = file.load_page(0).get_textpage_ocr().extractTEXT(sort=True)
file.close()


In [None]:
import tempfile
import os
import uuid
USERID = str(uuid.uuid4().hex)
CHATID = str(uuid.uuid4().hex)
FILENAME = "Doc1.pdf"
TEMP_STORAGE_PATH = tempfile.mkdtemp(prefix="files_")
FILEDIR = os.path.join(TEMP_STORAGE_PATH, USERID, CHATID)
os.makedirs(FILEDIR, exist_ok=True)
INPUT_PATH = os.path.join(FILEDIR, FILENAME)



In [None]:
import shutil
print(FILEDIR)
if os.path.exists(FILEDIR):
    shutil.rmtree(FILEDIR)
    print("TEMP FILE HAS BEEN REMOVED")

In [None]:

if os.path.exists(FILEDIR):
    shutil.rmtree(FILEDIR)
    print("TEMP FILE HAS BEEN REMOVED")

In [None]:
print(os.path.exists(FILEDIR))

In [None]:
with tempfile.TemporaryDirectory(prefix="file_") as tmpdirname:
    
    print('created temporary directory', tmpdirname)
            ocr_command = [
            "ocrmypdf", "--output-type", "pdf", "-j", str(CORES_PER_REQUEST),
            "--tesseract-timeout", str(OCR_TIMEOUT), "--skip-big", "4",
            "-f", self.input_path, output_path
        ]


In [1]:
import sys
print(sys.path)
sys.path.append("~/file_extractor")

['/home/naufal/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python311.zip', '/home/naufal/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11', '/home/naufal/.local/share/uv/python/cpython-3.11.13-linux-x86_64-gnu/lib/python3.11/lib-dynload', '', '/home/naufal/file_extractor/.venv/lib/python3.11/site-packages']


In [None]:
print(sys.path)

In [None]:
file_path = "/home/naufal/file_extractor/source/pdfs/Doc1.pdf"
with open(file_path, "rb") as file:
    binary_file = file.read()

with tempfile.TemporaryDirectory(prefix="file_") as temp_dir:
    file_dir = os.path.join(temp_dir, "userId", "chatId")
    os.makedirs(file_dir, exist_ok=True)
    print(f"Path exists? {os.path.exists(file_dir)}")
    
    file_save_path = os.path.join(file_dir, "Doc1.pdf")
    
    with open(file_save_path, "wb") as file:
        file.write(binary_file)

In [8]:
import os
import uuid
import tempfile
import asyncio
import subprocess
from io import BytesIO
from pathlib import Path
from loguru import logger
from typing import Dict, Any
from file_extractor.tools.pdf_extractor import PDFExtractor
from file_extractor.tools.word_extractor import WordDocumentExtractor
from concurrent.futures import ProcessPoolExecutor

MAX_PROCESS_WORKERS = 4

process_executor = ProcessPoolExecutor(max_workers=MAX_PROCESS_WORKERS)

file_path = "/home/naufal/file_extractor/source/pdfs/Doc1.pdf"

async def main(filePath: str, userId: str, chatId: str) -> Dict[str, Any]:

    logger.info(f"Processing request from ({userId},{chatId}).")

    # Open file to binary:
    with open(filePath, "rb") as f:
        binary_file = f.read()

    filename = Path(filePath).name

    if filename.lower().endswith("pdf"):
        try:
            with tempfile.TemporaryDirectory(prefix="file_") as temp_dir:

                    file_dir = os.path.join(temp_dir, userId, chatId)
                    os.makedirs(file_dir, exist_ok=True)

                    file_path = os.path.join(file_dir, filename)
                    with open(file_path, "wb") as file:
                        file.write(binary_file)

                    # start pdf extraction
                    extractor = PDFExtractor(max_workers=MAX_PROCESS_WORKERS)
                    result = await extractor.extract_async(file=binary_file, filename=filename, extract_tables=False, executor=process_executor)

                    if result["status"]:
                        
                        logger.info("Extraction successful.")

                        # cleanup
                        process_executor.shutdown(wait=True)

                        return result
                    
                    # Fall back to OCR
                    logger.info("Fall back to OCR")
                    ocr_output_path = os.path.join(file_dir, f"ocr_{filename}")
                    ocr_command = [
                                    "ocrmypdf", "--output-type", "pdf", "--jobs", str(MAX_PROCESS_WORKERS), "--language", "eng+ind", "-q", "-f", file_path, ocr_output_path
                                ]
                    try:
                        process = await asyncio.create_subprocess_exec(*ocr_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                        stdout, stderr = await process.communicate()

                        if process.returncode != 0:
                            logger.error(f"OCR extraction failed: {stderr.decode()}")

                            # cleanup
                            process_executor.shutdown(wait=True)

                            raise
                        
                        logger.info(f"OCR extraction completed successfully. Output saved to {ocr_output_path}")
                        logger.info("Retry extraction again.")

                        with open(ocr_output_path, 'rb') as file:
                            binary_file = file.read()

                        result = await extractor.extract_async(file=binary_file, filename=filename, extract_tables=False, executor=process_executor)

                        if result["status"]:
                            
                            logger.info("Extraction successful.")
                            
                            # cleanup
                            process_executor.shutdown(wait=True)

                            return result
                        
                        else:
                            logger.error("No content found after being ocrd.")
                            
                            # cleanup
                            process_executor.shutdown(wait=True)

                            return {"error": "No content found"}
                        
                    except Exception as e:
                        
                        # cleanup
                        process_executor.shutdown(wait=True)

                        raise e
        except Exception as e:
            # cleanup
            process_executor.shutdown(True)
            raise e

    else:
        try:
            extractor = WordDocumentExtractor(infer_table_structure=True)
            result = await extractor.extract_async(file=BytesIO(binary_file), filename=filename, executor=process_executor)

            if result["status"]:
                logger.info("Extraction successful.")

                # cleanup
                process_executor.shutdown(wait=True)

                return result
            
            else:
                logger.error("No content found")

                # cleanup
                process_executor.shutdown(wait=True)
                return {"error": "NO content found"}
            
        except Exception as e:
             # cleanup
             process_executor.shutdown(wait=True)
             raise e

In [9]:
USERID = str(uuid.uuid4().hex)
CHATID = str(uuid.uuid4().hex)
file_path="/home/naufal/file_extractor/source/pdfs/Doc1.pdf"
extraction_result = await main(filePath=file_path, userId=USERID, chatId=CHATID)

[32m2025-09-03 14:47:35.251[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m22[0m - [1mProcessing request from (47aab16d8d8b458d8feaa27a6539bf64,c62076ca88ac464fbc49dd0cdf30e4e7).[0m
[32m2025-09-03 14:47:35.253[0m | [1mINFO    [0m | [36mfile_extractor.tools.pdf_extractor[0m:[36m__init__[0m:[36m156[0m - [1mInitialized PDFExtractor with 4 max workers[0m
[32m2025-09-03 14:47:35.254[0m | [1mINFO    [0m | [36mfile_extractor.tools.pdf_extractor[0m:[36mextract_async[0m:[36m231[0m - [1mStarting text extraction for Doc1.pdf (1 pages) with 4 workers[0m
[32m2025-09-03 14:47:35.296[0m | [1mINFO    [0m | [36mfile_extractor.tools.pdf_extractor[0m:[36mextract_async[0m:[36m276[0m - [1mExtraction completed for Doc1.pdf: 0/1 pages successful[0m
[32m2025-09-03 14:47:35.299[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m55[0m - [1mFall back to OCR[0m
[32m2025-09-03 14:47:41.340[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0

In [4]:
print(extraction_result)

{'filename': 'Kompilasi Tabel HC.docx', 'total_pages': 10, 'pages': [{'page_index': 0, 'text': "Kompilasi Tabel (Versi Refaktorisasi)\nA. Nama tabel: employees\nSkema tabel:\n- employee_id (INTEGER/UUID) (Primary Key)\n- name (VARCHAR)\n- age (INTEGER)\n- gender (VARCHAR)\n- role (VARCHAR)\n- department (VARCHAR)\n- address_state (VARCHAR)\n- address_municipality (VARCHAR)\n- work_email (VARCHAR)\n- phone_number (VARCHAR)\n- proficiency_level (VARCHAR)\n- kpi_score (NUMERIC)\n\nDeskripsi fields (variabel):\n- employee_id: ID unik untuk setiap karyawan, berfungsi sebagai primary key.\n- Contoh Nilai: 101, 205, b5e3f2a1-c7d9-4b1e-8a0f-9d2c6e3b8a1d\n- name: Nama lengkap karyawan.\n- Contoh Nilai: 'Dewi Anggraini', 'Budi Santoso'\n- age: Usia karyawan dalam angka.\n- Contoh Nilai: 28, 45\n- gender: Jenis kelamin karyawan.\n- Contoh Nilai: 'Perempuan', 'Laki-laki'\n- role: Jabatan atau peran karyawan.\n- Contoh Nilai: 'Manajer Proyek', 'Analis Data Senior'", 'status': True}, {'page_index': 

In [None]:
from unstructured.partition.docx import partition_docx
from io import BytesIO
with open(file_path, "rb") as file:
    bytes_file = BytesIO(file.read())
    elements = partition_docx(file=bytes_file)


In [None]:
ocr_command = [
            "ocrmypdf", "--output-type", "pdf", "-f", "/home/naufal/file_extractor/source/pdfs/test.pdf", "/home/naufal/file_extractor/source/pdfs/ocrd_pdfs/test.pdf"
        ]
subprocess.run(ocr_command)

In [None]:
import pymupdf

with pymupdf.open("/home/naufal/file_extractor/source/pdfs/ocrd_pdfs/test.pdf") as file:
    result = file.load_page(0).get_text()

print(result)

In [30]:
from pydantic import BaseModel, Field
from dataclasses import dataclass

@dataclass
class SampleData:
    param_1: int = 5
    param_2: str = 5


In [29]:
print(type(SampleData.param_2))

<class 'int'>
