In [1]:
from indoxMiner import (
    ExtractorSchema,
    Field,
    FieldType,
    ValidationRule,
    OutputFormat,
    Extractor,
    DocumentProcessor,
    ProcessingConfig, OpenAi
)

In [2]:
# Set your OpenAI API key
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [3]:
# ollama_extractor = Ollama(model="llama3")

In [4]:
llm_extractor = OpenAi(api_key=OPENAI_API_KEY, model="gpt-4o-mini")

In [5]:
# Example 1: Quick text extraction
# -------------------------------
# Define what information to extract
schema = ExtractorSchema(
    fields=[
        Field(
            name="product_name",
            description="Product name",
            field_type=FieldType.STRING,
            rules=ValidationRule(min_length=2)
        ),
        Field(
            name="price",
            description="Price in USD",
            field_type=FieldType.FLOAT,
            rules=ValidationRule(min_value=0)
        )
    ]
)

extractor = Extractor(llm=ollama_extractor, schema=schema)

In [6]:
# Sample text
text = """
MacBook Pro 16-inch with M2 chip
Price: $2,399.99
In stock: Yes
"""

In [7]:
# Extract information
result = await extractor.extract(text)

Task: Extract structured information from the given text according to the following schema.

        Fields to extract:
        - product_name (string*): Product name
    Validation: minimum length: 2
- price (float*): Price in USD
    Validation: minimum value: 0

        Output Requirements:
        1. Extract ONLY the specified fields
        2. Follow the exact field names provided
        3. Use json format
        4. Format as a JSON object. Use null for missing values.
        5. If a required field cannot be found, use null/empty values
        6. Validate all values against provided rules
        7. For dates, use ISO format (YYYY-MM-DD)
        8. For lists, provide values in a consistent format
        9. CRITICAL: Return ONLY the json output - no explanations, comments, or additional text before or after
        10. CRITICAL: Do not include explanation of what was extracted
        11. CRITICAL: Do not include ```json tags or backticks

        Text to analyze:
        
Mac

In [12]:
result

ExtractionResult(data={'product_name': 'MacBook Pro 16-inch with M2 chip', 'price': 2399.99}, raw_response='{\n    "product_name": "MacBook Pro 16-inch with M2 chip",\n    "price": 2399.99\n}', validation_errors=[])

In [8]:
extractor.to_dataframe(result)

Unnamed: 0,product_name,price
0,,


In [8]:
import nltk

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ASHKAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ASHKAN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [8]:
# Example 2: Process a PDF and extract information
# ----------------------------------------------
# Initialize document processor
processor = DocumentProcessor(["invoice_Aaron Hawkins_36652.pdf", "invoice_Aaron Smayling_35876.pdf"])

# Process the document
documents = processor.process(
    config=ProcessingConfig(
        hi_res_pdf=True
    )
)



In [9]:
documents

{'invoice_Aaron Hawkins_36652.pdf': [Document(page_content='SuperStore INVOICE # 36652 Ship To: 90004, Los Angeles, California, United States Date: Ship Mode: Balance Due: May 12 2012 Standard Class $17.15 Bill To: Aaron Hawkins Item EcoTones Memo Sheets Paper, Office Supplies, OFF-PA-4014 Quantity 2 Rate Amount $8.00 $16.00 Subtotal: Subtotal: $16.00 Shipping: Shipping: $1.15 Total: Total: $17.15 Notes: Thanks for your business! Terms: Order ID : CA-2012-AH10030140-41041', metadata={'filename': 'invoice_Aaron Hawkins_36652.pdf', 'filetype': 'application/pdf', 'page_number': 1, 'source': 'invoice_Aaron Hawkins_36652.pdf'})],
 'invoice_Aaron Smayling_35876.pdf': [Document(page_content='SuperStore INVOICE # 35876 Ship To: 10035, New York City, New York, United States Date: Ship Mode: Balance Due: Jul 26 2012 Standard Class $727.94 Bill To: Aaron Smayling Item Xerox 1956 Paper, Office Supplies, OFF-PA-6512 Quantity 11 Rate $65.78 Amount $723.58 Subtotal: Subtotal: $723.58 Shipping: Shippi

In [10]:
# Example 1: Quick text extraction
# -------------------------------
# Define what information to extract
schema = ExtractorSchema(
    fields=[
        Field(
            name="bill_to",
            description="Bill To",
            field_type=FieldType.STRING,
            rules=ValidationRule(min_length=2)
        ),
        Field(
            name="ship_to",
            description="Ship To",
            field_type=FieldType.STRING,
            rules=ValidationRule(min_length=2)
        ),
        Field(
            name="date",
            description="date",
            field_type=FieldType.DATE,
        )
        ,
        Field(
            name="ship_mode",
            description="Ship Mode",
            field_type=FieldType.STRING,
        )
        ,
        Field(
            name="item",
            description="item",
            field_type=FieldType.STRING,
        )
        ,
        Field(
            name="quantity",
            description="Quantity",
            field_type=FieldType.INTEGER,
        )
        ,
        Field(
            name="rate",
            description="price in usd",
            field_type=FieldType.FLOAT,
        )
        ,
        Field(
            name="amount",
            description="price in usd",
            field_type=FieldType.FLOAT,
        )
        ,

    ],
    output_format= OutputFormat.JSON
)

extractor = Extractor(llm=llm_extractor, schema=schema)

In [11]:
documents

{'invoice_Aaron Hawkins_36652.pdf': [Document(page_content='SuperStore INVOICE # 36652 Ship To: 90004, Los Angeles, California, United States Date: Ship Mode: Balance Due: May 12 2012 Standard Class $17.15 Bill To: Aaron Hawkins Item EcoTones Memo Sheets Paper, Office Supplies, OFF-PA-4014 Quantity 2 Rate Amount $8.00 $16.00 Subtotal: Subtotal: $16.00 Shipping: Shipping: $1.15 Total: Total: $17.15 Notes: Thanks for your business! Terms: Order ID : CA-2012-AH10030140-41041', metadata={'filename': 'invoice_Aaron Hawkins_36652.pdf', 'filetype': 'application/pdf', 'page_number': 1, 'source': 'invoice_Aaron Hawkins_36652.pdf'})],
 'invoice_Aaron Smayling_35876.pdf': [Document(page_content='SuperStore INVOICE # 35876 Ship To: 10035, New York City, New York, United States Date: Ship Mode: Balance Due: Jul 26 2012 Standard Class $727.94 Bill To: Aaron Smayling Item Xerox 1956 Paper, Office Supplies, OFF-PA-6512 Quantity 11 Rate $65.78 Amount $723.58 Subtotal: Subtotal: $723.58 Shipping: Shippi

In [12]:
results = await extractor.extract(documents)

In [13]:
results

ExtractionResults(combined_data=[{'bill_to': 'Aaron Hawkins', 'ship_to': '90004, Los Angeles, California, United States', 'date': '2012-05-12', 'ship_mode': 'Standard Class', 'item': 'EcoTones Memo Sheets Paper, Office Supplies, OFF-PA-4014', 'quantity': 2, 'rate': 8.0, 'amount': 16.0}, {'bill_to': 'Aaron Smayling', 'ship_to': '10035, New York City, New York, United States', 'date': '2012-07-26', 'ship_mode': 'Standard Class', 'item': 'Xerox 1956 Paper, Office Supplies, OFF-PA-6512', 'quantity': 11, 'rate': 65.78, 'amount': 723.58}], raw_responses=['```json\n{\n    "bill_to": "Aaron Hawkins",\n    "ship_to": "90004, Los Angeles, California, United States",\n    "date": "2012-05-12",\n    "ship_mode": "Standard Class",\n    "item": "EcoTones Memo Sheets Paper, Office Supplies, OFF-PA-4014",\n    "quantity": 2,\n    "rate": 8.00,\n    "amount": 16.00\n}\n```', '```json\n{\n    "bill_to": "Aaron Smayling",\n    "ship_to": "10035, New York City, New York, United States",\n    "date": "2012

In [14]:
# Get all valid results
valid_data = results.get_valid_results()

# Check if any chunks had validation errors
if not results.is_valid:
    for chunk_idx, errors in results.validation_errors.items():
        print(f"Chunk {chunk_idx} has errors: {errors}")

In [15]:
valid_data

[{'bill_to': 'Aaron Hawkins',
  'ship_to': '90004, Los Angeles, California, United States',
  'date': '2012-05-12',
  'ship_mode': 'Standard Class',
  'item': 'EcoTones Memo Sheets Paper, Office Supplies, OFF-PA-4014',
  'quantity': 2,
  'rate': 8.0,
  'amount': 16.0},
 {'bill_to': 'Aaron Smayling',
  'ship_to': '10035, New York City, New York, United States',
  'date': '2012-07-26',
  'ship_mode': 'Standard Class',
  'item': 'Xerox 1956 Paper, Office Supplies, OFF-PA-6512',
  'quantity': 11,
  'rate': 65.78,
  'amount': 723.58}]

In [16]:
# Display results in a pandas DataFrame for better visualization
import pandas as pd

df = pd.DataFrame(valid_data)

In [17]:
df

Unnamed: 0,bill_to,ship_to,date,ship_mode,item,quantity,rate,amount
0,Aaron Hawkins,"90004, Los Angeles, California, United States",2012-05-12,Standard Class,"EcoTones Memo Sheets Paper, Office Supplies, O...",2,8.0,16.0
1,Aaron Smayling,"10035, New York City, New York, United States",2012-07-26,Standard Class,"Xerox 1956 Paper, Office Supplies, OFF-PA-6512",11,65.78,723.58


In [18]:
schema = ExtractorSchema(
    fields=[
        Field(
            name="amount",
            description="The quantity or hours of service/product (e.g., 2.25h, 40.3h)",
            field_type=FieldType.FLOAT,
        ),
        Field(
            name="description",
            description="Description of the service or product provided",
            field_type=FieldType.STRING,
        ),
        Field(
            name="price_per_unit",
            description="Price per unit in euro (e.g., 135.00)",
            field_type=FieldType.FLOAT,
        ),
        Field(
            name="total_price",
            description="Total price for this line item in euro (amount * price_per_unit)",
            field_type=FieldType.FLOAT,        ),
        Field(
            name="invoice_id",
            description="id of invoice",
            field_type=FieldType.INTEGER
        ),
        Field(
            name="tax_number",
            description="tax number",
            field_type=FieldType.INTEGER
        )
    ],
)

# Create the extractor
extractor = Extractor(llm=llm_extractor, schema=schema)



In [19]:
text = """
Denk Timo Denk Some Street 82 10000 Berlin, Germany Development Recipient Name Invoice ID; 4736 Another Avenue 18 Issue date: August 9, 2019 12345 Berlin Tax number: 35061/00029 Germany Invoice Item Amount Description Price per unit Total price 2.25 h Development of in project Character Grid 135.00 € 301.50 € 40.3h Research on 2D embedding 145.00 € 5,843.50 € 8.85 h Research on ID embedding 145.00 € 283.25 € Shipping 9.99 € 9.99 € Sum 7,438.24 € Total price 7,438.24 € Date of delivery service provision: July through August 2019 This invoice has no turnover tax due to Kleinunternehmerregelung according to $ 19 UStG_ Bank account Timo Denk IBAN DEO2 1421 1422 7293 1738 99 BIC DJENKSUXXX Transfer details Denk Development Invoice 4736 Timo Denk Email-address development @timodenkcom Website_ development timodenkcom
"""

In [20]:
resultss = await extractor.extract(text)

In [21]:
resultss.data

{'items': [{'amount': 2.25,
   'description': 'Development of in project Character Grid',
   'price_per_unit': 135.0,
   'total_price': 303.75,
   'invoice_id': 4736,
   'tax_number': 3506100029},
  {'amount': 40.3,
   'description': 'Research on 2D embedding',
   'price_per_unit': 145.0,
   'total_price': 5843.5,
   'invoice_id': 4736,
   'tax_number': 3506100029},
  {'amount': 8.85,
   'description': 'Research on ID embedding',
   'price_per_unit': 145.0,
   'total_price': 1287.25,
   'invoice_id': 4736,
   'tax_number': 3506100029},
  {'amount': None,
   'description': 'Shipping',
   'price_per_unit': 9.99,
   'total_price': 9.99,
   'invoice_id': 4736,
   'tax_number': 3506100029}]}

In [22]:
extractor.to_dataframe(resultss)

Unnamed: 0,amount,description,price_per_unit,total_price,invoice_id,tax_number
0,2.25,Development of in project Character Grid,135.0,303.75,4736,3506100029
1,40.3,Research on 2D embedding,145.0,5843.5,4736,3506100029
2,8.85,Research on ID embedding,145.0,1287.25,4736,3506100029
3,,Shipping,9.99,9.99,4736,3506100029


In [24]:
passport_processor = DocumentProcessor(["passport1.jpg", "passport2.jpg"])

# Process the document
doc_passport = passport_processor.process()
doc_passport

{'passport2.jpg': [Document(page_content='eed STATES OF 160 * PASSPORT CARD x jas ality. See “© Surname ee seed Names HAPPY Passport Card nos <€03005988 EXEMPLAR *** = Sex Date of Birth: M1 JAN 1981 Place of Birth NEWYORK, U.S.A. a Expires On. “at SNOV 2009.29 NOV2 BTATES DEPARTMENT OF STARE? he D', metadata={'filename': 'passport2.jpg', 'filetype': 'image/jpeg', 'page_number': 1, 'source': 'passport2.jpg'})],
 'passport1.jpg': [Document(page_content='PASPO PASSEPORT athe! Type jos hee Orca de FEtwt ne 1 # cenettour REG,,.. CHERUBIN Nea) Priam NIRKA Maun bs reve habongate HAITIENNE Dat bs 41) Date Oe mpistonce 6 MAI 1962 fi casper gasen,/ Seer FEMININ Cat pseps > ttt Dots emission 17 DECEMBRE, 1992 fH paepo » tn Dots Sespirotica 16 DECEMBRE 1997 HTI SPECIMEN AYITI? HAITI PASPD elerean FASSERIRT No 920000018 Kole fit Lee ee reccernes PORT-AU-PRINCE : hi SIYATI MET PASP0 & MSISNATURE DU TITULAIRE 920000018', metadata={'filename': 'passport1.jpg', 'filetype': 'image/jpeg', 'page_number': 

In [11]:
schema = ExtractorSchema(
    fields=[
        Field(
            name="passport_number",
            description="The unique identification number of the passport",
            field_type=FieldType.STRING,
        ),
        Field(
            name="full_name",
            description="Full name of the passport holder as shown in the passport",
            field_type=FieldType.STRING,
        ),
        Field(
            name="date_of_birth",
            description="Date of birth of the passport holder (e.g., 1980-01-01)",
            field_type=FieldType.DATE,
        ),
        Field(
            name="place_of_birth",
            description="Place of birth of the passport holder as mentioned in the passport",
            field_type=FieldType.STRING,
        ),
        Field(
            name="nationality",
            description="Nationality of the passport holder",
            field_type=FieldType.STRING,
        ),
        Field(
            name="date_of_issue",
            description="Date when the passport was issued (e.g., 2020-05-15)",
            field_type=FieldType.DATE,
        ),
        Field(
            name="date_of_expiry",
            description="Expiry date of the passport (e.g., 2030-05-15)",
            field_type=FieldType.DATE,
        ),
        Field(
            name="issuing_country",
            description="Country that issued the passport",
            field_type=FieldType.STRING,
        ),
        Field(
            name="sex",
            description="Sex of the passport holder (e.g., M, F, X)",
            field_type=FieldType.STRING,
        ),
        Field(
            name="personal_number",
            description="Additional identification number or personal number if present",
            field_type=FieldType.STRING,
        ),
    ],
)


In [12]:
extractor = Extractor(llm=llm_extractor, schema=schema)
resultss = await extractor.extract(results)


In [15]:
extractor.to_dataframe(resultss)

Unnamed: 0,passport_number,full_name,date_of_birth,place_of_birth,nationality,date_of_issue,date_of_expiry,issuing_country,sex,personal_number
0,1-02781-0,CHERUBIN HAPPY,1992-12-17,,USA,,,UNITED STATES,M,920000018.0
1,920000018,HAPPY CHERUBIN KELLI,1962-05-06,PORT-AU-PRINCE,HAITIENNE,,,,,


In [6]:
!pip install paddlepaddle paddleocr


Collecting paddlepaddle
  Using cached paddlepaddle-2.6.2-cp312-cp312-win_amd64.whl.metadata (8.7 kB)
Collecting astor (from paddlepaddle)
  Using cached astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting protobuf<=3.20.2,>=3.1.0 (from paddlepaddle)
  Using cached protobuf-3.20.2-py2.py3-none-any.whl.metadata (720 bytes)
Using cached paddlepaddle-2.6.2-cp312-cp312-win_amd64.whl (81.1 MB)
Using cached protobuf-3.20.2-py2.py3-none-any.whl (162 kB)
Using cached astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: protobuf, astor, paddlepaddle
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.27.3
    Uninstalling protobuf-5.27.3:
      Successfully uninstalled protobuf-5.27.3
Successfully installed astor-0.8.1 paddlepaddle-2.6.2 protobuf-3.20.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-health-checking 1.65.4 requires grpcio>=1.65.4, but you have grpcio 1.63.0 which is incompatible.
grpcio-health-checking 1.65.4 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 3.20.2 which is incompatible.
grpcio-status 1.62.2 requires protobuf>=4.21.6, but you have protobuf 3.20.2 which is incompatible.
grpcio-tools 1.65.4 requires grpcio>=1.65.4, but you have grpcio 1.63.0 which is incompatible.
grpcio-tools 1.65.4 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 3.20.2 which is incompatible.
tensorflow-intel 2.16.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.

[notice] A new release of pip is available: 24.1.1 -> 24.3.1
[notice] To update, run: C:\Users\ASHKAN\AppData\Local\Programs\Python\P

In [9]:
# Initialize processor with config
config = ProcessingConfig(
    ocr_for_images=True,
    ocr_model='paddle'  # or 'tesseract'
)

# Create processor instance
processor = DocumentProcessor(["passport1.jpg", "passport2.jpg"])

# Process documents
results = processor.process(config)

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to C:\Users\ASHKAN/.paddleocr/whl\det\en\en_PP-OCRv3_det_infer\en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:27<00:00, 143.46it/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to C:\Users\ASHKAN/.paddleocr/whl\rec\en\en_PP-OCRv4_rec_infer\en_PP-OCRv4_rec_infer.tar


100%|██████████| 10000/10000 [00:44<00:00, 227.20it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to C:\Users\ASHKAN/.paddleocr/whl\cls\ch_ppocr_mobile_v2.0_cls_infer\ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:18<00:00, 116.69it/s]


[2024/10/28 19:54:26] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ASHKAN/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ASHKAN/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_nu

In [10]:
results

{'passport2.jpg': [Document(page_content='KelLi c Dt Dte d *** 16DECEM8RE1997 Nationality USA 920000018 Surname Maun h rNsbnyit CHERUBIN Given Names 3 HAPPY Sex Date of Birth M 17DECE8RE1992 wie 920000018 HAISA SISNATURE D TITULAIRE ssued On Expires On FASSEPORT NO FASPONm HAITIENNE 1-02781-0 AICUNITEDSTATES.DEPARTMENT OF STATE', metadata={'filename': 'passport2.jpg', 'filetype': 'image/jpeg', 'page_number': 1, 'source': 'passport2.jpg'})],
 'passport1.jpg': [Document(page_content='Given Names HAPPY Expires On CrEt FASPONm 1-02781-0 ssued On Nationality FASSEPORT NO HAISA *** 920000018 CHERUBIN KelLi c Surname PORT-AU-PRINCE Maun h rNsbnyit HAITIENNE Dt Dte d 6MAI1962 y gn/S Date of Birth Cat pp  tDt demissin SIYATI METPASOA 17DECE8RE1992 SISNATURE D TITULAIRE C sponDos dprolicn 16DECEM8RE1997 920000018 SPECIMEN', metadata={'filename': 'passport1.jpg', 'filetype': 'image/jpeg', 'page_number': 1, 'source': 'passport1.jpg'})]}