In [1]:
import sys
import os
sys.path.append(os.path.abspath(r'E:\Codes\IndoxMiner'))



In [2]:

from indoxMiner import (
    ExtractorSchema,
    Field,
    FieldType,
    ValidationRule,
    OutputFormat,
    Extractor,
    DocumentProcessor,
    ProcessingConfig, 
    AsyncIndoxApi,
    AsyncOpenAi,
    OpenAi,
    IndoxApi
)


In [3]:
# Set your OpenAI API key
import os
from dotenv import load_dotenv

load_dotenv()
INDOX_API_KEY = os.environ['INDOX_API_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [4]:
# ollama_extractor = Ollama(model="llama3")

In [5]:
indox_api_extractor = IndoxApi(api_key=INDOX_API_KEY)

In [6]:
openai_async_extractor = AsyncOpenAi(api_key=OPENAI_API_KEY, model="gpt-4o-mini")
openai_extractor = OpenAi(api_key=OPENAI_API_KEY, model="gpt-4o-mini")

In [7]:
# Example 1: Quick text extraction
# -------------------------------
# Define what information to extract
schema = ExtractorSchema(
    fields=[
        Field(
            name="product_name",
            description="Product name",
            field_type=FieldType.STRING,
            rules=ValidationRule(min_length=2)
        ),
        Field(
            name="price",
            description="Price in USD",
            field_type=FieldType.FLOAT,
            rules=ValidationRule(min_value=0)
        ),
        Field(
            name="in_stock",
            description="Availability status of the product (Yes/No)",
            field_type=FieldType.STRING,
            rules=ValidationRule(allowed_values=["Yes", "No"])  # Ensures only "Yes" or "No" is valid
        )
    ]
)


extractor_async = Extractor(llm=openai_async_extractor, schema=schema)
extractor = Extractor(llm=indox_api_extractor, schema=schema) 

In [8]:
# Sample text
text = """
MacBook Pro 16-inch with M2 chip
Price: $2,399.99
In stock: Yes
"""

In [9]:
result = extractor.extract(text)

In [10]:
result

ExtractionResult(data={'items': [{'product_name': 'MacBook Pro 16-inch with M2 chip', 'price': 2399.99, 'in_stock': 'Yes'}]}, raw_response='{\n    "product_name": "MacBook Pro 16-inch with M2 chip",\n    "price": 2399.99,\n    "in_stock": "Yes"\n}', validation_errors=[])

In [11]:
extractor.to_dataframe(result)

Unnamed: 0,product_name,price,in_stock
0,MacBook Pro 16-inch with M2 chip,2399.99,Yes


In [14]:
# # Example 2: Process a PDF and extract information
# # ----------------------------------------------
# # Initialize document processor
# processor = DocumentProcessor(["data/invoice_Aaron Hawkins_36652.pdf", "data/invoice_Aaron Smayling_35876.pdf"])

# # Process the document
# documents = processor.process(
#     config=ProcessingConfig(
#         hi_res_pdf=True
#     )
# )



In [15]:
documents

NameError: name 'documents' is not defined

In [16]:
# Example 1: Quick text extraction
# -------------------------------
# Define what information to extract
schema = ExtractorSchema(
    fields=[
        Field(
            name="bill_to",
            description="Bill To",
            field_type=FieldType.STRING,
            rules=ValidationRule(min_length=2)
        ),
        Field(
            name="ship_to",
            description="Ship To",
            field_type=FieldType.STRING,
            rules=ValidationRule(min_length=2)
        ),
        Field(
            name="date",
            description="date",
            field_type=FieldType.DATE,
        )
        ,
        Field(
            name="ship_mode",
            description="Ship Mode",
            field_type=FieldType.STRING,
        )
        ,
        Field(
            name="item",
            description="item",
            field_type=FieldType.STRING,
        )
        ,
        Field(
            name="quantity",
            description="Quantity",
            field_type=FieldType.INTEGER,
        )
        ,
        Field(
            name="rate",
            description="price in usd",
            field_type=FieldType.FLOAT,
        )
        ,
        Field(
            name="amount",
            description="price in usd",
            field_type=FieldType.FLOAT,
        )
        ,

    ],
    output_format= OutputFormat.JSON
)

extractor = Extractor(llm=indox_api_extractor, schema=schema)

In [17]:
results = await extractor.extract(documents)

In [18]:
results

ExtractionResults(data=[{'items': [{'bill_to': 'Aaron Smayling', 'ship_to': '10035, New York City, New York, United States', 'date': '2012-07-26', 'ship_mode': 'Standard Class', 'item': 'Xerox 1956 Paper, Office Supplies, OFF-PA-6512', 'quantity': 11, 'rate': 65.78, 'amount': 723.58}]}, {'items': [{'bill_to': 'Aaron Hawkins', 'ship_to': '90004, Los Angeles, California, United States', 'date': '2012-05-12', 'ship_mode': 'Standard Class', 'item': 'EcoTones Memo Sheets Paper, Office Supplies, OFF-PA-4014', 'quantity': 2, 'rate': 8.0, 'amount': 16.0}]}], raw_responses=['{\n    "bill_to": "Aaron Smayling",\n    "ship_to": "10035, New York City, New York, United States",\n    "date": "2012-07-26",\n    "ship_mode": "Standard Class",\n    "item": "Xerox 1956 Paper, Office Supplies, OFF-PA-6512",\n    "quantity": 11,\n    "rate": 65.78,\n    "amount": 723.58\n}', '{\n    "bill_to": "Aaron Hawkins",\n    "ship_to": "90004, Los Angeles, California, United States",\n    "date": "2012-05-12",\n   

In [19]:
results.data

[{'items': [{'bill_to': 'Aaron Smayling',
    'ship_to': '10035, New York City, New York, United States',
    'date': '2012-07-26',
    'ship_mode': 'Standard Class',
    'item': 'Xerox 1956 Paper, Office Supplies, OFF-PA-6512',
    'quantity': 11,
    'rate': 65.78,
    'amount': 723.58}]},
 {'items': [{'bill_to': 'Aaron Hawkins',
    'ship_to': '90004, Los Angeles, California, United States',
    'date': '2012-05-12',
    'ship_mode': 'Standard Class',
    'item': 'EcoTones Memo Sheets Paper, Office Supplies, OFF-PA-4014',
    'quantity': 2,
    'rate': 8.0,
    'amount': 16.0}]}]

In [20]:
df = extractor.to_dataframe(results)
df

Unnamed: 0,bill_to,ship_to,date,ship_mode,item,quantity,rate,amount
0,Aaron Smayling,"10035, New York City, New York, United States",2012-07-26,Standard Class,"Xerox 1956 Paper, Office Supplies, OFF-PA-6512",11,65.78,723.58
1,Aaron Hawkins,"90004, Los Angeles, California, United States",2012-05-12,Standard Class,"EcoTones Memo Sheets Paper, Office Supplies, O...",2,8.0,16.0


In [21]:
results.validation_errors

{}

In [22]:
# Get all valid results
valid_data = results.get_valid_results()

# Check if any chunks had validation errors
if not results.is_valid:
    for chunk_idx, errors in results.validation_errors.items():
        print(f"Chunk {chunk_idx} has errors: {errors}")

In [16]:
from indoxMiner import Schema
schema = ExtractorSchema(
    fields=[
        Field(
            name="amount",
            description="The quantity or hours of service/product (e.g., 2.25, 40.3)",
            field_type=FieldType.FLOAT,
        ),
        Field(
            name="description",
            description="Description of the service or product provided",
            field_type=FieldType.STRING,
        ),
        Field(
            name="price_per_unit",
            description="Price per unit in euro (e.g., 135.00)",
            field_type=FieldType.FLOAT,
        ),
        Field(
            name="total_price",
            description="Total price for this line item in euro (amount * price_per_unit)",
            field_type=FieldType.FLOAT,
        ),
        Field(
            name="invoice_id",
            description="ID of the invoice",
            field_type=FieldType.INTEGER,
        ),
    ],
)


# Create the extractor
extractor = Extractor(llm=openai_async_extractor, schema=schema)
extractor_sync = Extractor(llm=openai_extractor,schema=schema)


In [17]:
text = """
Denk Timo Denk Some Street 82 10000 Berlin, Germany Development Recipient Name Invoice ID; 4736 Another Avenue 18 Issue date: August 9, 2019 12345 Berlin Tax number: 35061/00029 Germany Invoice Item Amount Description Price per unit Total price 2.25 h Development of in project Character Grid 135.00 € 301.50 € 40.3h Research on 2D embedding 145.00 € 5,843.50 € 8.85 h Research on ID embedding 145.00 € 283.25 € Shipping 9.99 € 9.99 € Sum 7,438.24 € Total price 7,438.24 € Date of delivery service provision: July through August 2019 This invoice has no turnover tax due to Kleinunternehmerregelung according to $ 19 UStG_ Bank account Timo Denk IBAN DEO2 1421 1422 7293 1738 99 BIC DJENKSUXXX Transfer details Denk Development Invoice 4736 Timo Denk Email-address development @timodenkcom Website_ development timodenkcom
"""

In [18]:
result_sync = extractor_sync.extract(text)
resultss = await extractor.extract(text)

In [19]:
result_sync.data

{'items': [{'amount': 2.25,
   'description': 'Development of in project Character Grid',
   'price_per_unit': 135.0,
   'total_price': 301.5,
   'invoice_id': 4736},
  {'amount': 40.3,
   'description': 'Research on 2D embedding',
   'price_per_unit': 145.0,
   'total_price': 5834.5,
   'invoice_id': 4736},
  {'amount': 8.85,
   'description': 'Research on ID embedding',
   'price_per_unit': 145.0,
   'total_price': 283.25,
   'invoice_id': 4736},
  {'amount': None,
   'description': 'Shipping',
   'price_per_unit': 9.99,
   'total_price': 9.99,
   'invoice_id': 4736}]}

In [20]:
resultss.data

{'items': [{'amount': 2.25,
   'description': 'Development of in project Character Grid',
   'price_per_unit': 135.0,
   'total_price': 301.5,
   'invoice_id': 4736},
  {'amount': 40.3,
   'description': 'Research on 2D embedding',
   'price_per_unit': 145.0,
   'total_price': 5843.5,
   'invoice_id': 4736},
  {'amount': 8.85,
   'description': 'Research on ID embedding',
   'price_per_unit': 145.0,
   'total_price': 283.25,
   'invoice_id': 4736},
  {'amount': None,
   'description': 'Shipping',
   'price_per_unit': 9.99,
   'total_price': 9.99,
   'invoice_id': 4736}]}

In [21]:
extractor.to_dataframe(resultss)

Unnamed: 0,amount,description,price_per_unit,total_price,invoice_id
0,2.25,Development of in project Character Grid,135.0,301.5,4736
1,40.3,Research on 2D embedding,145.0,5843.5,4736
2,8.85,Research on ID embedding,145.0,283.25,4736
3,,Shipping,9.99,9.99,4736


In [10]:
passport_processor = DocumentProcessor(["data/passport1.jpg", "data/passport2.jpg"])

# Process the document
doc_passport = passport_processor.process()
doc_passport

Error processing data/passport2.jpg: cannot import name 'TextRegions' from 'unstructured_inference.inference.elements' (c:\Users\ASHKAN\AppData\Local\Programs\Python\Python312\Lib\site-packages\unstructured_inference\inference\elements.py)
Error processing data/passport1.jpg: cannot import name 'TextRegions' from 'unstructured_inference.inference.elements' (c:\Users\ASHKAN\AppData\Local\Programs\Python\Python312\Lib\site-packages\unstructured_inference\inference\elements.py)


{'passport2.jpg': [], 'passport1.jpg': []}

In [22]:
schema = ExtractorSchema(
    fields=[
        Field(
            name="passport_number",
            description="The unique identification number of the passport",
            field_type=FieldType.STRING,
        ),
        Field(
            name="full_name",
            description="Full name of the passport holder as shown in the passport",
            field_type=FieldType.STRING,
        ),
        Field(
            name="date_of_birth",
            description="Date of birth of the passport holder (e.g., 1980-01-01)",
            field_type=FieldType.DATE,
        ),
        Field(
            name="place_of_birth",
            description="Place of birth of the passport holder as mentioned in the passport",
            field_type=FieldType.STRING,
        ),
        Field(
            name="nationality",
            description="Nationality of the passport holder",
            field_type=FieldType.STRING,
        ),
        Field(
            name="date_of_issue",
            description="Date when the passport was issued (e.g., 2020-05-15)",
            field_type=FieldType.DATE,
        ),
        Field(
            name="date_of_expiry",
            description="Expiry date of the passport (e.g., 2030-05-15)",
            field_type=FieldType.DATE,
        ),
        Field(
            name="issuing_country",
            description="Country that issued the passport",
            field_type=FieldType.STRING,
        ),
        Field(
            name="sex",
            description="Sex of the passport holder (e.g., M, F, X)",
            field_type=FieldType.STRING,
        ),
        Field(
            name="personal_number",
            description="Additional identification number or personal number if present",
            field_type=FieldType.STRING,
        ),
    ],
)


In [23]:
extractor = Extractor(llm=openai_extractor, schema=schema)
result = extractor.extract(doc_passport)


In [24]:
extractor.to_dataframe(result)

Unnamed: 0,passport_number,full_name,date_of_birth,place_of_birth,nationality,date_of_issue,date_of_expiry,issuing_country,sex,personal_number
0,3005988,HAPPY,1981-01-01,"NEWYORK, U.S.A.",,,2009-11-29,STATES DEPARTMENT OF STARE,M,
1,920000018,CHERUBIN Nea Priam NIRKA,1962-05-06,,HAITIENNE,1992-12-17,1997-12-16,HAITI,F,


In [23]:
# Initialize processor with config
config = ProcessingConfig(
    ocr_for_images=True,
    ocr_model='paddle'  # or 'tesseract'
)

# Create processor instance
processor = DocumentProcessor(["data/passport1.jpg", "data/passport2.jpg"])

# Process documents
results = processor.process(config)

[2024/11/02 17:56:11] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ASHKAN/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ASHKAN/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_nu

In [24]:
extractor = Extractor(llm=openai_async_extractor, schema=schema)
result = await extractor.extract(results)


In [25]:
df = extractor.to_dataframe(result)
df

Unnamed: 0,passport_number,full_name,date_of_birth,place_of_birth,nationality,date_of_issue,date_of_expiry,issuing_country,sex,personal_number
0,C03005988,Hok k. NIRKA,,HAISA,USA,2009-11-30,2011-11-29,UNITED STATES,M,
1,HTI 920000018,CHERUBIN Nee/Prian KelLi c NIRKA,1962-05-06,PORT-AU-PRINCE,HAITIENNE,1997-12-16,1992-12-17,AYITI/HAITI,F,
