In [17]:
import pdfplumber


pdf_path = "../Foresango USD Sample Invoice.pdf"
with pdfplumber.open(pdf_path) as pdf:
    text = "\n".join(page.extract_text() or '' for page in pdf.pages)
    print(text)

FORESANGO INVESTMENTS (Pvt) Ltd
3247 CMED Complex
Cnr Rekayi Tangwena Ave
Workington,Harare
Fiscal Tax Invoice
Date 17/10/2024
Document No. FUR57
VAT No. 220089190
TIN No. 2001365880
CLIENT DETAILS
Registered Name Lazio
T/A Mashwede Investments
Address, Street 8Ferguson,Greendale
City, Province Harare
Contact No. 0772754660
Customer E-mail Lazio@gmail.co.zw
Customer TIN. 2000181693
Customer VAT. 220572879
Description Qty Unit Price Excl Total VAT Excl VAT
Coke Can 2 1.50 3.00 15%
Fanta 500Ml 3 2.00 6.00 0%
Sprite 2 1.00 2.00 15%
Banking Details Subtotal Excl VAT USD 11.00
Account Name: Foresango Investments
VAT Total USD 0.75
Bank: CBZ Bank
Acc No: 26107970011
Bank Code: 011 Total Incl VAT USD 11.75
Branch: Kwame Nkrumah


In [None]:
import re
from datetime import datetime

def extract_data(pdf_text):
    # Identify document type
    document_type = None
    if re.search(r'(?i)(Fiscal Tax Invoice)', pdf_text):
        document_type = "FISCALINVOICE"
    elif re.search(r'(?i)(Credit Note)', pdf_text):
        document_type = "CREDITNOTE"
    elif re.search(r'(?i)(Debit Note)', pdf_text):
        document_type = "DEBITNOTE"

    # Extract Document No.
    document_no = re.search(r'Document No\.\s*(\S+)', pdf_text)
    document_no = document_no.group(1) if document_no else None

    # Extract Date
    date_match = re.search(r'Date\s+(\d{2}/\d{2}/\d{4})', pdf_text)
    formatted_date = None
    if date_match:
        receipt_date = date_match.group(1)  # Extract the captured group
        # Combine extracted date with current time
        extracted_date = datetime.strptime(receipt_date, '%d/%m/%Y')
        current_time = datetime.now().time()
        combined_datetime = datetime.combine(extracted_date.date(), current_time)
        formatted_date = combined_datetime.strftime('%Y-%m-%dT%H:%M:%S')

    # Extract currency
    currency = None
    if re.search(r'\bUSD\b', pdf_text):
        currency = "USD"
    elif re.search(r'\bZWG\b', pdf_text):
        currency = "ZWG"

    # Split the text based on item lines (after the headers, which usually appear after 'Excl VAT')
    item_lines = re.split(r'\n(?=\w)', pdf_text)  # Split at newlines with words, avoiding header lines
    
    # Extract item details from each line
    items = []
    for line in item_lines:
        # Clean the line by removing unwanted text
        line = line.strip()
        # Skip the header line with column names and any other irrelevant lines
        if re.match(r'^(Description|Qty|Unit Price|Excl Total VAT)', line):
            continue

        # Match individual item lines and capture the necessary details (description, qty, unit price, VAT)
        match = re.match(r'([a-zA-Z\s0-9]+)\s+(\d+)\s+(\d+\.\d{2})\s+(\d+\.\d{2})\s+(\d+%)', line)
        if match:
            items.append({
                "Description": match[1].strip(),  # Now correctly captures the description
                "Qty": int(match[2]),  # Quantity is now correctly extracted as an integer
                "Unit Price": float(match[3]),  # Unit Price as a float
                "VAT (%)": match[5]  # VAT percentage
            })

    return {
        "deviceID": 19250,
        "receiptType": document_type,
        "receiptCurrency": currency,
        "receiptCounter": 2,  # Placeholder for dynamic value
        "receiptGlobalNo": 2,  # Placeholder for dynamic value
        "invoiceNo": document_no,
        "receiptDate": formatted_date,
        "receiptLines": items,
        #"receiptPayments": [{"moneyTypeCode": 1, "paymentAmount": total_payment}]
    }



# Call the function and print the result
print(extract_data(text))


{'deviceID': 19250, 'receiptType': 'FISCALINVOICE', 'receiptCurrency': 'USD', 'receiptCounter': 2, 'receiptGlobalNo': 2, 'invoiceNo': 'FUR57', 'receiptDate': '2024-10-17T11:47:21', 'receiptLines': [{'Description': 'Coke Can', 'Qty': 2, 'Unit Price': 1.5, 'VAT (%)': '15%'}, {'Description': 'Fanta 500Ml', 'Qty': 3, 'Unit Price': 2.0, 'VAT (%)': '0%'}, {'Description': 'Sprite', 'Qty': 2, 'Unit Price': 1.0, 'VAT (%)': '15%'}]}
