In [5]:
!pip install pdfplumber




In [6]:
import zipfile
import os

# Path to the ZIP file in Colab (for example, from your Google Drive)
zip_file_path = '/content/sample_data/zolvit.zip'  # Replace with your actual ZIP file path

# Directory where you want to extract the contents
extract_to_dir = '/content/extracted_folder/'  # Replace with your desired extraction path

# Create the directory if it doesn't exist
os.makedirs(extract_to_dir, exist_ok=True)

# Extract the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_dir)

print(f"Files extracted to: {extract_to_dir}")


Files extracted to: /content/extracted_folder/


In [7]:
import os
import csv
import re
import pdfplumber

# Directory containing your PDFs
pdf_directory = '/content/extracted_folder'
output_csv = 'invoice_data_2.csv'

def extract_invoice_data(pdf_path):
    extracted_data = ""

    # Use pdfplumber to read PDF
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_data += page.extract_text() + "\n"  # Extract text from each page

    # Patterns for extracting invoice details
    patterns = {
        "invoice_number": r"Invoice #:\s*(INV-\d+)",  # Invoice number
        "invoice_date": r"Invoice Date:\s*(\d{2}\s[A-Za-z]{3}\s\d{4})",  # Invoice date
        "due_date": r"Due Date:\s*(\d{2}\s[A-Za-z]{3}\s\d{4})",  # Due date
        "taxable_amount": r"Taxable Amount ₹([\d,.]+)",  # Taxable amount
        "cgst": r"CGST\s+\d+\.\d+\%\s+₹([\d,.]+)",  # CGST amount
        "sgst": r"SGST\s+\d+\.\d+\%\s+₹([\d,.]+)",  # SGST amount
        "total_amount": r"Total ₹([\d,.]+)",  # Total amount
        "total_discount": r"Total Discount ₹([\d,.]+)",  # Total discount
        "total_items_qty": r"Total Items / Qty\s*:\s*(\d+)\s*/\s*(\d+)",  # Total items / qty
        "amount_in_words": r"Total amount \(in words\):\s*(.*)",  # Amount in words
    }

    invoice_data = {}

    # Extract key fields using the defined patterns
    for key, pattern in patterns.items():
        match = re.search(pattern, extracted_data, re.MULTILINE)
        if match:
            invoice_data[key] = match.group(1).strip()

        else:
            # Set default values for CGST and SGST to 0 if not found
            if key in ["cgst", "sgst"]:
                invoice_data[key] = 0
            else:
                invoice_data[key] = None


    # Regex pattern for customer details
    customer_details_pattern = r"Customer Details:\s*(.*?)\n"

    # Apply the pattern to the text
    match = re.search(customer_details_pattern, extracted_data, re.DOTALL)

    if match:
        customer_details_line = match.group(1).strip()  # Capture the customer details line

        # Check if the captured customer name is "Shipping Address:"
               # Check if the captured customer details are "Shipping Address:" or "Billing Address:"
        if customer_details_line == "Shipping Address:" or customer_details_line == "Billing Address:":
            # Create a regex pattern to match either "Shipping Address:" or "Billing Address:"
            address_pattern = r"(Shipping Address:|Billing Address:)\s*(\S+\s+\S+)"
            address_match = re.search(address_pattern, extracted_data)
            if address_match:
                customer_name = address_match.group(2)  # Get the next two words after either address
            else:
                customer_name = ""  # Default to empty if not found
        else:
            customer_name = customer_details_line  # Otherwise, use the captured name

        invoice_data["customer_name"] = customer_name
        invoice_data["shipping_address"] = ""  # Assuming you don't need to extract shipping address





    return invoice_data

# Function to process all PDF files in the directory
def process_all_pdfs(pdf_directory):
    extracted_data_list = []

    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            print(f"Processing {filename}...")

            # Extract invoice data from the PDF
            invoice_data = extract_invoice_data(pdf_path)

            # Extract invoice number from the filename (if not found in text)
            if not invoice_data.get("invoice_number"):
                invoice_data["invoice_number"] = extract_invoice_number_from_filename(filename)

            extracted_data_list.append(invoice_data)

    return extracted_data_list

# Function to save extracted data to a CSV file
def save_data_to_csv(extracted_data_list, output_csv):
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([  # CSV header
            "Invoice Number", "Invoice Date", "Due Date", "Customer Name",
             "Taxable Amount", "CGST", "SGST", "Total Amount",
            "Total Discount", "Total Items / Qty", "Amount in Words"
        ])

        for data in extracted_data_list:
            writer.writerow([
                data["invoice_number"],
                data["invoice_date"],
                data["due_date"],
                data["customer_name"],
                data["taxable_amount"],
                data["cgst"],
                data["sgst"],
                data["total_amount"],
                data["total_discount"],
                data["total_items_qty"],
                data["amount_in_words"],
            ])

# Function to calculate and print accuracy for each column
def print_accuracy(extracted_data_list):
    total_invoices = len(extracted_data_list)
    column_names = [
        "invoice_number", "invoice_date", "due_date", "customer_name",
       "taxable_amount", "cgst", "sgst", "total_amount",
        "total_discount", "total_items_qty", "amount_in_words"
    ]

    print("\nAccuracy of each column:")
    for column in column_names:
        filled_count = sum(1 for data in extracted_data_list if data[column] is not None)
        accuracy = (filled_count / total_invoices) * 100 if total_invoices > 0 else 0
        print(f"{column.replace('_', ' ').title()}: {filled_count}/{total_invoices}  ({accuracy:.2f}%)")

# Main execution
if __name__ == "__main__":
    extracted_data = process_all_pdfs(pdf_directory)
    save_data_to_csv(extracted_data, output_csv)
    print_accuracy(extracted_data)

print(f"Data extraction complete. Check {output_csv} for results.")


Processing INV-136_Rishabh Ramola.pdf...
Processing INV-100_Agrani Kandele.pdf...
Processing INV-112_Gauri.pdf...
Processing INV-138_Agrani Kandele.pdf...
Processing INV-148_harshit rathore.pdf...
Processing INV-141_Kasturi Kalwar.pdf...
Processing INV-106_Kamakshi Thakkar.pdf...
Processing INV-129_Divya Suhane.pdf...
Processing INV-142_Urmila Jangam.pdf...
Processing INV-147_Divya Suhane.pdf...
Processing INV-128_Atia Latif.pdf...
Processing INV-149_Karishma Bande.pdf...
Processing INV-135_Mohith Saragur.pdf...
Processing INV-133_Sheetal Kapur.pdf...
Processing INV-123_Asit.pdf...
Processing INV-104_Joseph Wincet.pdf...
Processing INV-113_Raghvendra.pdf...
Processing INV-114_Vaibhav Bhagat.pdf...
Processing INV-143_Prashant.pdf...
Processing INV-140_Ankit.pdf...
Processing INV-150_Bhusan Naresh.pdf...
Processing INV-115_Akhil Abhay.pdf...
Processing INV-107_Prashant.pdf...
Processing INV-99_Indraja.pdf...
Processing INV-73_Avik Mallick.pdf...
Processing INV-144_Atia Latif.pdf...
Proce