In [6]:
import fitz  # PyMuPDF
import pandas as pd
import re

# Function to capitalize the first letter of each word
def title_case(text):
    return ' '.join([word.capitalize() for word in text.split()])

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    doc.close()
    return text

# Extract and format fields from text
def extract_fields(text):
    fields = {}
    patterns = {
        "customer": r"CUSTOMER:?\s+([A-Z\s]+?)(?:\sACCOUNT|\n)",
        "account_number": r"ACCOUNT NUMBER:?\s+([0-9\-]+)",
        "service_address": r"SERVICE ADDRESS:?\s+([A-Z0-9\s,.-]+)\n",
        "billing_date": r"BILLING DATE:?\s+([A-Za-z0-9,\s]+)\n",
        "autopay_date": r"AUTOPAY DATE:?\s+([A-Za-z0-9,\s]+)\n",
        "amount_due": r"AMOUNT DUE:?\s+\$?([0-9,.\s]+)"
    }
    
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            extracted_text = match.group(1).strip()
            # Apply specific formatting
            if field in ["billing_date", "autopay_date"]:
                fields[field] = extracted_text  # Keep the original date format
            else:
                fields[field] = title_case(extracted_text.lower())
    
    return fields

# Write extracted fields to CSV
def write_to_csv(fields, csv_path):
    df = pd.DataFrame([fields])
    df.columns = [column.replace("_", " ").title() for column in df.columns]  # Adjust column names
    df.to_csv(csv_path, index=False)

# Main process
pdf_path = 'D:\\Python Practice files\\bill.pdf'
csv_path = 'D:\\Python Practice files\\bill.csv'

text = extract_text_from_pdf(pdf_path)
fields = extract_fields(text)
write_to_csv(fields, csv_path)

print("Extraction and saving to CSV complete. Fields extracted:", list(fields.keys()))


Extraction and saving to CSV complete. Fields extracted: ['customer', 'account_number', 'billing_date', 'autopay_date', 'amount_due']


In [6]:
# Dates formatted for dates fields as DD/MM/YYYY

import fitz  # PyMuPDF
import pandas as pd
import re
from datetime import datetime

# Function to capitalize the first letter of each word
def title_case(text):
    return ' '.join([word.capitalize() for word in text.split()])

# Convert date from "Jan 12, 2024" to "MM/DD/YYYY"
def convert_date_format(date_str):
    # Trim and clean the date string to ensure it matches the expected format
    cleaned_date_str = date_str.strip().split('\n')[0]  # This takes the first line if there are multiple lines
    # Parse the date
    date_obj = datetime.strptime(cleaned_date_str, '%b %d, %Y')
    # Convert to desired format
    return date_obj.strftime('%m/%d/%Y')


# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    doc.close()
    return text

# Extract and format fields from text
def extract_fields(text):
    fields = {}
    patterns = {
        "customer": r"CUSTOMER:?\s+([A-Z\s]+?)(?:\sACCOUNT|\n)",
        "account_number": r"ACCOUNT NUMBER:?\s+([0-9\-]+)",
        "service_address": r"SERVICE ADDRESS:?\s+([A-Z0-9\s,.-]+)\n",
        "billing_date": r"BILLING DATE:?\s+([A-Za-z0-9,\s]+)\n",
        "autopay_date": r"AUTOPAY DATE:?\s+([A-Za-z0-9,\s]+)\n",
        "amount_due": r"AMOUNT DUE:?\s+\$?([0-9,.\s]+)"
    }
    
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            extracted_text = match.group(1).strip()
            # Apply specific formatting
            if field in ["billing_date", "autopay_date"]:
                fields[field] = convert_date_format(extracted_text)  # Convert date format
            else:
                fields[field] = title_case(extracted_text.lower())
    
    return fields

# Write extracted fields to CSV
def write_to_csv(fields, csv_path):
    df = pd.DataFrame([fields])
    df.columns = [column.replace("_", " ").title() for column in df.columns]  # Adjust column names
    df.to_csv(csv_path, index=False)

# Main process
pdf_path = 'D:\\Python Practice files\\bill.pdf'
csv_path = 'D:\\Python Practice files\\bill.csv'


text = extract_text_from_pdf(pdf_path)
fields = extract_fields(text)
write_to_csv(fields, csv_path)

print("Extraction and saving to CSV complete. Fields extracted:", list(fields.keys()))


Extraction and saving to CSV complete. Fields extracted: ['customer', 'account_number', 'billing_date', 'autopay_date', 'amount_due']
