In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image, ImageEnhance
import io
import re

pdf_path = "Invoices/AlphaImportInvoice.pdf"

pdf_document = fitz.open(pdf_path)

text = ""

for page in pdf_document:
    text += page.get_text("text")

if not text.strip():
    print("No embedded text found, performing OCR...")
    ocr_text = ""
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]
        pix = page.get_pixmap(dpi=400, alpha=False)

        img = Image.open(io.BytesIO(pix.tobytes()))
        img = img.convert("L")
        img = ImageEnhance.Contrast(img).enhance(2.0)
        img = ImageEnhance.Sharpness(img).enhance(2.0)

        custom_config = (
            r'--oem 3 --psm 6 '
            r'-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
        )

        page_text = pytesseract.image_to_string(img, config=custom_config)
        page_text = re.sub(r'[^A-Za-z0-9\s]', '', page_text)
        ocr_text += page_text + "\n"

    text = ocr_text
else:
    print("Embedded text found, skipping OCR.")

pdf_document.close()

print(text)


Embedded text found, skipping OCR.
11 First St.
Charleston, SC 29405
DATE
10/10/2025
Phone: 843-987-9876
INVOICE #
25-14234
Website: alphaimports.com
CUSTOMER ID
2323
DUE DATE
11/9/2025
BILL TO
John Smith
RetailCo
123 Main St
Charlotte, NC 28205
(704) 123-1234
QUANTITY
ITEM PRICE
AMOUNT
Guinness Draught Stout – 4 Pack (14.9 fl oz Cans)
55
$9.99
$549.45
Stella Artois Belgian Lager – 6 Pack (11.2 fl oz Bottles)
45
$10.99
$494.55
White Claw Hard Seltzer Variety Pack – 12 Pack
40
$18.49
$739.60
Jack Daniel’s Tennessee Whiskey – 750 ml Bottle
45
$28.99
$1,304.55
Tito’s Handmade Vodka – 1 Liter Bottle
45
$24.99
$1,124.55
Josh Cellars Cabernet Sauvignon – 750 ml Bottle
30
$15.99
$479.70
Marlboro Red Cigarettes – Carton (10 Packs)
30
$89.99
$2,699.70
Camel Blue Cigarettes – Carton (10 Packs)
23
$87.99
$2,023.77
Newport Menthol Cigarettes – Carton (10 Packs)
20
$94.49
$1,889.80
American Spirit Original – Carton (10 Packs)
19
$95.99
1,823.81
          
Copenhagen Long Cut – 5 Cans
60
$38.49
2,30

In [39]:
import csv
import json

# Path to your CSV file
csv_file = "tax_rate_by_category.csv"
json_file = "output.json"

data = {}

# Open and read CSV
with open(csv_file, mode='r') as f:
    reader = csv.reader(f)
    
    # Skip header if present — remove this line if your CSV has no header
    next(reader, None)
    
    for row in reader:
        if len(row) >= 3:  # ensure col B and C exist
            key = row[1].strip()
            value = row[2].strip()
            if key:  # ignore empty keys
                data[key] = value

# Write to JSON
with open(json_file, mode='w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print(f"✅ Successfully converted '{csv_file}' → '{json_file}'")


✅ Successfully converted 'tax_rate_by_category.csv' → 'output.json'
