In [17]:
import os
import cv2
import pytesseract
import numpy as np
from pytesseract import Output
from PIL import Image
import pandas as pd

In [18]:
def load_image(image_path):
    """Load image with validation using multiple methods."""
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"File not found: {image_path}")
    
    # Try OpenCV first
    img = cv2.imread(image_path)
    
    # If OpenCV fails, try PIL as fallback
    if img is None:
        try:
            pil_img = Image.open(image_path)
            img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
        except Exception as e:
            raise ValueError(f"Both OpenCV and PIL failed to load {image_path}: {str(e)}")
    
    return img

In [19]:
def extract_text_from_image(image_path):
    """Extract text with comprehensive error handling."""
    try:
        img = load_image(image_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Apply preprocessing (critical for financial docs)
        gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
        
        # Configure Tesseract for financial documents
        custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ₹$,.-'
        data = pytesseract.image_to_data(gray, config=custom_config, output_type=Output.DICT)
        
        return " ".join([word for word in data["text"] if word.strip()]), data
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None, None

In [20]:
def process_folder(folder_path):
    """Process all images in a folder."""
    records = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(folder_path, filename)
            print(f"Processing {filename}...", end=" ")
            text, data = extract_text_from_image(image_path)
            
            if text:
                records.append({
                    "document_type": os.path.basename(folder_path),
                    "filename": filename,
                    "raw_text": text,
                    "words": data["text"]
                })
                print("✅ Success")
            else:
                print("❌ Failed")
    
    return pd.DataFrame(records)

In [22]:
# 1. Verify paths
base_dir = os.path.dirname(os.getcwd())  # Assumes notebook is in MoneyMatic/notebook/
data_dir = os.path.join(base_dir, "data", "raw")

print(f"Base directory: {base_dir}")
print(f"Data directory exists: {os.path.exists(data_dir)}")


Base directory: /home/mush/git/github/MoneyMatic
Data directory exists: True


In [23]:
# 2. Process all folders
raw_folders = ["Bank Statement", "Check", "ITR_Form 16", "Salary Slip", "Utility"]
df_list = []

for folder in raw_folders:
    folder_path = os.path.join(data_dir, folder)
    if os.path.exists(folder_path):
        print(f"\nProcessing {folder}...")
        df = process_folder(folder_path)
        df_list.append(df)


Processing Bank Statement...
Processing 1.jpg... ✅ Success
Processing 10.jpg... ✅ Success
Processing 11.jpg... ✅ Success
Processing 12.jpg... ❌ Failed
Processing 13.jpg... ✅ Success
Processing 14.jpg... ✅ Success
Processing 15.jpg... ✅ Success
Processing 16.jpg... ✅ Success
Processing 17.jpg... ✅ Success
Processing 18.jpg... ✅ Success
Processing 19.jpg... ✅ Success
Processing 2.jpg... ✅ Success
Processing 20.jpg... ✅ Success
Processing 21.jpg... ✅ Success
Processing 22.jpg... ✅ Success
Processing 23.jpg... ✅ Success
Processing 24.jpg... ✅ Success
Processing 25.jpg... ✅ Success
Processing 26.jpg... ✅ Success
Processing 27.jpg... ✅ Success
Processing 28.jpg... ✅ Success
Processing 29.jpg... ✅ Success
Processing 3.jpg... ✅ Success
Processing 30.jpg... ✅ Success
Processing 31.jpg... ✅ Success
Processing 32.jpg... ✅ Success
Processing 33.jpg... ✅ Success
Processing 34.jpg... ✅ Success
Processing 35.jpg... ✅ Success
Processing 37.jpg... ✅ Success
Processing 39.jpg... ✅ Success
Processing 41



✅ Success
Processing 31.jpg... ✅ Success
Processing 32.jpg... ✅ Success
Processing 33.jpg... ✅ Success
Processing 34.jpg... ✅ Success
Processing 35.jpg... ✅ Success
Processing 36.jpg... ✅ Success
Processing 37.jpg... ✅ Success
Processing 38.jpg... ✅ Success
Processing 39.jpg... ✅ Success
Processing 4.jpg... ✅ Success
Processing 40.jpg... ✅ Success
Processing 41.jpg... ✅ Success
Processing 42.jpg... ✅ Success
Processing 43.jpg... ✅ Success

Processing Salary Slip...
Processing 1.jpg... ✅ Success
Processing 10.jpg... ✅ Success
Processing 100.jpg... ✅ Success
Processing 101.jpg... ✅ Success
Processing 102.jpg... ✅ Success
Processing 11.jpg... ✅ Success
Processing 12.jpg... ✅ Success
Processing 13.jpg... ✅ Success
Processing 14.jpg... ✅ Success
Processing 15.jpg... ✅ Success
Processing 16.jpg... ✅ Success
Processing 18.jpg... ✅ Success
Processing 19.jpg... ✅ Success
Processing 2.jpg... ✅ Success
Processing 20.jpg... ✅ Success
Processing 21.jpg... ✅ Success
Processing 22.jpg... ✅ Success
Pr



✅ Success
Processing 52.jpg... ✅ Success
Processing 53.jpg... ✅ Success
Processing 54.jpg... ✅ Success
Processing 55.jpg... ✅ Success
Processing 57.jpg... ✅ Success
Processing 58.jpg... ✅ Success
Processing 59.jpg... ✅ Success
Processing 60.jpg... ✅ Success
Processing 61.jpg... ✅ Success
Processing 62.jpg... ✅ Success
Processing 63.jpg... ✅ Success
Processing 64.jpg... ✅ Success
Processing 65.jpg... ✅ Success
Processing 66.jpg... ✅ Success
Processing 67.jpg... ✅ Success
Processing 68.jpg... ✅ Success
Processing 69.jpg... ✅ Success
Processing 7.jpg... ✅ Success
Processing 70.jpg... ✅ Success
Processing 71.jpg... ✅ Success
Processing 72.jpg... ✅ Success
Processing 73.jpg... ✅ Success
Processing 74.jpg... ✅ Success
Processing 75.jpg... ✅ Success
Processing 76.jpg... ✅ Success
Processing 77.jpg... ✅ Success
Processing 78.jpg... ✅ Success
Processing 79.jpg... ✅ Success
Processing 8.jpg... ✅ Success
Processing 80.jpg... ✅ Success
Processing 81.jpg... ✅ Success
Processing 82.jpg... 



✅ Success
Processing 83.jpg... ❌ Failed
Processing 84.jpg... ✅ Success
Processing 85.jpg... ✅ Success
Processing 86.jpg... ✅ Success
Processing 87.jpg... ✅ Success
Processing 88.jpg... ✅ Success
Processing 89.jpg... ✅ Success
Processing 9.jpg... ✅ Success
Processing 90.jpg... ✅ Success
Processing 91.jpg... ✅ Success
Processing 92.jpg... ✅ Success
Processing 93.jpg... ✅ Success
Processing 95.jpg... ✅ Success
Processing 96.jpg... ✅ Success
Processing 97.jpg... ✅ Success
Processing 98.jpg... ✅ Success
Processing 99.jpg... ✅ Success

Processing Utility...
Processing 1.jpg... ✅ Success
Processing 10.jpg... ✅ Success
Processing 100.jpg... ✅ Success
Processing 11.jpg... ✅ Success
Processing 12.jpg... ✅ Success
Processing 13.jpg... ✅ Success
Processing 14.jpg... ✅ Success
Processing 15.jpg... ✅ Success
Processing 16.jpg... ✅ Success
Processing 17.jpg... ✅ Success
Processing 18.jpg... ✅ Success
Processing 19.jpg... ✅ Success
Processing 2.jpg... ✅ Success
Processing 20.jpg... ✅ Success
Processin



✅ Success
Processing 54.jpg... ✅ Success
Processing 55.jpg... ✅ Success
Processing 56.jpg... ✅ Success
Processing 57.jpg... ✅ Success
Processing 58.jpg... ✅ Success
Processing 59.jpg... ✅ Success
Processing 6.jpg... ✅ Success
Processing 60.jpg... ✅ Success
Processing 61.jpg... ✅ Success
Processing 62.jpg... ✅ Success
Processing 63.jpg... ✅ Success
Processing 64.jpg... ✅ Success
Processing 65.jpg... ✅ Success
Processing 66.jpg... ✅ Success
Processing 67.jpg... ✅ Success
Processing 68.jpg... ✅ Success
Processing 69.jpg... ✅ Success
Processing 7.jpg... ✅ Success
Processing 70.jpg... ✅ Success
Processing 71.jpg... ✅ Success
Processing 72.jpg... ✅ Success
Processing 73.jpg... ✅ Success
Processing 74.jpg... ✅ Success
Processing 75.jpg... ✅ Success
Processing 76.jpg... ✅ Success
Processing 77.jpg... ✅ Success
Processing 78.jpg... ✅ Success
Processing 79.jpg... ✅ Success
Processing 8.jpg... ✅ Success
Processing 80.jpg... ✅ Success
Processing 81.jpg... ✅ Success
Processing 83.jpg... ✅ Success
P



✅ Success
Processing 87.jpg... ✅ Success
Processing 88.jpg... ✅ Success
Processing 89.jpg... ✅ Success
Processing 9.jpg... ✅ Success
Processing 90.jpg... ✅ Success
Processing 91.jpg... ✅ Success
Processing 92.jpg... ✅ Success
Processing 93.jpg... ✅ Success
Processing 94.jpg... ✅ Success
Processing 95.jpg... ✅ Success
Processing 96.jpg... ✅ Success
Processing 97.jpg... ✅ Success
Processing 98.jpg... ✅ Success
Processing 99.jpg... ✅ Success


In [24]:
# 3. Combine and save results
if df_list:
    final_df = pd.concat(df_list, ignore_index=True)
    os.makedirs(os.path.join(base_dir, "data", "processed"), exist_ok=True)
    final_df.to_csv(os.path.join(base_dir, "data", "processed", "ocr_results.csv"), index=False)
    print("\nOCR completed. Results saved to data/processed/ocr_results.csv")
else:
    print("\nNo valid documents processed. Check your data/raw/ folder.")


OCR completed. Results saved to data/processed/ocr_results.csv
