In [1]:
import spacy
import pandas as pd
from collections import defaultdict

In [2]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
# Sample financial categories (expand as needed)
FINANCIAL_CATEGORIES = {
    "shopping": ["amazon", "flipkart", "myntra"],
    "food": ["zomato", "swiggy", "dominos", "restaurant"],
    "travel": ["uber", "ola", "irctc"],
    "utilities": ["electricity", "water", "broadband"]
}

In [4]:
def categorize_with_nlp(text):
    doc = nlp(text.lower())
    
    # Match against known entities
    for token in doc:
        for category, keywords in FINANCIAL_CATEGORIES.items():
            if token.text in keywords:
                return category
    
    # Fallback: Use noun chunks for unknown vendors
    for chunk in doc.noun_chunks:
        if "payment" in chunk.text or "bill" in chunk.text:
            return "utilities"
    
    return "uncategorized"


In [5]:
# Example usage
text = "Payment to Amazon Pay ₹1,499 for order #12345"
print(categorize_with_nlp(text))  # Output: "shopping"

shopping


In [6]:
def extract_financial_entities(text):
    doc = nlp(text)
    
    entities = {
        "vendor": None,
        "amount": None,
        "date": None
    }
    
    # Rule 1: Detect amounts (improved from regex)
    for ent in doc.ents:
        if ent.label_ == "MONEY":
            entities["amount"] = ent.text
        elif ent.label_ == "DATE":
            entities["date"] = ent.text
    
    # Rule 2: Detect vendors (first proper noun)
    for token in doc:
        if token.pos_ == "PROPN" and not entities["vendor"]:
            entities["vendor"] = token.text
    
    return entities

# Test
text = "Invoice from Uber ₹425.50 on 15th May 2024"
print(extract_financial_entities(text))
# Output: {'vendor': 'Uber', 'amount': '₹425.50', 'date': '15th May 2024'}

{'vendor': 'Uber', 'amount': '425.50', 'date': '15th May 2024'}


In [7]:
def process_ocr_results(df):
    results = []
    for _, row in df.iterrows():
        record = {
            "filename": row["filename"],
            "document_type": row["document_type"],
            "raw_text": row["raw_text"]
        }
        
        # NLP Extraction
        entities = extract_financial_entities(row["raw_text"])
        record.update(entities)
        
        # Categorization
        record["category"] = categorize_with_nlp(row["raw_text"])
        
        results.append(record)
    
    return pd.DataFrame(results)

In [8]:
from spacy.pipeline import EntityRuler


# Initialize entity ruler PROPERLY
ruler = nlp.add_pipe("entity_ruler", before="ner")  # Key change here

patterns = [
    # Indian Vendors
    {"label": "VENDOR", "pattern": "Zomato"},
    {"label": "VENDOR", "pattern": "Swiggy"},
    {"label": "VENDOR", "pattern": "IRCTC"},
    {"label": "VENDOR", "pattern": "Amazon Pay"},
    {"label": "VENDOR", "pattern": "Flipkart"},
    {"label": "VENDOR", "pattern": [{"LOWER": "airtel"}, {"LOWER": "payments"}]},
    
    # Date Formats (Indian conventions)
    {"label": "DATE", "pattern": [{"SHAPE": "dd"}, {"ORTH": "-"}, {"SHAPE": "mm"}, {"ORTH": "-"}, {"SHAPE": "yyyy"}]},
    {"label": "DATE", "pattern": [{"SHAPE": "dd"}, {"ORTH": "/"}, {"SHAPE": "mm"}, {"ORTH": "/"}, {"SHAPE": "yyyy"}]},
    {"label": "DATE", "pattern": [{"TEXT": {"REGEX": "^(\d{1,2})(th|st|nd|rd)?\s(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{4}$"}}]},
    
    # Indian Currency
    {"label": "MONEY", "pattern": [{"ORTH": "₹"}, {"SHAPE": "d,ddd.dd"}]},
    {"label": "MONEY", "pattern": [{"ORTH": "Rs."}, {"SHAPE": "d,ddd.dd"}]},
    
    # Invoice/Receipt IDs
    {"label": "REF_NO", "pattern": [{"ORTH": "#"}, {"SHAPE": "XXXXXX"}]},
    {"label": "REF_NO", "pattern": [{"TEXT": {"REGEX": "^[A-Z]{2}\d{6}$"}}]}  # E.g., "IN123456"
]

ruler.add_patterns(patterns)



In [9]:
# Load OCR output
df_ocr = pd.read_csv("../data/processed/ocr_results.csv")
df_processed = process_ocr_results(df_ocr)

In [11]:
# Save NLP-processed data
df_processed.to_csv("../data/processed/nlp_processed.csv", index=False)