In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from document_processor import DocumentProcessor
from langchain_community.document_loaders import PyPDFLoader
import logging
from tqdm import tqdm

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize processor
processor = DocumentProcessor()

# Project paths
PROJECT_ROOT = Path().absolute()
DATA_DIR = PROJECT_ROOT / 'data' / 'raw'

def inspect_document_store():
    """Inspect current state of processed documents"""
    print("\nCurrently processed files:")
    for file_path, file_hash in processor.processed_files.items():
        print(f"- {file_path}")
    
    print("\nChromaDB collection info:")
    total_docs = processor.collection.count()
    print(f"Total documents: {total_docs}")
    
    # Get collection stats
    results = processor.collection.get()
    if results['metadatas']:
        doc_types = {}
        sources = {}
        for metadata in results['metadatas']:
            doc_type = metadata.get('type', 'unknown')
            source = metadata.get('source', 'unknown')
            doc_types[doc_type] = doc_types.get(doc_type, 0) + 1
            sources[Path(source).name] = sources.get(Path(source).name, 0) + 1
        
        print("\nDocument types distribution:")
        for doc_type, count in doc_types.items():
            print(f"- {doc_type}: {count} chunks")
            
        print("\nSource files distribution:")
        for source, count in sources.items():
            print(f"- {source}: {count} chunks")

# Inspect current state
inspect_document_store()

INFO:config:✓ Environment variables validated
INFO:config:✓ ChromaDB client initialized at chroma_db
INFO:config:✓ Collection 'loan_documents' ready
INFO:document_processor:✓ Document processor initialized



Currently processed files:
- data\raw\loan_applications\Personal-Loan-Application-with-Disclosures-Fill-Form.pdf
- data\raw\loan_applications\URLA_2019_Borrower_v28.pdf
- data\raw\policy_documents\mill-citiescommunity-investments-loan-policies.pdf
- data\raw\policy_documents\section3-2.pdf
- data\raw\training_data\accepted_2007_to_2018Q4.csv
- data\raw\training_data\rejected_2007_to_2018Q4.csv

ChromaDB collection info:
Total documents: 8101

Document types distribution:
- pdf: 1939 chunks
- csv: 6162 chunks

Source files distribution:
- Personal-Loan-Application-with-Disclosures-Fill-Form.pdf: 26 chunks
- URLA_2019_Borrower_v28.pdf: 64 chunks
- mill-citiescommunity-investments-loan-policies.pdf: 163 chunks
- section3-2.pdf: 1686 chunks
- accepted_2007_to_2018Q4.csv: 5152 chunks
- rejected_2007_to_2018Q4.csv: 1010 chunks


In [2]:
from pytesseract import image_to_string
from pdf2image import convert_from_path

def ocr_pdf(file_path: Path):
    """Perform OCR on PDF pages"""
    try:
        print("\nAttempting OCR...")
        images = convert_from_path(file_path)
        ocr_text = "\n".join(image_to_string(img) for img in images)
        return ocr_text
    except Exception as e:
        logger.error(f"OCR failed: {str(e)}")
        return None

In [3]:
def analyze_pdf_quality(file_path: Path):
    """Analyze PDF document quality and content structure with OCR fallback"""
    try:
        # Try standard text extraction first
        from pdfminer.high_level import extract_text
        from pdfplumber import open as pdfplumber_open
        from PyPDF2 import PdfReader
        
        print(f"\nAnalyzing {file_path.name}:")
        
        # Method 1: PyPDFLoader
        loader = PyPDFLoader(str(file_path))
        pages = loader.load()
        print(f"PyPDFLoader pages: {len(pages)}")
        
        # Method 2: pdfminer
        pdfminer_text = extract_text(file_path)
        print(f"pdfminer text length: {len(pdfminer_text)}")
        
        # Method 3: pdfplumber
        with pdfplumber_open(file_path) as pdf:
            pdfplumber_text = "\n".join(page.extract_text() for page in pdf.pages)
            print(f"pdfplumber text length: {len(pdfplumber_text)}")
        
        # Method 4: PyPDF2
        reader = PdfReader(file_path)
        pypdf2_text = "\n".join(page.extract_text() for page in reader.pages)
        print(f"PyPDF2 text length: {len(pypdf2_text)}")
        
        # Choose the method with most content
        best_text = max([pdfminer_text, pdfplumber_text, pypdf2_text], key=len)
        
        if not best_text.strip():
            print("\nWARNING: No text could be extracted from this PDF")
            print("Attempting OCR...")
            
            # OCR fallback
            ocr_text = ocr_pdf(file_path)
            if ocr_text:
                print(f"OCR extracted {len(ocr_text)} characters")
                best_text = ocr_text
            else:
                print("OCR failed to extract text")
                return None
            
        # Content analysis
        content_stats = {
            'total_chars': len(best_text),
            'form_fields': 0,
            'pages': []
        }
        
        # Split text by pages (approximate)
        page_texts = best_text.split('\f') if '\f' in best_text else [best_text]
        
        for i, page_text in enumerate(page_texts):
            # Check for potential form fields
            form_fields = [line.strip() for line in page_text.split('\n') 
                         if ':' in line and len(line.split(':')[0]) < 50]
            
            # Page statistics
            page_stats = {
                'page_num': i + 1,
                'char_count': len(page_text),
                'form_fields': form_fields,
                'text_preview': page_text[:200]
            }
            content_stats['pages'].append(page_stats)
            content_stats['form_fields'] += len(form_fields)
        
        print("\nContent Statistics:")
        print(f"Total characters: {content_stats['total_chars']}")
        print(f"Form fields detected: {content_stats['form_fields']}")
        
        print("\nPage Analysis:")
        for page in content_stats['pages']:
            print(f"\nPage {page['page_num']}:")
            print(f"Characters: {page['char_count']}")
            if page['form_fields']:
                print("Form fields found:")
                for field in page['form_fields'][:5]:
                    print(f"  - {field}")
                if len(page['form_fields']) > 5:
                    print(f"  ... and {len(page['form_fields'])-5} more fields")
            
        return content_stats
        
    except Exception as e:
        logger.error(f"Error analyzing PDF {file_path}: {str(e)}")
        return None

In [4]:
def analyze_csv_quality(file_path: Path):
    """Analyze CSV data quality and structure"""
    try:
        # Read CSV with better error handling
        df = pd.read_csv(file_path, low_memory=False)
        
        print(f"\nAnalyzing {file_path.name}:")
        print(f"Shape: {df.shape}")
        
        # Data quality analysis
        quality_stats = {
            'missing_values': {},
            'unique_values': {},
            'numeric_stats': {},
            'categorical_stats': {},
            'potential_issues': []
        }
        
        # Analyze each column
        for column in df.columns:
            # Missing values
            missing = df[column].isnull().sum()
            missing_pct = (missing / len(df)) * 100
            quality_stats['missing_values'][column] = missing_pct
            
            if missing_pct > 20:
                quality_stats['potential_issues'].append(
                    f"High missing values in {column}: {missing_pct:.1f}%"
                )
            
            # Unique values analysis
            unique_count = df[column].nunique()
            unique_pct = (unique_count / len(df)) * 100
            quality_stats['unique_values'][column] = unique_count
            
            # Identify potential ID columns
            if unique_pct > 90:
                quality_stats['potential_issues'].append(
                    f"{column} might be an ID column ({unique_pct:.1f}% unique values)"
                )
            
            # Analyze by data type
            if pd.api.types.is_numeric_dtype(df[column]):
                try:
                    stats = {
                        'mean': df[column].mean(),
                        'std': df[column].std(),
                        'min': df[column].min(),
                        'max': df[column].max(),
                        'zeros_pct': (df[column] == 0).mean() * 100
                    }
                    quality_stats['numeric_stats'][column] = stats
                    
                    # Check for outliers using IQR
                    Q1 = df[column].quantile(0.25)
                    Q3 = df[column].quantile(0.75)
                    IQR = Q3 - Q1
                    outliers = df[column][(df[column] < (Q1 - 1.5 * IQR)) | 
                                        (df[column] > (Q3 + 1.5 * IQR))]
                    if len(outliers) > 0:
                        pct_outliers = (len(outliers) / len(df)) * 100
                        if pct_outliers > 5:
                            quality_stats['potential_issues'].append(
                                f"High outliers in {column}: {pct_outliers:.1f}% of values"
                            )
                except Exception as e:
                    logger.warning(f"Error analyzing numeric column {column}: {str(e)}")
            else:
                # Categorical analysis
                value_counts = df[column].value_counts()
                quality_stats['categorical_stats'][column] = {
                    'top_values': value_counts.head().to_dict(),
                    'unique_count': unique_count
                }
        
        # Print analysis results
        print("\nData Quality Summary:")
        
        print("\nMissing Values (%):")
        for col, pct in quality_stats['missing_values'].items():
            if pct > 0:
                print(f"- {col}: {pct:.1f}%")
        
        print("\nNumeric Columns Statistics:")
        for col, stats in quality_stats['numeric_stats'].items():
            print(f"\n{col}:")
            for stat_name, value in stats.items():
                print(f"  {stat_name}: {value:.2f}")
        
        print("\nCategorical Columns:")
        for col, stats in quality_stats['categorical_stats'].items():
            print(f"\n{col}:")
            print(f"  Unique values: {stats['unique_count']}")
            if len(stats['top_values']) < 10:
                print("  Value distribution:")
                for val, count in stats['top_values'].items():
                    print(f"    - {val}: {count}")
        
        if quality_stats['potential_issues']:
            print("\nPotential Data Quality Issues:")
            for issue in quality_stats['potential_issues']:
                print(f"- {issue}")
                
        return quality_stats
        
    except Exception as e:
        logger.error(f"Error analyzing CSV {file_path}: {str(e)}")
        return None

In [5]:
# Analyze documents
print("Analyzing Documents...")

# Process PDFs
print("\nAnalyzing PDF Documents:")
for pdf_file in Path(DATA_DIR).rglob('*.pdf'):
    pdf_stats = analyze_pdf_quality(pdf_file)

# Process CSVs
print("\nAnalyzing CSV Documents:")
for csv_file in Path(DATA_DIR).rglob('*.csv'):
    csv_stats = analyze_csv_quality(csv_file)

# Verify processed data
print("\nVerifying processed data...")
inspect_document_store()

# Save analysis results
print("\nProcessing complete. Check the analysis output above for data quality issues.")

Analyzing Documents...

Analyzing PDF Documents:

Analyzing Personal-Loan-Application-with-Disclosures-Fill-Form.pdf:
PyPDFLoader pages: 4
pdfminer text length: 9932
pdfplumber text length: 9191
PyPDF2 text length: 9396

Content Statistics:
Total characters: 9932
Form fields detected: 20

Page Analysis:

Page 1:
Characters: 6480
Form fields found:
  - IMPORTANT: Read these Directions before completing this Application. Check the Appropriate Box.
  - ADDRESS:
  - ADDRESS:
  - ADDRESS:
  - IS:
  ... and 9 more fields

Page 2:
Characters: 630
Form fields found:
  - What this means for you:

Page 3:
Characters: 1789
Form fields found:
  - type:
  - Dated:
  - Dated:
  - By:

Page 4:
Characters: 1029
Form fields found:
  - with the above Financial Institution:

Page 5:
Characters: 0

Analyzing URLA_2019_Borrower_v28.pdf:
PyPDFLoader pages: 9
pdfminer text length: 24397
pdfplumber text length: 23038
PyPDF2 text length: 23452

Content Statistics:
Total characters: 24397
Form fields detected: 

In [6]:
def view_pdf_data(file_path: Path):
    """View extracted data from a PDF document"""
    try:
        # Extract text using the most reliable method
        from pdfminer.high_level import extract_text
        from pdfplumber import open as pdfplumber_open
        from PyPDF2 import PdfReader
        
        print(f"\nAnalyzing {file_path.name}:")
        
        # Try multiple extraction methods
        methods = {
            'pdfminer': extract_text(file_path),
            'pdfplumber': "\n".join(page.extract_text() for page in pdfplumber_open(file_path).pages),
            'PyPDF2': "\n".join(page.extract_text() for page in PdfReader(file_path).pages)
        }
        
        # Choose the method with most content
        best_text = max(methods.values(), key=len)
        
        if not best_text.strip():
            print("No text extracted. Attempting OCR...")
            from pytesseract import image_to_string
            from pdf2image import convert_from_path
            
            images = convert_from_path(file_path)
            best_text = "\n".join(image_to_string(img) for img in images)
            
            if not best_text.strip():
                print("OCR failed to extract text")
                return None
        
        # Display extracted data
        print("\nExtracted Text Preview:")
        print(best_text[:2000])  # Show first 2000 characters
        
        # Analyze document structure
        print("\nDocument Structure Analysis:")
        lines = best_text.split('\n')
        sections = [line.strip() for line in lines if line.strip().isupper() or ':' in line]
        
        print("\nDetected Sections/Fields:")
        for i, section in enumerate(sections[:20]):  # Show first 20 sections
            print(f"{i+1}. {section}")
            if i == 19:
                print("... (truncated)")
        
        # Save extracted data
        output_file = file_path.with_suffix('.txt')
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(best_text)
        print(f"\nFull text saved to: {output_file}")
        
        return best_text
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

# Process all PDFs in directory
pdf_dir = Path('data/raw')  # Update with your directory
for pdf_file in pdf_dir.rglob('*.pdf'):
    print("\n" + "="*50)
    print(f"Processing: {pdf_file.name}")
    print("="*50)
    extracted_data = view_pdf_data(pdf_file)
    
    if extracted_data:
        print("\nExtraction successful!")
    else:
        print("\nExtraction failed for this document")


Processing: Personal-Loan-Application-with-Disclosures-Fill-Form.pdf

Analyzing Personal-Loan-Application-with-Disclosures-Fill-Form.pdf:

Extracted Text Preview:
PERSONAL  LOAN  APPLICATION
[SEE REVERSE SIDE FOR IMPORTANT INFORMATION ABOUT PROCEDURES FOR OPENING A NEW 

ACCOUNT]

IMPORTANT: Read these Directions before completing this Application. Check the Appropriate Box. 

you are applying for individual credit or an individual account, in your own name, and are relying on your own income or assets and not the income or assets of another person as the basis for repayment

fI
of the credit requested, complete only Sections A-D. If the requested credit or account is to be secured, also complete the first part of Section F.

If you are applying for joint credit with another person or for a joint account or an account that you and another person will use, complete all Sections, providing information in Section E about the joint
applicant.

We intend to apply for joint credit.

Applica