# Legal Document Analysis

This notebook analyzes a PDF document to identify its legal document type using best practices.

In [1]:
# Install required packages if not already installed
!pip install PyPDF2 nltk spacy pandas numpy scikit-learn

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
     ---------------------------------------- 0.0/232.6 kB ? eta -:--:--
     ---- -------------------------------- 30.7/232.6 kB 640.0 kB/s eta 0:00:01
     --------- --------------------------- 61.4/232.6 kB 544.7 kB/s eta 0:00:01
     --------------- -------------------- 102.4/232.6 kB 653.6 kB/s eta 0:00:01
     ---------------------------- ------- 184.3/232.6 kB 926.0 kB/s eta 0:00:01
     -------------------------------------- 232.6/232.6 kB 1.1 MB/s eta 0:00:00
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ----- ---------------------------------- 0.2/1.5 MB 6.3 MB/s eta 0:00:01
     ------------ --------------------------- 0.5/1.5 MB 5.9 MB/s eta 0:00:01
     -------------------------- ------------- 1.0/1.5 MB 6.9 MB/s eta 0:00:01
     -------------------------------------- - 1.4/1.5 MB 7.6 MB/s eta 0:00:01
     -


[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import PyPDF2
import nltk
import spacy
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB 1.3 MB/s eta 0:00:10
     --------------------------------------- 0.1/12.8 MB 787.7 kB/s eta 0:00:17
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
      --------------------------------------- 0.3/12.8 MB 1.7 MB/s eta 0:00:08
     - -------------------------------------- 0.5/12.8 MB 2.4 MB/s eta 0:00:06
     -- ------------------------------------- 0.7/12.8 MB 2.6 MB/s eta 0:00:05
     -- ------------------------------------- 0.8/12.8 MB 2.8 MB/s eta 0:00:05
     ---- ----------------------------------- 1.3/12.8 MB 3.7 MB/s eta 0:00:04
     ------ --------------------------------- 2.1/12.8 MB 5.1 MB/s eta 0:00:03
     -------- --------------------------


[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [7]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF, handling potential encryption."""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Check if PDF is encrypted
            if pdf_reader.is_encrypted:
                print("PDF is encrypted. Please provide password.")
                return None
            
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            
            return text
    except Exception as e:
        print(f"Error reading PDF: {str(e)}")
        return None

In [8]:
def analyze_legal_document(text):
    """Analyze the document to identify its type and key characteristics."""
    if not text:
        return None
    
    # Process with spaCy
    doc = nlp(text)
    
    # Common legal document keywords
    document_types = {
        'contract': ['agreement', 'contract', 'party', 'parties', 'terms', 'conditions'],
        'affidavit': ['affidavit', 'sworn', 'depose', 'oath', 'declare'],
        'will': ['will', 'testament', 'bequest', 'executor', 'heir', 'estate'],
        'power_of_attorney': ['power of attorney', 'attorney-in-fact', 'principal'],
        'lease': ['lease', 'tenant', 'landlord', 'premises', 'rent'],
        'deed': ['deed', 'property', 'grantor', 'grantee', 'convey'],
        'court_filing': ['court', 'plaintiff', 'defendant', 'jurisdiction', 'petition']
    }
    
    # Count occurrences of keywords
    type_scores = {doc_type: 0 for doc_type in document_types}
    
    # Analyze text for each document type
    text_lower = text.lower()
    for doc_type, keywords in document_types.items():
        for keyword in keywords:
            type_scores[doc_type] += text_lower.count(keyword)
    
    # Get the most likely document type
    likely_type = max(type_scores.items(), key=lambda x: x[1])
    
    # Extract key entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    return {
        'document_type': likely_type[0],
        'confidence_score': likely_type[1],
        'type_scores': type_scores,
        'key_entities': entities,
        'document_length': len(text),
        'paragraph_count': len(text.split('\n\n'))
    }


In [9]:
# Path to the PDF file
pdf_path = Path('encrypted_name.pdf')

# Extract text from PDF
print("Extracting text from PDF...")
text = extract_text_from_pdf(pdf_path)

if text:
    print("\nAnalyzing document...")
    analysis_results = analyze_legal_document(text)
    
    print("\nAnalysis Results:")
    print(f"Document Type: {analysis_results['document_type'].replace('_', ' ').title()}")
    print(f"Confidence Score: {analysis_results['confidence_score']}")
    print("\nType Scores:")
    for doc_type, score in analysis_results['type_scores'].items():
        print(f"{doc_type.replace('_', ' ').title()}: {score}")
    
    print("\nKey Entities Found:")
    for entity, label in analysis_results['key_entities'][:10]:  # Show first 10 entities
        print(f"{label}: {entity}")
    
    print(f"\nDocument Statistics:")
    print(f"Length: {analysis_results['document_length']} characters")
    print(f"Paragraphs: {analysis_results['paragraph_count']}")

Extracting text from PDF...

Analyzing document...

Analysis Results:
Document Type: Contract
Confidence Score: 122

Type Scores:
Contract: 122
Affidavit: 0
Will: 10
Power Of Attorney: 0
Lease: 2
Deed: 1
Court Filing: 10

Key Entities Found:
CARDINAL: 1
CARDINAL: 4
DATE: August 19th, 2024
ORG: the “Effective Date
ORG: ASIA PTE LTD
DATE: 202238741E
GPE: Singapore
CARDINAL: 328
FAC: North Bridge Road, Raffles Arcade
DATE: 188719

Document Statistics:
Length: 11059 characters
Paragraphs: 1
