# Legal Document Analysis

This notebook analyzes a PDF document to identify its legal document type using best practices.

In [None]:
# Install required packages if not already installed
!pip install PyPDF2 nltk spacy pandas numpy scikit-learn

In [None]:
import PyPDF2
import nltk
import spacy
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF, handling potential encryption."""
    try:
        with open(pdf_path, 'orb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Check if PDF is encrypted
            if pdf_reader.is_encrypted:
                print("PDF is encrypted. Please provide password.")
                return None
            
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            
            return text
    except Exception as e:
        print(f"Error reading PDF: {str(e)}")
        return None

In [None]:
def analyze_legal_document(text):
    """Analyze the document to identify its type and key characteristics."""
    if not text:
        return None
    
    # Process with spaCy
    doc = nlp(text)
    
    # Common legal document keywords
    document_types = {
        'NDA': ['nondisclosure','secret', 'parties']
        'contract': ['agreement', 'contract', 'party', 'parties', 'terms', 'conditions'],
        'affidavit': ['affidavit', 'sworn', 'depose', 'oath', 'declare'],
        'will': ['will', 'testament', 'bequest', 'executor', 'heir', 'estate'],
        'power_of_attorney': ['power of attorney', 'attorney-in-fact', 'principal'],
        'lease': ['lease', 'tenant', 'landlord', 'premises', 'rent'],
        'deed': ['deed', 'property', 'grantor', 'grantee', 'convey'],
        'court_filing': ['court', 'plaintiff', 'defendant', 'jurisdiction', 'petition']
    }
    
    # Count occurrences of keywords
    type_scores = {doc_type: 0 for doc_type in document_types}
    
    # Analyze text for each document type
    text_lower = text.lower()
    for doc_type, keywords in document_types.items():
        for keyword in keywords:
            type_scores[doc_type] += text_lower.count(keyword)
    
    # Get the most likely document type
    likely_type = max(type_scores.items(), key=lambda x: x[1])
    
    # Extract key entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    return {
        'document_type': likely_type[0],
        'confidence_score': likely_type[1],
        'type_scores': type_scores,
        'key_entities': entities,
        'document_length': len(text),
        'paragraph_count': len(text.split('\n\n'))
    }


In [None]:
# Path to the PDF file
pdf_path = Path('encrypted_name.pdf')

# Extract text from PDF
print("Extracting text from PDF...")
text = extract_text_from_pdf(pdf_path)

if text:
    print("\nAnalyzing document...")
    analysis_results = analyze_legal_document(text)
    
    print("\nAnalysis Results:")
    print(f"Document Type: {analysis_results['document_type'].replace('_', ' ').title()}")
    print(f"Confidence Score: {analysis_results['confidence_score']}")
    print("\nType Scores:")
    for doc_type, score in analysis_results['type_scores'].items():
        print(f"{doc_type.replace('_', ' ').title()}: {score}")
    
    print("\nKey Entities Found:")
    for entity, label in analysis_results['key_entities'][:10]:  # Show first 10 entities
        print(f"{label}: {entity}")
    
    print(f"\nDocument Statistics:")
    print(f"Length: {analysis_results['document_length']} characters")
    print(f"Paragraphs: {analysis_results['paragraph_count']}")

In [1]:
# Install required packages
!pip install PyPDF2 google-cloud-aiplatform python-dotenv google.generativeai

: 

In [None]:
import PyPDF2
import google.generativeai as genai
from pathlib import Path
import os
from dotenv import load_dotenv
import re
from collections import Counter

# Load environment variables
load_dotenv()

# Configure Gemini API
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key="AIzaSyCm4FSE6UTTUtoNvV8t5bI6GlTz_uW4VwE")

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF, handling potential encryption."""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            if pdf_reader.is_encrypted:
                print("PDF is encrypted. Please provide password.")
                return None
            
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            
            return text
    except Exception as e:
        print(f"Error reading PDF: {str(e)}")
        return None

In [None]:
# Enhanced keyword patterns for different document types
DOCUMENT_PATTERNS = {
    'NDA': {
        'keywords': [
            r'\bnondisclosure\b',
            r'\bnon\b',
            r'\bparties?\b',
            r'\bsecrets\b',
        ],
        'required_count': 2
    },
    'contract': {
        'keywords': [
            r'\b(?:this\s+)?agreement\b',
            r'\bcontract\b',
            r'\bparties?\b',
            r'\bhereby\s+agree\b',
            r'\bterms\s+and\s+conditions\b',
            r'\bin\s+witness\s+whereof\b'
        ],
        'required_count': 2
    },
    'power_of_attorney': {
        'keywords': [
            r'\bpower\s+of\s+attorney\b',
            r'\battorney[-\s]in[-\s]fact\b',
            r'\bprincipal\b',
            r'\bhereby\s+appoint\b',
            r'\bauthorize\s+and\s+empower\b'
        ],
        'required_count': 2
    },
    'will': {
        'keywords': [
            r'\blast\s+will\s+and\s+testament\b',
            r'\btestator\b',
            r'\bexecutor\b',
            r'\bbequest\b',
            r'\bdevise\b',
            r'\binherit\b',
            r'\bestate\b'
        ],
        'required_count': 3
    },
    'deed': {
        'keywords': [
            r'\bdeed\b',
            r'\bgrantor\b',
            r'\bgrantee\b',
            r'\bconvey\b',
            r'\bparcel\b',
            r'\breal\s+property\b'
        ],
        'required_count': 2
    },
    'court_filing': {
        'keywords': [
            r'\bin\s+the\s+court\b',
            r'\bplaintiff\b',
            r'\bdefendant\b',
            r'\bcause\s+(?:no|number)\b',
            r'\bmotion\b',
            r'\bpetition\b',
            r'\bcomplaint\b'
        ],
        'required_count': 2
    }
}

def analyze_with_keywords(text):
    """Analyze document using enhanced keyword filtering."""
    text_lower = text.lower()
    results = {}
    
    for doc_type, pattern_info in DOCUMENT_PATTERNS.items():
        matches = []
        for pattern in pattern_info['keywords']:
            found = re.findall(pattern, text_lower)
            if found:
                matches.extend(found)
        
        match_count = len(matches)
        required_count = pattern_info['required_count']
        confidence = match_count / len(pattern_info['keywords'])
        
        results[doc_type] = {
            'match_count': match_count,
            'confidence': confidence,
            'meets_threshold': match_count >= required_count,
            'matches': matches
        }
    
    # Determine most likely type
    valid_types = {k: v for k, v in results.items() if v['meets_threshold']}
    if valid_types:
        most_likely = max(valid_types.items(), key=lambda x: x[1]['confidence'])
        results['most_likely_type'] = most_likely[0]
    else:
        results['most_likely_type'] = 'unknown'
    
    return results

In [None]:
def analyze_with_gemini(text):
    """Analyze document using Gemini AI."""
    try:
        model = genai.GenerativeModel('gemini-pro')
        
        prompt = f"""
        Analyze the following legal document text and determine its type.
        Consider common legal document types. Provide your analysis and confidence level.
        
        Text to analyze:
        {text[:1500]}...  # First 1500 chars for API limits
        
        Please provide your response in the following format:
        Document Type: [type]
        Confidence: [high/medium/low]
        Reasoning: [brief explanation]
        """
        
        response = model.generate_content(prompt)
        return response.text
        
    except Exception as e:
        return f"Error using Gemini API: {str(e)}"

In [None]:
# Main analysis
pdf_path = Path('encrypted_name.pdf')

# Extract text
print("Extracting text from PDF...")
text = extract_text_from_pdf(pdf_path)

if text:
    # Keyword Analysis
    print("\nPerforming keyword analysis...")
    keyword_results = analyze_with_keywords(text)
    
    print("\nKeyword Analysis Results:")
    print(f"Most Likely Document Type: {keyword_results['most_likely_type'].replace('_', ' ').title()}")
    
    print("\nDetailed Results:")
    for doc_type, info in keyword_results.items():
        if doc_type != 'most_likely_type':
            print(f"\n{doc_type.replace('_', ' ').title()}:")
            print(f"Confidence: {info['confidence']:.2f}")
            print(f"Matches Found: {info['match_count']}")
            print(f"Meets Threshold: {'Yes' if info['meets_threshold'] else 'No'}")
    
    # Gemini Analysis
    print("\nPerforming Gemini AI analysis...")
    gemini_results = analyze_with_gemini(text)
    print("\nGemini Analysis Results:")
    print(gemini_results)

### Summarization

In [1]:
from document_processor import process_legal_documents
from pathlib import Path
import json
from datetime import datetime

# Process all legal documents in the data directory
data_path = Path('data')
print(f"Starting document processing from: {data_path}")

results = process_legal_documents(data_path)

# Save results to a JSON file with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"analysis_results_{timestamp}.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"\nProcessed {len(results)} documents.")
print(f"Results saved to: {output_file}")

# Print summary
print("\nDocument Summary:")
for result in results:
    print(f"\nFile: {result['file_path']}")
    print(f"Type: {result['doc_type']}")
    print(f"Languages: {', '.join([f'{lang} ({conf:.2f})' for lang, conf in result['languages']])}")

  from .autonotebook import tqdm as notebook_tqdm


Starting document processing from: data

Processing: data\20240609 OLT Daftar Permintaan Dokumen - PT REGENE.docx
Analyzing document type: default, language: id
Analysis complete for: 20240609 OLT Daftar Permintaan Dokumen - PT REGENE.docx

--------------------------------------------------------------------------------
Analysis results:
**1. Jenis dan Tujuan Dokumen**

Daftar Permintaan Dokumen dan Informasi

Tujuan: Untuk mengumpulkan dokumen dan informasi yang diperlukan dari pihak lain untuk peninjauan atau audit.

**2. Pihak-Pihak Utama yang Terlibat**

* PT REGENE ARTIFISIAL INTELIGEN (PT RAI)
* Pihak lain yang meminta dokumen dan informasi

**3. Ketentuan dan Syarat Utama**

Tidak ada ketentuan atau syarat utama yang disebutkan dalam dokumen yang disediakan.

**4. Tanggal atau Tenggat Waktu Penting**

Tidak ada tanggal atau tenggat waktu yang disebutkan dalam dokumen yang disediakan.

**5. Klausul atau Kondisi Khusus yang Penting**

* Catatan bahwa dokumen tambahan dapat diminta