# Legal Document Comparison and Anonymization
A notebook to compare two legal documents and remove sensitive information like real names.

## Import Required Libraries
We'll import the necessary Python libraries for text processing and comparison.

In [None]:
import re
import difflib
import pandas as pd
from typing import List, Tuple
import spacy
import numpy as np

## Load Legal Documents
Load the documents from text files or strings for comparison.

In [None]:
def load_document(file_path: str) -> str:
    """
    Load a document from a text file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        return ""

# Example usage
document1 = load_document("document1.txt")
document2 = load_document("document2.txt")

## Preprocess Text
Clean and normalize the text to ensure consistent comparison.

In [None]:
def preprocess_text(text: str) -> str:
    """
    Normalize and clean text for comparison
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    return text.strip()

# Process both documents
doc1_processed = preprocess_text(document1)
doc2_processed = preprocess_text(document2)

## Compare Documents for Matching
Use difflib to compare the documents and identify differences.

In [None]:
def compare_documents(text1: str, text2: str) -> Tuple[float, List[str]]:
    """
    Compare two documents and return similarity score and differences
    """
    # Calculate similarity ratio
    similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
    
    # Get differences
    differ = difflib.Differ()
    diff = list(differ.compare(text1.splitlines(), text2.splitlines()))
    
    return similarity, diff

# Compare the processed documents
similarity_score, differences = compare_documents(doc1_processed, doc2_processed)
print(f"Document similarity: {similarity_score:.2%}")

## Identify and Remove Real Names
Use NLP and regular expressions to identify and remove sensitive information.

In [None]:
# Load spaCy model for named entity recognition
nlp = spacy.load("en_core_web_sm")

def anonymize_text(text: str) -> str:
    """
    Remove personal information from text
    """
    # Process text with spaCy
    doc = nlp(text)
    
    # Replace named entities with placeholders
    anonymized = text
    for ent in reversed(doc.ents):
        if ent.label_ in ['PERSON', 'ORG', 'GPE']:
            anonymized = anonymized[:ent.start_char] + f"[{ent.label_}]" + anonymized[ent.end_char:]
    
    return anonymized

# Anonymize both documents
doc1_anonymized = anonymize_text(document1)
doc2_anonymized = anonymize_text(document2)