# Legal Document Comparison and Anonymization
## Tenancy Agreement Processing System
This notebook processes and compares tenancy agreements during tenant onboarding. It helps ensure consistency across agreements while protecting personal information.

In [None]:
# Import Required Libraries
import re
import difflib
import pandas as pd
import numpy as np
import spacy
from pathlib import Path

# Load Legal Documents
Read the input documents from files and prepare them for processing.

In [None]:
def load_document(file_path):
    """Load a document from a file path."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Example usage
doc1_path = "document1.txt"
doc2_path = "document2.txt"

try:
    document1 = load_document(doc1_path)
    document2 = load_document(doc2_path)
except FileNotFoundError:
    print("Please ensure the document files exist in the correct location.")

# Preprocess Text
Clean and normalize the text to ensure consistent comparison.

In [None]:
def preprocess_text(text):
    """Clean and normalize text."""
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep periods and commas
    text = re.sub(r'[^\w\s.,]', '', text)
    
    return text.strip()

# Process both documents
doc1_processed = preprocess_text(document1)
doc2_processed = preprocess_text(document2)

# Compare Documents for Matching
Implement document comparison using difflib to identify similarities and differences.

In [None]:
def compare_documents(text1, text2):
    """Compare two documents and return similarity ratio and differences."""
    # Calculate similarity ratio
    similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
    
    # Get detailed differences
    differ = difflib.Differ()
    diff = list(differ.compare(text1.splitlines(), text2.splitlines()))
    
    return similarity, diff

# Compare the processed documents
similarity_ratio, differences = compare_documents(doc1_processed, doc2_processed)
print(f"Document similarity: {similarity_ratio:.2%}")

# Identify and Remove Real Names
Use NLP techniques to identify and remove personal information from the documents.

In [None]:
# Load spaCy model for named entity recognition
nlp = spacy.load("en_core_web_sm")

def anonymize_text(text):
    """Remove personal information from text."""
    doc = nlp(text)
    anonymized = text
    
    # Replace named entities with generic labels
    for ent in reversed(doc.ents):
        if ent.label_ in ["PERSON", "ORG", "GPE", "LOC"]:
            replacement = f"[{ent.label_}]"
            anonymized = anonymized[:ent.start_char] + replacement + anonymized[ent.end_char:]
    
    return anonymized

# Anonymize both documents
doc1_anonymous = anonymize_text(document1)
doc2_anonymous = anonymize_text(document2)

# Compare anonymized versions
anon_similarity, anon_diff = compare_documents(
    preprocess_text(doc1_anonymous),
    preprocess_text(doc2_anonymous)
)
print(f"Anonymized document similarity: {anon_similarity:.2%}")

# Legal Document Comparison Tool
This notebook provides functionality to compare two legal documents and ensure sensitive information is removed.

In [None]:
# Import Required Libraries
import re
import difflib
import spacy
import pandas as pd
from typing import List, Tuple

# Load Legal Documents
We'll create functions to load and initially process the legal documents.

In [None]:
def load_document(file_path: str) -> str:
    """Load a document from a file path and return its contents as a string."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def split_into_lines(text: str) -> List[str]:
    """Split document into lines and remove empty lines."""
    return [line.strip() for line in text.split('\n') if line.strip()]

# Preprocess Text
Clean and normalize the text to ensure consistent comparison.

In [None]:
def preprocess_text(text: str) -> str:
    """Clean and normalize text for comparison."""
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Remove special characters but keep periods and commas
    text = re.sub(r'[^\w\s.,]', '', text)
    return text

# Compare Documents for Similarity
Implement document comparison using difflib to identify differences.

In [None]:
def compare_documents(doc1: str, doc2: str) -> Tuple[float, List[str]]:
    """Compare two documents and return similarity ratio and differences."""
    # Split into lines and preprocess
    lines1 = split_into_lines(preprocess_text(doc1))
    lines2 = split_into_lines(preprocess_text(doc2))
    
    # Calculate similarity
    matcher = difflib.SequenceMatcher(None, doc1, doc2)
    similarity = matcher.ratio()
    
    # Get differences
    differ = difflib.Differ()
    diff = list(differ.compare(lines1, lines2))
    
    return similarity, diff

# Identify and Remove Real Names
Use NLP to identify and remove personally identifiable information.

In [None]:
# Load spaCy model for named entity recognition
nlp = spacy.load("en_core_web_sm")

def remove_personal_info(text: str) -> str:
    """Identify and replace personal information with placeholders."""
    doc = nlp(text)
    
    # Copy text for modification
    modified_text = text
    
    # Replace named entities with placeholders
    for ent in reversed(doc.ents):
        if ent.label_ in ['PERSON', 'ORG', 'GPE', 'LOC']:
            modified_text = modified_text[:ent.start_char] + f"[{ent.label_}]" + modified_text[ent.end_char:]
    
    return modified_text

# Example Usage
Here's how to use the functions defined above.

In [None]:
# Example usage
sample_text1 = """
John Smith agrees to pay $1000 to ABC Corp
located at 123 Main St, New York.
"""

sample_text2 = """
John Smith agrees to pay $1500 to ABC Corp
located at 123 Main St, New York.
"""

# Compare documents
similarity, differences = compare_documents(sample_text1, sample_text2)
print(f"Document similarity: {similarity:.2%}")

# Remove personal information
anonymized_text = remove_personal_info(sample_text1)
print("\nAnonymized text:")
print(anonymized_text)

# Legal Document Comparison and Anonymization

This notebook compares two legal documents for similarity and removes sensitive information like real names and addresses.

## Import Required Libraries

We'll import necessary libraries for text processing, comparison, and pattern matching.

In [None]:
import re
import difflib
import spacy
import pandas as pd
from typing import List, Tuple

# Load English language model for NER
nlp = spacy.load("en_core_web_sm")

## Load Legal Documents

Load the two legal documents that need to be compared and anonymized.

In [None]:
def load_document(file_path: str) -> str:
    """Load a document from file path and return its contents as string."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Example usage
# doc1 = load_document('path_to_first_document.txt')
# doc2 = load_document('path_to_second_document.txt')

# For demonstration, using sample texts
doc1 = """This agreement between Jake Lander residing at 145 Ladyshot
and Daniel Hibbard states the following terms..."""

doc2 = """This agreement between Jake Lander residing at 145 Ladyshot
and Daniel Hibbard includes the following conditions..."""

## Preprocess Text

Clean and normalize the text to ensure consistent comparison.

In [None]:
def preprocess_text(text: str) -> str:
    """Clean and normalize text for comparison."""
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-z0-9\s.,!?-]', '', text)
    return text.strip()

# Preprocess both documents
doc1_processed = preprocess_text(doc1)
doc2_processed = preprocess_text(doc2)

## Compare Documents for Similarity

Use difflib to compare the documents and calculate similarity ratio.

In [None]:
def compare_documents(text1: str, text2: str) -> Tuple[float, List[str]]:
    """Compare two documents and return similarity ratio and differences."""
    # Calculate similarity ratio
    similarity = difflib.SequenceMatcher(None, text1, text2).ratio()
    
    # Get differences
    differ = difflib.Differ()
    diff = list(differ.compare(text1.splitlines(), text2.splitlines()))
    
    return similarity, diff

# Compare the processed documents
similarity_ratio, differences = compare_documents(doc1_processed, doc2_processed)
print(f"Document similarity: {similarity_ratio:.2%}")
print("\nDifferences found:")
for d in differences:
    if d.startswith(('+ ', '- ', '? ')):
        print(d)

## Identify and Remove Real Names

Use Named Entity Recognition (NER) and regular expressions to identify and remove sensitive information.

In [None]:
def anonymize_document(text: str) -> str:
    """Remove sensitive information from text using NER and regex."""
    # Use spaCy for Named Entity Recognition
    doc = nlp(text)
    
    # Create a copy of text for modification
    anonymized_text = text
    
    # Replace named entities with generic labels
    for ent in doc.ents:
        if ent.label_ in ['PERSON', 'GPE', 'LOC', 'ORG']:
            replacement = f"[{ent.label_}]"
            anonymized_text = anonymized_text.replace(str(ent), replacement)
    
    # Additional regex patterns for specific sensitive information
    patterns = [
        (r'\d{3}[\s-]?\d{3}[\s-]?\d{4}', '[PHONE]'),  # Phone numbers
        (r'\d+\s+[A-Za-z\s]+(?:Street|St|Avenue|Ave|Road|Rd|Lane|Ln)', '[ADDRESS]'),  # Addresses
    ]
    
    for pattern, replacement in patterns:
        anonymized_text = re.sub(pattern, replacement, anonymized_text)
    
    return anonymized_text

# Anonymize both documents
doc1_anonymized = anonymize_document(doc1)
doc2_anonymized = anonymize_document(doc2)

print("Original Document 1:")
print(doc1)
print("\nAnonymized Document 1:")
print(doc1_anonymized)