In [1]:
import re
import sys
import os

def mask_patterns(text):
    """
    Masks URLs, email addresses, and TikTok usernames to protect them from being altered during processing.
    
    Returns the masked text along with a dictionary mapping placeholders to original patterns.
    """
    patterns = {
        'URL': r'https?://[^\s]+',
        'EMAIL': r'\b[\w\.-]+@[\w\.-]+\.\w{2,}\b',
        'USERNAME': r'@[\w\.]+'
    }
    
    placeholders = {}
    masked_text = text
    
    for key, pattern in patterns.items():
        matches = re.findall(pattern, masked_text)
        for i, match in enumerate(matches):
            placeholder = f"__{key}_{i}__"
            placeholders[placeholder] = match
            # Escape periods in usernames to prevent partial replacements
            masked_text = masked_text.replace(match, placeholder)
    
    return masked_text, placeholders

def unmask_patterns(text, placeholders):
    """
    Reverses the masking process by replacing placeholders with their original patterns.
    """
    for placeholder, original in placeholders.items():
        text = text.replace(placeholder, original)
    return text

def process_legal_document(input_file_path, output_file_path):
    """
    Processes a legal document to fix spacing and formatting issues,
    excluding the multiple closing parentheses issue.
    
    Parameters:
    - input_file_path: Path to the input text file.
    - output_file_path: Path to save the processed text file.
    """
    try:
        # Check if input file exists
        if not os.path.isfile(input_file_path):
            print(f"Error: The input file '{input_file_path}' does not exist.")
            return

        # Read the entire content of the input file with proper encoding
        with open(input_file_path, 'r', encoding='utf-8') as input_file:
            content = input_file.read()

        print("Successfully read the input file.")

        # Step 1: Mask URLs, emails, and usernames to protect them during processing
        masked_content, placeholders = mask_patterns(content)
        print("Masked URLs, emails, and usernames.")

        # Step 2: Replace all newline characters with a space to consolidate the text
        masked_content = masked_content.replace('\n', ' ')
        print("Replaced all newline characters with spaces.")

        # Step 3: Replace multiple spaces with a single space
        masked_content = re.sub(r'\s{2,}', ' ', masked_content)
        print("Replaced multiple spaces with single spaces.")

        # Step 4: Insert two newlines before numbered bullet points (e.g., '1.', '2.', etc.)
        # Refined Regex:
        # - (?<!\d): Negative lookbehind to ensure the number isn't preceded by another digit (avoids matching parts of larger numbers)
        # - \b(\d+\.\s): Word boundary followed by one or more digits, a period, and a space
        # - (?=[A-Z]): Positive lookahead to ensure that the bullet point is followed by a capital letter
        masked_content = re.sub(r'(?<!\d)\b(\d+\.\s)(?=[A-Z])', r'\n\n\1', masked_content)
        print("Inserted two newlines before numbered bullet points.")

        # Step 5: Remove any space before punctuation marks (periods, commas, semicolons, colons)
        masked_content = re.sub(r'\s+([.,;:])', r'\1', masked_content)
        print("Removed unnecessary spaces before punctuation marks.")

        # Step 6: Ensure a single space after punctuation marks if not followed by space or newline
        # Since URLs and emails are masked, this won't affect them
        punctuation = ['\.', ',', ';', ':']
        for punct in punctuation:
            # Add space after punctuation if not followed by space or end of string
            masked_content = re.sub(rf'({punct})(?!\s|$)', r'\1 ', masked_content)
        print("Ensured single space after punctuation marks where necessary.")

        # Step 7: Fix spacing in usernames (remove spaces before periods within usernames)
        # Since usernames are masked, we can safely process them
        # Example: @narctok_royalty.Korslund -> @narctok_royalty. Korslund
        masked_content = re.sub(r'(__USERNAME_\d+)__\.\s+', r'\1__. ', masked_content)
        # Note: The placeholder ends with "__USERNAME_x__". We add a space after the period.
        # We'll handle the exact restoration in the unmasking step.

        # Step 8: Unmask URLs, emails, and usernames
        final_content = unmask_patterns(masked_content, placeholders)
        print("Unmasked URLs, emails, and usernames.")

        # Step 9: Fix spacing in usernames after unmasking
        # Ensure there's a space after the period if it's part of the sentence
        # For example, "@narctok_royalty. Korslund" should have a space after the period
        final_content = re.sub(r'@([\w\.]+)\.\s+', r'@\1. ', final_content)
        print("Fixed spacing in usernames after unmasking.")

        # Step 10: Trim leading and trailing whitespace
        final_content = final_content.strip()

        # Write the modified content to the output file
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(final_content)

        print(f"Document processed successfully and saved to '{output_file_path}'.")

    except Exception as e:
        print(f"An error occurred: {e}")
        sys.exit(1)

# Define the input and output file paths
input_file_path = 'JasonGoodman.txt'     # Replace with your actual input file path
output_file_path = 'JasonGoodmanCleaned.txt'  # Replace with your desired output file path

# Process the legal document
process_legal_document(input_file_path, output_file_path)


Successfully read the input file.
Masked URLs, emails, and usernames.
Replaced all newline characters with spaces.
Replaced multiple spaces with single spaces.
Inserted two newlines before numbered bullet points.
Removed unnecessary spaces before punctuation marks.
Ensured single space after punctuation marks where necessary.
Unmasked URLs, emails, and usernames.
Fixed spacing in usernames after unmasking.
Document processed successfully and saved to 'JasonGoodmanCleaned.txt'.
