## **Asterisk sequence remover**

#### **Script to remove asterisks sequences such ass '**nee**' from the Copaile manuscript plaintext**

In [None]:
import json
import re

In [None]:
def remove_asterisk_sequences(text):
    """
    Remove sequences between asterisks (including the asterisks themselves) from text.
    Handles both single asterisks (*text*) and multiple asterisks (**text**).
    """
    # This regex matches asterisks followed by any content until the next asterisk(s)
    # It handles cases like *text*, **text**, ***text***, etc present in the original text manuscript 
    pattern = r'\*+[^*]*\*+'
    return re.sub(pattern, '', text)

def process_json_file(input_file, output_file=None):
    """
    Process a JSON file to remove asterisk sequences from plaintext fields.
    
    Args:
        input_file (str): Path to input JSON file
        output_file (str): Path to output JSON file (optional)
    """
    try:
        
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Process each entry in the JSON
        for key, value in data.items():
            if isinstance(value, dict) and 'plaintext' in value:
                original_text = value['plaintext']
                cleaned_text = remove_asterisk_sequences(original_text)
                value['plaintext'] = cleaned_text.strip()  # remove extra whitespace
                
                print(f"Entry: {key}")
                print(f"Original: {original_text}")
                print(f"Cleaned:  {cleaned_text.strip()}")
                print("-" * 50)
        
        # Save the processed data
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Processed data saved to: {output_file}")
        else:
            # If no output file specified, overwrite the input file
            with open(input_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Original file updated: {input_file}")
            
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{input_file}'.")
    except Exception as e:
        print(f"Error: {str(e)}")

def process_json_string(json_string):
    """
    Process a JSON string to remove asterisk sequences from plaintext fields.
    
    Args:
        json_string (str): JSON data as string
        
    Returns:
        str: Processed JSON string
    """
    try:
        data = json.loads(json_string)
        
        for key, value in data.items():
            if isinstance(value, dict) and 'plaintext' in value:
                original_text = value['plaintext']
                cleaned_text = remove_asterisk_sequences(original_text)
                value['plaintext'] = cleaned_text.strip()
        
        return json.dumps(data, indent=2, ensure_ascii=False)
    
    except json.JSONDecodeError:
        print("Error: Invalid JSON format.")
        return None

### Main

In [None]:
if __name__ == "__main__":
    json_file_path = ""

    # Process the file
    if json_file_path:
        print(f"Processing file: {json_file_path}")
        print("=" * 60)
        process_json_file(json_file_path)