In [26]:
import json
import re

In [27]:
def add_sentence_spacing(text):
    # Adds a space after `.`, `!`, or `?` if not followed by space and starts with uppercase
    return re.sub(r'([.!?])(?=[A-ZÉÈÊÀÂÎÔÙÛÇ])', r'\1 ', text)

In [28]:
def add_space_after_parenthesis(text):
    # Add space after ')' if not followed by space or period and not at end of string
    return re.sub(r'\)(?=[^\s.])(?=.)', r') ', text)

In [29]:
def strip_numbering(text):
    # Pattern to match numbering at the beginning of a sentence (e.g., 6, 6.3, 6.3.1)
    number_start_pattern = r'^\s*\d+(\.\d+)*\s*'

    # Split the text into sentences based on punctuation followed by a space or newline
    split_pattern = r'(?<=[.!?])\s+'
    sentences = re.split(split_pattern, text)
    
    # Strip the numbering from the start of each sentence
    stripped_sentences = [re.sub(number_start_pattern, '', sentence) for sentence in sentences]
    
    # Rejoin the sentences back into one string
    return ' '.join(stripped_sentences)

In [30]:
def match_french_case_to_english(item):
    if 'en' in item and 'fr' in item:
        en = item['en'].lstrip()
        fr = item['fr'].lstrip()
        
        if en and fr and en[0].isupper() and fr[0].islower():
            leading_spaces = len(item['fr']) - len(fr)
            # Capitalize only the first character, keep the rest unchanged
            item['fr'] = ' ' * leading_spaces + fr[0].upper() + fr[1:]
    
    return item

In [31]:
# Load your JSON data (from file or pasted string)
with open('output/all_pairs.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    
# Process each item in the list
for item in data:
    if 'en' in item:
        item['en'] = strip_numbering(item['en'])
        item['en'] = add_sentence_spacing(item['en'])
        item['en'] = add_space_after_parenthesis(item['en'])
        item['en'] = strip_numbering(item['en'])
    if 'fr' in item:
        item['fr'] = strip_numbering(item['fr'])
        item['fr'] = add_sentence_spacing(item['fr'])
        item['fr'] = add_space_after_parenthesis(item['fr'])
        item['fr'] = strip_numbering(item['fr'])

    item = match_french_case_to_english(item)


# Optional: Save the cleaned data
with open('cleaned_output.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)