In [35]:
import json
import re
import os

In [36]:
def add_sentence_spacing(text):
    # Adds a space after `.`, `!`, or `?` if not followed by space and starts with uppercase
    return re.sub(r'([.!?])(?=[A-ZÉÈÊÀÂÎÔÙÛÇ])', r'\1 ', text)

In [37]:
def add_space_after_parenthesis(text):
    # Add space after ')' if not followed by space or period and not at end of string
    return re.sub(r'\)(?=[^\s.])(?=.)', r') ', text)

In [38]:
def strip_numbering(text):
    """
    Removes leading section-like numbering (e.g., 8.4.14) from the beginning of each line.
    """
    lines = text.splitlines()
    cleaned_lines = [re.sub(r'^\s*\d+(?:\.\d+)*\s*', '', line) for line in lines]
    return '\n'.join(cleaned_lines)

In [39]:
def add_spaces_on_concat(item):
    item = re.sub(r'([a-zA-Z0-9])([A-ZÁÀÂÉÈÊÍÎÓÔÙÛÇ])', r'\1 \2', item)  # Add space between lowercase and uppercase letters
    item = re.sub(r'([A-Za-z0-9])([\'’])([A-Za-z])', r'\1\2 \3', item)  # Add space after apostrophe if needed
    return item

In [40]:
def clean_address(item):
  item = re.sub(r'([a-zA-Z0-9]),([a-zA-Z])', r'\1, \2', item)  # Ensure space after commas
  return item

In [41]:
def standardize_dash(item):
  item = re.sub(r' – ', ' - ', item)  # Replace en dash with hyphen where appropriate
  return item

In [42]:
def match_french_case_to_english(item):
    if 'en' in item and 'fr' in item:
        en = item['en'].lstrip()
        fr = item['fr'].lstrip()
        
        if en and fr and en[0].isupper() and fr[0].islower():
            leading_spaces = len(item['fr']) - len(fr)
            # Capitalize only the first character, keep the rest unchanged
            item['fr'] = ' ' * leading_spaces + fr[0].upper() + fr[1:]
    
    return item

In [43]:
def remove_phone_numbers(text):
    # Regular expression pattern for matching phone numbers in formats like (613) 946-3061, 613-946-3061, etc.
    phone_number_pattern = r'\(?\d{3}\)?[\s\-‐]?\d{3}[\s\-‐]?\d{4}'
    
    # Remove phone numbers from the text
    text = re.sub(phone_number_pattern, '', text)
    
    # Remove leading/trailing spaces (in case the phone number was at the start or end)
    text = text.strip()
    
    return text

In [44]:
def clean_text(text: str):
  text = strip_numbering(text)
  text = add_sentence_spacing(text)
  text = add_space_after_parenthesis(text)
  text = add_spaces_on_concat(text)
  text = clean_address(text)
  text = standardize_dash(text)
  text = remove_phone_numbers(text)

  return text

In [45]:
# Load your JSON data (from file or pasted string)
with open('output/all_pairs.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Process each item in the list
for text in data:
    if 'en' in text:
        text['en'] = clean_text(text['en'])
    if 'fr' in text:
        text['fr'] = clean_text(text['fr'])

    text = match_french_case_to_english(text)

# # Add files from the static folder
# static_folder = 'output/static'
# if os.path.exists(static_folder) and os.path.isdir(static_folder):
#     for filename in os.listdir(static_folder):
#         if filename.endswith('.json'):
#             with open(os.path.join(static_folder, filename), 'r', encoding='utf-8') as sf:
#                 static_data = json.load(sf)
#                 if isinstance(static_data, list):
#                     data.extend(static_data)
#                 else:
#                     data.append(static_data)
# else:
#     print(f"Warning: The folder '{static_folder}' does not exist. Skipping static folder processing.")


# Optional: Save the cleaned data
with open('train.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

    print(len(data))

8128
