# Data Cleaning Functions

In [3]:
import os
import re

In [4]:
def load_file_as_string(filename):
    """Loads a text file as a single string."""
    file_path = os.path.join('texts', 'keeps', filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_cleaned_file(text, original_filename):
    """Saves the cleaned text to a file in the 'cleaned_data' folder."""
    cleaned_folder = os.path.join('cleaned_data')
    os.makedirs(cleaned_folder, exist_ok=True)
    base_filename = os.path.basename(original_filename)
    cleaned_file_path = os.path.join(cleaned_folder, base_filename)
    with open(cleaned_file_path, 'w', encoding='utf-8') as file:
        file.write(text)
    return cleaned_file_path

def process_file(filename, processing_function):
    """Processes the file using the provided processing function and saves the result."""
    text = load_file_as_string(filename)
    processed_text = processing_function(text)
    cleaned_file_path = save_cleaned_file(processed_text, filename)
    print(f"Processed file saved to: {cleaned_file_path}")

In [5]:
def transform_1(text):
    # Remove all occurrences of '[num]'
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    return cleaned_text

In [6]:
input_filename = 'emerson_selfreliance.txt'
process_file(input_filename, transform_1)

Processed file saved to: cleaned_data\emerson_selfreliance.txt
