# Data Cleaning Functions

In [5]:
import os
import re

In [6]:
def load_file_as_string(filename):
    """Loads a text file as a single string."""
    file_path = os.path.join('raw_data', 'wikipedia', filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_cleaned_file(text, original_filename):
    """Saves the cleaned text to a file in the 'cleaned_data' folder."""
    cleaned_folder = os.path.join('raw_data', 'cleaned_data')
    os.makedirs(cleaned_folder, exist_ok=True)
    base_filename = os.path.basename(original_filename)
    cleaned_file_path = os.path.join(cleaned_folder, base_filename)
    with open(cleaned_file_path, 'w', encoding='utf-8') as file:
        file.write(text)
    return cleaned_file_path

def process_file(filename, processing_function):
    """Processes the file using the provided processing function and saves the result."""
    text = load_file_as_string(filename)
    processed_text = processing_function(text)
    cleaned_file_path = save_cleaned_file(processed_text, filename)
    print(f"Processed file saved to: {cleaned_file_path}")

In [7]:
def transform_1(text):
    # Remove all occurrences of '[num]'
    cleaned_text = re.sub(r'\[\d+\]', '', text)
    return cleaned_text

def transform_2(text):
    # Remove all occurrences of 'Albert Camus  THE STRANGER'
    text = text.replace('Albert Camus  THE STRANGER', '')
    text = re.sub(r'\n+', '\n', text)
    return text

def transform_3(text):
    # Remove all occurrences of 'Albert Camus  THE STRANGER'
    text = text.replace('*', '')
    text = text.replace('>', '')
    text = re.sub(r'\n+', '\n', text)
    return text

In [8]:
input_files = [
    'whypivot.txt',
]

for filename in input_files:
    process_file(filename, transform_3)

Processed file saved to: raw_data\cleaned_data\whypivot.txt
