# 01_preprocessing.ipynb

## Objective
Preprocess Cusanus' sermons, including:
- Loading raw XML files.
- Cleaning and extracting relevant text using BeautifulSoup.
- Saving cleaned text for analysis.


In [None]:
import os
from bs4 import BeautifulSoup

In [None]:
# Define input and output directories
current_dir = os.path.dirname(os.path.abspath(__file__))
xml_folder = os.path.join(current_dir, '../data/Sermones_XML')
output_folder = os.path.join(current_dir, '../data/Preprocessed')

# Create output directory if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [None]:
# Function to preprocess a single XML file
def preprocess_tei_data(file_path):
    # Read and parse the XML file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    soup = BeautifulSoup(content, 'lxml-xml')
    
    # Extract body content
    body_tag = soup.find('body')
    sermon_text = body_tag.get_text() if body_tag else 'No Content Found'
    
    # Clean up the text by removing extra spaces
    cleaned_text = ' '.join(sermon_text.split())
    
    return cleaned_text

In [None]:
# Process all XML files
for filename in os.listdir(xml_folder):
    if filename.endswith('.xml'):
        file_path = os.path.join(xml_folder, filename)
        processed_sermon = preprocess_tei_data(file_path)
        
        # Save the processed text to the output folder
        output_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}_processed.txt")
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(processed_sermon)

print("All files have been successfully preprocessed and saved.")