# 01_preprocessing.ipynb

## Objective

Preprocess Cusanus' sermons for topic modeling:

- Load and parse TEI XML files.
- Clean, normalize, and lemmatize text.
- Save cleaned text for further analysis.


In [1]:
!pip install beautifulsoup4



In [2]:
!pip install lxml




In [3]:
from bs4 import BeautifulSoup
import os


In [4]:
def preprocess_text(xml_content):
    """
    Process the XML content and extract the word-level text, ensuring to handle line breaks and special characters.
    """
    # Parse the XML content using BeautifulSoup
    soup = BeautifulSoup(xml_content, 'lxml-xml')
    
    # Extract text within <w> tags, handling <lb> (line breaks) as well
    words = []
    for w in soup.find_all('w'):
        words.append(w.get_text())
    
    # Handle line breaks represented by <lb>
    processed_text = ' '.join(words).replace('\n', ' ').strip()
    
    return processed_text


In [5]:
# Example XML content for testing
test_xml = '''
<TEI.2>
  <text>
    <body>
      <p>
        <w id="C160375504" lemma_l="10242">Verbum</w> 
        <w id="C160375505" lemma_l="1433">caro</w> 
        <w id="C160375506" lemma_l="3761">factum</w> 
        <w id="C160375507" lemma_l="9483">est</w>.
      </p>
    </body>
  </text>
</TEI.2>
'''

# Testing the preprocess_text function
processed_test_text = preprocess_text(test_xml)
print("Processed test text:", processed_test_text)


Processed test text: Verbum caro factum est


In [6]:
# Directory setup for input&output
input_dir = '../data/raw/'
output_dir = '../data/processed/'

os.makedirs(output_dir, exist_ok=True)


In [7]:
print(os.getcwd())


/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/notebooks


In [8]:
# Continue with processing logic
for filename in os.listdir(input_dir):
    if filename.endswith('.xml'):
        input_path = os.path.join(input_dir, filename)
        with open(input_path, 'r', encoding='utf-8') as file:
            xml_content = file.read()

        # Preprocess the content
        processed_text = preprocess_text(xml_content)

        # Save the processed text to the output directory
        output_filename = f"{os.path.splitext(filename)[0]}_processed.txt"
        output_path = os.path.join(output_dir, output_filename)
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(processed_text)

        print(f'Preprocessed text saved to: {output_path}')

Preprocessed text saved to: ../data/processed/v170_048_processed.txt
Preprocessed text saved to: ../data/processed/v170_060_processed.txt
Preprocessed text saved to: ../data/processed/v170_074_processed.txt
Preprocessed text saved to: ../data/processed/h180_134_processed.txt
Preprocessed text saved to: ../data/processed/v180_135_processed.txt
Preprocessed text saved to: ../data/processed/h170_075_processed.txt
Preprocessed text saved to: ../data/processed/h170_061_processed.txt
Preprocessed text saved to: ../data/processed/h170_049_processed.txt
Preprocessed text saved to: ../data/processed/h190_281_processed.txt
Preprocessed text saved to: ../data/processed/v160_024_1_processed.txt
Preprocessed text saved to: ../data/processed/h190_256_processed.txt
Preprocessed text saved to: ../data/processed/h170_101_processed.txt
Preprocessed text saved to: ../data/processed/h170_115_processed.txt
Preprocessed text saved to: ../data/processed/h190_242_processed.txt
Preprocessed text saved to: ../d