# 01_preprocessing.ipynb

## Objective

Preprocess Cusanus' sermons for topic modeling:

- Load and parse TEI XML files.
- Clean, normalize, and lemmatize text.
- Save cleaned text for further analysis.


In [1]:
import os
from bs4 import BeautifulSoup
import tqdm
from collections import defaultdict
import yaml

In [2]:
# 更改工作目录到项目根目录
os.chdir('/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling')
print("当前工作目录为: ", os.getcwd())


当前工作目录为:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [13]:
# 加载配置文件
CONFIG_PATH = 'config.yaml'

try:
    with open(CONFIG_PATH, 'r') as config_file:
        config = yaml.safe_load(config_file)
    BASE_DIR = config.get('BASE_DIR', os.getcwd())
    print("配置文件加载成功，项目根目录为: ", BASE_DIR)
except FileNotFoundError:
    print(f"配置文件未找到: {CONFIG_PATH}，使用当前工作目录。")
    BASE_DIR = os.getcwd()

# 示例使用 BASE_DIR
input_dir = os.path.join(BASE_DIR, 'data/raw')
output_dir = os.path.join(BASE_DIR, 'data/processed')

print(f"输入目录: {input_dir}")
print(f"输出目录: {output_dir}")

# 列出输入目录中的文件
print("输入目录中的文件: ", os.listdir(input_dir))

配置文件加载成功，项目根目录为:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling
输入目录: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/raw
输出目录: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed
输入目录中的文件:  ['v170_048.xml', 'v170_060.xml', 'v170_074.xml', 'h180_134.xml', 'v180_135.xml', 'h170_075.xml', 'h170_061.xml', 'h170_049.xml', 'h190_281.xml', 'v160_024_1.xml', 'h190_256.xml', 'h170_101.xml', 'h170_115.xml', 'h190_242.xml', 'v190_243.xml', 'v170_114.xml', 'v170_100.xml', 'v190_257.xml', 'v190_280.xml', 'v190_281.xml', 'v190_256.xml', 'v170_101.xml', 'v170_115.xml', 'v190_242.xml', 'h190_243.xml', 'h170_114.xml', 'h170_100.xml', 'h190_257.xml', 'h190_280.xml', 'h170_048.xml', 'h170_060.xml', 'h170_074.xml', 'v180_134.xml', 'h180_135.xml', 'v170_075.xml', 'v170_061.xml', 'v170_049.xml', 'v170_077.xml', 'v170_063.xml', 'h180_123.xml', 'v170_088.xml', 'h180_137.xml', 'v180_136.xml', 'v180_122.xml', 'h170_089.xml', 'h170_062.xml', 'h170_076.xml', 'h190_282.xm

# 数据预处理部分

In [4]:
# 定义停用词加载函数
def load_stopwords(filepath):
    try:
        with open(filepath, 'r') as f:
            stopwords = {line.strip() for line in f if line.strip() and not line.startswith('#')}
        print("停用词列表加载成功。")
        return stopwords
    except FileNotFoundError:
        print(f"停用词文件未找到: {filepath}")
        return set()

In [5]:
# 定义词形表加载函数
def load_lemmas(filepath):
    lemmas = defaultdict(str)
    try:
        with open(filepath, 'r') as file:
            soup = BeautifulSoup(file, 'xml')
            for lemma in soup.find_all('lemma'):
                lemmas[lemma['name']] = lemma['name']
                for variant in lemma.find_all('variant'):
                    lemmas[variant['name']] = lemma['name']
        print("词形表加载成功。")
    except FileNotFoundError:
        print(f"词形文件未找到: {filepath}")
    return lemmas

In [7]:
#  处理目录中的文件，保存结果
def process_directory(input_dir, output_dir, lemmas, latin_stopwords):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    xml_files = [f for f in os.listdir(input_dir) if f.endswith('.xml')]

    for xml_file in tqdm.tqdm(xml_files, desc="Processing files"):
        input_path = os.path.join(input_dir, xml_file)
        output_path = os.path.join(output_dir, xml_file.replace('.xml', '.txt'))
        
        # 如果文件已处理则跳过
        if os.path.exists(output_path):
            print(f"文件已存在，跳过处理: {output_path}")
            continue
        
        try:
            # 调用预处理函数处理文本
            processed_text = preprocess_text_with_pos(input_path, lemmas, latin_stopwords)
            
            # 写入处理结果到输出文件
            with open(output_path, 'w') as out_file:
                out_file.write(processed_text)
            
            print(f"成功处理文件: {xml_file}")
        
        except Exception as e:
            print(f"处理文件 {xml_file} 时出错: {e}")

In [8]:
# XML 文件预处理函数
# 处理 XML 文件，检查是否包含 lemma_l 属性，并根据情况进行词形还原和停用词去除
def preprocess_text(xml_file):
    try:
        with open(xml_file, 'r') as file:
            soup = BeautifulSoup(file, 'xml')
            words = []
            for w in soup.find_all('w'):
                # 检查词汇是否有 lemma_l 属性
                lemma_attr = w.get('lemma_l')
                if lemma_attr:
                    # 如果有 lemma_l 属性，直接使用
                    word = lemma_attr.lower()
                else:
                    # 如果没有 lemma_l 属性，则使用词汇文本并进行词形还原
                    raw_word = w.get_text().lower()
                    word = lemmas.get(raw_word, raw_word)
                # 去除停用词
                if word not in latin_stopwords:
                    words.append(word)
            return ' '.join(words)
    except FileNotFoundError:
        print(f"XML 文件未找到: {xml_file}")
        return ""

In [14]:
# 主程序：加载停用词和词形表，并处理指定目录
# 更改工作目录到项目根目录
os.chdir('/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling')
print("当前工作目录为: ", os.getcwd())
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'  # 请设置为实际项目路径
stopwords_path = os.path.join(BASE_DIR, 'data/external/stopwords_latin.txt')
lemmas_path = os.path.join(BASE_DIR, 'data/external/lemma.xml')
input_dir = os.path.join(BASE_DIR, 'data/raw')
output_dir = os.path.join(BASE_DIR, 'data/processed')

当前工作目录为:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [16]:
# 加载停用词和词形表
latin_stopwords = load_stopwords(stopwords_path)
lemmas = load_lemmas(lemmas_path)

停用词文件未找到: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/../data/external/stopwords_latin.txt
词形文件未找到: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/../data/external/lemma.xml


In [None]:
# 运行数据预处理
process_directory('data/raw', 'data/processed')