In [None]:
import os
from bs4 import BeautifulSoup
from cltk.lemmatize.lat import LatinBackoffLemmatizer
import logging
import re
from tqdm import tqdm
import requests

In [24]:
# 设置工作目录到项目根目录
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("当前工作目录为: ", os.getcwd())

当前工作目录为:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [25]:
# 配置日志记录，将其记录到 notebooks 文件夹中的 data_processing.log
log_file_path = 'notebooks/data_processing.log'
log_dir = os.path.dirname(log_file_path)

# 检查并创建日志文件目录
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# 设置文件处理程序，确保日志写入文件
file_handler = logging.FileHandler(log_file_path)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logging.getLogger().addHandler(file_handler)



In [26]:
# 从 URL 加载停用词列表
stopwords_url = 'https://raw.githubusercontent.com/aurelberra/stopwords/master/stopwords_latin.txt'
response = requests.get(stopwords_url)
response.encoding = 'utf-8'
latin_stopwords = set(line.strip() for line in response.text.splitlines() if line.strip())

# 添加额外的停用词
additional_stopwords = {
    'ego', 'mei', 'mihi', 'me', 'tu', 'tui', 'tibi', 'te',
    'nos', 'noster', 'nobis', 'vos', 'vester',
    'sui', 'sibi', 'se',
    'ab', 'ex', 'ad', 'in', 'de', 'per', 'cum', 'sub', 'pro',
    'ante', 'post', 'supra', 'et', 'ac', 'aut', 'nec', 'sed',
    'ut', 'si', 'atque', 'qui', 'quae', 'quod', 'quis', 'quid', 'non', 'ne'
}
latin_stopwords.update(additional_stopwords)

In [27]:
# 初始化 CLTK 的词形还原器
lemmatizer = LatinBackoffLemmatizer()

In [28]:
def preprocess_text(words):
    # 过滤停用词
    filtered_words = [word for word in words if word.lower() not in latin_stopwords]
    return filtered_words


In [29]:
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        words = []
        stopwords_not_removed = []
        words_not_lemmatized = []

        for w in soup.find_all('w'):
            original_word = w.get_text().lower()
            
            # 排除德语内容
            if re.search(r'[äöüß]', original_word):
                continue

            # 去除带有数字或特殊符号的“cum”变体
            if re.match(r'^cum\W*\d*$', original_word):
                continue

            # 使用 CLTK 进行词形还原
            lemma = lemmatizer.lemmatize([original_word])[0][1]
            if not lemma:
                lemma = original_word
                words_not_lemmatized.append(original_word)

            # 将词形还原后的词汇添加到 `words`
            words.append(lemma)

        logging.info(f"原始词汇: {words[:10]}")  # 调试信息

        # 使用 preprocess_text 过滤停用词
        filtered_words = preprocess_text(words)

        logging.info(f"过滤后的词汇: {filtered_words[:10]}")  # 调试信息
        logging.info(f"未去除的停用词: {stopwords_not_removed[:10]}")  # 未去除的停用词调试信息
        logging.info(f"未还原的词汇: {words_not_lemmatized[:10]}")  # 未还原词汇调试信息

        # 返回处理结果
        return filtered_words, stopwords_not_removed, words_not_lemmatized

In [30]:
# 处理输入目录中的所有 v 系列文件
input_dir = 'data/raw'
output_dir = 'data/processed'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for file_name in tqdm(os.listdir(input_dir), desc="Processing files"):
    if file_name.endswith('.xml') and file_name.startswith('v'):
        input_path = os.path.join(input_dir, file_name)
        
        # 调用处理函数
        processed_words, stopwords_not_removed, words_not_lemmatized = process_file(input_path)

        # 保存处理后的文本
        output_file_name = file_name.replace('.xml', '.txt')
        output_path = os.path.join(output_dir, output_file_name)
        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(' '.join(processed_words))

        logging.info(f"Processed file: {file_name}")
        if stopwords_not_removed:
            logging.warning(f"File: {file_name} - Stopwords not removed (sample): {list(stopwords_not_removed)[:5]}...")
        if words_not_lemmatized:
            logging.warning(f"File: {file_name} - Words not lemmatized (sample): {list(words_not_lemmatized)[:5]}...")

Processing files:   0%|          | 0/617 [00:00<?, ?it/s]2024-11-10 01:08:36,361 - INFO - 原始词汇: ['dies', 'sanctifico', 'etc', 'sum', 'octavus', 're¬sponsorium', 'ex', 'evangelium', 'duo', 'noto']
2024-11-10 01:08:36,363 - INFO - 过滤后的词汇: ['dies', 'sanctifico', 'octavus', 're¬sponsorium', 'evangelium', 'duo', 'noto', 'primus', 'puer', 'nascor']
2024-11-10 01:08:36,364 - INFO - 未去除的停用词: []
2024-11-10 01:08:36,364 - INFO - 未还原的词汇: []
2024-11-10 01:08:36,365 - INFO - Processed file: v170_048.xml
2024-11-10 01:08:36,395 - INFO - 原始词汇: ['ego', 'resuscito', 'is', 'in', 'novus', 'dies', 'jo¬hannis', 'vis', 'et', 'in']
2024-11-10 01:08:36,396 - INFO - 过滤后的词汇: ['resuscito', 'novus', 'dies', 'jo¬hannis', 'vis', 'evangelium', 'praesum', 'dies', 'loquor', 'hiis']
2024-11-10 01:08:36,396 - INFO - 未去除的停用词: []
2024-11-10 01:08:36,396 - INFO - 未还原的词汇: []
2024-11-10 01:08:36,397 - INFO - Processed file: v170_060.xml
Processing files:   0%|          | 2/617 [00:00<00:30, 19.94it/s]2024-11-10 01:08:36,426 

In [31]:
for file_name in tqdm(os.listdir(input_dir), desc="Processing files"):
    if file_name.endswith('.xml') and file_name.startswith('v'):
        input_path = os.path.join(input_dir, file_name)
        
        # 调用处理函数
        processed_words, stopwords_not_removed, words_not_lemmatized = process_file(input_path)

        # 保存处理后的文本
        output_file_name = file_name.replace('.xml', '.txt')
        output_path = os.path.join(output_dir, output_file_name)
        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(' '.join(processed_words))

        logging.info(f"Processed file: {file_name}")
        if stopwords_not_removed:
            logging.warning(f"File: {file_name} - Stopwords not removed (sample): {list(stopwords_not_removed)[:5]}...")
        if words_not_lemmatized:
            logging.warning(f"File: {file_name} - Words not lemmatized (sample): {list(words_not_lemmatized)[:5]}...")

Processing files:   0%|          | 0/617 [00:00<?, ?it/s]2024-11-10 01:08:57,844 - INFO - 原始词汇: ['dies', 'sanctifico', 'etc', 'sum', 'octavus', 're¬sponsorium', 'ex', 'evangelium', 'duo', 'noto']
2024-11-10 01:08:57,846 - INFO - 过滤后的词汇: ['dies', 'sanctifico', 'octavus', 're¬sponsorium', 'evangelium', 'duo', 'noto', 'primus', 'puer', 'nascor']
2024-11-10 01:08:57,846 - INFO - 未去除的停用词: []
2024-11-10 01:08:57,847 - INFO - 未还原的词汇: []
2024-11-10 01:08:57,848 - INFO - Processed file: v170_048.xml
Processing files:   0%|          | 1/617 [00:00<02:01,  5.07it/s]2024-11-10 01:08:57,879 - INFO - 原始词汇: ['ego', 'resuscito', 'is', 'in', 'novus', 'dies', 'jo¬hannis', 'vis', 'et', 'in']
2024-11-10 01:08:57,880 - INFO - 过滤后的词汇: ['resuscito', 'novus', 'dies', 'jo¬hannis', 'vis', 'evangelium', 'praesum', 'dies', 'loquor', 'hiis']
2024-11-10 01:08:57,880 - INFO - 未去除的停用词: []
2024-11-10 01:08:57,881 - INFO - 未还原的词汇: []
2024-11-10 01:08:57,882 - INFO - Processed file: v170_060.xml
2024-11-10 01:08:57,912 