In [27]:
import os
from bs4 import BeautifulSoup
from cltk.lemmatize.lat import LatinBackoffLemmatizer
import logging
import re
from tqdm import tqdm
import requests

In [28]:
# 设置工作目录到项目根目录
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("当前工作目录为: ", os.getcwd())

当前工作目录为:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [29]:
# 配置日志记录，将其记录到 notebooks 文件夹中的 data_processing.log
log_file_path = 'log/data_processing.log'
log_dir = os.path.dirname(log_file_path)

# 检查并创建日志文件目录
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# 设置文件处理程序，确保日志写入文件
file_handler = logging.FileHandler(log_file_path)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logging.getLogger().addHandler(file_handler)



In [30]:
# 从URL加载停用词列表
stopwords_url = 'https://raw.githubusercontent.com/aurelberra/stopwords/master/stopwords_latin.txt'
response = requests.get(stopwords_url)
latin_stopwords = set(response.text.splitlines())  # 将每一行作为一个停用词

In [31]:
# 初始化 CLTK 的词形还原器
lemmatizer = LatinBackoffLemmatizer()

In [32]:

def preprocess_text(words):
    # 过滤停用词
    filtered_words = [word for word in words if word.lower() not in latin_stopwords]
    return filtered_words

In [33]:
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        words = []
        stopwords_not_removed = []
        words_not_lemmatized = []

        for w in soup.find_all('w'):
            original_word = w.get_text().lower()
            # 排除德语内容：如果检测到德语内容，则跳过处理
            if re.search(r'[äöüß]', original_word):
                continue

            # 使用 CLTK 进行词形还原
            lemma = lemmatizer.lemmatize([original_word])[0][1]
            if not lemma:
                lemma = original_word  # 如果未找到还原形则保留原始单词

            # 检查停用词是否被去除
            if lemma in latin_stopwords:
                stopwords_not_removed.append(lemma)
                continue

            words.append(lemma)

        logging.info(f"原始词汇: {words[:10]}")  # 调试信息

        # 过滤停用词
        filtered_words = preprocess_text(words)

        logging.info(f"过滤后的词汇: {filtered_words[:10]}")  # 调试信息

        # 返回处理结果
        return filtered_words, stopwords_not_removed, words_not_lemmatized

In [34]:
# 处理输入目录中的所有 v 系列文件
input_dir = 'data/raw'
output_dir = 'data/processed_test'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for file_name in tqdm(os.listdir(input_dir), desc="Processing files"):
    if file_name.endswith('.xml') and file_name.startswith('v'):
        input_path = os.path.join(input_dir, file_name)
        
        # 调用处理函数
        processed_words, stopwords_not_removed, words_not_lemmatized = process_file(input_path)

        # 保存处理后的文本
        output_file_name = file_name.replace('.xml', '.txt')
        output_path = os.path.join(output_dir, output_file_name)
        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(' '.join(processed_words))

        logging.info(f"Processed file: {file_name}")
        if stopwords_not_removed:
            logging.warning(f"File: {file_name} - Stopwords not removed (sample): {list(stopwords_not_removed)[:5]}...")
        if words_not_lemmatized:
            logging.warning(f"File: {file_name} - Words not lemmatized (sample): {list(words_not_lemmatized)[:5]}...")

  soup = BeautifulSoup(file, 'lxml')
2024-11-07 16:51:56,042 - INFO - 原始词汇: ['dies', 'sanctifico', 'octavus', 're¬sponsorium', 'evangelium', 'duo', 'noto', 'primus', 'puer', 'nascor']
2024-11-07 16:51:56,044 - INFO - 过滤后的词汇: ['dies', 'sanctifico', 'octavus', 're¬sponsorium', 'evangelium', 'duo', 'noto', 'primus', 'puer', 'nascor']
2024-11-07 16:51:56,045 - INFO - Processed file: v170_048.xml
2024-11-07 16:51:56,076 - INFO - 原始词汇: ['resuscito', 'novus', 'dies', 'jo¬hannis', 'vis', 'evangelium', 'praesum', 'dies', 'loquor', 'hiis']
2024-11-07 16:51:56,077 - INFO - 过滤后的词汇: ['resuscito', 'novus', 'dies', 'jo¬hannis', 'vis', 'evangelium', 'praesum', 'dies', 'loquor', 'hiis']
2024-11-07 16:51:56,079 - INFO - Processed file: v170_060.xml
Processing files:   0%|          | 2/617 [00:00<00:34, 17.80it/s]2024-11-07 16:51:56,111 - INFO - 原始词汇: ['tertius', 'dies', 'resurgo', 'secundus', 'consuetudo', 'pri¬mam', 'pars', 'doceo', 'resur¬rectio', 'tertius']
2024-11-07 16:51:56,112 - INFO - 过滤后的词汇: ['

In [35]:
# 仅处理1-2个文件以检查POS标注结果
for file_name in tqdm(os.listdir(input_dir), desc="Processing files"):
    if file_name.endswith('.xml') and file_name.startswith('v'):
        input_path = os.path.join(input_dir, file_name)
        
        # 调用处理函数
        processed_words, stopwords_not_removed, words_not_lemmatized = process_file(input_path)

        # 保存处理后的文本
        output_file_name = file_name.replace('.xml', '.txt')
        output_path = os.path.join(output_dir, output_file_name)
        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(' '.join(processed_words))

        logging.info(f"Processed file: {file_name}")
        if stopwords_not_removed:
            logging.warning(f"File: {file_name} - Stopwords not removed (sample): {list(stopwords_not_removed)[:5]}...")
        if words_not_lemmatized:
            logging.warning(f"File: {file_name} - Words not lemmatized (sample): {list(words_not_lemmatized)[:5]}...")
        
        # 仅处理1-2个文件
        break

  soup = BeautifulSoup(file, 'lxml')
2024-11-07 16:52:19,823 - INFO - 原始词汇: ['dies', 'sanctifico', 'octavus', 're¬sponsorium', 'evangelium', 'duo', 'noto', 'primus', 'puer', 'nascor']
2024-11-07 16:52:19,824 - INFO - 过滤后的词汇: ['dies', 'sanctifico', 'octavus', 're¬sponsorium', 'evangelium', 'duo', 'noto', 'primus', 'puer', 'nascor']
2024-11-07 16:52:19,825 - INFO - Processed file: v170_048.xml
Processing files:   0%|          | 0/617 [00:00<?, ?it/s]


In [36]:
for file_name in tqdm(os.listdir(input_dir), desc="Processing files"):
    if file_name.endswith('.xml') and file_name.startswith('v'):
        input_path = os.path.join(input_dir, file_name)
        
        # 调用处理函数
        processed_words, stopwords_not_removed, words_not_lemmatized = process_file(input_path)

        # 保存处理后的文本
        output_file_name = file_name.replace('.xml', '.txt')
        output_path = os.path.join(output_dir, output_file_name)
        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(' '.join(processed_words))

        logging.info(f"Processed file: {file_name}")
        if stopwords_not_removed:
            logging.warning(f"File: {file_name} - Stopwords not removed (sample): {list(stopwords_not_removed)[:5]}...")
        if words_not_lemmatized:
            logging.warning(f"File: {file_name} - Words not lemmatized (sample): {list(words_not_lemmatized)[:5]}...")

  soup = BeautifulSoup(file, 'lxml')
2024-11-07 16:52:19,907 - INFO - 原始词汇: ['dies', 'sanctifico', 'octavus', 're¬sponsorium', 'evangelium', 'duo', 'noto', 'primus', 'puer', 'nascor']
2024-11-07 16:52:19,908 - INFO - 过滤后的词汇: ['dies', 'sanctifico', 'octavus', 're¬sponsorium', 'evangelium', 'duo', 'noto', 'primus', 'puer', 'nascor']
2024-11-07 16:52:19,909 - INFO - Processed file: v170_048.xml
2024-11-07 16:52:19,942 - INFO - 原始词汇: ['resuscito', 'novus', 'dies', 'jo¬hannis', 'vis', 'evangelium', 'praesum', 'dies', 'loquor', 'hiis']
2024-11-07 16:52:19,943 - INFO - 过滤后的词汇: ['resuscito', 'novus', 'dies', 'jo¬hannis', 'vis', 'evangelium', 'praesum', 'dies', 'loquor', 'hiis']
2024-11-07 16:52:19,943 - INFO - Processed file: v170_060.xml
Processing files:   0%|          | 2/617 [00:00<00:33, 18.42it/s]2024-11-07 16:52:19,979 - INFO - 原始词汇: ['tertius', 'dies', 'resurgo', 'secundus', 'consuetudo', 'pri¬mam', 'pars', 'doceo', 'resur¬rectio', 'tertius']
2024-11-07 16:52:19,979 - INFO - 过滤后的词汇: ['