In [1]:
import os
from bs4 import BeautifulSoup
import tqdm
import yaml
import pickle

In [2]:
# 更改工作目录到项目根目录
os.chdir('/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling')
print("当前工作目录为: ", os.getcwd())

当前工作目录为:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [3]:
# 加载停用词表
stopwords_path = 'data/external/stopwords_latin.txt'
if not os.path.exists(stopwords_path):
    raise FileNotFoundError(f"停用词文件未找到: {stopwords_path}")

with open(stopwords_path, 'r', encoding='utf-8') as f:
    latin_stopwords = set(line.strip().lower() for line in f if line.strip())

# 加载词形还原字典
lemma_dict_path = 'lemma_dict.pkl'
if not os.path.exists(lemma_dict_path):
    raise FileNotFoundError(f"词形还原字典文件未找到: {lemma_dict_path}")

with open(lemma_dict_path, 'rb') as f:
    lemmas = pickle.load(f)

In [4]:
def process_file(file_path, has_lemma):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'xml')
        words = []
        for w in soup.find_all('w'):
            if has_lemma:
                lemma_attr = w.get('lemma_l')
                if lemma_attr and lemma_attr.isdigit():
                    word = lemmas.get(lemma_attr, w.get_text().lower())
                else:
                    word = w.get_text().lower()
            else:
                raw_word = w.get_text().lower()
                word = lemmas.get(raw_word, raw_word)
            if word not in latin_stopwords:
                words.append(word)
        return ' '.join(words)

In [5]:
def save_processed_text(text, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(text)

In [6]:
input_dir = 'data/raw'
output_dir = 'data/processed'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
  

In [7]:
for file_name in os.listdir(input_dir):
    if file_name.endswith('.xml'):
        input_path = os.path.join(input_dir, file_name)
        if file_name.startswith('h'):
            processed_text = process_file(input_path, has_lemma=True)
        elif file_name.startswith('v'):
            processed_text = process_file(input_path, has_lemma=False)
        else:
            continue
        output_file_name = file_name.replace('.xml', '.txt')
        output_path = os.path.join(output_dir, output_file_name)
        save_processed_text(processed_text, output_path)
        print(f"Processed file saved to: {output_path}")
        

Processed file saved to: data/processed/v170_048.txt
Processed file saved to: data/processed/v170_060.txt
Processed file saved to: data/processed/v170_074.txt
Processed file saved to: data/processed/h180_134.txt
Processed file saved to: data/processed/v180_135.txt
Processed file saved to: data/processed/h170_075.txt
Processed file saved to: data/processed/h170_061.txt
Processed file saved to: data/processed/h170_049.txt
Processed file saved to: data/processed/h190_281.txt
Processed file saved to: data/processed/v160_024_1.txt
Processed file saved to: data/processed/h190_256.txt
Processed file saved to: data/processed/h170_101.txt
Processed file saved to: data/processed/h170_115.txt
Processed file saved to: data/processed/h190_242.txt
Processed file saved to: data/processed/v190_243.txt
Processed file saved to: data/processed/v170_114.txt
Processed file saved to: data/processed/v170_100.txt
Processed file saved to: data/processed/v190_257.txt
Processed file saved to: data/processed/v190