# 01_preprocessing.ipynb

## Objective

Preprocess Cusanus' sermons for topic modeling:

- Load and parse TEI XML files.
- Clean, normalize, and lemmatize text.
- Save cleaned text for further analysis.


In [1]:
import os
from bs4 import BeautifulSoup
import tqdm
from collections import defaultdict
import yaml

In [2]:
# 更改工作目录到项目根目录
os.chdir('/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling')
print("当前工作目录为: ", os.getcwd())


当前工作目录为:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [3]:
# 加载配置文件
CONFIG_PATH = 'config.yaml'

try:
    with open(CONFIG_PATH, 'r') as config_file:
        config = yaml.safe_load(config_file)
    BASE_DIR = config.get('BASE_DIR', os.getcwd())
    print("配置文件加载成功，项目根目录为: ", BASE_DIR)
except FileNotFoundError:
    print(f"配置文件未找到: {CONFIG_PATH}，使用当前工作目录。")
    BASE_DIR = os.getcwd()

# 示例使用 BASE_DIR
input_dir = os.path.join(BASE_DIR, 'data/raw')
output_dir = os.path.join(BASE_DIR, 'data/processed')

print(f"输入目录: {input_dir}")
print(f"输出目录: {output_dir}")

# 列出输入目录中的文件
print("输入目录中的文件: ", os.listdir(input_dir))

配置文件加载成功，项目根目录为:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling
输入目录: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/raw
输出目录: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed
输入目录中的文件:  ['v170_048.xml', 'v170_060.xml', 'v170_074.xml', 'h180_134.xml', 'v180_135.xml', 'h170_075.xml', 'h170_061.xml', 'h170_049.xml', 'h190_281.xml', 'v160_024_1.xml', 'h190_256.xml', 'h170_101.xml', 'h170_115.xml', 'h190_242.xml', 'v190_243.xml', 'v170_114.xml', 'v170_100.xml', 'v190_257.xml', 'v190_280.xml', 'v190_281.xml', 'v190_256.xml', 'v170_101.xml', 'v170_115.xml', 'v190_242.xml', 'h190_243.xml', 'h170_114.xml', 'h170_100.xml', 'h190_257.xml', 'h190_280.xml', 'h170_048.xml', 'h170_060.xml', 'h170_074.xml', 'v180_134.xml', 'h180_135.xml', 'v170_075.xml', 'v170_061.xml', 'v170_049.xml', 'v170_077.xml', 'v170_063.xml', 'h180_123.xml', 'v170_088.xml', 'h180_137.xml', 'v180_136.xml', 'v180_122.xml', 'h170_089.xml', 'h170_062.xml', 'h170_076.xml', 'h190_282.xm

# 数据预处理部分

In [4]:
# 停用词加载函数，过滤掉空行和注释行
def load_stopwords(filepath):
    try:
        with open(filepath, 'r') as f:
            stopwords = {line.strip() for line in f if line.strip() and not line.startswith('#')}
        print("停用词列表加载成功。")
        return stopwords
    except FileNotFoundError:
        print(f"停用词文件未找到: {filepath}")
        return set()

# 加载词形表和停用词
stopwords_path = os.path.join(BASE_DIR, config.get('STOPWORDS_PATH', 'data/external/stopwords_latin.txt'))
latin_stopwords = load_stopwords(stopwords_path)

停用词列表加载成功。


In [13]:
# 词形表加载函数
# 从 lemma.xml 文件中加载词形，还原词汇的标准形式
# 解析 lemma.xml 文件并创建 lemma_dict
def load_lemma_dict(lemma_file):
    with open(lemma_file, 'r') as file:
        soup = BeautifulSoup(file, 'xml')
        lemma_dict = {}
        for lemma in soup.find_all('lemma'):
            id_lemma = lemma.get('id_lemma')
            name = lemma.get('name')
            if id_lemma and name:
                lemma_dict[int(id_lemma)] = name
        return lemma_dict


In [14]:
# 加载词典
lemma_dict = load_lemma_dict("data/raw/lemma.xml")

In [15]:
# XML 文件预处理函数
# 处理 XML 文件，检查是否包含 lemma_l 属性，并根据情况进行词形还原和停用词去除
def preprocess_text(xml_file):
    try:
        with open(xml_file, 'r') as file:
            soup = BeautifulSoup(file, 'xml')
            words = []

            for w in soup.find_all('w'):
                lemma_attr = w.get('lemma_l')
                
                # 若 lemma_l 存在且为数字，用 lemma_dict 词典获取还原词，否则用原始文本
                if lemma_attr and lemma_attr.isdigit():
                    word = lemma_dict.get(int(lemma_attr), w.get_text().lower())
                else:
                    raw_word = w.get_text().lower()
                    word = lemmas.get(raw_word, raw_word)

                # 若非停用词则添加到 words 列表
                if word not in latin_stopwords:
                    words.append(word)

            # 返回处理后的文本字符串
            return ' '.join(words)

    # 文件不存在时，输出错误信息
    except FileNotFoundError:
        print(f"XML 文件未找到: {xml_file}")
        return ""


In [16]:
# 处理目录中的文件，保存结果
# 读取原始文件并处理，生成经过词形还原和停用词去除的文本文件
def process_directory(input_dir, output_dir):
    input_dir = os.path.join(BASE_DIR, input_dir)
    output_dir = os.path.join(BASE_DIR, output_dir)

    # 如果输出目录不存在，则创建
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 获取所有 XML 文件
    xml_files = [f for f in os.listdir(input_dir) if f.endswith('.xml')]

    for xml_file in tqdm.tqdm(xml_files, desc="Processing files"):
        input_path = os.path.join(input_dir, xml_file)
        output_path = os.path.join(output_dir, xml_file.replace('.xml', '.txt'))
        
        # 如果文件已处理则跳过
        if os.path.exists(output_path):
            print(f"文件已存在，跳过处理: {output_path}")
            continue
        
        try:
            # 调用预处理函数处理文本
            processed_text = preprocess_text(input_path)
            
            # 写入处理结果到输出文件
            with open(output_path, 'w') as out_file:
                out_file.write(processed_text)
            
            print(f"成功处理文件: {xml_file}")
        
        except Exception as e:
            print(f"处理文件 {xml_file} 时出错: {e}")


In [17]:
# 运行数据预处理
process_directory('data/raw', 'data/processed')

Processing files:   1%|          | 7/615 [00:00<00:14, 43.23it/s]

文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_048.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_060.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_074.txt
成功处理文件: h180_134.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_135.txt
成功处理文件: h170_075.xml
成功处理文件: h170_061.xml
成功处理文件: h170_049.xml


Processing files:   5%|▍         | 28/615 [00:00<00:07, 77.44it/s]

成功处理文件: h190_281.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_024_1.txt
成功处理文件: h190_256.xml
成功处理文件: h170_101.xml
成功处理文件: h170_115.xml
成功处理文件: h190_242.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_243.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_114.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_100.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_257.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_280.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_281.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_256.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_101.txt
文件已存在，跳过处理: /Users/jessie/Doc

Processing files:   6%|▌         | 37/615 [00:00<00:10, 57.65it/s]

成功处理文件: h190_280.xml
成功处理文件: h170_048.xml
成功处理文件: h170_060.xml
成功处理文件: h170_074.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_134.txt
成功处理文件: h180_135.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_075.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_061.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_049.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_077.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_063.txt
成功处理文件: h180_123.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_088.txt
成功处理文件: h180_137.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_136.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v1

Processing files:   9%|▉         | 55/615 [00:00<00:09, 62.05it/s]

成功处理文件: h170_076.xml
成功处理文件: h190_282.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_024_2.txt
成功处理文件: h170_116.xml
成功处理文件: h190_241.xml
成功处理文件: h190_255.xml
成功处理文件: h170_102.xml
成功处理文件: h190_269.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_268.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_103.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_254.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_240.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_117.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_283.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_282.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/

Processing files:  11%|█         | 68/615 [00:01<00:06, 78.66it/s]

成功处理文件: h190_268.xml
成功处理文件: h170_103.xml
成功处理文件: h190_254.xml
成功处理文件: h190_240.xml
成功处理文件: h170_117.xml
成功处理文件: h510_epalb.xml


Processing files:  14%|█▍        | 87/615 [00:01<00:07, 69.22it/s]

成功处理文件: h190_283.xml
成功处理文件: h170_077.xml
成功处理文件: h170_063.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_123.txt
成功处理文件: h170_088.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_137.txt
成功处理文件: h180_136.xml
成功处理文件: h180_122.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_089.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_062.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_076.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_072.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_066.txt
成功处理文件: h180_126.xml
成功处理文件: h180_132.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_099.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Model

Processing files:  15%|█▌        | 95/615 [00:01<00:08, 60.07it/s]

成功处理文件: h170_067.xml
成功处理文件: h170_073.xml
成功处理文件: h190_287.xml
成功处理文件: h190_293.xml
成功处理文件: h190_278.xml
成功处理文件: h170_113.xml
成功处理文件: h190_244.xml
成功处理文件: h190_250.xml


Processing files:  19%|█▉        | 116/615 [00:01<00:06, 75.41it/s]

成功处理文件: h170_107.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_106.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_251.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_245.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_112.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_279.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_292.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_286.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_287.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_293.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_278.txt
成功处理文件: h160_024_2.

Processing files:  23%|██▎       | 140/615 [00:02<00:05, 81.84it/s]

成功处理文件: h190_292.xml
成功处理文件: h190_286.xml
成功处理文件: h170_072.xml
成功处理文件: h170_066.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_126.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_132.txt
成功处理文件: h170_099.xml
成功处理文件: h180_133.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_098.txt
成功处理文件: h180_127.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_067.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_073.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_065.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_071.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_059.txt
成功处理文件: h180_131.xml
成功处理文件: h180_125.xml
文件已存在，跳过处理: /Users/jessie/Documents/Project

Processing files:  24%|██▍       | 149/615 [00:02<00:06, 69.10it/s]

成功处理文件: h190_290.xml
成功处理文件: h190_284.xml
成功处理文件: h190_253.xml
成功处理文件: h170_104.xml
成功处理文件: h170_110.xml
成功处理文件: h190_247.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_246.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_111.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_105.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_252.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_285.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_291.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_290.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_284.txt


Processing files:  27%|██▋       | 165/615 [00:02<00:06, 70.85it/s]

成功处理文件: h160_024_1.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_253.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_104.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_110.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_247.txt
成功处理文件: h190_246.xml
成功处理文件: h170_111.xml
成功处理文件: h170_105.xml
成功处理文件: h190_252.xml
成功处理文件: h190_285.xml
成功处理文件: h190_291.xml
成功处理文件: h170_065.xml


Processing files:  28%|██▊       | 173/615 [00:02<00:07, 58.15it/s]

成功处理文件: h170_071.xml
成功处理文件: h170_059.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_131.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_125.txt
成功处理文件: h180_124.xml
成功处理文件: h180_130.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_058.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_070.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_064.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_037_b.txt
成功处理文件: h180_194.xml
成功处理文件: h180_180.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_017.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_003.txt
成功处理文件: h180_157.xml
成功处理文件: h180_195_a.xml
成功处理文件: h180_143.xml
文件已存在，跳过处理: /Users/jessie/Documents/Pro

Processing files:  32%|███▏      | 196/615 [00:02<00:05, 69.89it/s]

成功处理文件: h160_002.xml
成功处理文件: h160_016.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_181.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_195.txt
成功处理文件: h190_235.xml
成功处理文件: h190_221.xml
成功处理文件: h190_209.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_208.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_220.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_234.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_235.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_221.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_209.txt
成功处理文件: h190_208.xml
成功处理文件: h190_220.xml
成功处理文件: h190_234.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Model

Processing files:  35%|███▍      | 214/615 [00:03<00:05, 74.65it/s]

成功处理文件: h160_017.xml
成功处理文件: h180_197_b.xml
成功处理文件: h160_003.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_157.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_143.txt
成功处理文件: h180_142.xml
成功处理文件: h180_156.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_002.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_016.txt
成功处理文件: h180_181.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_037_c.txt
成功处理文件: h180_195.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_028.txt
成功处理文件: h180_183.xml
成功处理文件: h180_197.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_037_a.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_014.txt
成功处理文件: h180_195_b.xml
成功处理文件: h180_1

Processing files:  38%|███▊      | 233/615 [00:03<00:05, 72.62it/s]

成功处理文件: h180_154.xml
成功处理文件: h180_168.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_169.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_155.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_141.txt
成功处理文件: h160_015.xml
成功处理文件: h160_001.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_196.txt
成功处理文件: h170_029.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_182.txt
成功处理文件: h190_222.xml
成功处理文件: h190_236.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_237.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_223.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_222.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v1

Processing files:  40%|████      | 249/615 [00:03<00:06, 60.75it/s]

成功处理文件: h190_223.xml
成功处理文件: h170_028.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_183.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_197.txt
成功处理文件: h160_014.xml
成功处理文件: h180_197_a.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_140.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_154.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_168.txt
成功处理文件: h180_169.xml


Processing files:  42%|████▏     | 256/615 [00:03<00:07, 50.76it/s]

成功处理文件: h180_155.xml
成功处理文件: h180_141.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_015.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_001.txt
成功处理文件: h180_196.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_029.txt
成功处理文件: h180_182.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_005.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_011.txt
成功处理文件: h180_186.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_039.txt


Processing files:  44%|████▍     | 271/615 [00:04<00:06, 54.13it/s]

成功处理文件: h180_192.xml
成功处理文件: h180_179.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_197_a.txt
成功处理文件: h180_145.xml
成功处理文件: h180_151.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_150.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_144.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_178.txt
成功处理文件: h170_038.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_193.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_187.txt


Processing files:  45%|████▌     | 277/615 [00:04<00:07, 45.84it/s]

成功处理文件: h160_010.xml
成功处理文件: h160_004.xml
成功处理文件: h190_227.xml
成功处理文件: h190_233.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_232.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_226.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_227.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_233.txt
成功处理文件: h190_232.xml
成功处理文件: h190_226.xml


Processing files:  48%|████▊     | 298/615 [00:04<00:05, 55.58it/s]

成功处理文件: h160_005.xml
成功处理文件: h160_011.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_186.txt
成功处理文件: h170_039.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_195_b.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_192.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_179.txt
成功处理文件: h170_037_a.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_145.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_151.txt
成功处理文件: h180_150.xml
成功处理文件: h180_144.xml
成功处理文件: h180_178.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_038.txt
成功处理文件: h180_193.xml


Processing files:  50%|████▉     | 305/615 [00:04<00:05, 55.49it/s]

成功处理文件: h180_187.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_010.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_004.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_012.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_006.txt
成功处理文件: h180_191.xml
成功处理文件: h180_185.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_197_b.txt
成功处理文件: h180_152.xml
成功处理文件: h180_146.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_147.txt
成功处理文件: h170_037_c.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_153.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_184.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_190.txt


Processing files:  53%|█████▎    | 328/615 [00:05<00:04, 57.50it/s]

成功处理文件: h160_007.xml
成功处理文件: h160_013.xml
成功处理文件: h190_218.xml
成功处理文件: h190_230.xml
成功处理文件: h190_224.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_225.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_231.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_219.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_218.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_230.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_224.txt
成功处理文件: h190_225.xml
成功处理文件: h190_231.xml
成功处理文件: h190_219.xml
成功处理文件: h160_012.xml


Processing files:  56%|█████▌    | 342/615 [00:05<00:04, 55.72it/s]

成功处理文件: h160_006.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_191.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_195_a.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_185.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_152.txt
成功处理文件: h170_037_b.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_146.txt
成功处理文件: h180_147.xml
成功处理文件: h180_153.xml
成功处理文件: h180_184.xml
成功处理文件: h180_190.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_007.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_013.txt
成功处理文件: h180_189.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_022.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processe

Processing files:  59%|█████▉    | 362/615 [00:05<00:03, 70.51it/s]

成功处理文件: h180_176.xml
成功处理文件: h180_162.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_163.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_177.txt
成功处理文件: h160_023.xml
成功处理文件: h170_037.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_188.txt
成功处理文件: h180_200.xml
成功处理文件: h190_214.xml
成功处理文件: h190_228.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_229.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_201.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_215.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_182_b.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_200.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/

Processing files:  62%|██████▏   | 380/615 [00:06<00:03, 78.07it/s]

成功处理文件: h190_215.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_189.txt
成功处理文件: h160_022.xml
成功处理文件: h170_036.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_176.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_162.txt
成功处理文件: h180_163.xml
成功处理文件: h180_177.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_023.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_037.txt
成功处理文件: h180_188.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_009.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_035.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_021.txt
成功处理文件: h180_161.xml
成功处理文件: h180_175.xml
成功处理文件: h180_149.xml
文件已存在，跳过处理: /Users/jessie/Documents/Project

Processing files:  63%|██████▎   | 389/615 [00:06<00:05, 41.45it/s]

成功处理文件: h190_217.xml
成功处理文件: h180_203.xml
成功处理文件: h500_ck.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_216.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_202.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_182_a.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_217.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_203.txt


Processing files:  64%|██████▍   | 396/615 [00:06<00:05, 41.40it/s]

成功处理文件: h190_216.xml
成功处理文件: h180_202.xml


Processing files:  66%|██████▌   | 407/615 [00:07<00:06, 30.37it/s]

成功处理文件: h160_009.xml
成功处理文件: h170_035.xml
成功处理文件: h160_021.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_161.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_175.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_149.txt
成功处理文件: h180_148.xml
成功处理文件: h180_174.xml
成功处理文件: h180_160.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_034.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_020.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_008.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_024.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_030.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_018.txt
成功处理文件: h1

Processing files:  69%|██████▉   | 425/615 [00:07<00:04, 45.57it/s]

成功处理文件: h180_164.xml
成功处理文件: h180_170.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_171.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_165.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_159.txt
成功处理文件: h160_019.xml
成功处理文件: h160_025.xml
成功处理文件: h170_031.xml
成功处理文件: h190_206.xml
成功处理文件: h190_212.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_213.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_207.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_206.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_212.txt
成功处理文件: h190_213.xml


Processing files:  70%|███████   | 431/615 [00:07<00:04, 38.46it/s]

成功处理文件: h190_207.xml
成功处理文件: h180_182_a.xml
成功处理文件: h160_024.xml
成功处理文件: h170_030.xml


Processing files:  72%|███████▏  | 445/615 [00:07<00:03, 47.58it/s]

成功处理文件: h160_018.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_158.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_164.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_170.txt
成功处理文件: h180_171.xml
成功处理文件: h180_165.xml
成功处理文件: h180_159.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_019.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_025.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_031.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_027.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_033.txt
成功处理文件: h180_198.xml
成功处理文件: h180_173.xml
成功处理文件: h180_167.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v1

Processing files:  75%|███████▌  | 462/615 [00:08<00:02, 60.79it/s]

成功处理文件: h190_239.xml
成功处理文件: h190_211.xml
成功处理文件: h190_205.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_204.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_210.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_238.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_239.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_211.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_205.txt
成功处理文件: h190_204.xml
成功处理文件: h190_210.xml
成功处理文件: h190_238.xml


Processing files:  77%|███████▋  | 471/615 [00:08<00:02, 59.20it/s]

成功处理文件: h180_182_b.xml
成功处理文件: h170_027.xml
成功处理文件: h170_033.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_198.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_173.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_167.txt
成功处理文件: h180_166.xml
成功处理文件: h180_172.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_032.txt
成功处理文件: h180_199.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v160_026.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_069.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_041.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_055.txt
成功处理文件: h180_129.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/

Processing files:  79%|███████▊  | 484/615 [00:08<00:01, 70.25it/s]

成功处理文件: h170_054.xml
成功处理文件: h170_040.xml
成功处理文件: h170_068.xml
成功处理文件: h190_288.xml
成功处理文件: h170_120.xml


Processing files:  80%|████████  | 492/615 [00:08<00:02, 48.31it/s]

成功处理文件: h190_277.xml
成功处理文件: h190_263.xml
成功处理文件: h170_108.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_109.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_262.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_276.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_121.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_289.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_288.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_120.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_277.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_263.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/

Processing files:  83%|████████▎ | 511/615 [00:09<00:02, 49.29it/s]

成功处理文件: h190_276.xml
成功处理文件: h170_121.xml
成功处理文件: h190_289.xml
成功处理文件: h170_069.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_079_b.txt
成功处理文件: h170_041.xml
成功处理文件: h170_055.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_129.txt


Processing files:  86%|████████▌ | 526/615 [00:09<00:01, 65.94it/s]

成功处理文件: h170_082.xml
成功处理文件: h170_096.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_097.txt
成功处理文件: h180_128.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_083.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_054.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_040.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_068.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_056.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_042.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_095.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_081.txt
成功处理文件: h170_080.xml
成功处理文件: h170_094.xml
成功处理文件: h170_043.xml
成功处理文件: h1

Processing files:  87%|████████▋ | 534/615 [00:09<00:01, 53.97it/s]

成功处理文件: h190_260.xml
成功处理文件: h190_274.xml
成功处理文件: h190_248.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_249.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_275.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_261.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_260.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_274.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_248.txt
成功处理文件: h190_249.xml


Processing files:  90%|█████████ | 556/615 [00:09<00:00, 71.12it/s]

成功处理文件: h190_275.xml
成功处理文件: h190_261.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_079_a.txt
成功处理文件: h170_056.xml
成功处理文件: h170_042.xml
成功处理文件: h170_095.xml
成功处理文件: h170_081.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_080.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_094.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_043.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_057.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_053.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_047.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_090.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_084.txt
成功处理文件: 

Processing files:  92%|█████████▏| 565/615 [00:10<00:01, 39.51it/s]

成功处理文件: h190_265.xml
成功处理文件: h190_271.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_270.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_264.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_258.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_259.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_265.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v190_271.txt
成功处理文件: h190_270.xml
成功处理文件: h190_264.xml
成功处理文件: h190_258.xml
成功处理文件: h170_053.xml


Processing files:  96%|█████████▋| 593/615 [00:10<00:00, 60.90it/s]

成功处理文件: h170_047.xml
成功处理文件: h170_090.xml
成功处理文件: h170_084.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_085.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_091.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_046.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_052.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_044.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_050.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_078.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_087.txt
成功处理文件: h180_138.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_093.txt
成功处理文件: h170_079_b.xml
文件已存在，跳过处理: /Users/jessie/Doc

Processing files: 100%|██████████| 615/615 [00:10<00:00, 57.14it/s]

成功处理文件: h190_267.xml
成功处理文件: h190_273.xml
成功处理文件: h170_118.xml
成功处理文件: h170_044.xml
成功处理文件: h170_050.xml
成功处理文件: h170_078.xml
成功处理文件: h170_087.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v180_138.txt
成功处理文件: h170_093.xml
成功处理文件: h180_139.xml
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_092.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_086.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_079.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_051.txt
文件已存在，跳过处理: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling/data/processed/v170_045.txt



