In [1]:
import os
from bs4 import BeautifulSoup
from tqdm import tqdm
import csv
import json
import stanza
import re
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set working directory to project root
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("Current working directory: ", os.getcwd())

Current working directory:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [3]:
input_dir = 'data/h_variant'
preprocessed_dir = 'data/preprocessed/cusanus_lemma'
result_dir = 'results/preprocessing_result'
os.makedirs(preprocessed_dir, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)

In [4]:
lemmatized_output_path = os.path.join(result_dir, "cusanus_lemmatized.json")
pos_statistics_path = os.path.join(result_dir, "cusanus_statistics.csv")
word_freq_statistics_path = os.path.join(result_dir, "cusanus_word_frequency.csv")


In [5]:
# Load Latin stopwords from URL
stopwords_url = 'https://raw.githubusercontent.com/aurelberra/stopwords/master/stopwords_latin.txt'
response = requests.get(stopwords_url)
response.encoding = 'utf-8'
latin_stopwords = set(line.strip() for line in response.text.splitlines() if line.strip())

# Add additional stopwords
additional_stopwords = {
    'ego', 'mei', 'mihi', 'me', 'tu', 'tui', 'tibi', 'te',
    'nos', 'noster', 'nobis', 'vos', 'vester',
    'sui', 'sibi', 'se',
    'ab', 'ex', 'ad', 'in', 'de', 'per', 'cum', 'sub', 'pro',
    'ante', 'post', 'supra', 'et', 'ac', 'aut', 'nec', 'sed',
    'ut', 'si', 'atque', 'qui', 'quae', 'quod', 'quis', 'quid', 'non', 'ne'
}
latin_stopwords.update(additional_stopwords)

In [6]:
# Initialize Stanza NLP model for Latin
stanza.download('la')  # Download Latin model
nlp = stanza.Pipeline('la')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 142MB/s]                     
2024-11-22 23:02:07 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-22 23:02:07 INFO: Downloading default packages for language: la (Latin) ...
2024-11-22 23:02:07 INFO: File exists: /Users/jessie/stanza_resources/la/default.zip
2024-11-22 23:02:09 INFO: Finished downloading models and saved to /Users/jessie/stanza_resources
2024-11-22 23:02:09 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 27.8MB/s]                    
2024-11-22 23:02:09 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-22 23:02:10 INFO: Loading these model

In [7]:
# Load lemma mapping from external file
lemma_mapping_path = os.path.join(project_root, 'data/external/lemma.xml')
lemma_mapping = {}

with open(lemma_mapping_path, 'r', encoding='utf-8') as lemma_file:
    lemma_soup = BeautifulSoup(lemma_file, 'lxml')
    for lemma_entry in lemma_soup.find_all('lemma'):
        lemma_id = lemma_entry.get('id_lemma')
        lemma_name = lemma_entry.get('name')

        # Improved extraction logic for lemma name
        if lemma_name:
            # Step 1: Remove any leading characters like "*" or other notes
            lemma_name = re.sub(r'^\*.*?\s', '', lemma_name).strip()

            # Step 2: If there are parentheses, prioritize the content before them
            if '(' in lemma_name:
                lemma_value = lemma_name.split('(')[0].strip().lower()
            else:
                lemma_value = lemma_name.strip().lower()

            # Step 3: Handle multi-word lemmas by selecting the first valid word
            if lemma_value:
                lemma_value_parts = lemma_value.split()
                if lemma_value_parts:
                    lemma_value = lemma_value_parts[0]

            # Step 4: Remove grammatical category suffixes (e.g., "cj.", "adv.")
            lemma_value = re.split(r'\b(?:cj\.|adv\.|praep\.|f\.|m\.|n\.|pl\.|sg\.|dat\.|acc\.|nom\.|gen\.|abl\.)\b', lemma_value)[0].strip()

            # Step 5: Remove unnecessary descriptive content like place names
            lemma_value = re.sub(r'\b(?:provincia|region|place|saec\.|asia minor|africa|italia|hispania)\b.*', '', lemma_value).strip()

            # Step 6: Ensure the final lemma value is valid and has an ID
            if lemma_id and lemma_value:
                lemma_mapping[lemma_id] = lemma_value

print(f"Lemma mapping completed. Number of entries: {len(lemma_mapping)}")



Lemma mapping completed. Number of entries: 11784


In [8]:
# Preprocess text function
def preprocess_text(words):
    return [word for word in words if word.lower() not in latin_stopwords]


In [9]:
def pos_tag_text(text):
    doc = nlp(text)
    return [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]


In [10]:
def lemmatize_with_cusanus(words):
    lemmatized_words = []
    for word in words:
        lemma_id = word.get('lemma_l', '').lower()  # 从 lemma_l 获取 lemma id
        if lemma_id in lemma_mapping:
            lemmatized_words.append(lemma_mapping[lemma_id])
        else:
            lemmatized_words.append(word.get('rend', word.get_text()).lower())  # 默认使用原始词
    return lemmatized_words

In [11]:
def process_file_with_cusanus(file_path, preprocessed_path):
    """
    使用 Cusanus lemma 处理单个文件，提取段落、清理内容、去停用词和词形还原，返回段落列表。
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        paragraphs = []

        for fw_tag in soup.find_all('fw', {'type': 'n'}):
            section_content = []
            next_sibling = fw_tag.find_next_sibling()

            while next_sibling and next_sibling.name != 'fw':
                if next_sibling.name == 'p':
                    words = []
                    for w in next_sibling.find_all('w'):
                        # 跳过包含德文字母或特殊符号的单词
                        if re.search(r'[äöüß]', w.get_text()) or re.match(r'^cum\W*\d*$', w.get_text()):
                            continue

                        words.append(w)

                    # 使用 Cusanus lemma 进行词形还原
                    lemmatized_words = lemmatize_with_cusanus(words)

                    # 去除停用词
                    filtered_words = preprocess_text(lemmatized_words)
                    section_content.append(' '.join(filtered_words))

                next_sibling = next_sibling.find_next_sibling()

            paragraphs.append({'content': ' '.join(section_content)})

        # 保存到 lemmatized.txt
        with open(preprocessed_path, 'w', encoding='utf-8') as preprocessed_file:
            for idx, paragraph in enumerate(paragraphs, start=1):
                preprocessed_file.write(f"Paragraph {idx}:\n")
                preprocessed_file.write(f"{paragraph['content']}\n\n")

        return paragraphs

In [12]:
# 统计信息计算
def calculate_statistics_with_documents(paragraphs, document_id, overall_stats):
    doc_stats = {
        "document_id": document_id,
        "total_paragraphs": len(paragraphs),
        "total_words": 0,
        "total_types": 0,
        "pos_distribution": {},
        "lemmatized_content": []
    }
    unique_words = set()

    for paragraph in paragraphs:
        words = paragraph['content'].split()
        doc_stats["total_words"] += len(words)
        unique_words.update(words)

        # POS 标注
        pos_tags = pos_tag_text(paragraph['content'])
        for _, pos in pos_tags:
            doc_stats["pos_distribution"][pos] = doc_stats["pos_distribution"].get(pos, 0) + 1
            overall_stats["pos_distribution"][pos] = overall_stats["pos_distribution"].get(pos, 0) + 1

        # 保存段落内容
        doc_stats["lemmatized_content"].append(paragraph['content'])

    # 更新文档级统计
    doc_stats["total_types"] = len(unique_words)
    doc_stats["unique_words"] = unique_words

    # 更新总体统计
    overall_stats["total_words"] += doc_stats["total_words"]
    overall_stats["unique_words"].update(unique_words)

    # 更新词频
    for word in unique_words:
        overall_stats["word_frequencies"][word] = overall_stats["word_frequencies"].get(word, 0) + 1

    return doc_stats

In [13]:
# 保存统计信息
def save_overall_statistics(overall_stats, document_stats, result_dir):
    statistics_json_path = os.path.join(result_dir, "cusanus_statistics_summary.json")
    overall_stats["total_types"] = len(overall_stats["unique_words"])
    overall_stats.pop("unique_words", None)
    data_to_save = {
        "overall_statistics": overall_stats,
        "document_statistics": document_stats
    }
    with open(statistics_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(data_to_save, json_file, ensure_ascii=False, indent=4)
    print(f"统计信息已保存到 {statistics_json_path}")

In [14]:
# 主流程
def main_cusanus_pipeline(input_dir, preprocessed_dir, result_dir):
    overall_stats = {
        "total_words": 0,
        "unique_words": set(),
        "pos_distribution": {},
        "word_frequencies": {}
    }
    document_stats = []

    for file_name in tqdm(os.listdir(input_dir), desc="文件处理进度"):
        if file_name.endswith('.xml'):
            input_path = os.path.join(input_dir, file_name)
            preprocessed_path = os.path.join(preprocessed_dir, file_name.replace('.xml', '_lemmatized.txt'))
            try:
                paragraphs = process_file_with_cusanus(input_path, preprocessed_path)
                doc_stat = calculate_statistics_with_documents(paragraphs, file_name, overall_stats)
                document_stats.append(doc_stat)
            except Exception as e:
                print(f"处理文件 {file_name} 时出错: {e}")

    save_overall_statistics(overall_stats, document_stats, result_dir)
    print("所有文件处理完成并生成统计信息！")

In [15]:
# 执行主流程
if __name__ == "__main__":
    main_cusanus_pipeline(input_dir, preprocessed_dir, result_dir)

文件处理进度: 100%|██████████| 308/308 [22:03<00:00,  4.30s/it]


TypeError: Object of type set is not JSON serializable

In [4]:
import os
import json
import csv
import re
from tqdm import tqdm
import stanza

# 设置路径
lemmatized_dir = 'data/preprocessed/cusanus_lemma'  # 已存在的 lemmatized 文件目录
result_dir = 'results/preprocessing_result'  # 输出结果目录
json_path = os.path.join(result_dir, 'cusanus_lemma_statistics_summary.json')  # JSON 输出路径
csv_word_freq_path = os.path.join(result_dir, 'cusanus_lemma_word_frequencies.csv')  # 词频 CSV 输出路径
csv_pos_stats_path = os.path.join(result_dir, 'cusanus_lemma_pos_statistics.csv')  # POS 统计 CSV 输出路径

os.makedirs(result_dir, exist_ok=True)

# 初始化 Stanza 模型
stanza.download('la')
nlp = stanza.Pipeline('la', processors='tokenize,pos')

# 步骤 1: 加载所有 lemmatized.txt 文件并去除标点符号
def load_and_clean_lemmatized_files(directory):
    """
    加载目录中的所有 lemmatized.txt 文件并清理标点符号
    """
    all_paragraphs = {}
    punctuation_pattern = re.compile(r'[^\w\s]', re.UNICODE)  # 匹配所有非单词字符和非空白字符
    for file_name in os.listdir(directory):
        if file_name.endswith('_lemmatized.txt'):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                paragraphs = []
                for line in file:
                    if line.startswith("Paragraph"):
                        continue  # 跳过段落标题
                    clean_line = punctuation_pattern.sub('', line.strip())  # 去除标点符号
                    if clean_line:  # 只保留非空行
                        paragraphs.append(clean_line)
                all_paragraphs[file_name] = paragraphs
    return all_paragraphs

# 步骤 2: 统计所有文件的词频和 POS 分布
def calculate_statistics_for_all(files_data):
    """
    统计所有文件的词频和 POS 分布
    """
    overall_statistics = {
        "total_words": 0,
        "unique_words": set(),
        "word_frequencies": {},
        "pos_distribution": {},
        "document_statistics": []
    }

    for file_name, paragraphs in tqdm(files_data.items(), desc="统计文件数据"):
        file_stats = {
            "document_id": file_name,
            "total_words": 0,
            "total_types": 0,
            "pos_distribution": {}
        }
        unique_words = set()

        for paragraph in paragraphs:
            words = paragraph.split()
            file_stats["total_words"] += len(words)
            overall_statistics["total_words"] += len(words)
            unique_words.update(words)

            # 更新总体词频
            for word in words:
                overall_statistics["word_frequencies"][word] = overall_statistics["word_frequencies"].get(word, 0) + 1

            # POS 标注
            pos_tags = pos_tag_text(paragraph)
            for _, pos in pos_tags:
                file_stats["pos_distribution"][pos] = file_stats["pos_distribution"].get(pos, 0) + 1
                overall_statistics["pos_distribution"][pos] = overall_statistics["pos_distribution"].get(pos, 0) + 1

        file_stats["total_types"] = len(unique_words)
        file_stats["pos_distribution"] = dict(file_stats["pos_distribution"])
        file_stats["unique_words"] = list(unique_words)

        overall_statistics["unique_words"].update(unique_words)
        overall_statistics["document_statistics"].append(file_stats)

    overall_statistics["unique_words"] = list(overall_statistics["unique_words"])
    return overall_statistics

# POS 标注函数
def pos_tag_text(text):
    """
    对段落进行 POS 标注
    """
    doc = nlp(text)
    return [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]

# 步骤 3: 保存统计结果到文件
def save_statistics_to_files(statistics, json_path, csv_word_freq_path, csv_pos_stats_path):
    # 保存到 JSON 文件
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(statistics, json_file, ensure_ascii=False, indent=4)
    print(f"统计信息已保存到 JSON 文件: {json_path}")

    # 保存词频到 CSV
    with open(csv_word_freq_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Word", "Frequency"])  # 写入表头
        for word, freq in statistics["word_frequencies"].items():
            writer.writerow([word, freq])
    print(f"词频统计已保存到 CSV 文件: {csv_word_freq_path}")

    # 保存 POS 分布到 CSV
    with open(csv_pos_stats_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["POS Tag", "Frequency"])  # 写入表头
        for pos, freq in statistics["pos_distribution"].items():
            writer.writerow([pos, freq])
    print(f"POS 统计已保存到 CSV 文件: {csv_pos_stats_path}")

# 主流程
if __name__ == "__main__":
    # 步骤 1: 加载文件
    print("开始加载所有 lemmatized 文件...")
    files_data = load_and_clean_lemmatized_files(lemmatized_dir)
    print(f"已加载 {len(files_data)} 个文件，共包含 {sum(len(v) for v in files_data.values())} 个段落。")
    print(f"示例文件内容: {list(files_data.items())[0] if files_data else '无文件'}")

    # 询问是否继续
    if input("是否继续统计词频和 POS 数据？(y/n): ").strip().lower() != 'y':
        print("已退出程序。")
        exit()

    # 步骤 2: 统计数据
    print("开始统计词频和 POS 分布...")
    overall_statistics = calculate_statistics_for_all(files_data)
    print(f"总词数: {overall_statistics['total_words']}")
    print(f"独特词数: {len(overall_statistics['unique_words'])}")

    # 询问是否保存
    if input("是否保存统计结果到文件？(y/n): ").strip().lower() != 'y':
        print("已退出程序。")
        exit()

    # 步骤 3: 保存结果
    print("开始保存统计结果...")
    save_statistics_to_files(overall_statistics, json_path, csv_word_freq_path, csv_pos_stats_path)
    print("所有步骤已完成！")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 27.9MB/s]                    
2024-11-22 23:58:14 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-22 23:58:14 INFO: Downloading default packages for language: la (Latin) ...
2024-11-22 23:58:15 INFO: File exists: /Users/jessie/stanza_resources/la/default.zip
2024-11-22 23:58:16 INFO: Finished downloading models and saved to /Users/jessie/stanza_resources
2024-11-22 23:58:16 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 45.0MB/s]                    
2024-11-22 23:58:16 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-22 23:58:16 INFO: Loading these model

开始加载所有 lemmatized 文件...
已加载 308 个文件，共包含 4645 个段落。
示例文件内容: ('h180_197_b_lemmatized.txt', ['epistula dominicus debitor sum plato constantius romanos concludo praemitto paulus sum vita sum osdroena ismael sum vita sum spiritus', 'aristoteles dico ethicus vita voluptuosus vita politicus vita contemplativus omnis philosophus concors paulo voluptas osdroena fugio doceo insector passio voluptas recedo sapientia vita rationalis fio adriel maneo homo propheta dico homo honos sum intellego comparo sum iumentum similis facio sum ille vita politicus bonus sum consisto calabria facio fio volo christus matthaei doceo lex propheta vita loquor complico is dico praeceptum vita contemplativus sum verus vita spiritus noster intellectualis sum omnis contrarius delectabilis annas vita osdroena contrarior vita spiritus voluptas contemplatio osdroena vivo morior spiritus vergo ratio bartholomaeus spiritus vivo mortifico vita osdroena compositio uterque vivo humanus politicus duco vita omnis tempus suus facio

统计文件数据: 100%|██████████| 308/308 [05:46<00:00,  1.12s/it]


总词数: 394248
独特词数: 11503
开始保存统计结果...
统计信息已保存到 JSON 文件: results/preprocessing_result/cusanus_lemma_statistics_summary.json
词频统计已保存到 CSV 文件: results/preprocessing_result/cusanus_lemma_word_frequencies.csv
POS 统计已保存到 CSV 文件: results/preprocessing_result/cusanus_lemma_pos_statistics.csv
所有步骤已完成！


In [5]:
import os
import re
import json
import csv
from tqdm import tqdm
import stanza

# 设置路径
lemmatized_dir = 'data/preprocessed/cusanus_lemma'
processed_dir = 'data/preprocessed/cusanus'
result_dir = 'results'
os.makedirs(processed_dir, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)

json_path = os.path.join(result_dir, 'cusanus_statistics_summary.json')
csv_word_freq_path = os.path.join(result_dir, 'cusanus_word_frequencies.csv')
csv_pos_stats_path = os.path.join(result_dir, 'cusanus_pos_statistics.csv')

# 初始化 Stanza 模型
stanza.download('la')
nlp = stanza.Pipeline('la', processors='tokenize,pos', tokenize_pretokenized=True)

# 定义要移除的标点符号和停用词
punctuation_pattern = r'[^\w\s]'
stopwords_to_remove = {'sum', 'qui'}

# 加载所有 lemmatized 文件
def load_lemmatized_files(directory):
    all_files = {}
    for file_name in os.listdir(directory):
        if file_name.endswith('_lemmatized.txt'):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                paragraphs = [line.strip() for line in file if line.strip() and not line.startswith("Paragraph")]
                all_files[file_name] = paragraphs
    return all_files

# 预处理单个段落
def preprocess_paragraph(paragraph):
    # 移除标点符号
    paragraph = re.sub(punctuation_pattern, '', paragraph)
    # 按空格分词并移除停用词
    words = [word for word in paragraph.split() if word.lower() not in stopwords_to_remove]
    return ' '.join(words)

# 预处理所有文件并保存到新目录
def preprocess_files(files, output_dir):
    for file_name, paragraphs in tqdm(files.items(), desc="预处理文件"):
        output_path = os.path.join(output_dir, file_name)
        with open(output_path, 'w', encoding='utf-8') as output_file:
            for idx, paragraph in enumerate(paragraphs, start=1):
                processed_paragraph = preprocess_paragraph(paragraph)
                output_file.write(f"Paragraph {idx}:\n{processed_paragraph}\n\n")

# POS 标注
def pos_tag_text(text):
    doc = nlp(text)
    return [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]

# 统计文件的词频和 POS 分布
def calculate_statistics(files):
    overall_statistics = {
        "total_words": 0,
        "unique_words": set(),
        "word_frequencies": {},
        "pos_distribution": {},
        "document_statistics": []
    }

    for file_name, paragraphs in tqdm(files.items(), desc="统计文件数据"):
        file_stats = {
            "document_id": file_name,
            "total_words": 0,
            "total_types": 0,
            "pos_distribution": {}
        }
        unique_words = set()

        for paragraph in paragraphs:
            words = paragraph.split()
            file_stats["total_words"] += len(words)
            overall_statistics["total_words"] += len(words)
            unique_words.update(words)

            # 更新词频统计
            for word in words:
                overall_statistics["word_frequencies"][word] = overall_statistics["word_frequencies"].get(word, 0) + 1

            # POS 标注
            pos_tags = pos_tag_text(paragraph)
            for _, pos in pos_tags:
                file_stats["pos_distribution"][pos] = file_stats["pos_distribution"].get(pos, 0) + 1
                overall_statistics["pos_distribution"][pos] = overall_statistics["pos_distribution"].get(pos, 0) + 1

        file_stats["total_types"] = len(unique_words)
        file_stats["unique_words"] = list(unique_words)

        overall_statistics["unique_words"].update(unique_words)
        overall_statistics["document_statistics"].append(file_stats)

    overall_statistics["unique_words"] = list(overall_statistics["unique_words"])
    return overall_statistics

# 保存统计结果到文件
def save_statistics_to_files(statistics, json_path, csv_word_freq_path, csv_pos_stats_path):
    # 保存到 JSON 文件
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(statistics, json_file, ensure_ascii=False, indent=4)
    print(f"统计信息已保存到 JSON 文件: {json_path}")

    # 保存词频到 CSV
    with open(csv_word_freq_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Word", "Frequency"])
        for word, freq in statistics["word_frequencies"].items():
            writer.writerow([word, freq])
    print(f"词频统计已保存到 CSV 文件: {csv_word_freq_path}")

    # 保存 POS 分布到 CSV
    with open(csv_pos_stats_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["POS Tag", "Frequency"])
        for pos, freq in statistics["pos_distribution"].items():
            writer.writerow([pos, freq])
    print(f"POS 统计已保存到 CSV 文件: {csv_pos_stats_path}")

# 主流程
if __name__ == "__main__":
    print("开始加载 lemmatized 文件...")
    lemmatized_files = load_lemmatized_files(lemmatized_dir)

    print(f"加载完成，文件数: {len(lemmatized_files)}")
    print("示例段落: ", lemmatized_files[list(lemmatized_files.keys())[0]][:3])

    if input("是否继续进行预处理？(y/n): ").strip().lower() == 'y':
        preprocess_files(lemmatized_files, processed_dir)
        print(f"预处理完成，文件已保存到 {processed_dir}")

    if input("是否继续统计词频和 POS 数据？(y/n): ").strip().lower() == 'y':
        processed_files = load_lemmatized_files(processed_dir)  # 加载处理后的文件
        statistics = calculate_statistics(processed_files)
        print(f"总词数: {statistics['total_words']}")
        print(f"独特词数: {len(statistics['unique_words'])}")
        save_statistics_to_files(statistics, json_path, csv_word_freq_path, csv_pos_stats_path)
        print("所有步骤已完成！")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 48.0MB/s]                    
2024-11-23 00:23:55 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-23 00:23:55 INFO: Downloading default packages for language: la (Latin) ...
2024-11-23 00:23:55 INFO: File exists: /Users/jessie/stanza_resources/la/default.zip
2024-11-23 00:23:56 INFO: Finished downloading models and saved to /Users/jessie/stanza_resources
2024-11-23 00:23:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 37.7MB/s]                    
2024-11-23 00:23:57 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-23 00:23:57 INFO: Loading these model

开始加载 lemmatized 文件...
加载完成，文件数: 308
示例段落:  ['epistula, dominicus, debitor, sum, plato constantius romanos concludo, praemitto, paulus sum, vita, sum, osdroena ismael sum, vita, sum, spiritus,', 'aristoteles dico, ethicus, vita, voluptuosus, vita, politicus, vita, contemplativus, omnis, philosophus, concors, paulo voluptas, osdroena fugio, doceo, insector, passio, voluptas, recedo, sapientia, vita, rationalis, fio, adriel maneo, homo, propheta, dico, homo, honos, sum, intellego, comparo, sum, iumentum, similis, facio, sum, ille, vita, politicus, bonus, sum, consisto, calabria facio, fio, volo, christus, matthaei doceo, lex, propheta, vita, loquor, complico, is, dico, praeceptum, vita, contemplativus, sum, verus, vita, spiritus, noster, intellectualis, sum, omnis, contrarius, delectabilis, annas vita, osdroena contrarior, vita, spiritus, voluptas, contemplatio, osdroena vivo, morior, spiritus, vergo, ratio, bartholomaeus spiritus, vivo, mortifico, vita, osdroena compositio, uterque, vivo

预处理文件: 100%|██████████| 308/308 [00:00<00:00, 1956.94it/s]


预处理完成，文件已保存到 data/preprocessed/cusanus


统计文件数据: 100%|██████████| 308/308 [02:39<00:00,  1.93it/s]

总词数: 360434
独特词数: 11501
统计信息已保存到 JSON 文件: results/cusanus_statistics_summary.json
词频统计已保存到 CSV 文件: results/cusanus_word_frequencies.csv
POS 统计已保存到 CSV 文件: results/cusanus_pos_statistics.csv
所有步骤已完成！



