In [2]:
import os
from bs4 import BeautifulSoup
import stanza
import re
from tqdm import tqdm
import csv
import json
import requests

In [None]:
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("Current working directory: ", os.getcwd())

In [4]:
input_dir = 'data/v_variant'
preprocessed_dir = 'data/preprocessed/stanza'
result_dir = 'results/preprocessing_result'
os.makedirs(preprocessed_dir, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)

In [5]:
# 停用词加载
stopwords_url = 'https://raw.githubusercontent.com/aurelberra/stopwords/master/stopwords_latin.txt'
response = requests.get(stopwords_url)
response.encoding = 'utf-8'
latin_stopwords = set(line.strip() for line in response.text.splitlines() if line.strip())

additional_stopwords = {
    'ego', 'mei', 'mihi', 'me', 'tu', 'tui', 'tibi', 'te',
    'nos', 'noster', 'nobis', 'vos', 'vester',
    'sui', 'sibi', 'se',
    'ab', 'ex', 'ad', 'in', 'de', 'per', 'cum', 'sub', 'pro',
    'ante', 'post', 'supra', 'et', 'ac', 'aut', 'nec', 'sed',
    'ut', 'si', 'atque', 'qui', 'quae', 'quod', 'quis', 'quid', 'non', 'ne'
}
latin_stopwords.update(additional_stopwords)


In [None]:
# 初始化 Stanza NLP 模型
stanza.download('la')
nlp = stanza.Pipeline('la', processors='tokenize,pos,lemma')

In [7]:
# 文本预处理函数
def preprocess_text(words):
    return [word for word in words if word.lower() not in latin_stopwords]

In [8]:
# POS 标注函数
def pos_tag_text(text):
    doc = nlp(text)
    return [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]


In [9]:
# Stanza 词形还原函数
def lemmatize_with_stanza(text):
    doc = nlp(text)
    return [word.lemma for sentence in doc.sentences for word in sentence.words]


In [10]:
def process_file_with_stanza(file_path, preprocessed_path):
    """
    使用 Stanza 处理单个文件：提取段落、清理内容、去停用词和词形还原，返回段落列表。
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        paragraphs = []

        for fw_tag in soup.find_all('fw', {'type': 'n'}):
            section_content = []
            next_sibling = fw_tag.find_next_sibling()
            while next_sibling and next_sibling.name != 'fw':
                if next_sibling.name == 'p':
                    words = []
                    for w in next_sibling.find_all('w'):
                        original_word = w.get('rend', w.get_text()).lower()

                        # 跳过德语单词和特殊符号
                        if re.search(r'[äöüß]', original_word) or re.match(r'^cum\W*\d*$', original_word):
                            continue

                        words.append(original_word)

                    # 词形还原
                    lemmatized_words = lemmatize_with_stanza(' '.join(words))

                    # 去停用词
                    filtered_words = preprocess_text(lemmatized_words)
                    section_content.append(' '.join(filtered_words))

                next_sibling = next_sibling.find_next_sibling()

            paragraphs.append({'content': ' '.join(section_content)})

        # 保存到 lemmatized.txt
        with open(preprocessed_path, 'w', encoding='utf-8') as preprocessed_file:
            for idx, paragraph in enumerate(paragraphs, start=1):
                preprocessed_file.write(f"Paragraph {idx}:\n")
                preprocessed_file.write(f"{paragraph['content']}\n\n")

        return paragraphs


In [11]:
# 统计信息计算（包含词频和文档级统计）
def calculate_statistics_with_documents(paragraphs, document_id, overall_stats):
    """
    计算文档级统计信息，保留段落的词形还原内容。
    """
    doc_stats = {
        "document_id": document_id,
        "total_paragraphs": len(paragraphs),
        "total_words": 0,
        "total_types": 0,
        "pos_distribution": {},
        "lemmatized_content": []  # 新增：存储词形还原内容
    }
    unique_words = set()
    
    for paragraph in paragraphs:
        words = paragraph['content'].split()
        doc_stats["total_words"] += len(words)
        unique_words.update(words)

        # POS 标注并统计分布
        pos_tags = pos_tag_text(paragraph['content'])
        for _, pos in pos_tags:
            doc_stats["pos_distribution"][pos] = doc_stats["pos_distribution"].get(pos, 0) + 1
            overall_stats["pos_distribution"][pos] = overall_stats["pos_distribution"].get(pos, 0) + 1

        # 将段落内容加入 lemmatized_content
        doc_stats["lemmatized_content"].append(paragraph['content'])

    # 更新文档级统计
    doc_stats["total_types"] = len(unique_words)
    doc_stats["unique_words"] = unique_words

    # 更新总体统计
    overall_stats["total_words"] += doc_stats["total_words"]
    overall_stats["unique_words"].update(unique_words)

    # 更新词频
    for word in unique_words:
        overall_stats["word_frequencies"][word] = overall_stats["word_frequencies"].get(word, 0) + 1

    return doc_stats

In [12]:
# 保存文档统计和词频统计
def save_document_statistics_with_totals(document_stats, result_dir):
    csv_path = os.path.join(result_dir, "stanza_statistics.csv")
    total_words = sum(doc["total_words"] for doc in document_stats)
    total_types = len(set(word for doc in document_stats for word in doc.get("unique_words", [])))

    with open(csv_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Document ID", "Total Paragraphs", "Total Words", "Total Types", "POS Distribution"])
        
        # 写入文档级统计
        for doc_stat in document_stats:
            writer.writerow([
                doc_stat["document_id"],
                doc_stat["total_paragraphs"],
                doc_stat["total_words"],
                doc_stat["total_types"],
                json.dumps(doc_stat["pos_distribution"])
            ])

        # 添加总统计信息
        writer.writerow([
            "ALL DOCUMENTS",
            sum(doc["total_paragraphs"] for doc in document_stats),
            total_words,
            total_types,
            "N/A"
        ])
    print(f"文档级统计信息已保存到 {csv_path}")

In [13]:
def save_word_frequency_with_pos(overall_stats, result_dir):
    """
    保存词频统计到 CSV 文件，包含 Word, Frequency, POS 列。
    """
    word_freq_csv_path = os.path.join(result_dir, "stanza_word_frequency.csv")
    pos_tags = overall_stats.get("pos_distribution", {})
    word_frequencies = overall_stats.get("word_frequencies", {})
    pos_for_words = {}
    for pos, freq in pos_tags.items():
        for word, frequency in word_frequencies.items():
            if word not in pos_for_words:
                pos_for_words[word] = pos
    with open(word_freq_csv_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Word", "Frequency", "POS"])  # 表头
        for word, freq in word_frequencies.items():
            pos = pos_for_words.get(word, "UNKNOWN")  # 默认 POS 为 UNKNOWN
            writer.writerow([word, freq, pos])
    print(f"词频统计信息已保存到 {word_freq_csv_path}")

In [21]:
# 保存统计信息 (专门针对 Stanza)
def save_overall_statistics_for_stanza(overall_stats, document_stats, result_dir):
    """
    保存总体统计信息为 JSON 文件，确保 JSON 序列化稳定，避免类型问题。
    """
    statistics_json_path = os.path.join(result_dir, "stanza_statistics_summary.json")

    # 转换 set 为 list，确保 JSON 序列化兼容
    overall_stats["total_types"] = len(overall_stats["unique_words"])
    overall_stats["unique_words"] = list(overall_stats["unique_words"])  # 转换为列表

    data_to_save = {
        "overall_statistics": overall_stats,
        "document_statistics": document_stats
    }

    with open(statistics_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(data_to_save, json_file, ensure_ascii=False, indent=4)

    print(f"Stanza 统计信息已保存到 {statistics_json_path}")


# 主流程仅针对 Stanza
def main_stanza_pipeline(input_dir, preprocessed_dir, result_dir):
    """
    针对 Stanza 的主处理流程。
    """
    overall_stats = {
        "total_words": 0,
        "unique_words": set(),
        "pos_distribution": {},
        "word_frequencies": {}
    }
    document_stats = []

    for file_name in tqdm(os.listdir(input_dir), desc="文件处理进度"):
        if file_name.endswith('.xml'):
            input_path = os.path.join(input_dir, file_name)
            preprocessed_path = os.path.join(preprocessed_dir, file_name.replace('.xml', '_lemmatized.txt'))

            # 跳过已处理文件
            if os.path.exists(preprocessed_path):
                print(f"{preprocessed_path} 已存在，跳过预处理...")
            else:
                try:
                    paragraphs = process_file_with_stanza(input_path, preprocessed_path)
                    if paragraphs:
                        doc_stat = calculate_statistics_with_documents(paragraphs, file_name, overall_stats)
                        document_stats.append(doc_stat)
                except Exception as e:
                    print(f"处理文件 {file_name} 时出错: {e}")

    # 检查统计文件是否已存在
    statistics_json_path = os.path.join(result_dir, "stanza_statistics_summary.json")
    if os.path.exists(statistics_json_path):
        print(f"{statistics_json_path} 已存在，跳过统计...")
    else:
        save_overall_statistics_for_stanza(overall_stats, document_stats, result_dir)
    print("Stanza 所有文件处理完成并生成统计信息！")


In [None]:
# 执行主流程
if __name__ == "__main__":
    main_stanza_pipeline(input_dir, preprocessed_dir, result_dir)

In [24]:
import os
import json
import csv
from tqdm import tqdm
import stanza
import re

# 路径设置
processed_dir = 'data/preprocessed/cusanus'  # 已经处理的文件目录
updated_dir = 'data/preprocessed/cusanus_cleaned'  # 停用词再次清理后的目录
result_dir = 'results'  # 统计结果目录

os.makedirs(updated_dir, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)

json_path = os.path.join(result_dir, 'cusanus_cleaned_statistics_summary.json')
csv_word_freq_path = os.path.join(result_dir, 'cusanus_cleaned_word_frequencies.csv')
csv_pos_stats_path = os.path.join(result_dir, 'cusanus_cleaned_pos_statistics.csv')

# 加载 Stanza 模型
stanza.download('la')
nlp = stanza.Pipeline('la', processors='tokenize,pos', tokenize_pretokenized=True)

# 加载停用词列表
stopwords_url = 'https://raw.githubusercontent.com/aurelberra/stopwords/master/stopwords_latin.txt'
response = requests.get(stopwords_url)
response.encoding = 'utf-8'
latin_stopwords = set(line.strip() for line in response.text.splitlines() if line.strip())

# 如果需要增加额外停用词
additional_stopwords = {'sum', 'qui'}  # 已经处理但未移除的词
latin_stopwords.update(additional_stopwords)

# 加载已处理的文件
def load_processed_files(directory):
    """
    加载已处理的文件（每个文件段落为列表）。
    """
    all_files = {}
    for file_name in os.listdir(directory):
        if file_name.endswith('_lemmatized.txt'):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                paragraphs = [line.strip() for line in file if line.strip() and not line.startswith("Paragraph")]
                all_files[file_name] = paragraphs
    return all_files

# 再次清理停用词
def clean_stopwords(paragraph):
    """
    从段落中移除停用词。
    """
    words = [word for word in paragraph.split() if word.lower() not in latin_stopwords]
    return ' '.join(words)

def clean_files(files_data, output_dir):
    """
    遍历文件并清理停用词，将结果保存到新目录。
    """
    for file_name, paragraphs in tqdm(files_data.items(), desc="清理停用词"):
        output_path = os.path.join(output_dir, file_name)
        with open(output_path, 'w', encoding='utf-8') as output_file:
            for idx, paragraph in enumerate(paragraphs, start=1):
                cleaned_paragraph = clean_stopwords(paragraph)
                output_file.write(f"Paragraph {idx}:\n{cleaned_paragraph}\n\n")

# POS 标注
def pos_tag_text(text):
    """
    对文本进行 POS 标注。
    """
    doc = nlp(text)
    return [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]

# 统计文件的词频和 POS 分布
def calculate_statistics(files_data):
    """
    统计文件的词频和 POS 分布。
    """
    overall_statistics = {
        "total_words": 0,
        "unique_words": set(),
        "word_frequencies": {},
        "pos_distribution": {},
        "document_statistics": []
    }

    for file_name, paragraphs in tqdm(files_data.items(), desc="统计文件数据"):
        file_stats = {
            "document_id": file_name,
            "total_words": 0,
            "total_types": 0,
            "pos_distribution": {}
        }
        unique_words = set()

        for paragraph in paragraphs:
            words = paragraph.split()
            file_stats["total_words"] += len(words)
            overall_statistics["total_words"] += len(words)
            unique_words.update(words)

            # 更新词频
            for word in words:
                overall_statistics["word_frequencies"][word] = overall_statistics["word_frequencies"].get(word, 0) + 1

            # POS 标注
            pos_tags = pos_tag_text(paragraph)
            for _, pos in pos_tags:
                file_stats["pos_distribution"][pos] = file_stats["pos_distribution"].get(pos, 0) + 1
                overall_statistics["pos_distribution"][pos] = overall_statistics["pos_distribution"].get(pos, 0) + 1

        file_stats["total_types"] = len(unique_words)
        file_stats["unique_words"] = list(unique_words)

        overall_statistics["unique_words"].update(unique_words)
        overall_statistics["document_statistics"].append(file_stats)

    overall_statistics["unique_words"] = list(overall_statistics["unique_words"])
    return overall_statistics

# 保存统计结果
def save_statistics(statistics, json_path, csv_word_freq_path, csv_pos_stats_path):
    """
    保存统计结果到 JSON 和 CSV 文件。
    """
    # 保存到 JSON
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(statistics, json_file, ensure_ascii=False, indent=4)
    print(f"统计信息已保存到: {json_path}")

    # 保存词频到 CSV
    with open(csv_word_freq_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Word", "Frequency"])
        for word, freq in statistics["word_frequencies"].items():
            writer.writerow([word, freq])
    print(f"词频已保存到: {csv_word_freq_path}")

    # 保存 POS 分布到 CSV
    with open(csv_pos_stats_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["POS Tag", "Frequency"])
        for pos, freq in statistics["pos_distribution"].items():
            writer.writerow([pos, freq])
    print(f"POS 统计已保存到: {csv_pos_stats_path}")

# 主流程
if __name__ == "__main__":
    print("加载已处理的文件...")
    processed_files = load_processed_files(processed_dir)
    print(f"加载完成，文件数: {len(processed_files)}")

    if input("是否继续清理停用词？(y/n): ").strip().lower() == 'y':
        clean_files(processed_files, updated_dir)
        print(f"停用词清理完成，文件已保存到 {updated_dir}")

    if input("是否继续统计词频和 POS 分布？(y/n): ").strip().lower() == 'y':
        cleaned_files = load_processed_files(updated_dir)
        statistics = calculate_statistics(cleaned_files)
        print(f"总词数: {statistics['total_words']}")
        print(f"独特词数: {len(statistics['unique_words'])}")
        save_statistics(statistics, json_path, csv_word_freq_path, csv_pos_stats_path)
        print("所有统计任务已完成！")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 19.8MB/s]                    
2024-11-23 00:33:00 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-23 00:33:01 INFO: Downloading default packages for language: la (Latin) ...
2024-11-23 00:33:02 INFO: File exists: /Users/jessie/stanza_resources/la/default.zip
2024-11-23 00:33:03 INFO: Finished downloading models and saved to /Users/jessie/stanza_resources
2024-11-23 00:33:03 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 26.3MB/s]                    
2024-11-23 00:33:03 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-23 00:33:03 INFO: Loading these model

加载已处理的文件...
加载完成，文件数: 308


清理停用词: 100%|██████████| 308/308 [00:00<00:00, 2875.47it/s]


停用词清理完成，文件已保存到 data/preprocessed/cusanus_cleaned


统计文件数据: 100%|██████████| 308/308 [02:17<00:00,  2.24it/s]

总词数: 307521
独特词数: 11433
统计信息已保存到: results/cusanus_cleaned_statistics_summary.json
词频已保存到: results/cusanus_cleaned_word_frequencies.csv
POS 统计已保存到: results/cusanus_cleaned_pos_statistics.csv
所有统计任务已完成！



