In [None]:
import os
from bs4 import BeautifulSoup
from cltk.lemmatize.lat import LatinBackoffLemmatizer
import stanza
import re
from tqdm import tqdm
import csv
from collections import Counter
import requests
import json

In [None]:
# Set working directory to project root
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("Current working directory: ", os.getcwd())

In [3]:
# 输入和输出路径
input_dir = 'data/v_variant'
preprocessed_dir = 'data/preprocessed/cltk'
result_dir = 'results/preprocessing_result'
os.makedirs(preprocessed_dir, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)

In [4]:
# Load Latin stopwords from URL
stopwords_url = 'https://raw.githubusercontent.com/aurelberra/stopwords/master/stopwords_latin.txt'
response = requests.get(stopwords_url)
response.encoding = 'utf-8'
latin_stopwords = set(line.strip() for line in response.text.splitlines() if line.strip())

# Add additional stopwords
additional_stopwords = {
    'ego', 'mei', 'mihi', 'me', 'tu', 'tui', 'tibi', 'te',
    'nos', 'noster', 'nobis', 'vos', 'vester',
    'sui', 'sibi', 'se',
    'ab', 'ex', 'ad', 'in', 'de', 'per', 'cum', 'sub', 'pro',
    'ante', 'post', 'supra', 'et', 'ac', 'aut', 'nec', 'sed',
    'ut', 'si', 'atque', 'qui', 'quae', 'quod', 'quis', 'quid', 'non', 'ne'
}
latin_stopwords.update(additional_stopwords)

In [None]:
# Initialize CLTK lemmatizer
lemmatizer = LatinBackoffLemmatizer()

# Initialize Stanza NLP model for Latin
stanza.download('la')  # Download Latin model
nlp = stanza.Pipeline('la')

In [6]:
# Preprocess text function
# Filters stopwords from the word list
def preprocess_text(words):
    return [word for word in words if word.lower() not in latin_stopwords]


In [7]:
def pos_tag_text(text):
    doc = nlp(text)
    return [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]


In [8]:
# CLTK 词形还原函数（结合 rend 属性）
def lemmatize_with_cltk(words):
    lemmatized_words = []
    for word in words:
        lemma = lemmatizer.lemmatize([word])[0][1]
        lemmatized_words.append(lemma if lemma else word)
    return lemmatized_words

In [25]:
def process_file_with_cltk(file_path, preprocessed_path):
    """
    使用 CLTK 处理单个文件，提取段落、清理内容、去停用词和词形还原，返回段落列表。
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        paragraphs = []

        for fw_tag in soup.find_all('fw', {'type': 'n'}):
            section_content = []
            next_sibling = fw_tag.find_next_sibling()

            while next_sibling and next_sibling.name != 'fw':
                if next_sibling.name == 'p':
                    words = []
                    for w in next_sibling.find_all('w'):
                        # 跳过包含德文字母或特殊符号的单词
                        if re.search(r'[äöüß]', w.get_text()) or re.match(r'^cum\W*\d*$', w.get_text()):
                            continue

                        # 使用 CLTK 进行词形还原
                        original_word = w.get('rend', w.get_text()).lower()
                        lemma = lemmatizer.lemmatize([original_word])[0][1]
                        words.append(lemma)

                    # 去除停用词
                    filtered_words = preprocess_text(words)
                    section_content.append(' '.join(filtered_words))

                next_sibling = next_sibling.find_next_sibling()

            paragraphs.append({'content': ' '.join(section_content)})

        # 保存到 lemmatized.txt
        with open(preprocessed_path, 'w', encoding='utf-8') as preprocessed_file:
            for idx, paragraph in enumerate(paragraphs, start=1):
                preprocessed_file.write(f"Paragraph {idx}:\n")
                preprocessed_file.write(f"{paragraph['content']}\n\n")

        return paragraphs


In [26]:
# 文档统计计算
def calculate_statistics_with_documents(paragraphs, document_id, overall_stats):
    doc_stats = {
        "document_id": document_id,
        "total_paragraphs": len(paragraphs),
        "total_words": 0,
        "total_types": 0,
        "pos_distribution": {},
        "lemmatized_content": []
    }
    unique_words = set()
    
    for paragraph in paragraphs:
        words = paragraph['content'].split()
        doc_stats["total_words"] += len(words)
        unique_words.update(words)

        # POS 标注并统计分布
        pos_tags = pos_tag_text(paragraph['content'])
        for _, pos in pos_tags:
            doc_stats["pos_distribution"][pos] = doc_stats["pos_distribution"].get(pos, 0) + 1
            overall_stats["pos_distribution"][pos] = overall_stats["pos_distribution"].get(pos, 0) + 1

        # 保存段落内容
        doc_stats["lemmatized_content"].append(paragraph['content'])

    # 更新文档级统计
    doc_stats["total_types"] = len(unique_words)
    doc_stats["unique_words"] = unique_words

    # 更新总体统计
    overall_stats["total_words"] += doc_stats["total_words"]
    overall_stats["unique_words"].update(unique_words)

    # 更新词频
    for word in unique_words:
        overall_stats["word_frequencies"][word] = overall_stats["word_frequencies"].get(word, 0) + 1

    return doc_stats

In [27]:
# 保存统计信息 (专门针对 CLTK)
def save_overall_statistics_for_cltk(overall_stats, document_stats, result_dir):
    """
    保存总体统计信息为 JSON 文件，确保 JSON 序列化稳定，避免类型问题。
    """
    statistics_json_path = os.path.join(result_dir, "cltk_statistics_summary.json")

    # 转换 set 为 list，确保 JSON 序列化兼容
    overall_stats["total_types"] = len(overall_stats["unique_words"])
    overall_stats["unique_words"] = list(overall_stats["unique_words"])  # 转换为列表

    data_to_save = {
        "overall_statistics": overall_stats,
        "document_statistics": document_stats
    }

    with open(statistics_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(data_to_save, json_file, ensure_ascii=False, indent=4)

    print(f"CLTK 统计信息已保存到 {statistics_json_path}")


# 主流程仅针对 CLTK
def main_cltk_pipeline(input_dir, preprocessed_dir, result_dir):
    """
    针对 CLTK 的主处理流程。
    """
    overall_stats = {
        "total_words": 0,
        "unique_words": set(),
        "pos_distribution": {},
        "word_frequencies": {}
    }
    document_stats = []

    for file_name in tqdm(os.listdir(input_dir), desc="文件处理进度"):
        if file_name.endswith('.xml'):
            input_path = os.path.join(input_dir, file_name)
            preprocessed_path = os.path.join(preprocessed_dir, file_name.replace('.xml', '_lemmatized.txt'))

            # 跳过已处理文件
            if os.path.exists(preprocessed_path):
                print(f"{preprocessed_path} 已存在，跳过预处理...")
            else:
                try:
                    paragraphs = process_file_with_cltk(input_path, preprocessed_path)
                    if paragraphs:
                        doc_stat = calculate_statistics_with_documents(paragraphs, file_name, overall_stats)
                        document_stats.append(doc_stat)
                except Exception as e:
                    print(f"处理文件 {file_name} 时出错: {e}")

    # 检查统计文件是否已存在
    statistics_json_path = os.path.join(result_dir, "cltk_statistics_summary.json")
    if os.path.exists(statistics_json_path):
        print(f"{statistics_json_path} 已存在，跳过统计...")
    else:
        save_overall_statistics_for_cltk(overall_stats, document_stats, result_dir)
    print("CLTK 所有文件处理完成并生成统计信息！")


In [None]:
# 执行主流程
if __name__ == "__main__":
    main_cltk_pipeline(input_dir, preprocessed_dir, result_dir)

In [None]:
import os
import json
import csv

# 定义输入和输出路径
json_path = 'results/preprocessing_result/cltk_statistics_summary.json'  # 已存在的 JSON 文件路径
csv_word_freq_path = 'results/preprocessing_result/cltk_word_frequencies.csv'  # 词频 CSV 输出路径
csv_pos_stats_path = 'results/preprocessing_result/cltk_pos_statistics.csv'  # POS 统计 CSV 输出路径

# 从 JSON 文件加载统计数据
def load_statistics_from_json(json_path):
    """
    加载 JSON 文件内容为字典。

    Parameters:
        json_path (str): JSON 文件路径。

    Returns:
        dict: 解析后的 JSON 数据。
    """
    with open(json_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
    return data

# 保存词频统计到 CSV 文件
def save_word_frequencies_to_csv(word_frequencies, csv_path):
    """
    将词频统计保存为 CSV 文件。

    Parameters:
        word_frequencies (dict): 词频统计数据。
        csv_path (str): 输出 CSV 文件路径。
    """
    with open(csv_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Word", "Frequency"])  # 写入表头
        for word, freq in word_frequencies.items():
            writer.writerow([word, freq])
    print(f"词频统计已保存到: {csv_path}")

# 保存 POS 统计到 CSV 文件
def save_pos_statistics_to_csv(pos_distribution, csv_path):
    """
    将 POS 统计保存为 CSV 文件。

    Parameters:
        pos_distribution (dict): POS 统计数据。
        csv_path (str): 输出 CSV 文件路径。
    """
    with open(csv_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["POS Tag", "Frequency"])  # 写入表头
        for pos, freq in pos_distribution.items():
            writer.writerow([pos, freq])
    print(f"POS 统计已保存到: {csv_path}")

# 主流程
if __name__ == "__main__":
    try:
        # 加载统计数据
        statistics = load_statistics_from_json(json_path)

        # 提取需要的数据
        word_frequencies = statistics["overall_statistics"]["word_frequencies"]
        pos_distribution = statistics["overall_statistics"]["pos_distribution"]

        # 保存到 CSV 文件
        save_word_frequencies_to_csv(word_frequencies, csv_word_freq_path)
        save_pos_statistics_to_csv(pos_distribution, csv_pos_stats_path)

        print("统计数据已成功保存到 CSV 文件。")
    except FileNotFoundError:
        print(f"文件 {json_path} 不存在，请检查路径是否正确。")
    except KeyError as e:
        print(f"JSON 文件格式有误，缺少关键字段: {e}")
    except Exception as e:
        print(f"发生错误: {e}")


In [38]:
import os
import json
import csv
from tqdm import tqdm
import stanza

# 设置路径
lemmatized_dir = 'data/preprocessed/cltk'  # 已存在的 lemmatized 文件目录
result_dir = 'results/preprocessing_result'  # 输出结果目录
json_path = os.path.join(result_dir, 'cltk_statistics_summary.json')  # JSON 输出路径
csv_word_freq_path = os.path.join(result_dir, 'cltk_word_frequencies.csv')  # 词频 CSV 输出路径
csv_pos_stats_path = os.path.join(result_dir, 'cltk_pos_statistics.csv')  # POS 统计 CSV 输出路径

os.makedirs(result_dir, exist_ok=True)

# 初始化 Stanza 模型
stanza.download('la')
nlp = stanza.Pipeline('la', processors='tokenize,pos')

# 步骤 1: 加载所有 lemmatized.txt 文件
def load_all_lemmatized_files(directory):
    """
    加载目录中的所有 lemmatized.txt 文件
    """
    all_paragraphs = {}
    for file_name in os.listdir(directory):
        if file_name.endswith('_lemmatized.txt'):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                paragraphs = []
                for line in file:
                    if line.startswith("Paragraph"):
                        continue  # 跳过段落标题
                    if line.strip():  # 只保留非空行
                        paragraphs.append(line.strip())
                all_paragraphs[file_name] = paragraphs
    return all_paragraphs

# 步骤 2: 统计所有文件的词频和 POS 分布
def calculate_statistics_for_all(files_data):
    """
    统计所有文件的词频和 POS 分布
    """
    overall_statistics = {
        "total_words": 0,
        "unique_words": set(),
        "word_frequencies": {},
        "pos_distribution": {},
        "document_statistics": []
    }

    for file_name, paragraphs in tqdm(files_data.items(), desc="统计文件数据"):
        file_stats = {
            "document_id": file_name,
            "total_words": 0,
            "total_types": 0,
            "pos_distribution": {}
        }
        unique_words = set()

        for paragraph in paragraphs:
            words = paragraph.split()
            file_stats["total_words"] += len(words)
            overall_statistics["total_words"] += len(words)
            unique_words.update(words)

            # 更新总体词频
            for word in words:
                overall_statistics["word_frequencies"][word] = overall_statistics["word_frequencies"].get(word, 0) + 1

            # POS 标注
            pos_tags = pos_tag_text(paragraph)
            for _, pos in pos_tags:
                file_stats["pos_distribution"][pos] = file_stats["pos_distribution"].get(pos, 0) + 1
                overall_statistics["pos_distribution"][pos] = overall_statistics["pos_distribution"].get(pos, 0) + 1

        file_stats["total_types"] = len(unique_words)
        file_stats["pos_distribution"] = dict(file_stats["pos_distribution"])
        file_stats["unique_words"] = list(unique_words)

        overall_statistics["unique_words"].update(unique_words)
        overall_statistics["document_statistics"].append(file_stats)

    overall_statistics["unique_words"] = list(overall_statistics["unique_words"])
    return overall_statistics

# POS 标注函数
def pos_tag_text(text):
    doc = nlp(text)
    return [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]

# 步骤 3: 保存统计结果到文件
def save_statistics_to_files(statistics, json_path, csv_word_freq_path, csv_pos_stats_path):
    # 保存到 JSON 文件
    with open(json_path, 'w', encoding='utf-8') as json_file:
        json.dump(statistics, json_file, ensure_ascii=False, indent=4)
    print(f"统计信息已保存到 JSON 文件: {json_path}")

    # 保存词频到 CSV
    with open(csv_word_freq_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Word", "Frequency"])  # 写入表头
        for word, freq in statistics["word_frequencies"].items():
            writer.writerow([word, freq])
    print(f"词频统计已保存到 CSV 文件: {csv_word_freq_path}")

    # 保存 POS 分布到 CSV
    with open(csv_pos_stats_path, 'w', encoding='utf-8', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["POS Tag", "Frequency"])  # 写入表头
        for pos, freq in statistics["pos_distribution"].items():
            writer.writerow([pos, freq])
    print(f"POS 统计已保存到 CSV 文件: {csv_pos_stats_path}")

# 主流程
if __name__ == "__main__":
    # 步骤 1: 加载文件
    print("开始加载所有 lemmatized 文件...")
    files_data = load_all_lemmatized_files(lemmatized_dir)
    print(f"已加载 {len(files_data)} 个文件，共包含 {sum(len(v) for v in files_data.values())} 个段落。")
    print(f"示例文件内容: {list(files_data.items())[0] if files_data else '无文件'}")

    # 询问是否继续
    if input("是否继续统计词频和 POS 数据？(y/n): ").strip().lower() != 'y':
        print("已退出程序。")
        exit()

    # 步骤 2: 统计数据
    print("开始统计词频和 POS 分布...")
    overall_statistics = calculate_statistics_for_all(files_data)
    print(f"总词数: {overall_statistics['total_words']}")
    print(f"独特词数: {len(overall_statistics['unique_words'])}")

    # 询问是否保存
    if input("是否保存统计结果到文件？(y/n): ").strip().lower() != 'y':
        print("已退出程序。")
        exit()

    # 步骤 3: 保存结果
    print("开始保存统计结果...")
    save_statistics_to_files(overall_statistics, json_path, csv_word_freq_path, csv_pos_stats_path)
    print("所有步骤已完成！")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 11.4MB/s]                    
2024-11-22 15:26:45 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-22 15:26:45 INFO: Downloading default packages for language: la (Latin) ...
2024-11-22 15:26:45 INFO: File exists: /Users/jessie/stanza_resources/la/default.zip
2024-11-22 15:26:47 INFO: Finished downloading models and saved to /Users/jessie/stanza_resources
2024-11-22 15:26:47 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 47.0MB/s]                    
2024-11-22 15:26:47 INFO: Downloaded file to /Users/jessie/stanza_resources/resources.json
2024-11-22 15:26:47 INFO: Loading these model

开始加载所有 lemmatized 文件...
已加载 306 个文件，共包含 4515 个段落。
示例文件内容: ('v160_007_lemmatized.txt', ['remitto pecco diligo mare peccatrix peccator sordes tenebrae circumvoluta sanctus spiritus receptaculis purus caelum1 gaudium gloriosissimo triumphus proveho diligo careo operio multitudo peccator petri publicus peccatrix lucas nimietas peccator superbus pharisaeo contemno christus gratia sumo diligo mulier secundus marcus1 septem daemonium christus eicio secundus gregorium homilia universus peccator genus reconcilio deus diligo christus verus samaritanus vulnero mariam vinum oleum lavo sanitas restituo ezechielis conculcaris sanguis sanguis vivo paucus expando amicio operio ignominia lavo aqua emundo sanguis ungo oleum vestio discoloribus mare magdalena peccatrix primus pecco effectus diligo secundus dilectio dimitto pecco gratia remissio brevis tres pecco dilectio dimissio', 'secundus ambrosius1 peccatum praevaricatio lex divina caelestis inoboedio mandator secundus augustinum sperno incommutabili

统计文件数据: 100%|██████████| 306/306 [04:13<00:00,  1.21it/s]


总词数: 281540
独特词数: 16959
开始保存统计结果...
统计信息已保存到 JSON 文件: results/preprocessing_result/cltk_statistics_summary.json
词频统计已保存到 CSV 文件: results/preprocessing_result/cltk_word_frequencies.csv
POS 统计已保存到 CSV 文件: results/preprocessing_result/cltk_pos_statistics.csv
所有步骤已完成！
