In [None]:
import os
from bs4 import BeautifulSoup
from tqdm import tqdm
import csv
from collections import Counter
import requests
import json
import stanza
import re

In [None]:
# Set working directory to project root
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("Current working directory: ", os.getcwd())

In [3]:
input_dir = 'data/h_variant'
output_dir = 'data/processed/h_variant'
statistics_dir = 'data/statistics'
lemmatized_output_path = 'data/processed/h_variant/h_lemmatized.json'
pos_statistics_path = 'data/statistics/h_statistics.csv'
word_freq_statistics_path = 'data/statistics/h_word_frequency.csv'

In [4]:
os.makedirs(output_dir, exist_ok=True)

In [5]:
# Load Latin stopwords from URL
stopwords_url = 'https://raw.githubusercontent.com/aurelberra/stopwords/master/stopwords_latin.txt'
response = requests.get(stopwords_url)
response.encoding = 'utf-8'
latin_stopwords = set(line.strip() for line in response.text.splitlines() if line.strip())

# Add additional stopwords
additional_stopwords = {
    'ego', 'mei', 'mihi', 'me', 'tu', 'tui', 'tibi', 'te',
    'nos', 'noster', 'nobis', 'vos', 'vester',
    'sui', 'sibi', 'se',
    'ab', 'ex', 'ad', 'in', 'de', 'per', 'cum', 'sub', 'pro',
    'ante', 'post', 'supra', 'et', 'ac', 'aut', 'nec', 'sed',
    'ut', 'si', 'atque', 'qui', 'quae', 'quod', 'quis', 'quid', 'non', 'ne'
}
latin_stopwords.update(additional_stopwords)

In [None]:
# Initialize Stanza NLP model for Latin
stanza.download('la')  # Download Latin model
nlp = stanza.Pipeline('la')

In [32]:
lemma_mapping_path = os.path.join(project_root, 'data/external/lemma.xml')
lemma_mapping = {}
with open(lemma_mapping_path, 'r', encoding='utf-8') as lemma_file:
    lemma_soup = BeautifulSoup(lemma_file, 'lxml')
    for lemma_entry in lemma_soup.find_all('lemma'):
        lemma_id = lemma_entry.get('id_lemma')
        lemma_value = lemma_entry.get('name').split(',')[0].strip().lower()  # 确保只获取词根
        lemma_value = re.sub(r'[\[\(].*?[\]\)]', '', lemma_value)  # 去除方括号和括号中的内容
        if lemma_id and lemma_value:
            lemma_mapping[lemma_id] = lemma_value

In [17]:
# Preprocess text function
# Filters stopwords from the word list
def preprocess_text(words):
    filtered_words = [word for word in words if word.lower() not in latin_stopwords]
    return filtered_words

In [33]:
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
        paragraphs = []

        # 遍历所有的 `fw` 标签，以便对文档进行分段
        for fw_tag in soup.find_all('fw', {'type': 'n'}):
            section_content = []
            next_sibling = fw_tag.find_next_sibling()

            # 循环直到找到下一个 `fw` 标签
            while next_sibling and next_sibling.name != 'fw':
                if next_sibling.name == 'p':  # 处理段落
                    words = []

                    # 处理每个 `w` 标签中的单词
                    for w in next_sibling.find_all('w'):
                        # 获取原始词或 `rend` 属性中的值
                        original_word = w.get('rend', w.get_text()).lower()

                        # 跳过包含德文字母的单词
                        if re.search(r'[äöüß]', original_word):
                            continue

                        # 跳过包含特殊符号或数字的 `cum` 变体
                        if re.match(r'^cum\W*\d*$', original_word):
                            continue

                        # 词形还原：使用 `lemma_l` 和映射表
                        lemma_l = w.get('lemma_l', '').lower()

                        if lemma_l in lemma_mapping:
                            # 如果 `lemma_l` 存在于映射表中，使用映射后的词根
                            lemma = lemma_mapping[lemma_l]
                        else:
                            # 如果找不到映射，使用原始词
                            lemma = original_word

                        # 添加词形到单词列表
                        words.append(lemma)

                    # 去除停用词并将段落添加到 section_content
                    filtered_words = preprocess_text(words)
                    section_content.append(' '.join(filtered_words))

                next_sibling = next_sibling.find_next_sibling()

            # 将段落合并并添加到 paragraphs 列表
            paragraphs.append({'content': ' '.join(section_content), 'topic': None})

        # 返回处理后的段落
        return paragraphs

In [34]:
# Save sections to JSON format
def save_to_json(output_path, data):
    with open(output_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

In [35]:
processing_summary = {
    'total_files': 0,
    'failed_files': 0
}
all_documents = []


In [None]:
# 去除停用词和词形还原部分
for file_name in tqdm(os.listdir(input_dir), desc="处理文件"):
    if file_name.endswith('.xml') and file_name.startswith('h'):
        input_path = os.path.join(input_dir, file_name)
        processing_summary['total_files'] += 1

        try:
            # 调用处理函数
            paragraphs = process_file(input_path)
            
            # 准备保存的文档信息
            document_data = {
                "document_id": file_name,
                "paragraphs": paragraphs
            }
            all_documents.append(document_data)

            # 保存处理后的文本到 .txt 文件
            output_file_name = file_name.replace('.xml', '_processed.txt')
            output_path = os.path.join(output_dir, output_file_name)
            with open(output_path, 'w', encoding='utf-8') as output_file:
                for idx, paragraph in enumerate(paragraphs, start=1):
                    output_file.write(f"Paragraph {idx}:\n")  # 添加换行符 \n
                    output_file.write(f"{paragraph['content']}\n\n")
                    
        except Exception as e:
            print(f"处理文件 {file_name} 时发生错误: {str(e)}")
            processing_summary['failed_files'] += 1

# 保存所有文档的预处理结果到 JSON
save_to_json(lemmatized_output_path, {
    "processing_summary": processing_summary,
    "documents": all_documents
})

print("去除停用词和词形还原步骤完成，已保存结果到 data/processed/h_variant/lemmatized_data.json")

# 提示用户检查处理结果
input("请检查处理后的文件是否符合要求，按下 Enter 键继续进行 POS 标注和统计工作...")

In [37]:
# Perform POS tagging using Stanza
def pos_tag_text(text):
    doc = nlp(text)
    pos_tags = [(word.text, word.upos) for sentence in doc.sentences for word in sentence.words]
    return pos_tags

In [38]:
# Step 5: Calculate statistics and save to CSV
def calculate_statistics(paragraphs):
    total_words = 0
    word_freq = {}
    pos_freq = {}
    unique_words = set()

    for paragraph in tqdm(paragraphs, desc="POS标注和统计词频进度"):  # Adding progress bar for POS tagging and frequency calculation
        words = paragraph['content'].split()
        total_words += len(words)
        unique_words.update(words)
        for word in words:
            word_freq[word] = word_freq.get(word, 0) + 1

        # POS tagging for the paragraph
        pos_tags = pos_tag_text(paragraph['content'])
        for _, pos in pos_tags:
            pos_freq[pos] = pos_freq.get(pos, 0) + 1
    
    # Calculate total types (unique words)
    total_types = len(unique_words)

    return total_words, total_types, word_freq, pos_freq

In [None]:
# POS 标注和统计步骤
all_word_freq = {}
all_pos_freq = {}
total_words_all = 0
total_types_all = set()

for document in tqdm(all_documents, desc="统计文档进度"):
    total_paragraphs = len(document['paragraphs'])
    total_words, total_types, word_freq, pos_freq = calculate_statistics(document['paragraphs'])
    total_words_all += total_words
    total_types_all.update(word_freq.keys())

    # 合并词频和POS统计信息
    for word, freq in word_freq.items():
        all_word_freq[word] = all_word_freq.get(word, 0) + freq
    for pos, freq in pos_freq.items():
        all_pos_freq[pos] = all_pos_freq.get(pos, 0) + freq

In [None]:
with open(pos_statistics_path, 'w', encoding='utf-8', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow([
        'Document ID', 
        'Total Paragraphs', 
        'Total Words', 
        'Total Types',
        'POS Frequencies'
    ])
    
    # 写入整个数据库的统计信息
    writer.writerow([
        'ALL DOCUMENTS',
        '',  # No specific paragraph count for all documents
        total_words_all,
        len(total_types_all),
        json.dumps(all_pos_freq)  # 整个数据库的POS统计信息
        ])

In [None]:
# 保存每个词的词频和每个type的数量到单独的CSV文件
with open(word_freq_statistics_path, 'w', encoding='utf-8', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['Word', 'Frequency'])
    for word, freq in all_word_freq.items():
        writer.writerow([word, freq])
    writer.writerow([])  # 空行分隔
    writer.writerow(['POS Tag', 'Frequency'])
    for pos, freq in all_pos_freq.items():
        writer.writerow([pos, freq])

print("POS 标注和统计步骤完成，结果已保存到统计文件中。")