In [3]:
import os
import re
import json
import csv
from bs4 import BeautifulSoup
from tqdm import tqdm
import spacy
import logging
from collections import Counter
import pandas as pd

# 设置工作目录
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("Current working directory: ", os.getcwd())

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# 设置路径
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
input_dir = os.path.join(project_root, 'data/h_variant')
output_dir = os.path.join(project_root, 'experiments/lda/cusanus/preprocessed')
stats_dir = os.path.join(output_dir, 'stats')

os.makedirs(output_dir, exist_ok=True)
os.makedirs(stats_dir, exist_ok=True)

# 加载spacy模型
try:
    nlp = spacy.load('la_core_web_lg')
    logging.info("成功加载Spacy拉丁语模型")
except:
    logging.error("无法加载Spacy拉丁语模型，请确保已安装")
    raise

# 加载停用词
with open('data/external/stopwords_latin.txt', 'r', encoding='utf-8') as f:
    LATIN_STOPS = set(f.read().splitlines())
logging.info(f"成功加载 {len(LATIN_STOPS)} 个停用词")

# 加载lemma映射
lemma_mapping_path = os.path.join(project_root, 'data/external/lemma.xml')
lemma_mapping = {}

with open(lemma_mapping_path, 'r', encoding='utf-8') as lemma_file:
    lemma_soup = BeautifulSoup(lemma_file, 'lxml')
    for lemma_entry in lemma_soup.find_all('lemma'):
        lemma_id = lemma_entry.get('id_lemma')
        lemma_name = lemma_entry.get('name')

        # Improved extraction logic for lemma name
        if lemma_name:
            # Step 1: Remove any leading characters like "*" or other notes
            lemma_name = re.sub(r'^\*.*?\s', '', lemma_name).strip()

            # Step 2: If there are parentheses, prioritize the content before them
            if '(' in lemma_name:
                lemma_value = lemma_name.split('(')[0].strip().lower()
            else:
                lemma_value = lemma_name.strip().lower()

            # Step 3: Handle multi-word lemmas by selecting the first valid word
            if lemma_value:
                lemma_value_parts = lemma_value.split()
                if lemma_value_parts:
                    lemma_value = lemma_value_parts[0]

            # Step 4: Remove grammatical category suffixes
            lemma_value = re.split(r'\b(?:cj\.|adv\.|praep\.|f\.|m\.|n\.|pl\.|sg\.|dat\.|acc\.|nom\.|gen\.|abl\.)\b', lemma_value)[0].strip()

            # Step 5: Remove unnecessary descriptive content
            lemma_value = re.sub(r'\b(?:provincia|region|place|saec\.|asia minor|africa|italia|hispania)\b.*', '', lemma_value).strip()

            # Step 6: Ensure the final lemma value is valid and has an ID
            if lemma_id and lemma_value:
                lemma_mapping[lemma_id] = lemma_value

logging.info(f"成功加载 {len(lemma_mapping)} 个lemma映射")

class CusanusPreprocessor:
    def __init__(self):
        self.nlp = nlp
        
        # 统计数据
        self.word_freq = Counter()
        self.pos_stats = Counter()
        self.word_pos = {}
        self.total_paragraphs = 0
        self.total_words = 0
        
        # 加载停用词
        self.stopwords = LATIN_STOPS
        
        # 词性过滤配置
        self.VALID_POS = {'NOUN', 'ADJ', 'VERB', 'PROPN'}
        
        # 加载lemma映射
        self.lemma_mapping = lemma_mapping
        
    def process_word(self, word_tag):
        """处理单个词标签"""
        try:
            # 获取原始词和lemma_id
            original_word = word_tag.get('rend', word_tag.get_text())
            lemma_id = word_tag.get('lemma_l', '').lower()
            
            # 清理词形
            word = original_word.lower()
            word = re.sub(r'\d+$', '', word)
            word = re.sub(r'[^a-zāēīōūȳĀĒĪŌŪȲ]', '', word)
            
            # 跳过德语词
            if re.search(r'[äöüß]', word) or re.match(r'^cum\W*\d*$', word):
                return None
            
            if not word or word in self.stopwords:
                return None
            
            # 词性标注
            doc = self.nlp(word)
            if not doc:
                return None
            
            token = doc[0]
            pos = token.pos_
            
            # 更新词性统计
            self.pos_stats[pos] += 1
            
            # 过滤无效词性
            if pos not in self.VALID_POS:
                return None
            
            # 使用Cusanus lemma映射或spacy lemma
            if lemma_id in self.lemma_mapping:
                lemma = self.lemma_mapping[lemma_id]
            else:
                lemma = token.lemma_
            
            if lemma:
                self.word_freq[lemma] += 1
                self.word_pos[lemma] = pos
                self.total_words += 1
                return lemma
                
        except Exception as e:
            logging.warning(f"处理词 '{word}' 时出错: {str(e)}")
            return None
            
        return None

    def process_file(self, file_path, output_path):
        """处理单个文件"""
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')
            paragraphs = []
            
            for fw_tag in soup.find_all('fw', {'type': 'n'}):
                section_content = []
                next_sibling = fw_tag.find_next_sibling()
                
                while next_sibling and next_sibling.name != 'fw':
                    if next_sibling.name == 'p':
                        words = []
                        for w in next_sibling.find_all('w'):
                            processed = self.process_word(w)
                            if processed:
                                words.append(processed)
                        
                        if words:
                            section_content.append(' '.join(words))
                    
                    next_sibling = next_sibling.find_next_sibling()
                
                if section_content:
                    paragraphs.append({'content': ' '.join(section_content)})
                    self.total_paragraphs += 1
            
            # 保存处理后的文本
            with open(output_path, 'w', encoding='utf-8') as f:
                for idx, paragraph in enumerate(paragraphs, 1):
                    if paragraph['content'].strip():
                        f.write(f"Paragraph {idx}:\n")
                        f.write(f"{paragraph['content']}\n\n")
            
            return paragraphs

    def save_statistics(self, stats_dir):
        """保存统计信息"""
        stats = {
            'total_words': self.total_words,
            'unique_words': len(self.word_freq),
            'total_paragraphs': self.total_paragraphs,
            'pos_distribution': dict(self.pos_stats)
        }
        
        # 保存基本统计信息
        stats_file = os.path.join(stats_dir, 'cusanus_statistics_summary.json')
        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(stats, f, ensure_ascii=False, indent=2)
        
        # 保存词频统计
        word_freq_df = pd.DataFrame([
            {'word': word, 'frequency': freq, 'pos': self.word_pos.get(word, 'UNKNOWN')}
            for word, freq in self.word_freq.most_common()
        ])
        word_freq_file = os.path.join(stats_dir, 'cusanus_word_frequencies.csv')
        word_freq_df.to_csv(word_freq_file, index=False, encoding='utf-8')
        
        # 保存词性统计
        pos_stats_file = os.path.join(stats_dir, 'cusanus_pos_statistics.csv')
        pd.DataFrame(self.pos_stats.most_common(), columns=['pos', 'frequency']).to_csv(
            pos_stats_file, index=False, encoding='utf-8'
        )
        
        logging.info(f"统计信息已保存到 {stats_dir}")
        logging.info(f"总词数: {self.total_words}")
        logging.info(f"独特词数: {len(self.word_freq)}")
        logging.info(f"总段落数: {self.total_paragraphs}")
        logging.info("词性分布:")
        for pos, count in self.pos_stats.most_common():
            logging.info(f"  {pos}: {count}")

def main():
    processor = CusanusPreprocessor()
    
    # 获取所有XML文件
    xml_files = [f for f in os.listdir(input_dir) if f.endswith('.xml')]
    logging.info(f"找到 {len(xml_files)} 个XML文件")
    
    # 处理所有文件
    for xml_file in tqdm(xml_files, desc="处理文件"):
        input_file = os.path.join(input_dir, xml_file)
        output_file = os.path.join(output_dir, xml_file.replace('.xml', '_lemmatized.txt'))
        
        try:
            paragraphs = processor.process_file(input_file, output_file)
            logging.info(f"处理文件 {xml_file}: 提取了 {len(paragraphs)} 个段落")
        except Exception as e:
            logging.error(f"处理文件 {xml_file} 时出错: {str(e)}")
            continue
    
    # 保存统计信息
    processor.save_statistics(stats_dir)

if __name__ == "__main__":
    main()

Current working directory:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


2024-11-24 19:44:09,675 - INFO - 成功加载Spacy拉丁语模型
2024-11-24 19:44:09,683 - INFO - 成功加载 4022 个停用词
2024-11-24 19:44:09,870 - INFO - 成功加载 11784 个lemma映射
2024-11-24 19:44:09,873 - INFO - 找到 308 个XML文件
处理文件:   0%|          | 0/308 [00:00<?, ?it/s]2024-11-24 19:44:11,656 - INFO - 处理文件 h180_134.xml: 提取了 10 个段落
处理文件:   0%|          | 1/308 [00:01<08:53,  1.74s/it]2024-11-24 19:44:12,672 - INFO - 处理文件 h170_075.xml: 提取了 5 个段落
处理文件:   1%|          | 2/308 [00:02<06:42,  1.31s/it]2024-11-24 19:44:17,932 - INFO - 处理文件 h170_061.xml: 提取了 26 个段落
处理文件:   1%|          | 3/308 [00:08<15:50,  3.12s/it]2024-11-24 19:44:22,776 - INFO - 处理文件 h170_049.xml: 提取了 12 个段落
处理文件:   1%|▏         | 4/308 [00:12<19:14,  3.80s/it]2024-11-24 19:44:27,438 - INFO - 处理文件 h190_281.xml: 提取了 25 个段落
处理文件:   2%|▏         | 5/308 [00:17<20:45,  4.11s/it]2024-11-24 19:44:31,436 - INFO - 处理文件 h190_256.xml: 提取了 12 个段落
处理文件:   2%|▏         | 6/308 [00:21<20:29,  4.07s/it]2024-11-24 19:44:32,055 - INFO - 处理文件 h170_101.xml: 提取了 5 个段落
处理