In [1]:
import os
import spacy
import pandas as pd
import numpy as np
from gensim import corpora, models
from sklearn.model_selection import KFold
from collections import Counter
from tqdm import tqdm
import logging
import re
from datetime import datetime

# 设置工作目录和路径
base_dir = "/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling"
os.chdir(base_dir)

# 设置日志
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_dir = 'experiments/lda/spacy/results'
os.makedirs(log_dir, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(f'{log_dir}/lda_experiment_{timestamp}.log'),
        logging.StreamHandler()
    ]
)

In [2]:
def prepare_corpus_and_dictionary():
    """准备词典和语料库，处理带段落标记的文本"""
    logging.info("Preparing corpus and dictionary...")
    
    # 1. 加载spaCy模型
    try:
        nlp = spacy.load('models/la_core_web_lg-0.1.0-py3-none-any.whl')
    except:
        logging.info("Trying to load la_core_web_lg model directly...")
        nlp = spacy.load('la_core_web_lg')
    
    # 2. 读取所有预处理文件
    source_dir = 'experiments/lda/spacy/preprocessed'
    
    def is_valid_word(word):
        """检查词是否有效"""
        return (not bool(re.search(r'\d', word)) and 
                not all(char in '.,;:!?"\'()[]{}' for char in word) and 
                len(word.strip()) > 0)
    
    # 3. 处理文档并构建词频统计
    word_stats = Counter()
    all_texts = []
    paragraph_info = []  # 存储段落信息
    
    logging.info("Processing documents and building vocabulary...")
    for file in tqdm(os.listdir(source_dir)):
        if file.endswith('.txt'):
            with open(os.path.join(source_dir, file), 'r', encoding='utf-8') as f:
                content = f.read()
                # 按段落分割
                paragraphs = re.split(r'Paragraph \d+:\n', content)[1:]  # 跳过第一个空分割
                
                for para_idx, para in enumerate(paragraphs, 1):
                    para = para.strip()
                    if para:  # 忽略空段落
                        doc = nlp(para)
                        valid_words = [token.text.lower() for token in doc 
                                     if not token.is_stop and 
                                     not token.is_punct and 
                                     is_valid_word(token.text.lower())]
                        
                        if valid_words:  # 只添加非空段落
                            all_texts.append(valid_words)
                            word_stats.update(valid_words)
                            paragraph_info.append({
                                'file': file,
                                'paragraph_idx': para_idx,
                                'text': para,
                                'words': valid_words
                            })
    
    # 4. 筛选词频在2-200之间的词
    valid_words = {word for word, count in word_stats.items() 
                  if 2 <= count <= 200}
    
    # 5. 基于有效词汇过滤文档
    filtered_texts = [[word for word in text if word in valid_words] 
                     for text in all_texts]
    
    # 6. 创建词典和语料库
    dictionary = corpora.Dictionary(filtered_texts)
    corpus = [dictionary.doc2bow(text) for text in filtered_texts]
    
    logging.info(f"Total documents: {len(corpus)}")
    logging.info(f"Total paragraphs: {len(paragraph_info)}")
    logging.info(f"Vocabulary size: {len(dictionary)}")
    
    # 7. 保存段落信息
    pd.DataFrame(paragraph_info).to_csv(
        f'{log_dir}/paragraph_info_{timestamp}.csv', 
        index=False
    )
    
    return dictionary, corpus, filtered_texts, paragraph_info

def normalize_npmi(npmi):
    """将NPMI标准化到[0,1]区间"""
    return (npmi + 1) / 2

def normalize_diversity(diversity):
    """确保diversity在[0,1]区间内"""
    return max(0, min(1, diversity))

def evaluate_model(model, corpus, dictionary, texts):
    """评估LDA模型的NPMI和多样性，处理极端值"""
    try:
        # 计算NPMI
        coherence_model = models.coherencemodel.CoherenceModel(
            model=model, 
            texts=texts,
            dictionary=dictionary,
            coherence='c_npmi'
        )
        npmi = coherence_model.get_coherence()
        
        # 处理NPMI的极端值和无效值
        if np.isinf(npmi) or np.isnan(npmi):
            npmi = -1.0  # 设置一个默认的最低值
        
        # 确保NPMI在[-1, 1]范围内
        npmi = max(-1.0, min(1.0, npmi))
        
        # 标准化NPMI到[0,1]区间
        npmi_normalized = (npmi + 1) / 2
        
        # 计算主题多样性
        topics = model.show_topics(formatted=False)
        unique_words = set()
        total_words = 0
        for topic_id, topic in topics:
            words = [w for w, _ in topic]
            unique_words.update(words)
            total_words += len(words)
        
        # 确保diversity在[0,1]区间内
        diversity = max(0.0, min(1.0, len(unique_words) / total_words))
        
        # 最终检查确保返回值都是有效的浮点数
        if np.isinf(npmi_normalized) or np.isnan(npmi_normalized):
            npmi_normalized = 0.0
        if np.isinf(diversity) or np.isnan(diversity):
            diversity = 0.0
            
        logging.info(f"Evaluation metrics - NPMI: {npmi_normalized:.4f}, Diversity: {diversity:.4f}")
        
        return float(npmi_normalized), float(diversity)
    
    except Exception as e:
        logging.error(f"Error in evaluate_model: {str(e)}")
        # 发生错误时返回默认值
        return 0.0, 0.0

In [3]:
def run_lda_experiment(dictionary, corpus, texts, paragraph_info, k=5):
    """运行LDA实验并保存详细结果"""
    # 定义参数搜索空间
    num_topics_range = [10, 15]
    alpha_range = ['symmetric', 0.1, 0.3, 0.5]
    eta_range = ['symmetric', 0.1, 0.3, 0.5]
    
    # 创建结果列表
    experiment_results = []
    topic_words_results = []
    paragraph_topic_results = []
    
    experiment_id = 1  # 为每个实验添加唯一ID
    
    # 创建KFold对象
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    total_experiments = len(num_topics_range) * len(alpha_range) * len(eta_range)
    logging.info(f"Starting {total_experiments} experiments with {k} folds each")
    
    for num_topics in num_topics_range:
        for alpha in alpha_range:
            for eta in eta_range:
                logging.info(f"Running experiment {experiment_id}/{total_experiments}: "
                           f"topics={num_topics}, alpha={alpha}, eta={eta}")
                
                # 存储每个fold的结果
                fold_scores = []
                
                # 获取所有fold的划分
                fold_splits = list(kf.split(corpus))
                
                for fold, (train_idx, val_idx) in enumerate(fold_splits):
                    logging.info(f"Processing fold {fold+1}/{k}")
                    
                    # 准备训练和验证数据
                    train_corpus = [corpus[i] for i in train_idx]
                    val_corpus = [corpus[i] for i in val_idx]
                    val_texts = [texts[i] for i in val_idx]
                    
                    try:
                        # 训练模型
                        model = models.LdaModel(
                            corpus=train_corpus,
                            id2word=dictionary,
                            num_topics=num_topics,
                            alpha=alpha,
                            eta=eta,
                            random_state=42
                        )
                        
                        # 评估模型
                        npmi, diversity = evaluate_model(model, val_corpus, dictionary, val_texts)
                        optimal_score = 0.5 * npmi + 0.5 * diversity
                        
                        # 保存fold结果
                        fold_scores.append({
                            'experiment_id': experiment_id,
                            'fold': fold,
                            'npmi': npmi,
                            'diversity': diversity,
                            'optimal_score': optimal_score
                        })
                        
                        # 保存主题词
                        for topic_id, topic in model.show_topics(formatted=False):
                            topic_words_results.append({
                                'experiment_id': experiment_id,
                                'fold': fold,
                                'num_topics': num_topics,
                                'alpha': str(alpha),
                                'eta': str(eta),
                                'topic_id': topic_id,
                                'words': ', '.join([word for word, _ in topic]),
                                'word_probs': ', '.join([f"{prob:.4f}" for _, prob in topic])
                            })
                        
                        # 保存段落主题分配
                        for idx, bow in enumerate(val_corpus):
                            topic_dist = model.get_document_topics(bow)
                            main_topic = max(topic_dist, key=lambda x: x[1]) if topic_dist else (-1, 0)
                            paragraph_topic_results.append({
                                'experiment_id': experiment_id,
                                'fold': fold,
                                'paragraph_id': val_idx[idx],
                                'file': paragraph_info[val_idx[idx]]['file'],
                                'paragraph_idx': paragraph_info[val_idx[idx]]['paragraph_idx'],
                                'main_topic': main_topic[0],
                                'topic_prob': main_topic[1],
                                'topic_distribution': str(dict(topic_dist))
                            })
                            
                    except Exception as e:
                        logging.error(f"Error in experiment {experiment_id}, fold {fold}: {str(e)}")
                        continue
                
                # 计算平均分数
                if fold_scores:
                    avg_scores = pd.DataFrame(fold_scores).mean()
                    experiment_results.append({
                        'experiment_id': experiment_id,
                        'num_topics': num_topics,
                        'alpha': str(alpha),
                        'eta': str(eta),
                        'avg_npmi': avg_scores['npmi'],
                        'avg_diversity': avg_scores['diversity'],
                        'avg_optimal_score': avg_scores['optimal_score'],
                        'fold_count': len(fold_scores)
                    })
                
                # 保存中间结果
                if experiment_results:
                    pd.DataFrame(experiment_results).to_csv(
                        f'{log_dir}/intermediate_results_{timestamp}.csv', 
                        index=False
                    )
                
                experiment_id += 1
    
    # 创建最终的DataFrame
    results_df = pd.DataFrame(experiment_results)
    topic_words_df = pd.DataFrame(topic_words_results)
    paragraph_topics_df = pd.DataFrame(paragraph_topic_results)
    
    # 保存所有结果
    results_df.to_csv(f'{log_dir}/experiment_results_{timestamp}.csv', index=False)
    topic_words_df.to_csv(f'{log_dir}/topic_words_{timestamp}.csv', index=False)
    paragraph_topics_df.to_csv(f'{log_dir}/paragraph_topics_{timestamp}.csv', index=False)
    
    return results_df, topic_words_df, paragraph_topics_df

# 主执行流程
if __name__ == "__main__":
    try:
        # 1. 准备数据
        logging.info("Preparing corpus and dictionary...")
        dictionary, corpus, texts, paragraph_info = prepare_corpus_and_dictionary()
        
        # 2. 运行实验
        logging.info("Starting LDA experiments...")
        results_df, topic_words_df, paragraph_topics_df = run_lda_experiment(
            dictionary, corpus, texts, paragraph_info
        )
        
        # 3. 显示最佳结果
        best_result = results_df.loc[results_df['avg_optimal_score'].idxmax()]
        logging.info("\nBest Model Configuration:")
        logging.info(f"Experiment ID: {best_result['experiment_id']}")
        logging.info(f"Number of Topics: {best_result['num_topics']}")
        logging.info(f"Alpha: {best_result['alpha']}")
        logging.info(f"Eta: {best_result['eta']}")
        logging.info(f"Average NPMI: {best_result['avg_npmi']:.4f}")
        logging.info(f"Average Diversity: {best_result['avg_diversity']:.4f}")
        logging.info(f"Average Optimal Score: {best_result['avg_optimal_score']:.4f}")
        
        # 4. 保存最佳结果的主题词
        best_topics = topic_words_df[
            topic_words_df['experiment_id'] == best_result['experiment_id']
        ]
        best_topics.to_csv(f'{log_dir}/best_topics_{timestamp}.csv', index=False)
        
    except Exception as e:
        logging.error(f"Error occurred: {str(e)}", exc_info=True)

2024-11-24 18:54:33,251 - INFO - Preparing corpus and dictionary...
2024-11-24 18:54:33,251 - INFO - Preparing corpus and dictionary...
2024-11-24 18:54:33,253 - INFO - Trying to load la_core_web_lg model directly...
2024-11-24 18:54:35,610 - INFO - Processing documents and building vocabulary...
100%|██████████| 306/306 [01:50<00:00,  2.77it/s]
2024-11-24 18:56:26,066 - INFO - adding document #0 to Dictionary<0 unique tokens: []>
2024-11-24 18:56:26,130 - INFO - built Dictionary<7048 unique tokens: ['aegeus', 'andreo', 'declaro', 'dominicus', 'epistula']...> from 4515 documents (total 124220 corpus positions)
2024-11-24 18:56:26,130 - INFO - Dictionary lifecycle event {'msg': "built Dictionary<7048 unique tokens: ['aegeus', 'andreo', 'declaro', 'dominicus', 'epistula']...> from 4515 documents (total 124220 corpus positions)", 'datetime': '2024-11-24T18:56:26.130885', 'gensim': '4.3.3', 'python': '3.10.15 (main, Oct  3 2024, 02:24:49) [Clang 14.0.6 ]', 'platform': 'macOS-15.1-arm64-arm