In [28]:
import os
import logging
import json
import numpy as np
import pandas as pd
from datetime import datetime
from itertools import product
from typing import List, Tuple, Dict
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from tqdm import tqdm
import sys

# 设置工作目录和路径
base_dir = "/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling"
os.chdir(base_dir)

# 设置实验相关目录
dirs = {
    'input': 'experiments/lda/cusanus/preprocessed',  # 修改为正确的输入目录
    'output': 'experiments/lda/cusanus/threshold',
    'logs': 'experiments/lda/cusanus/threshold'
}

# 创建所需目录
for dir_path in dirs.values():
    os.makedirs(dir_path, exist_ok=True)


In [29]:
def load_paragraphs(input_dir: str) -> List[List[str]]:
    """
    加载所有文档的段落
    返回: 段落列表，每个段落是词列表
    """
    paragraphs = []
    
    # 调试信息
    print(f"当前工作目录: {os.getcwd()}")
    print(f"尝试加载目录: {input_dir}")
    
    files = [f for f in os.listdir(input_dir) if f.endswith('.txt')]
    print(f"找到的txt文件: {files}")
    
    for filename in files:
        file_path = os.path.join(input_dir, filename)
        print(f"\n处理文件: {filename}")
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            
        # 按"Paragraph"分割，但跳过空字符串
        parts = [p.strip() for p in content.split('Paragraph') if p.strip()]
        
        for part in parts:
            # 分割段落编号和内容
            lines = part.split('\n', 1)  # 最多分割一次
            if len(lines) == 2:  # 确保有内容行
                content_line = lines[1].strip()
                if content_line:  # 确保内容不为空
                    words = content_line.split()
                    if words:  # 确保有词
                        paragraphs.append(words)
    
    print(f"\n总共加载了 {len(paragraphs)} 个段落")
    
    if len(paragraphs) == 0:
        raise ValueError(f"未能从目录 {input_dir} 加载到任何段落，请检查文件路径和文件内容格式是否正确")
        
    return paragraphs

In [30]:
# 设置日志配置
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
log_file = os.path.join(dirs['logs'], f'cusanus_threshold_{timestamp}.log')

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)

In [32]:
def evaluate_thresholds(
    corpus: List[List[str]],
    min_freqs: List[int] = [2, 3, 4],
    max_freqs: List[int] = [200, 800, 1400, 2000],
    n_topics: int = 15,
    n_splits: int = 5,
    alpha: float = 0.5,
    random_state: int = 42
) -> Tuple[pd.DataFrame, Dict, List]:
    """评估不同词频阈值组合的效果"""
    
    # 初始化结果存储
    results = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    logging.info("开始阈值评估实验")
    logging.info(f"参数设置: n_topics={n_topics}, n_splits={n_splits}, alpha={alpha}")
    
    # 对所有参数组合进行网格搜索
    total_combinations = len(min_freqs) * len(max_freqs)
    
    with tqdm(total=total_combinations*n_splits, desc="实验进度") as pbar:
        for min_freq, max_freq in product(min_freqs, max_freqs):
            if min_freq >= max_freq:
                continue
                
            logging.info(f"\n评估阈值组合: min_freq={min_freq}, max_freq={max_freq}")
            fold_results = []
            
            # K折交叉验证
            for fold, (train_idx, val_idx) in enumerate(kf.split(corpus), 1):
                try:
                    # 构建词典
                    train_docs = [corpus[i] for i in train_idx]
                    dictionary = Dictionary(train_docs)
                    
                    # 应用词频过滤
                    original_tokens = len(dictionary)
                    dictionary.filter_extremes(
                        no_below=min_freq,
                        no_above=0.5,
                        keep_n=max_freq
                    )
                    filtered_tokens = len(dictionary)
                    
                    logging.info(f"词典过滤: {original_tokens} -> {filtered_tokens} 个词")
                    
                    # 转换为词袋表示
                    corpus_bow = [dictionary.doc2bow(doc) for doc in train_docs]
                    
                    # 训练LDA模型
                    lda = LdaModel(
                        corpus=corpus_bow,
                        id2word=dictionary,
                        num_topics=n_topics,
                        random_state=random_state
                    )
                    
                    # 计算评估指标
                    npmi_score = calculate_npmi(lda, corpus_bow, dictionary, train_docs)
                    diversity_score = calculate_diversity(lda)
                    
                    # 计算optimal score (已标准化的指标)
                    optimal_score = alpha * npmi_score + (1 - alpha) * diversity_score
                    
                    # 获取主题词
                    topics = lda.show_topics(formatted=False)
                    
                    fold_results.append({
                        'fold': fold,
                        'npmi': npmi_score,
                        'diversity': diversity_score,
                        'optimal_score': optimal_score,
                        'topics': topics
                    })
                    
                    logging.info(f"第 {fold} 折评估完成: "
                               f"NPMI={npmi_score:.4f}, "
                               f"Diversity={diversity_score:.4f}, "
                               f"Optimal Score={optimal_score:.4f}")
                    
                except Exception as e:
                    logging.error(f"处理第 {fold} 折时发生错误: {str(e)}")
                    continue
                finally:
                    pbar.update(1)
            
            # 计算平均分数
            if fold_results:
                avg_scores = {
                    'npmi': np.mean([r['npmi'] for r in fold_results]),
                    'diversity': np.mean([r['diversity'] for r in fold_results]),
                    'optimal_score': np.mean([r['optimal_score'] for r in fold_results])
                }
                
                std_scores = {
                    'npmi_std': np.std([r['npmi'] for r in fold_results]),
                    'diversity_std': np.std([r['diversity'] for r in fold_results]),
                    'optimal_score_std': np.std([r['optimal_score'] for r in fold_results])
                }
                
                results.append({
                    'min_freq': min_freq,
                    'max_freq': max_freq,
                    **avg_scores,
                    **std_scores,
                    'fold_results': fold_results
                })
    
    # 转换为DataFrame
    results_df = pd.DataFrame([
        {k: v for k, v in r.items() if k != 'fold_results'}
        for r in results
    ])
    
    # 找出最佳参数组合
    best_idx = results_df['optimal_score'].idxmax()
    best_params = {
        'min_freq': results_df.loc[best_idx, 'min_freq'],
        'max_freq': results_df.loc[best_idx, 'max_freq'],
        'optimal_score': results_df.loc[best_idx, 'optimal_score']
    }
    
    return results_df, best_params, results

In [33]:
def save_experiment_results(results_df: pd.DataFrame, best_params: Dict, raw_results: List, experiment_dir: str):
    """保存实验结果"""
    try:
        # 1. 保存DataFrame结果
        results_df.to_csv(os.path.join(experiment_dir, 'evaluation_results.csv'), index=False)
        logging.info("已保存评估结果DataFrame")
        
        # 2. 保存每折详细结果
        fold_results_df = pd.DataFrame([
            {
                'min_freq': result['min_freq'],
                'max_freq': result['max_freq'],
                'fold': fold_data['fold'],
                'npmi': fold_data['npmi'],
                'diversity': fold_data['diversity'],
                'optimal_score': fold_data['optimal_score']
            }
            for result in raw_results
            for fold_data in result['fold_results']
        ])
        fold_results_df.to_csv(os.path.join(experiment_dir, 'fold_results.csv'), index=False)
        logging.info("已保存每折详细结果")
        
        # 3. 保存主题词 - 添加类型转换
        topics_dir = os.path.join(experiment_dir, 'topics')
        os.makedirs(topics_dir, exist_ok=True)
        for result in raw_results:
            # 转换fold_results中的数据类型
            processed_fold_results = []
            for fold_data in result['fold_results']:
                processed_fold = {
                    'fold': int(fold_data['fold']),
                    'npmi': float(fold_data['npmi']),
                    'diversity': float(fold_data['diversity']),
                    'optimal_score': float(fold_data['optimal_score']),
                    'topics': [
                        [(str(word), float(score)) for word, score in topic]
                        for topic_id, topic in fold_data['topics']
                    ]
                }
                processed_fold_results.append(processed_fold)
                
            topics_file = os.path.join(
                topics_dir,
                f'topics_min{int(result["min_freq"])}_max{int(result["max_freq"])}.json'
            )
            with open(topics_file, 'w') as f:
                json.dump(processed_fold_results, f, indent=2)
        logging.info("已保存主题词结果")
        
        # 4. 保存最佳参数 - 添加类型转换
        processed_best_params = {
            'min_freq': int(best_params['min_freq']),
            'max_freq': int(best_params['max_freq']),
            'optimal_score': float(best_params['optimal_score'])
        }
        with open(os.path.join(experiment_dir, 'best_params.json'), 'w') as f:
            json.dump(processed_best_params, f, indent=2)
        logging.info("已保存最佳参数")
        
    except Exception as e:
        logging.error(f"保存结果时发生错误: {str(e)}")
        raise

In [34]:
def visualize_results(results_df: pd.DataFrame, experiment_dir: str):
    """生成可视化结果"""
    try:
        plt.figure(figsize=(12, 8))
        pivot_table = results_df.pivot(
            index='min_freq',
            columns='max_freq',
            values='optimal_score'
        )
        sns.heatmap(
            pivot_table,
            annot=True,
            fmt='.3f',
            cmap='YlOrRd',
            cbar_kws={'label': 'Optimal Score'}
        )
        plt.title('词频阈值组合评估结果')
        plt.xlabel('最大词频阈值')
        plt.ylabel('最小词频阈值')
        plt.tight_layout()
        plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
        plt.close()
        logging.info("已保存评估结果热力图")
    except Exception as e:
        logging.error(f"生成可视化结果时发生错误: {str(e)}")

In [35]:
# 主程序
if __name__ == "__main__":
    # 设置时间戳和实验目录
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    experiment_dir = os.path.join(dirs['output'], f'experiment_{timestamp}')
    os.makedirs(experiment_dir, exist_ok=True)

    # 设置日志
    log_file = os.path.join(dirs['logs'], f'threshold_evaluation_{timestamp}.log')
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )

    try:
        # 加载段落数据
        paragraphs = load_paragraphs(dirs['input'])
        logging.info(f"成功加载语料库，共 {len(paragraphs)} 个段落")

        # 运行实验
        results_df, best_params, raw_results = evaluate_thresholds(
            corpus=paragraphs,
            min_freqs=[2, 3, 4],
            max_freqs=[200, 800, 1400, 2000],
            n_topics=15,
            n_splits=5,
            alpha=0.5
        )

        # 保存结果
        save_experiment_results(results_df, best_params, raw_results, experiment_dir)
        print("实验结果已成功保存!")

        # 生成可视化
        visualize_results(results_df, experiment_dir)
        print("可视化结果已生成!")

        # 打印最佳参数
        print("\n最佳参数组合:")
        print(f"最小词频阈值: {best_params['min_freq']}")
        print(f"最大词频阈值: {best_params['max_freq']}")
        print(f"Optimal Score: {best_params['optimal_score']:.3f}")

    except Exception as e:
        logging.error(f"实验过程中发生错误: {str(e)}")
        raise

当前工作目录: /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling
尝试加载目录: experiments/lda/cusanus/preprocessed
找到的txt文件: ['h160_013_cleaned.txt', 'h190_264_cleaned.txt', 'h180_167_cleaned.txt', 'h190_214_cleaned.txt', 'h180_149_cleaned.txt', 'h180_139_cleaned.txt', 'h170_065_cleaned.txt', 'h170_107_cleaned.txt', 'h170_098_cleaned.txt', 'h170_084_cleaned.txt', 'h170_079_cleaned.txt', 'h190_226_cleaned.txt', 'h180_125_cleaned.txt', 'h160_021_cleaned.txt', 'h190_256_cleaned.txt', 'h180_155_cleaned.txt', 'h170_057_cleaned.txt', 'h190_208_cleaned.txt', 'h170_027_cleaned.txt', 'h190_278_cleaned.txt', 'h190_285_cleaned.txt', 'h180_186_cleaned.txt', 'h170_042_cleaned.txt', 'h170_120_cleaned.txt', 'h170_032_cleaned.txt', 'h190_290_cleaned.txt', 'h180_193_cleaned.txt', 'h170_091_cleaned.txt', 'h190_233_cleaned.txt', 'h180_130_cleaned.txt', 'h190_243_cleaned.txt', 'h180_140_cleaned.txt', 'h170_070_cleaned.txt', 'h170_112_cleaned.txt', 'h160_006_cleaned.txt', 'h190_271_cleaned.txt', 'h180_172_cleane

2024-11-24 21:10:32,267 - INFO - 成功加载语料库，共 4645 个段落
2024-11-24 21:10:32,267 - INFO - 开始阈值评估实验
2024-11-24 21:10:32,268 - INFO - 参数设置: n_topics=15, n_splits=5, alpha=0.5



处理文件: h170_111_cleaned.txt

处理文件: h170_073_cleaned.txt

处理文件: h180_133_cleaned.txt

处理文件: h190_230_cleaned.txt

处理文件: h170_092_cleaned.txt

处理文件: h180_143_cleaned.txt

处理文件: h190_240_cleaned.txt

处理文件: h170_041_cleaned.txt

处理文件: h180_190_cleaned.txt

处理文件: h160_019_cleaned.txt

处理文件: h190_293_cleaned.txt

处理文件: h170_031_cleaned.txt

处理文件: h170_054_cleaned.txt

处理文件: h180_185_cleaned.txt

处理文件: h190_286_cleaned.txt

处理文件: h180_178_cleaned.txt

处理文件: h180_126_cleaned.txt

处理文件: h190_225_cleaned.txt

处理文件: h170_118_cleaned.txt

处理文件: h170_087_cleaned.txt

处理文件: h180_156_cleaned.txt

处理文件: h190_255_cleaned.txt

处理文件: h160_022_cleaned.txt

处理文件: h190_249_cleaned.txt

处理文件: h170_104_cleaned.txt

处理文件: h170_066_cleaned.txt

处理文件: h190_239_cleaned.txt

处理文件: h180_164_cleaned.txt

处理文件: h170_038_cleaned.txt

处理文件: h190_267_cleaned.txt

处理文件: h160_010_cleaned.txt

处理文件: h180_199_cleaned.txt

处理文件: h170_048_cleaned.txt

处理文件: h190_217_cleaned.txt

处理文件: h170_053_cleaned.txt

处理文件: h180_182_clea

实验进度:   0%|          | 0/60 [00:00<?, ?it/s]2024-11-24 21:10:32,270 - INFO - 
评估阈值组合: min_freq=2, max_freq=200
2024-11-24 21:10:32,276 - INFO - adding document #0 to Dictionary<0 unique tokens: []>
2024-11-24 21:10:32,400 - INFO - built Dictionary<8839 unique tokens: ['apocalypsis', 'appareo', 'behalten', 'constantinus', 'corona']...> from 3716 documents (total 216188 corpus positions)
2024-11-24 21:10:32,401 - INFO - Dictionary lifecycle event {'msg': "built Dictionary<8839 unique tokens: ['apocalypsis', 'appareo', 'behalten', 'constantinus', 'corona']...> from 3716 documents (total 216188 corpus positions)", 'datetime': '2024-11-24T21:10:32.401050', 'gensim': '4.3.3', 'python': '3.10.15 (main, Oct  3 2024, 02:24:49) [Clang 14.0.6 ]', 'platform': 'macOS-15.1-arm64-arm-64bit', 'event': 'created'}
2024-11-24 21:10:32,406 - INFO - discarding 8639 tokens: [('apocalypsis', 16), ('constantinus', 35), ('corona', 25), ('element', 9), ('expono', 81), ('generalis', 29), ('gloriosus', 75), ('gra

实验结果已成功保存!


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment_dir, 'threshold_heatmap.png'))
  plt.savefig(os.path.join(experiment

可视化结果已生成!

最佳参数组合:
最小词频阈值: 2
最大词频阈值: 200
Optimal Score: 0.476


In [36]:
def visualize_metrics_trends(df, experiment_dir):
    """Create comprehensive visualization for all metrics"""
    plt.figure(figsize=(15, 8))
    
    # 创建阈值组合标签
    df['threshold_range'] = df['min_freq'].astype(str) + '-' + df['max_freq'].astype(str)
    
    # 计算每个阈值组合的平均指标值
    metrics_avg = df.groupby('threshold_range').agg({
        'npmi': 'mean',
        'diversity': 'mean',
        'optimal_score': 'mean'
    }).reset_index()
    
    # 排序以确保x轴顺序合理
    metrics_avg['min_freq'] = metrics_avg['threshold_range'].str.split('-').str[0].astype(int)
    metrics_avg['max_freq'] = metrics_avg['threshold_range'].str.split('-').str[1].astype(int)
    metrics_avg = metrics_avg.sort_values(['min_freq', 'max_freq'])
    
    # 绘制三个指标的折线
    plt.plot(metrics_avg['threshold_range'], metrics_avg['npmi'], 
            marker='o', label='NPMI', color='#FFA500', linewidth=2)
    plt.plot(metrics_avg['threshold_range'], metrics_avg['diversity'], 
            marker='s', label='Diversity', color='#FF6B6B', linewidth=2)
    plt.plot(metrics_avg['threshold_range'], metrics_avg['optimal_score'], 
            marker='^', label='Optimal Score', color='#4ECDC4', linewidth=2)
    
    # 设置图表格式
    plt.title('Relationship Between Threshold Range and Metrics', fontsize=14, pad=20)
    plt.xlabel('Threshold Range (min_freq-max_freq)', fontsize=12)
    plt.ylabel('Scores', fontsize=12)
    
    # 添加网格
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # 调整x轴标签
    plt.xticks(rotation=45, ha='right')
    
    # 添加图例
    plt.legend(loc='upper right')
    
    # 调整布局
    plt.tight_layout()
    
    # 保存图片
    plt.savefig(os.path.join(experiment_dir, 'metrics_trends.png'), 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # 打印最优组合
    best_combo = metrics_avg.loc[metrics_avg['optimal_score'].idxmax()]
    print(f"\nBest Parameter Combination:")
    print(f"Threshold Range: {best_combo['threshold_range']}")
    print(f"NPMI: {best_combo['npmi']:.4f}")
    print(f"Diversity: {best_combo['diversity']:.4f}")
    print(f"Optimal Score: {best_combo['optimal_score']:.4f}")

# 读取数据并生成可视化
df = pd.read_csv(os.path.join(experiment_dir, 'fold_results.csv'))
visualize_metrics_trends(df, experiment_dir)
print(f"Visualization saved in: {experiment_dir}/metrics_trends.png")

2024-11-24 21:13:43,207 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2024-11-24 21:13:43,210 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2024-11-24 21:13:43,211 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2024-11-24 21:13:43,212 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
2024-11-24 21:13:43,214 - INFO - Using categorical units to plot a list of strings that are all parsable as 


Best Parameter Combination:
Threshold Range: 2-200
NPMI: 0.4981
Diversity: 0.4547
Optimal Score: 0.4764
Visualization saved in: experiments/lda/cusanus/threshold/experiment_20241124_211031/metrics_trends.png
