In [1]:
import os
import pandas as pd
import numpy as np
from gensim import corpora, models
from gensim.models import CoherenceModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
import logging
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 设置工作目录
project_root = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(project_root)
print("Current working directory: ", os.getcwd())

# 设置日志记录
os.makedirs('results/parameter_search', exist_ok=True)
logging.basicConfig(
    filename='results/parameter_search/experiment.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

Current working directory:  /Users/jessie/Documents/Projects/Cusanus_Topic_Modeling


In [3]:
# 方法参数配置
PARAMETERS = {
    'cltk': {
        'tfidf_threshold': 0.12,
        'alpha_weight': 0.3,    # NPMI的权重
        'lambda_weight': 0.1    # Overlap的惩罚权重
    },
    'cusanus': {
        'tfidf_threshold': 0.30,
        'alpha_weight': 0.3,
        'lambda_weight': 0.1
    },
    'stanza': {
        'tfidf_threshold': 0.19,
        'alpha_weight': 0.3,
        'lambda_weight': 0.1
    }
}

# LDA参数搜索空间
num_topics_list = [10]
alpha_list = ['symmetric', 0.1, 0.3, 0.5, 0.7]
eta_list = ['symmetric', 0.1, 0.3, 0.5, 0.7]

In [4]:
def load_files(method, data_type='train'):
    """加载文件列表"""
    if data_type == 'train':
        data_dir = f'data/preprocessed/{method}'
    else:
        data_dir = f'data/testset/{method}'
    
    files = glob.glob(os.path.join(data_dir, '*.txt'))
    logging.info(f"从 {data_dir} 加载了 {len(files)} 个文件")
    return files

def load_paragraphs_from_file(file_path):
    """从单个文件加载段落"""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
        
        # 处理每个段落中的词
        processed_paragraphs = []
        for para in paragraphs:
            words = []
            for token in para.split():
                if '/' in token:  # 如果有词性标注
                    word = token.rsplit('/', 1)[0]  # 只保留词形
                    words.append(word)
                else:
                    words.append(token)
            if words:  # 只添加非空段落
                processed_paragraphs.append(' '.join(words))
        
        return processed_paragraphs

In [5]:
def load_all_paragraphs(method, data_type='train'):
    """加载所有段落"""
    files = load_files(method, data_type)
    all_paragraphs = []
    paragraph_info = []
    
    for file_path in tqdm(files, desc=f"加载{data_type}集文件"):
        paragraphs = load_paragraphs_from_file(file_path)
        file_name = os.path.basename(file_path)
        
        for i, para in enumerate(paragraphs):
            all_paragraphs.append(para)
            paragraph_info.append({
                'file': file_name,
                'paragraph_num': i
            })
    
    logging.info(f"{method} {data_type}集:")
    logging.info(f"- 总文件数: {len(files)}")
    logging.info(f"- 总段落数: {len(all_paragraphs)}")
    if all_paragraphs:
        logging.info(f"- 段落样本:\n{all_paragraphs[0]}\n{all_paragraphs[-1]}")
    
    return all_paragraphs, paragraph_info

In [19]:
def filter_paragraphs_with_tfidf(paragraphs, threshold):
    """使用TF-IDF过滤段落"""
    if not paragraphs:
        raise ValueError("输入的段落列表为空！")
    
    logging.info(f"TF-IDF过滤前的段落数：{len(paragraphs)}")
    
    vectorizer = TfidfVectorizer()
    
    try:
        tfidf_matrix = vectorizer.fit_transform(paragraphs)
        max_tfidf = np.max(tfidf_matrix.toarray(), axis=0)
        
        # 打印TF-IDF值的分布情况
        logging.info(f"TF-IDF值分布:")
        logging.info(f"最小值: {max_tfidf.min():.4f}")
        logging.info(f"最大值: {max_tfidf.max():.4f}")
        logging.info(f"平均值: {max_tfidf.mean():.4f}")
        logging.info(f"中位数: {np.median(max_tfidf):.4f}")
        
        feature_names = np.array(vectorizer.get_feature_names_out())
        kept_words = set(feature_names[max_tfidf >= threshold])
        
        filtered_paragraphs = []
        for para in paragraphs:
            words = para.split()
            filtered_words = [w for w in words if w in kept_words]
            if filtered_words:  # 只保留非空段落
                filtered_paragraphs.append(' '.join(filtered_words))
        
        logging.info(f"过滤后的段落数：{len(filtered_paragraphs)}")
        logging.info(f"保留的词汇数：{len(kept_words)}")
        
        return filtered_paragraphs, kept_words
        
    except Exception as e:
        logging.error(f"TF-IDF过滤失败：{str(e)}")
        raise

In [17]:
def compute_optimal_score(npmi, diversity, overlap, alpha_weight, lambda_weight):
    """计算综合得分
    
    Args:
        npmi: NPMI一致性分数
        diversity: 主题多样性分数
        overlap: 主题重叠度
        alpha_weight: NPMI权重
        lambda_weight: 重叠度惩罚权重
    
    Returns:
        float: 综合得分
    """
    return alpha_weight * npmi + (1 - alpha_weight) * diversity - lambda_weight * overlap

def compute_topic_overlap(model, num_topics, topn=10):
    """计算主题重叠度
    
    Args:
        model: 训练好的LDA模型
        num_topics: 主题数量
        topn: 每个主题考虑的top词数量
    
    Returns:
        float: 平均重叠度
    """
    # 获取所有主题的前topn个词
    topic_words = []
    for i in range(num_topics):
        top_words = [word for word, _ in model.show_topic(i, topn=topn)]
        topic_words.append(set(top_words))
    
    # 计算平均重叠度
    overlap_scores = []
    for i in range(num_topics):
        for j in range(i + 1, num_topics):
            overlap = len(topic_words[i] & topic_words[j]) / topn
            overlap_scores.append(overlap)
    
    return np.mean(overlap_scores) if overlap_scores else 0.0

def prepare_corpus(paragraphs):
    """准备语料库
    
    Args:
        paragraphs: 段落列表
    
    Returns:
        tuple: (gensim词典, 语料库)
    """
    # 将文本分词
    texts = [para.split() for para in paragraphs]
    
    # 创建词典
    dictionary = corpora.Dictionary(texts)
    
    # 创建语料库
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    logging.info(f"词典大小: {len(dictionary)}")
    logging.info(f"语料库大小: {len(corpus)}")
    
    return dictionary, corpus

def train_lda_model(corpus, dictionary, num_topics, alpha, eta):
    """训练LDA模型
    
    Args:
        corpus: 语料库
        dictionary: 词典
        num_topics: 主题数量
        alpha: 文档-主题分布的先验
        eta: 主题-词分布的先验
    
    Returns:
        LdaModel: 训练好的模型
    """
    model = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        alpha=alpha,
        eta=eta,
        passes=10,
        iterations=100,
        random_state=42
    )
    return model

def evaluate_model(model, val_paragraphs, dictionary, num_topics):
    """评估模型性能
    
    Args:
        model: 训练好的LDA模型
        val_paragraphs: 验证集段落
        dictionary: 词典
        num_topics: 主题数量
    
    Returns:
        tuple: (NPMI分数, 多样性分数, 重叠度分数)
    """
    # 准备验证集
    val_texts = [para.split() for para in val_paragraphs]
    
    # 计算NPMI
    coherence_model = CoherenceModel(
        model=model,
        texts=val_texts,
        dictionary=dictionary,
        coherence='c_npmi'
    )
    npmi = coherence_model.get_coherence()
    
    # 计算多样性
    top_words_per_topic = []
    for topic_id in range(num_topics):
        top_words = [word for word, prob in model.show_topic(topic_id, topn=10)]
        top_words_per_topic.extend(top_words)
    diversity = len(set(top_words_per_topic)) / (num_topics * 10)
    
    # 计算重叠度
    overlap = compute_topic_overlap(model, num_topics)
    
    return npmi, diversity, overlap

In [21]:
# 在主循环开始前添加测试代码
for method in PARAMETERS.keys():
    train_paragraphs, _ = load_all_paragraphs(method, 'train')
    test_paragraphs, _ = load_all_paragraphs(method, 'test')
    print(f"\n{method}方法:")
    print(f"训练集段落数: {len(train_paragraphs)}")
    print(f"测试集段落数: {len(test_paragraphs)}")
    print(f"训练集样本:\n{train_paragraphs[0][:200]}")
    
# 对每个方法测试TF-IDF过滤
for method in PARAMETERS.keys():
    train_paragraphs, _ = load_all_paragraphs(method, 'train')
    filtered_paras, kept_words = filter_paragraphs_with_tfidf(
        train_paragraphs,
        PARAMETERS[method]['tfidf_threshold']
    )
    print(f"\n{method}方法:")
    print(f"过滤前段落数: {len(train_paragraphs)}")
    print(f"过滤后段落数: {len(filtered_paras)}")
    print(f"保留词汇数: {len(kept_words)}")

加载train集文件: 100%|██████████| 306/306 [00:00<00:00, 2151.23it/s]
加载test集文件: 100%|██████████| 30/30 [00:00<00:00, 3356.43it/s]



cltk方法:
训练集段落数: 4516
测试集段落数: 514
训练集样本:
Paragraph 1: remitto pecco diligo mare peccatrix peccator sordes tenebrae circumvoluta sanctus spiritus receptaculis purus caelum1 gaudium gloriosissimo triumphus proveho diligo careo operio multitudo


加载train集文件: 100%|██████████| 308/308 [00:00<00:00, 3341.36it/s]
加载test集文件: 100%|██████████| 30/30 [00:00<00:00, 3051.29it/s]



cusanus方法:
训练集段落数: 4645
测试集段落数: 666
训练集样本:
Paragraph 1: epistula dominicus debitor plato constantius romanos concludo praemitto paulus vita osdroena ismael vita spiritus


加载train集文件: 100%|██████████| 306/306 [00:00<00:00, 3245.24it/s]
加载test集文件: 100%|██████████| 30/30 [00:00<00:00, 3770.27it/s]



stanza方法:
训练集段落数: 4516
测试集段落数: 514
训练集样本:
Paragraph 1: remitto peccatum diligo maria peccatrix peccatum sordus tenebra circumvo sanctus spiritus receptaculum purus caelum gaudia gloriosus triumphus provecto diligo caritas operio multitudo pec


加载train集文件: 100%|██████████| 306/306 [00:00<00:00, 7980.85it/s]



cltk方法:
过滤前段落数: 4516
过滤后段落数: 4515
保留词汇数: 13348


加载train集文件: 100%|██████████| 308/308 [00:00<00:00, 6515.10it/s]



cusanus方法:
过滤前段落数: 4645
过滤后段落数: 4645
保留词汇数: 2749


加载train集文件: 100%|██████████| 306/306 [00:00<00:00, 7786.22it/s]



stanza方法:
过滤前段落数: 4516
过滤后段落数: 4515
保留词汇数: 6977


In [22]:
# 主实验循环
results = []

# 1. 加载数据
for method in PARAMETERS.keys():
    print(f"\n处理 {method} 方法...")
    
    try:
        # 加载训练集和测试集
        train_paragraphs, train_info = load_all_paragraphs(method, 'train')
        test_paragraphs, test_info = load_all_paragraphs(method, 'test')
        
        if not train_paragraphs or not test_paragraphs:
            logging.error(f"{method}: 训练集或测试集为空")
            continue
            
        # 2. TF-IDF过滤
        filtered_train_paras, kept_words = filter_paragraphs_with_tfidf(
            train_paragraphs,
            PARAMETERS[method]['tfidf_threshold']
        )
        
        if not filtered_train_paras:
            logging.error(f"{method}: TF-IDF过滤后没有剩余段落")
            continue
            
        # 3. 准备语料库
        dictionary, corpus = prepare_corpus(filtered_train_paras)
        
        # 4. 参数搜索
        for num_topics in num_topics_list:
            for alpha in alpha_list:
                for eta in eta_list:
                    try:
                        # 5. 训练模型
                        model = train_lda_model(corpus, dictionary, num_topics, alpha, eta)
                        
                        # 6. 评估模型
                        npmi, diversity, overlap = evaluate_model(
                            model, test_paragraphs, dictionary, num_topics
                        )
                        
                        # 7. 计算综合得分
                        optimal_score = compute_optimal_score(
                            npmi, diversity, overlap,
                            PARAMETERS[method]['alpha_weight'],
                            PARAMETERS[method]['lambda_weight']
                        )
                        
                        # 8. 保存结果
                        result = {
                            'method': method,
                            'num_topics': num_topics,
                            'alpha': str(alpha),
                            'eta': str(eta),
                            'npmi': round(npmi, 4),
                            'diversity': round(diversity, 4),
                            'overlap': round(overlap, 4),
                            'optimal_score': round(optimal_score, 4)
                        }
                        results.append(result)
                        
                        # 定期保存中间结果
                        if len(results) % 5 == 0:
                            pd.DataFrame(results).to_excel(
                                'results/parameter_search/temp_results.xlsx',
                                index=False
                            )
                            
                    except Exception as e:
                        error_msg = f"Error with {method}, topics={num_topics}, alpha={alpha}, eta={eta}: {str(e)}"
                        print(error_msg)
                        logging.error(error_msg)
                        continue
                        
    except Exception as e:
        error_msg = f"Error processing method {method}: {str(e)}"
        print(error_msg)
        logging.error(error_msg)
        continue


处理 cltk 方法...


加载train集文件: 100%|██████████| 306/306 [00:00<00:00, 3331.62it/s]
加载test集文件: 100%|██████████| 30/30 [00:00<00:00, 3800.34it/s]



处理 cusanus 方法...


加载train集文件: 100%|██████████| 308/308 [00:00<00:00, 4526.68it/s]
加载test集文件: 100%|██████████| 30/30 [00:00<00:00, 2484.04it/s]



处理 stanza 方法...


加载train集文件: 100%|██████████| 306/306 [00:00<00:00, 2887.15it/s]
加载test集文件: 100%|██████████| 30/30 [00:00<00:00, 3381.87it/s]


In [23]:
# 保存最终结果
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('optimal_score', ascending=False)
results_df.to_excel('results/parameter_search/all_results.xlsx', index=False)

# 分析最佳参数
best_params = results_df.loc[results_df.groupby('method')['optimal_score'].idxmax()]
best_params.to_excel('results/parameter_search/best_parameters.xlsx', index=False)

print("\n各方法的最佳参数组合：")
for _, row in best_params.iterrows():
    print(f"\n{row['method']}方法:")
    print(f"主题数量: {row['num_topics']}")
    print(f"alpha: {row['alpha']}")
    print(f"eta: {row['eta']}")
    print(f"NPMI: {row['npmi']:.4f}")
    print(f"Diversity: {row['diversity']:.4f}")
    print(f"Overlap: {row['overlap']:.4f}")
    print(f"Optimal Score: {row['optimal_score']:.4f}")


各方法的最佳参数组合：

cltk方法:
主题数量: 10
alpha: symmetric
eta: symmetric
NPMI: inf
Diversity: 0.7100
Overlap: 0.1156
Optimal Score: inf

cusanus方法:
主题数量: 10
alpha: 0.7
eta: 0.7
NPMI: inf
Diversity: 0.7500
Overlap: 0.1222
Optimal Score: inf

stanza方法:
主题数量: 10
alpha: 0.1
eta: 0.1
NPMI: inf
Diversity: 0.7300
Overlap: 0.0978
Optimal Score: inf
