# 主题建模

围绕这些关键词（long COVID, Post COVID syndrome, long-haul COVID, post-acute sequelae of COVID），看看出现这些关键词的句子，如何描述长新冠。BERTopic主题建模，追踪主题随时间的演变，发现不同时期的热点话题 （每个topic15个关键词；尽量把不同国家topics的数量控制在一个范围，例如18-25个topics，尽量通过调整参数来实现，而不是把超出25的删掉）

## 1.用BERTopic对关键句子进行主题建模

In [8]:
import glob
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import torch
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import numpy as np
from sklearn.decomposition import PCA
import os
from docx import Document

# 加载本地的多语言嵌入模型
local_model_path = "./paraphrase-multilingual-MiniLM-L12-v2"
embedding_model = SentenceTransformer(local_model_path)
def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x

# 遍历文件夹下所有CSV文件
for csv_file in glob.glob('./key words/*.csv'):
    df = pd.read_csv(csv_file, encoding='utf-8')

    df = df.dropna(subset=['key_sentences', 'load_date'])

    print('文档名称：', csv_file)

    print('文档长度：', len(df))

    # 2. 确保时间格式的一致性
    df['load_date'] = pd.to_datetime(df['load_date'])

    # 3. 按年月重新组织数据
    df['year_month'] = df['load_date'].dt.strftime('%Y-%m')
    timestamps = df['year_month'].tolist()

    texts = df['key_sentences'].tolist()

    # 编码文本
    embeddings = embedding_model.encode(texts, show_progress_bar=True)
    pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))
    # Start UMAP from PCA embeddings
    umap_model = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric="cosine",
        random_state=42,
        init=pca_embeddings,
    )

    vectorizer_model = CountVectorizer(stop_words='english',ngram_range=(1,2))
 # Train the topic model using pre-trained sentence-transformers embeddings
    topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=15, nr_topics='auto')
    topics, _ = topic_model.fit_transform(texts, embeddings)

    num_topics = len(topic_model.get_topics())
    if num_topics > 10:
        topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=15, nr_topics=10)
        topics, _ = topic_model.fit_transform(texts, embeddings)

    t = topic_model.get_topic_info()
    # t.head(25)
    doc = Document()

    # Iterate over the DataFrame to get topic information
    for index, row in t.iterrows():
        topic_number = row['Topic']
        frequency = row['Count']
        keywords = [word for word, _ in topic_model.get_topic(topic_number)]
        # Add a paragraph for each topic with its details
        doc.add_paragraph(f"Topic {topic_number}: {keywords}, Frequency: {frequency}")


    # 文档与主题图
    # fig_documents = topic_model.visualize_documents(df['key_sentences'], embeddings=embeddings)
    # 层次聚类图
    linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
    hierarchical_topics = topic_model.hierarchical_topics(df['key_sentences'], linkage_function=linkage_function)
    topic_labels = topic_model.generate_topic_labels(nr_words=15)
    topic_model.set_topic_labels(topic_labels)
    fig_hierarchy = topic_model.visualize_hierarchy (hierarchical_topics=hierarchical_topics, custom_labels=True)

    # 时间序列图

        # 1. 首先确保 topic_model 已经完成训练
    print("主题数量:", len(topic_model.get_topics()))

    # 2. 使用更明确的参数调用
    try:
        topics_over_time = topic_model.topics_over_time(
            docs=texts,
            timestamps=timestamps,
            nr_bins=9
        )
        topic_labels = topic_model.generate_topic_labels(nr_words=15)
        topic_model.set_topic_labels(topic_labels)
        fig_over_time = topic_model.visualize_topics_over_time(topics_over_time, custom_labels=True,width=1800,
        height=600)

    except Exception as e:
        print("错误类型:", type(e))
        print("错误信息:", str(e))


    # Extract the base name without the '.csv' extension
    base_name = os.path.basename(csv_file).replace('.csv', '')
    # Create a directory for this file's outputs
    output_dir = os.path.join('./输出文件', base_name)
    os.makedirs(output_dir, exist_ok=True)
    # When saving the HTML files, use the output_dir
    # Save the document
    doc.save(os.path.join(output_dir,'topics_and_frequencies.docx'))
    # fig_documents.write_html(os.path.join(output_dir, 'documents_and_topics.html'))
    fig_hierarchy.write_html(os.path.join(output_dir, 'hierarchy_clustering.html'))

    fig_over_time.write_html(os.path.join(output_dir, 'over_time.html'))
    try:
        #主题分布图
        fig_topics = topic_model.visualize_topics()
        fig_topics.write_html(os.path.join(output_dir, 'intertopic_distance.html'))
    except:
        pass
    model_dir = os.path.join(output_dir, 'model')
    os.makedirs(model_dir, exist_ok=True)
    topic_model.save(os.path.join(model_dir), serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
    # except:
    #     print(csv_file)

文档名称： ./key words\result_Australia 澳大利亚.csv
文档长度： 150


Batches: 100%|██████████| 5/5 [00:03<00:00,  1.41it/s]
100%|██████████| 3/3 [00:00<00:00, 220.01it/s]


主题数量: 5
文档名称： ./key words\result_Germany 德国.csv
文档长度： 144


Batches: 100%|██████████| 5/5 [00:04<00:00,  1.02it/s]
100%|██████████| 1/1 [00:00<00:00, 200.56it/s]


主题数量: 3
文档名称： ./key words\result_India 印度.csv
文档长度： 219


Batches: 100%|██████████| 7/7 [00:10<00:00,  1.45s/it]
100%|██████████| 1/1 [00:00<00:00, 167.15it/s]


主题数量: 3
文档名称： ./key words\result_Singapore 新加坡.csv
文档长度： 103


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.11s/it]
100%|██████████| 1/1 [00:00<00:00, 164.27it/s]


主题数量: 3
文档名称： ./key words\result_UK 英国.csv
文档长度： 921


Batches: 100%|██████████| 29/29 [00:31<00:00,  1.07s/it]
100%|██████████| 8/8 [00:00<00:00, 115.13it/s]


主题数量: 10
文档名称： ./key words\result_USA 美国.csv
文档长度： 846


Batches: 100%|██████████| 27/27 [00:28<00:00,  1.04s/it]
100%|██████████| 8/8 [00:00<00:00, 180.84it/s]


主题数量: 10


## 2.用BERTopic对关键段落进行主题建模

In [10]:
import glob
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import torch
from bertopic import BERTopic
from scipy.cluster import hierarchy as sch
import numpy as np
from sklearn.decomposition import PCA
import os
from docx import Document
# 加载本地的多语言嵌入模型
local_model_path = "./paraphrase-multilingual-MiniLM-L12-v2"
embedding_model = SentenceTransformer(local_model_path)
def rescale(x, inplace=False):
    """ Rescale an embedding so optimization will not have convergence issues.
    """
    if not inplace:
        x = np.array(x, copy=True)

    x /= np.std(x[:, 0]) * 10000

    return x

# 遍历文件夹下所有CSV文件
for csv_file in glob.glob('./key words/*.csv'):
    df = pd.read_csv(csv_file, encoding='utf-8')

    df = df.dropna(subset=['key_paragraphs', 'load_date'])

    print('文档名称：', csv_file)

    print('文档长度：', len(df))

    # 2. 确保时间格式的一致性
    df['load_date'] = pd.to_datetime(df['load_date'])

    # 3. 按年月重新组织数据
    df['year_month'] = df['load_date'].dt.strftime('%Y-%m')
    timestamps = df['year_month'].tolist()

    texts = df['key_paragraphs'].tolist()

    # 编码文本
    embeddings = embedding_model.encode(texts, show_progress_bar=True)
    pca_embeddings = rescale(PCA(n_components=5).fit_transform(embeddings))
    # Start UMAP from PCA embeddings
    umap_model = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric="cosine",
        random_state=42,
        init=pca_embeddings,
    )

    vectorizer_model = CountVectorizer(stop_words='english',ngram_range=(1,2))
 # Train the topic model using pre-trained sentence-transformers embeddings
    topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=15, nr_topics='auto')
    topics, _ = topic_model.fit_transform(texts, embeddings)

    num_topics = len(topic_model.get_topics())
    if num_topics > 15:
        topic_model = BERTopic(umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=15, nr_topics=15)
        topics, _ = topic_model.fit_transform(texts, embeddings)

    t = topic_model.get_topic_info()
    # t.head(25)
    doc = Document()

    # Iterate over the DataFrame to get topic information
    for index, row in t.iterrows():
        topic_number = row['Topic']
        frequency = row['Count']
        keywords = [word for word, _ in topic_model.get_topic(topic_number)]
        # Add a paragraph for each topic with its details
        doc.add_paragraph(f"Topic {topic_number}: {keywords}, Frequency: {frequency}")


    # 文档与主题图
    # fig_documents = topic_model.visualize_documents(df['key_sentences'], embeddings=embeddings)
    # 层次聚类图
    linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
    hierarchical_topics = topic_model.hierarchical_topics(df['key_paragraphs'], linkage_function=linkage_function)
    topic_labels = topic_model.generate_topic_labels(nr_words=15)
    topic_model.set_topic_labels(topic_labels)
    fig_hierarchy = topic_model.visualize_hierarchy (hierarchical_topics=hierarchical_topics, custom_labels=True)

    # 时间序列图

        # 1. 首先确保 topic_model 已经完成训练
    print("主题数量:", len(topic_model.get_topics()))

    # 2. 使用更明确的参数调用
    try:
        topics_over_time = topic_model.topics_over_time(
            docs=texts,
            timestamps=timestamps,
            nr_bins=9
        )
        topic_labels = topic_model.generate_topic_labels(nr_words=15)
        topic_model.set_topic_labels(topic_labels)
        fig_over_time = topic_model.visualize_topics_over_time(topics_over_time, custom_labels=True, width=1800, height=600)
        print("成功生成时间序列图")
    except Exception as e:
        print("错误类型:", type(e))
        print("错误信息:", str(e))


    # Extract the base name without the '.csv' extension
    base_name = os.path.basename(csv_file).replace('.csv', '')
    # Create a directory for this file's outputs
    output_dir = os.path.join('./段落输出文件', base_name)
    os.makedirs(output_dir, exist_ok=True)
    # When saving the HTML files, use the output_dir
    # Save the document
    doc.save(os.path.join(output_dir,'topics_and_frequencies.docx'))
    # fig_documents.write_html(os.path.join(output_dir, 'documents_and_topics.html'))
    fig_hierarchy.write_html(os.path.join(output_dir, 'hierarchy_clustering.html'))

    fig_over_time.write_html(os.path.join(output_dir, 'over_time.html'))
    try:
        #主题分布图
        fig_topics = topic_model.visualize_topics()
        fig_topics.write_html(os.path.join(output_dir, 'intertopic_distance.html'))
    except:
        pass
    model_dir = os.path.join(output_dir, 'model')
    os.makedirs(model_dir, exist_ok=True)
    topic_model.save(os.path.join(model_dir), serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
    # except:
    #     print(csv_file)

文档名称： ./key words\result_Australia 澳大利亚.csv
文档长度： 150


Batches: 100%|██████████| 5/5 [00:06<00:00,  1.32s/it]
100%|██████████| 3/3 [00:00<00:00, 158.28it/s]


主题数量: 5
成功生成时间序列图
文档名称： ./key words\result_Germany 德国.csv
文档长度： 144


Batches: 100%|██████████| 5/5 [00:07<00:00,  1.52s/it]
100%|██████████| 2/2 [00:00<00:00, 231.68it/s]


主题数量: 4



k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



成功生成时间序列图
文档名称： ./key words\result_India 印度.csv
文档长度： 219


Batches: 100%|██████████| 7/7 [00:11<00:00,  1.66s/it]
100%|██████████| 1/1 [00:00<00:00, 111.40it/s]


主题数量: 3
成功生成时间序列图
文档名称： ./key words\result_Singapore 新加坡.csv
文档长度： 103


Batches: 100%|██████████| 4/4 [00:04<00:00,  1.13s/it]
100%|██████████| 1/1 [00:00<00:00, 143.16it/s]


主题数量: 3
成功生成时间序列图
文档名称： ./key words\result_UK 英国.csv
文档长度： 921


Batches: 100%|██████████| 29/29 [00:40<00:00,  1.39s/it]
100%|██████████| 13/13 [00:00<00:00, 160.94it/s]


主题数量: 15
成功生成时间序列图
文档名称： ./key words\result_USA 美国.csv
文档长度： 846


Batches: 100%|██████████| 27/27 [00:35<00:00,  1.30s/it]
100%|██████████| 9/9 [00:00<00:00, 164.18it/s]


主题数量: 11
成功生成时间序列图
