In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch.utils.data import DataLoader, Dataset



In [None]:
BASE_DIR = '/Users/jessie/Documents/Projects/Cusanus_Topic_Modeling'
os.chdir(BASE_DIR)

In [None]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch.utils.data import DataLoader, Dataset

# 加载预处理后的文档数据
testset_dir = 'data/testset'
documents = []

for file_name in os.listdir(testset_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(testset_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append(content)

# 使用 CountVectorizer 创建词袋模型（直接基于预处理文本）
vectorizer = CountVectorizer(max_df=0.95, min_df=2)  # max_df和min_df可以根据需要调整
X = vectorizer.fit_transform(documents)
vocab = vectorizer.get_feature_names_out()
bow_data = X.toarray()


In [None]:
class BOWDataset(Dataset):
    def __init__(self, bow_data):
        self.bow_data = torch.tensor(bow_data).float()

    def __len__(self):
        return len(self.bow_data)

    def __getitem__(self, idx):
        return {'bow': self.bow_data[idx]}

# 创建 DataLoader
bow_dataset = BOWDataset(bow_data)
data_loader = DataLoader(bow_dataset, batch_size=16, shuffle=True)


In [None]:
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning import Trainer

class ETM(pl.LightningModule):
    def __init__(self, vocab_size, num_topics, embed_size, hidden_size, dropout=0.2):
        super(ETM, self).__init__()
        self.num_topics = num_topics
        self.embed_size = embed_size
        self.hidden_size = hidden_size

        # 词汇嵌入
        self.word_embeddings = nn.Embedding(vocab_size, embed_size)
        
        # 主题嵌入
        self.topic_embeddings = nn.Embedding(num_topics, embed_size)

        # 编码器
        self.encoder = nn.Sequential(
            nn.Linear(vocab_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_topics),
            nn.Softmax(dim=-1)
        )

    def forward(self, bow):
        # 文档编码为主题分布
        doc_topic_distr = self.encoder(bow)
        
        # 主题嵌入
        topic_vectors = self.topic_embeddings(torch.arange(self.num_topics, device=self.device))
        
        # 重构分布
        reconstructed_bow = torch.matmul(doc_topic_distr, topic_vectors)
        word_distr = torch.matmul(reconstructed_bow, self.word_embeddings.weight.t())
        
        return word_distr, doc_topic_distr

    def training_step(self, batch, batch_idx):
        bow = batch['bow']
        word_distr, doc_topic_distr = self(bow)
        
        # 使用 KL 散度损失
        kl_loss = torch.nn.functional.kl_div(word_distr.log(), bow, reduction="batchmean")
        return kl_loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)


In [None]:
# 设置模型参数
vocab_size = len(vocab)
num_topics = 5  # 根据您的需求调整
embed_size = 300
hidden_size = 128

# 初始化 ETM 模型
etm_model = ETM(vocab_size, num_topics, embed_size, hidden_size)

# 使用 Trainer 进行训练
trainer = Trainer(max_epochs=50)  # 根据情况调整epoch数
trainer.fit(etm_model, data_loader)


In [None]:
# 提取每个主题的关键词
topic_vectors = etm_model.topic_embeddings.weight.cpu().detach().numpy()
for topic_idx, topic_vec in enumerate(topic_vectors):
    top_word_indices = topic_vec.argsort()[-10:][::-1]  # 获取每个主题的前10个词
    top_words = [vocab[i] for i in top_word_indices]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")
