In [1]:
import pickle
import torch
from transformers import BertModel
from tqdm import tqdm 
from octis.preprocessing.preprocessing import Preprocessing
from sentence_transformers import SentenceTransformer
from Code.TNTM.TNTM_SentenceTransformer import TNTM_SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


current device: cuda
current device: cuda


In [2]:
torch.manual_seed(41)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'current device: {device}')

current device: cuda


In [3]:
with open("Data/Preprocessed_Data/octis_dataset_20ng.pickle", "rb") as f: 
  dataset_raw_20ng = pickle.load(f)

vocab = dataset_raw_20ng.get_vocabulary()  # alternative way of getting unique word list
corpus = dataset_raw_20ng.get_corpus()    # list of documents as words

In [4]:
# number of unique words
len(vocab)

3349

In [5]:
# number of docs and each doc is converted as inner list of words
print("totoal number of docs:", len(corpus))

totoal number of docs: 18846


In [6]:
# load the original docs as sentence format
with open("Data/Auxillary_Data/twng_textData.txt", "r") as file:
    data20ng_text = file.readlines()

data20ng_text[0]

"From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu> Subject: Pens fans reactions Organization: Post Office, Carnegie Mellon, Pittsburgh, PA Lines: 12 NNTP-Posting-Host: po4.andrew.cmu.edu    I am sure some bashers of Pens fans are pretty confused about the lack of any kind of posts about the recent Pens massacre of the Devils. Actually, I am  bit puzzled too and a bit relieved. However, I am going to put an end to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they are killing those Devils worse than I thought. Jagr just showed you why he is much better than his regular season stats. He is also a lot fo fun to watch in the playoffs. Bowman should let JAgr have a lot of fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final regular season game.          PENS RULE!!!  \n"

In [7]:
embeddings_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

In [8]:
# embeddings of unique words
unique_words_embeddings = [embeddings_model.encode(word) for word in tqdm(vocab)]
unique_words_embeddings = torch.Tensor(unique_words_embeddings)
unique_words_embeddings.shape

100%|██████████| 3349/3349 [00:10<00:00, 305.12it/s]
  unique_words_embeddings = torch.Tensor(unique_words_embeddings)


torch.Size([3349, 384])

In [9]:
# create sentence embeddings
# Load the model
embeddings_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

sentence_embedding = []
for line in tqdm(data20ng_text):
    line_embedded =embeddings_model.encode(line.lower())
    sentence_embedding.append(line_embedded)
    
sentence_embedding = torch.tensor(sentence_embedding)
sentence_embedding.shape

100%|██████████| 18846/18846 [01:31<00:00, 207.03it/s]


torch.Size([18846, 384])

In [16]:
#sentence_embedding = torch.randn(18846, 384)

In [10]:

tntm = TNTM_SentenceTransformer(
    n_topics = 20,
    save_path = f"Data/Auxillary_Data/{20}_topics",
    n_dims = 11,
    n_hidden_units = 200,
    n_encoder_layers = 2,
    enc_lr = 1e-3,
    dec_lr = 1e-3,
    n_epochs = 20,
    #batch_size = 128,
    batch_size = 256,
    dropout_rate_encoder = 0.3,
    prior_variance =  0.995, 
    prior_mean = None,
    n_topwords = 200,
    device = device, 
    validation_set_size = 0.2, 
    early_stopping = True,
    n_epochs_early_stopping = 10,
)

In [14]:
# from TNTM_SentenceTransformer import TNTM_SentenceTransformer
# tntm = TNTM_SentenceTransformer(
#       n_topics  = 10, 
#       save_path = f"Data/Auxillary_Data/{20}_topics", 
#       enc_lr    = 1e-3,
#       dec_lr    = 1e-3
#       )

In [11]:
result = tntm.fit(
              corpus              = corpus,
              vocab               = vocab, 
              word_embeddings     = unique_words_embeddings,
              document_embeddings = sentence_embedding)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  mus_init_ten = torch.tensor(mus_init).to(self.device)
  L_lower_init_ten = torch.tensor(L_lower_init).to(self.device)
  log_diag_init_ten = torch.tensor(log_diag_init).to(self.device)


Epoch nr 0: mean_train_loss = -3193.902099609375, mean_train_nl = -3216.8564453125, mean_train_kld = 22.954605102539062, elapsed time: 2.9492733478546143
Epoch nr 0: median_train_loss = -3099.603515625, median_train_nl = -3119.6005859375, median_train_kld = 26.644495010375977, elapsed time: 2.9492733478546143
Epoch nr 0: mean_val_loss = -3349.61669921875, mean_val_nl = -3377.052490234375, mean_val_kld = 27.43627166748047
Epoch nr 0: median_val_loss = -3162.52197265625, median_val_nl = -3189.9443359375, median_val_kld = 27.44211196899414
gradient norm: mean: 3580.283273556512, median: 2703.9600836174473, max: 14830.10748260662


Epoch nr 1: mean_train_loss = -3384.67626953125, mean_train_nl = -3409.788818359375, mean_train_kld = 25.11225128173828, elapsed time: 2.793236255645752
Epoch nr 1: median_train_loss = -3330.80078125, median_train_nl = -3356.02734375, median_train_kld = 25.349475860595703, elapsed time: 2.793236255645752
Epoch nr 1: mean_val_loss = -3461.30078125, mean_val_nl = 

In [15]:
# result
# result[0] is word distribution with shape #num_of_topics x num_unique_words
# result[1] is weights corresponding to each word with shape #num_of_topics x num_unique_words

### Test the Model

In [12]:

weights = result[1]
# normalize weights for each corresponding unique word
normalize_weights = weights/weights.sum(axis=1, keepdims=True)

In [13]:
# Select top-k words for each cluster
top_k = 10
top_words_per_cluster = []
for cluster_idx in range(normalize_weights.shape[0]):  # Iterate over clusters
    # Get weights for all words in the cluster
    word_weights = normalize_weights[cluster_idx]
    
    # Get indices of the top-k words
    top_k_indices = word_weights.argsort()[-top_k:]
    
    # Map indices to words using resulttt[0]
    top_words = [result[0][cluster_idx][i] for i in top_k_indices]
    top_words_per_cluster.append(top_words)

# Print the top-k words for each cluster
for cluster_idx, words in enumerate(top_words_per_cluster):
    print(f"Cluster {cluster_idx + 1}: {words}")

Cluster 1: ['to', 'having', 'have', 'an', 'a', 'with', 'and', 'of', 'the', 'as']
Cluster 2: ['sin', 'biblical', 'atheist', 'testament', 'orthodox', 'bible', 'atheism', 'christian', 'christianity', 'religion']
Cluster 3: ['air', 'physics', 'motor', 'wind', 'sky', 'shuttle', 'train', 'rocket', 'engine', 'propulsion']
Cluster 4: ['careful', 'take', 'save', 'go', 'proceed', 'control', 'away', 'block', 'flee', 'escape']
Cluster 5: ['na', 'sp', 'pointer', 'pp', 'm', 'p', 'y', 'gw', 'l', 'f']
Cluster 6: ['useless', 'care', 'medicine', 'infection', 'no', 'lack', 'neither', 'non', 'never', 'not']
Cluster 7: ['most', 'these', 'etc', 'such', 'significantly', 'either', 'another', 'like', 'widely', 'some']
Cluster 8: ['practical', 'participate', 'organize', 'capacity', 'integrate', 'qualify', 'incorporate', 'apply', 'accomplish', 'advanced']
Cluster 9: ['jeff', 'randy', 'tom', 'james', 'george', 'walker', 'walter', 'brian', 'christopher', 'bob']
Cluster 10: ['guy', 'anybody', 'person', 'friend', 'w

# BERTopic 
### using BERTopic to assing one of the above topic to a document  

In [14]:
from bertopic import BERTopic

### Topic_embeddings

In [15]:
# compute weighted sum: weights_in_each_cluster x vocab_embeddings
# np.dot(num_cluster x 3349, 3349 x embedding_dim) 
# output: num_cluster x embedding_dim
vocab_embeddings = np.array(unique_words_embeddings)

topic_embeddings = np.dot(normalize_weights, vocab_embeddings)
print(f"Topic Embeddings Shape: {topic_embeddings.shape}") 

Topic Embeddings Shape: (20, 384)


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Example: Document embeddings
document_embeddings = np.array(sentence_embedding)  

# Compute cosine similarity between document and topic embeddings
similarity_matrix = cosine_similarity(document_embeddings, topic_embeddings)  # shape (50, 20)
# Assign each document to the most similar topic
document_topics = similarity_matrix.argmax(axis=1)  # len(corpus)

# Print document-topic assignments
for doc_idx, topic_idx in enumerate(document_topics):
    print(f"Document {doc_idx + 1} is assigned to Topic {topic_idx + 1}")

Document 1 is assigned to Topic 8
Document 2 is assigned to Topic 19
Document 3 is assigned to Topic 8
Document 4 is assigned to Topic 8
Document 5 is assigned to Topic 3
Document 6 is assigned to Topic 2
Document 7 is assigned to Topic 1
Document 8 is assigned to Topic 19
Document 9 is assigned to Topic 16
Document 10 is assigned to Topic 20
Document 11 is assigned to Topic 8
Document 12 is assigned to Topic 8
Document 13 is assigned to Topic 16
Document 14 is assigned to Topic 8
Document 15 is assigned to Topic 20
Document 16 is assigned to Topic 8
Document 17 is assigned to Topic 3
Document 18 is assigned to Topic 5
Document 19 is assigned to Topic 16
Document 20 is assigned to Topic 4
Document 21 is assigned to Topic 1
Document 22 is assigned to Topic 8
Document 23 is assigned to Topic 2
Document 24 is assigned to Topic 10
Document 25 is assigned to Topic 1
Document 26 is assigned to Topic 8
Document 27 is assigned to Topic 20
Document 28 is assigned to Topic 8
Document 29 is assig

In [26]:
# since the lenght of top_words_per_cluster is not the same as corpus, here we create pseudo_document to be the same lenght as corpus
pseudo_documents = [" ".join(top_words_per_cluster[topic]) for topic in document_topics]
len(pseudo_documents)

18846

### Use BERTopic to find the topic

In [44]:
# bert topic modelling
topic_model = BERTopic()

# Fit BERTopic with pseudo-documents and document embeddings
topics, probs = topic_model.fit_transform(pseudo_documents, document_embeddings)

# Display topics
print(topic_model.get_topic_info())

     Topic  Count                                          Name  \
0       -1   7816  -1_accomplish_incorporate_integrate_organize   
1        0    545                0_place_center_location_ground   
2        1    381                            1_not_non_never_no   
3        2    302                         2_having_with_of_have   
4        3    149                                 3_na_gw_pp_sp   
..     ...    ...                                           ...   
321    320     10          320_such_significantly_these_another   
322    321     10               321_whom_whoever_anybody_fellow   
323    322     10         322_accomplish_advanced_qualify_apply   
324    323     10                        323_and_an_have_having   
325    324     10                324_whose_whom_whoever_anybody   

                                        Representation  \
0    [accomplish, incorporate, integrate, organize,...   
1    [place, center, location, ground, area, where,...   
2    [not, non, never