# **Topic Models for Regular-Sized Documents**
**Daniel Voskergian, Rashid Jayouse and Malik Yousef**

---



**To perform topic modeling** --> A csv file containing pre-processed dataset is required. Pre-processing means removing punctuations, numbers, stop-words, stemming, words less than n char, etc. Each line contains one document.

*Note:* for embedding-based topic models, we use a non-stemmed dataset.

In [8]:
file_name = '/content/title+abstract_1.csv' #stemmed dataset

In [None]:
file_name_ns = '/content/title+abstract_2.csv' #not_stemmed dataset



---


# ***A) Algebraic Topic Models***

# **NMF - Non-negative Matrix Factorization**

In [None]:
!pip uninstall scikit-learn -y
!pip install scikit-learn==0.24

In [4]:
import pandas as pd

In [None]:
data = pd.read_csv(file_name, encoding = "ISO-8859-1", header = None)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

analyzer = TfidfVectorizer().build_analyzer()

# Override TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: analyzer(doc)

In [None]:
vectorizer = StemmedTfidfVectorizer(min_df=50)
matrix = vectorizer.fit_transform(data[0])
words_df = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names())
words_df.head()

In [None]:
from sklearn.decomposition import NMF

model = NMF(n_components=20)
model.fit(matrix)

In [None]:
n_words = 20
feature_names = vectorizer.get_feature_names()

topic_list = []
for topic_idx, topic in enumerate(model.components_):
   top_features = [feature_names[i] for i in topic.argsort()][::-1][:n_words]  
   top_n = ' '.join(top_features)
   topic_list.append(f"topic_{'_'.join(top_features[:3])}") 

   print(f"Topic {topic_idx}: {top_n}")

In [17]:
n_words = 20
feature_names = vectorizer.get_feature_names()

topic_list = []
with open('NMF_topics_words.txt', 'w') as f:
  for topic_idx, topic in enumerate(model.components_):
     top_features = [feature_names[i] for i in topic.argsort()][::-1][:n_words]
     for i in top_features:
      f.write(i)
      f.write('\n')

In [18]:
amounts = model.transform(matrix) 
probs_df=pd.DataFrame(amounts)
probs_df.to_excel(r'NMF_topics_distibutions.xlsx', sheet_name='NMF', index=False)

#**LSI - Latent Semantic Analysis/Indexing**

In [20]:
import pandas as pd

In [21]:
data = pd.read_csv(file_name, header = None) 

In [22]:
# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [23]:
vect =TfidfVectorizer() # to play with. min_df,max_df,max_features etc...

In [24]:
vect_text=vect.fit_transform(data[0])

In [25]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=1000, random_state=42)

In [26]:
lsa_top=lsa_model.fit_transform(vect_text)

In [None]:
tmp=""
# most important words for each topic
vocab = vect.get_feature_names()
import codecs
# top words of each topic
file = codecs.open('LSI_topics_words','w','utf-8')

with open('LSIreadme.txt', 'w') as f:
 for i, comp in enumerate(lsa_model.components_):
    
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:20]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        f.write(t[0])
        f.write('\n')
        print(t[0],end=" ")
        tmp += t[0] + ' '
    file.write(tmp + '\n')
    tmp = ''
    
    print("\n")


# **FLSA-W - Fuzzy Latent Semantic Analysis**

In [None]:
pip install FuzzyTM

In [None]:
from FuzzyTM import FLSA_W

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(file_name) 

In [None]:
import nltk
nltk.download('punkt')

In [None]:
data['tokenized_sents'] = data.apply(lambda row: nltk.word_tokenize(row[0]), axis=1)

In [None]:
data['tokenized_sents'].values.tolist()

In [None]:
flsaW = FLSA_W(input_file = data['tokenized_sents'].values.tolist(), num_topics=20, num_words=20)

In [None]:
pwgt, ptgd = flsaW.get_matrices()

In [None]:
topics = flsaW.show_topics(representation='words', num_words=20)

In [None]:
with open('FLSA_topics_words.txt', 'w') as f:
  for i in range(20):
    print("\n")
    for m in range(20):
      print(topics[i][m])
      f.write(topics[i][m])
      f.write('\n')
print(topics[0][0])

In [None]:
probs_df=pd.DataFrame(ptgd)
probs_df.to_excel(r'FLSA_topics_distibutions.xlsx', sheet_name='FLSA', index=False)



---


# ***B) Probabilistic Topic Models***

# **PLSA - Probabilistic Latent Semantic Analysis**

In [None]:
import sys
import matplotlib.pyplot as plt

In [None]:
%matplotlib notebook

In [None]:
sys.path.append('..')

In [None]:
!pip install plsa

In [None]:
from plsa import Corpus, Pipeline, Visualize
from plsa.pipeline import DEFAULT_PIPELINE
from plsa.algorithms import PLSA

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
!pip install preprocessor

In [None]:
#There is a need to change some Pipline class parameters 
pipeline = Pipeline(* DEFAULT_PIPELINE)
pipeline

In [None]:
corpus = Corpus.from_csv(file_name, pipeline)
corpus

In [None]:
n_topics = 20

In [None]:
plsa = PLSA(corpus, n_topics, True)
plsa

In [None]:
result = plsa.fit()
plsa

In [None]:
probs= result.topic_given_doc

In [None]:
with open('PLSA_topics_words.txt', 'w') as f:
  for i in range(20):
    topic_words = result.word_given_topic[i][:20] 
    print("\n")
    for m in range(20):
      print(topic_words[m][0])
      f.write(topic_words[m][0])
      f.write('\n')

In [None]:
import pandas as pd
probs_df=pd.DataFrame(probs)
probs_df.to_excel(r'PLSA_topics_distibutions.xlsx', sheet_name='PLSA', index=False)

# **CTM - Correlated Topic Model**

In [None]:
!pip install --upgrade pip
!pip install tomotopy

In [None]:
import tomotopy as tp

In [None]:
# k = number of topics
mdl = tp.CTMModel(k=20)
for line in open(file_name , encoding = "ISO-8859-1"):
    mdl.add_doc(line.strip().split())

for i in range(0, 100, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

In [None]:
#Showing topic-word distribution lists
for k in range(mdl.k):
    print('Top 20 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=20))  #top_m = number of words per topic

In [None]:
#Showing only words in each topic
for k in range(mdl.k):
    print()
    for i in range(20):
      print(mdl.get_topic_words(k, top_n=20)[i][0])

In [None]:
#inference 
topic_distribution=[]
for line in open(file_name_ns, encoding = "ISO-8859-1"):
  doc_inst = mdl.make_doc(line.strip().split())
  b, t = mdl.infer(doc_inst)
  topic_distribution.append(b)

In [None]:
# Saving topic distribution over documents in excel sheet
import pandas as pd
probs_df=pd.DataFrame(topic_distribution)
probs_df.to_excel(r'ts_CorrelatedTM_distibutions_T2.xlsx', sheet_name='CorrelatedTM', index=False)



---


# ***C) Embedding-based Topic Models***

# **BERTopics**

In [None]:
pip install --upgrade joblib==1.1.0; 
#Restart Runtime after executing this cell

In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic

In [None]:
import pandas as pd 
data = pd.read_csv(file_name_ns, header = None) #write header name if exists

In [None]:
topic_model = BERTopic(calculate_probabilities=True, top_n_words=20, min_topic_size=50)         # nr_topics=20, top_n_words=20,
topic_model_large = BERTopic("all-mpnet-base-v2")

In [None]:
topics, probs = topic_model.fit_transform(data[0])

In [None]:
all_topics = topic_model.get_topics()
print(len(all_topics))

In [None]:
frequency = topic_model.get_topic_freq()
frequency

In [None]:
# Further reduce topics (if needed)
topic_model.reduce_topics(data[0], nr_topics=20)

In [None]:
topic_model.visualize_barchart(n_words=20)

In [None]:
with open('BERTopic_topic_words.txt', 'w') as f:
  for i in range(len(all_topics)-1):
    topic_words = topic_model.get_topic(i)
    print("\n")
    for m in range(20):
      print(topic_words[m][0])
      f.write(topic_words[m][0])
      f.write('\n')

In [None]:
probs_df=pd.DataFrame(probs)
probs_df.to_excel(r'BERTopics_topics_distributions.xlsx', sheet_name='BERTopics', index=False)

# **Top2Vec**

In [None]:
!pip install top2vec --no-cache-dir --no-binary :all:

In [None]:
import numpy as np 
import pandas as pd 
import json
import os
import ipywidgets as widgets
from IPython.display import clear_output, display
from top2vec import Top2Vec
import pandas as pd

In [None]:
metadata_df = pd.read_csv(file_name_ns)

In [None]:
top2vec = Top2Vec(documents=metadata_df[0].values.tolist(), speed="learn", workers=4)

In [None]:
top2vec.get_num_topics()

In [None]:
topic_mapping = top2vec.hierarchical_topic_reduction(num_topics=20)
topic_mapping

In [None]:
topic_distribution=[]
for i in range(len(metadata_df)):
  topic_distribution.insert(i,top2vec.get_documents_topics([i], reduced=True, num_topics=20)[1].tolist()[0])

In [None]:
with open('T2v1_topics_words.txt', 'w') as f:
  for i in range(45):
    topic_words = top2vec.get_topics(45, reduced=False)[0][i]
    print("\n")
    for m in range(20):
      print(topic_words[m])
      f.write(topic_words[m])
      f.write('\n')

In [None]:
probs_df=pd.DataFrame(topic_distribution)
probs_df.to_excel(r'T2V_topics_distibutions.xlsx', sheet_name='T2V', index=False)

# **CombinedTM**

In [None]:
pip install -U contextualized_topic_models

In [None]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.data_preparation import bert_embeddings_from_file
import pandas as pd

In [None]:
qt = TopicModelDataPreparation("all-mpnet-base-v2")

In [None]:
list_of_preprocessed_documents = pd.read_csv(file_name, header = None)[0].values.tolist()

In [None]:
list_of_unpreprocessed_documents = pd.read_csv(file_name_ns, header = None)[0].values.tolist()

In [None]:
training_dataset = qt.fit(text_for_contextual=list_of_unpreprocessed_documents, text_for_bow=list_of_preprocessed_documents)

In [None]:
ctm = CombinedTM(bow_size=len(qt.vocab), contextual_size=768, n_components=20) # 50 topics

In [None]:
ctm.fit(training_dataset) # run the model

In [None]:
with open('CombinedTM_topics_words.txt', 'w') as f:
 for k in range(20):
    for i in range(20):
      print(ctm.get_topics(20)[k][i])
      f.write(ctm.get_topics(20)[k][i])
      f.write('\n')

# **EmbdedTM**

In [None]:
pip install -U embedded_topic_model

In [None]:
from embedded_topic_model.utils import preprocessing
import json

# Loading a dataset in JSON format. As said, documents must be composed by string sentences
corpus_file = '/content/abstract_full_filtered_no_stopwords.json'
documents_raw = json.load(open(corpus_file, 'r'))
documents = [document['List(Term as String)'] for document in documents_raw]

# Preprocessing the dataset
vocabulary, train_dataset, _, = preprocessing.create_etm_datasets(
    documents, 
    min_df=0.01, 
    max_df=0.75, 
    train_size=0.85, 
)

In [None]:
from embedded_topic_model.utils import embedding

# Training word2vec embeddings
embeddings_mapping = embedding.create_word2vec_embedding_from_dataset(documents)

In [None]:
from embedded_topic_model.models.etm import ETM

# Training an ETM instance
etm_instance = ETM(
    vocabulary,
    embeddings=embeddings_mapping, # You can pass here the path to a word2vec file or
                                   # a KeyedVectors instance
    num_topics=20,
    num_words=20,
    epochs=300,
    debug_mode=True,
    train_embeddings=False, # Optional. If True, ETM will learn word embeddings jointly with
                            # topic embeddings. By default, is False. If 'embeddings' argument
                            # is being passed, this argument must not be True
)

etm_instance.fit(train_dataset)

In [None]:
topics = etm_instance.get_topics(20)

In [None]:
with open('ETM_topics_words.txt', 'w') as f:
 for k in range(20):
    for i in range(20):
      print(topics[k][i])
      f.write(topics[k][i])
      f.write('\n')

# **LDA2VEC**