In [1]:
######################
# Author: Omar Masood
# Class: ISYE6740
# Final Project
# Title: Topic Modeling of Religious Texts - Measuring Coherence and Performance. 
#####################

In [54]:
import numpy as np 
import random
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import plotly.io as pio
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import clone
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
import gensim.utils as gu 
from gensim import corpora
from gensim.models import CoherenceModel, Phrases
from gensim.models.phrases import Phraser
from top2vec import Top2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pyLDAvis
import pyLDAvis.lda_model as ldavis
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import umap_ as UMAP
import umap
from hdbscan import HDBSCAN
import os
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
#import ssl

In [4]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
sw = stopwords.words('english')

In [12]:
#return each verse with the words in their lemmetized form.
def clean_docs(data):
    lem_text = []
    items = []

    cln_txt = [t.lemma_ for t in nlp(data) if (t not in sw) and (len(t)>=3)]
    lem = " ".join(cln_txt)
    cln = gu.simple_preprocess(lem, deacc=True)
    
    cleaned = " ".join(cln)
    
    return cleaned

In [20]:
def view_topics(lda_model, vector):
    terms = vector.get_feature_names_out()

    for i, comp in enumerate(lda_model.components_):
        vocab_dic = zip(terms, comp)
        sorted_words = sorted(vocab_dic, key=lambda x:x[1], reverse=True)[:20]
        print("Topic ", i)
        for t in sorted_words: print(t[0])

In [21]:
def getWords(data):
    words = []

    words = [gu.simple_preprocess(i, deacc=True) for i in data]
    
    return words

In [22]:
def getTopics(lda_model, vector):
    features = vector.get_feature_names_out()

    topics = []
    weights = []
    
    for topic_idx, topic in enumerate(lda_model.components_):
        top_features_ind = topic.argsort()[-20:]
        topics.append(features[top_features_ind])
        weights.append(topic[top_features_ind])
    
    return topics, weights

In [None]:
def LDATopicDiveristy(lda_model, vector):
    topics = getTopics(lda_model=lda_model, vector=vector)
    output = {'topics':topics}
    
    score = TopicDiversity(topk=5)
    
    return score.score(output)


In [53]:
#This function just get the pieces ready that are needed to calculate the topic coherence score for our LDA models. 
def setupCoherence(data, lda_model, vector):

    token_docs = [gu.simple_preprocess(d, deacc=True) for d in data]
    dictionary = corpora.Dictionary(token_docs)
    corpus = [dictionary.doc2bow(t) for t in token_docs]
    topics = getTopics(lda_model, vector)

    return token_docs, dictionary, corpus, topics[0]

In [44]:
#Calculate LDA topic coherence score using gensim package
def getCoherenceScore(data, lda_model, vector):
    token_docs, dictionary, corpus, topics = setupCoherence(data, lda_model, vector)

    cm = CoherenceModel(topics=topics, texts=token_docs, 
                        dictionary=dictionary, coherence='c_v')
    
    return cm.get_coherence()
    

In [26]:
#This is a wrapper function so that I can use the gensim topic coherence score in my RandomSearchCV
def scorer(lda_model, data):
    sw = stopwords.words('english')
    vector = TfidfVectorizer(stop_words=sw, max_features=1000)
    vect_text = vector.fit_transform(data)

    lda = clone(lda_model)
    topics = lda.fit_transform(vect_text)
    
    feature_names = vector.get_feature_names_out()

    token_docs, dictionary, corpus, topics = setupCoherence(data, lda_model, vector)
    cm = CoherenceModel(topics=topics, texts=token_docs, dictionary=dictionary, coherence='c_v')

    return cm.get_coherence()

In [28]:
##Borrowed code from here to calculate BERTopic topic coherence score, https://github.com/MaartenGr/BERTopic/issues/90
def BERTCoherence(brt_model, data, topics):
    vectorizer = brt_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    words = vectorizer.get_feature_names_out
    tokens = [analyzer(doc) for doc in data]

    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in brt_model.get_topic(topic)] 
                   for topic in range(len(set(topics))-1)]
    
    coherence_model = CoherenceModel(topics=topic_words, 
                                        texts=tokens, 
                                        corpus=corpus,
                                        dictionary=dictionary, 
                                        coherence='c_v')
    
    coherence = coherence_model.get_coherence()

    return coherence

In [29]:
def setupLDA(vect_text):

    lda_model = LatentDirichletAllocation(learning_method='online')
    topics = lda_model.fit_transform(vect_text)

    return lda_model, topics

In [None]:
#I borrowed the code from here: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
def plot_top_words(model, feature_names, n_top_words, title):
        fig, axes = plt.subplots(2, 4, figsize=(30, 15), sharex=True)
        axes = axes.flatten()

        for topic_idx, topic in enumerate(model.components_):
            top_features_ind = topic.argsort()[-n_top_words:]
            top_features = feature_names[top_features_ind]
            weights = topic[top_features_ind]

            ax = axes[topic_idx]
            ax.barh(top_features, weights, height=0.7)
            ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
            ax.tick_params(axis="both", which="major", labelsize=20)
            for i in "top right left".split():
                ax.spines[i].set_visible(False)
            fig.suptitle(title, fontsize=40)

        plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
        plt.show()

In [31]:
def Top2VecCoherence(data, topics):
    doc_tokens = [d.split() for d in data]

    vectorizer = CountVectorizer(ngram_range=(1,3),
                                 tokenizer=lambda x: x, 
                                 preprocessor=lambda x: x,
                                 max_df=0.9)
    
    X = vectorizer.fit_transform(doc_tokens)

    topic_word_lists = [list(words[:20]) for words in topics]

    dictionary = corpora.Dictionary(doc_tokens)

    coh_model = CoherenceModel(topics=topic_word_lists, 
                               texts=doc_tokens, 
                               dictionary=dictionary, 
                               coherence='c_v')

    return coh_model.get_coherence()

In [32]:
def getBestParams(lda_model, vect_text):
    batch = range(2000,10000,2000)
    learning = [1,10,50,100,500,1000]
    decay = [0.6,0.7,0.75,0.8]
    doc_top = [0.001, 0.01, 0.1, 1]
    n_comp = [1,2,3,4,5,7,10,15,20,25]

    parameters = {'n_components':n_comp,
                'doc_topic_prior':doc_top,
                'learning_decay':decay,
                'learning_offset':learning,
                'batch_size':batch}

    clf = RandomizedSearchCV(lda_model, param_distributions=parameters, cv=5, random_state=6740, scoring=scorer)
    search = clf.fit(vect_text)
    return search.best_params_

In [None]:
#Calculate the topic diversity of the BERTopic model
def calculateTopicDiversity(brt_model):
    n = len(brt_model.get_topics())

    topic_list = []
    for i in range(n-1):
        topic = brt_model.get_topic(i)
        words_list = [t[0].replace(" ", "_") for t in topic]
        topic_list.append(words_list)

    output = {'topics':topic_list}
    score = TopicDiversity(topk=5)
    return score.score(output)

In [None]:
#calculate the topic diversity of the Top2Vec model
def Top2VecTopicDiversity(topics):
    word_list = []
    
    for t in topics:
        test = [w.replace(" ","_") for w in t]
        word_list.append(test)

    output = {'topics':word_list}
    score = TopicDiversity(topk=5)

    return score.score(output)

In [None]:
##I needed this to download some stuff for the nltk package
# os.environ['NTLK_DATA'] = "Users/xxxxxx/Desktop/Spring 20XX/??????/Project/nltk_data/"
# nltk.data.path.append("Users/xxxxxx/Desktop/Spring 20XX/?????/Project/nltk_data/")
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

# nltk.download('punkt')
# nltk.download('stopwords')

In [34]:
random.seed(6740)

files = ['data/quran.csv', 'data/oldtestament.csv', 'data/newtesament.csv']
texts = [pd.read_csv(f) for f in files]

quran = texts[0]
oldt = texts[1]
newt = texts[2]

In [35]:
test_quran = quran.ayah_en
test_oldt = oldt.text
test_newt = newt.text

In [36]:
vector = TfidfVectorizer(stop_words=sw, max_features=1000,
                             ngram_range=(1,3))

clean_quran = test_quran.map(clean_docs)
qur_vec_text = vector.fit_transform(clean_quran)

clean_old_testament = test_oldt.map(clean_docs)
oldt_vec_text = vector.fit_transform(clean_old_testament)

clean_new_testament = test_newt.map(clean_docs)
newt_vec_text = vector.fit_transform(clean_new_testament)

In [17]:
##Commenting this part out - used RandomSearchCV to find the best parameters for our LDA model

#qur_best_params = getBestParams(new_lda_model, qur_vec_text)
# oldt_best_params = getBestParams(olt_lda_model, oldt_vec_text)
# # newt_best_params = getBestParams(newt_lda_model, newt_vec_text)
# # {'n_components': 7,
# #  'learning_offset': 500,
# #  'learning_decay': 0.7,
# #  'doc_topic_prior': 1,
# #  'batch_size': 4000}

In [42]:
#Best parameters for LDA model after tuning
new_lda_model = LatentDirichletAllocation(learning_method='online',learning_offset=500, n_components=7,
                                      learning_decay=0.7, doc_topic_prior=1, batch_size=4000)

In [49]:
qur_topics = new_lda_model.fit_transform(qur_vec_text)
topic_div = LDATopicDiveristy(new_lda_model, vector)
coherence = getCoherenceScore(clean_quran,new_lda_model,vector)
perplexity = new_lda_model.perplexity(qur_vec_text)

print(f'The scores for this LDA mode are as follows, Coherence: {coherence}, Topic Diversity: {topic_div}, and perplexity: {perplexity}')
#The scores for this LDA mode are as follows, Coherence: 0.39811689145911894, Topic Diversity: 0.6571428571428571, and perplexity: 2406.5657013769955

<class 'tuple'>
The scores for this LDA mode are as follows, Coherence: 0.508835159600508, Topic Diversity: 0.8, and perplexity: 2407.5811181354275


In [50]:
oldt_topic = new_lda_model.fit_transform(oldt_vec_text)
topic_div = LDATopicDiveristy(new_lda_model, vector)
coherence = getCoherenceScore(clean_old_testament,new_lda_model,vector)
perplexity = new_lda_model.perplexity(oldt_vec_text)

print(f'The scores for this LDA mode are as follows, Coherence: {coherence}, Topic Diversity: {topic_div}, and perplexity: {perplexity}')
#The scores for this LDA mode are as follows, Coherence: 0.38077568084480656, Topic Diversity: 0.5714285714285714, and perplexity: 1221.0900062361586

<class 'tuple'>
The scores for this LDA mode are as follows, Coherence: 0.5256238686035462, Topic Diversity: 0.6571428571428571, and perplexity: 1220.9917771332357


In [52]:
newtt_topic = new_lda_model.fit_transform(newt_vec_text)
topic_div = LDATopicDiveristy(new_lda_model, vector)
coherence = getCoherenceScore(clean_new_testament,new_lda_model,vector)
perplexity = new_lda_model.perplexity(newt_vec_text)

print(f'The scores for this LDA mode are as follows, Coherence: {coherence}, Topic Diversity: {topic_div}, and perplexity: {perplexity}')
#The scores for this LDA mode are as follows, Coherence: 0.30797002263842777, Topic Diversity: 0.6, and perplexity: 1940.0431470305832

<class 'tuple'>
The scores for this LDA mode are as follows, Coherence: 0.3029100953841114, Topic Diversity: 0.6571428571428571, and perplexity: 1940.52763345102


In [None]:
tf_feature_names = vector.get_feature_names_out()
plot_top_words(new_lda_model, tf_feature_names, 10, "Topics in LDA model")
# topics, weights1 = getTopics(new_lda_model, qur_vec)
# print(weights1)
# print(topics)

In [None]:
# tf_feature_names = oldt_vec.get_feature_names_out()
# plot_top_words(new_lda_model, tf_feature_names, 15, "Topics in LDA model")

In [None]:
# tf_feature_names = newt_vec.get_feature_names_out()
# plot_top_words(new_lda_model, tf_feature_names, 15, "Topics in LDA model")

In [None]:
# prep_data = ldavis.prepare(new_lda_model, qur_vec_text, qur_vec)
#prep_data = ldavis.prepare(new_lda_model, oldt_vec_text, oldt_vec)
# pyLDAvis.enable_notebook()
# pyLDAvis.display(prep_data)

# pyLDAvis.save_html(prep_data,'QuranLDA.html')
# prep_data = ldavis.prepare(new_lda_model, oldt_vec_text, oldt_vec)
# pyLDAvis.save_html(prep_data,'OLDLDA.html')
# prep_data = ldavis.prepare(new_lda_model, newt_vec_text, newt_vec)
# pyLDAvis.save_html(prep_data,'NEWLDA.html')

In [55]:
##BERTopic
os.environ["TOKENIZERS_PARALLELISM"] = "false"
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True)
umap_model = umap.UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False)
tuned_brt_model = BERTopic(language='english', embedding_model='all-MiniLM-L6-v2', n_gram_range=(1,3), 
                          umap_model=umap_model, hdbscan_model=hdbscan_model, top_n_words=5, nr_topics="auto")

# brt_model = BERTopic(language='english', embedding_model='all-MiniLM-L6-v2', n_gram_range=(1,2), 
#                           umap_model=umap_model, hdbscan_model=hdbscan_model)

# n_words = range(5,30,5)
# topic_size = range(100,500,50)



In [27]:
qur_tops, probs = tuned_brt_model.fit_transform(clean_quran)
coherence = BERTCoherence(tuned_brt_model, clean_quran, qur_tops)
topic_diversity = calculateTopicDiversity(tuned_brt_model)

topics_n = range(1,9)
n = len(tuned_brt_model.get_topics())
k = list(range(1,n-1))

tuned_brt_model.visualize_topics(topics=k).write_html("Visualizations/quran_intertopic_dist_map.html")
tuned_brt_model.visualize_barchart(topics=topics_n).write_html("Visualizations/quran_barchart.html")

print(f'There are {n} topics and the Coherence for the BERT Model is {coherence} and the Topic Diversity is {topic_diversity}')
#There are 109 topics and the Coherence for the BERT Model is 0.7682063071475818 and the Topic Diversity is 0.9037037037037037

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


There are 50 topics and the Coherence for the BERT Model is 0.7800520232897611 and the Topic Diversity is 0.963265306122449


In [28]:
oldt_tops, probs = tuned_brt_model.fit_transform(clean_old_testament)
coherence = BERTCoherence(tuned_brt_model, clean_old_testament, oldt_tops)
topic_diversity = calculateTopicDiversity(tuned_brt_model)

n = len(tuned_brt_model.get_topics())
k = list(range(1,n-1))

tuned_brt_model.visualize_topics(topics=k).write_html("Visualizations/old_testament_intertopic_dist_map.html")
tuned_brt_model.visualize_barchart(topics=topics_n).write_html("Visualizations/old_testament_barchart.html")

print(f'There are {n} topics and the Coherence for the BERT Model is {coherence} and the Topic Diversity is {topic_diversity}')
#There are 259 topics and the Coherence for the BERT Model is 0.8498131720508381 and the Topic Diversity is 0.8806201550387597

There are 15 topics and the Coherence for the BERT Model is 0.8171662854204647 and the Topic Diversity is 0.9


In [29]:
newt_tops, probs = tuned_brt_model.fit_transform(clean_new_testament)
coherence = BERTCoherence(tuned_brt_model, clean_new_testament, newt_tops)
topic_diversity = calculateTopicDiversity(tuned_brt_model)

n = len(tuned_brt_model.get_topics())
k = list(range(1,n-1))

tuned_brt_model.visualize_topics(topics=k).write_html("Visualizations/new_testament_intertopic_dist_map.html")
tuned_brt_model.visualize_barchart(topics=topics_n).write_html("Visualizations/new_testament_barchart.html")

print(f'There are {n} topics and the Coherence for the BERT Model is {coherence} and the Topic Diversity is {topic_diversity}')
#There are 105 topics and the Coherence for the BERT Model is 0.7924715016683221 and the Topic Diversity is 0.9442307692307692

There are 40 topics and the Coherence for the BERT Model is 0.7980990952545068 and the Topic Diversity is 0.9333333333333333


In [None]:
# for i in n_words:
#     for k in topic_size:
#         test_brt_model = BERTopic(language='english', embedding_model='all-MiniLM-L6-v2', n_gram_range=(1,2), 
#                           umap_model=umap_model, hdbscan_model=hdbscan_model, top_n_words=i, min_topic_size=k)
#         test_brt_model.fit_transform(clean_quran)
#         coherence = BERTCoh(test_brt_model, clean_quran)
#         print("For top_n_words: "+str(i)+" and min_topic_size: "+str(k)+" the coherence is: "+str(coherence))

In [30]:
hdbscan_params = {'min_cluster_size':10, 'metric':'euclidean', 'prediction_data':True, }
umap_params = {'n_neighbors':15, 'n_components':10, 'metric':'cosine', 'low_memory':False}

qur_t2v_model = Top2Vec(clean_quran,embedding_model='doc2vec', speed='deep-learn', 
                         min_count=25, hdbscan_args=hdbscan_params, umap_args=umap_params)

oldt_t2v_model = Top2Vec(clean_old_testament,embedding_model='doc2vec', speed='deep-learn',  
                         min_count=25, hdbscan_args=hdbscan_params, umap_args=umap_params)

newt_t2v_model = Top2Vec(clean_new_testament,embedding_model='doc2vec', speed='deep-learn', 
                         min_count=25, hdbscan_args=hdbscan_params, umap_args=umap_params)


2025-04-27 15:01:55,512 - top2vec - INFO - Pre-processing documents for training
2025-04-27 15:01:55,621 - top2vec - INFO - Creating joint document/word embedding
2025-04-27 15:02:30,312 - top2vec - INFO - Creating lower dimension embedding of documents
2025-04-27 15:02:32,035 - top2vec - INFO - Finding dense areas of documents
2025-04-27 15:02:32,292 - top2vec - INFO - Finding topics
2025-04-27 15:02:32,321 - top2vec - INFO - Pre-processing documents for training
2025-04-27 15:02:32,860 - top2vec - INFO - Creating joint document/word embedding
2025-04-27 15:04:58,281 - top2vec - INFO - Creating lower dimension embedding of documents
2025-04-27 15:05:01,427 - top2vec - INFO - Finding dense areas of documents
2025-04-27 15:05:02,669 - top2vec - INFO - Finding topics
2025-04-27 15:05:02,806 - top2vec - INFO - Pre-processing documents for training
2025-04-27 15:05:03,035 - top2vec - INFO - Creating joint document/word embedding
2025-04-27 15:05:46,696 - top2vec - INFO - Creating lower dim

In [31]:
qur_t2v_model.hierarchical_topic_reduction(50)
topics = qur_t2v_model.topic_words_reduced
td = Top2VecTopicDiversity(topics)
c = Top2VecCoherence(clean_quran, topics)

print(f'For this Top2Vec model the Topic Diversity is {td}, coherence is {c}')

Reducing topics: 100%|██████████| 85/85 [00:00<00:00, 280.66it/s]


For this Top2Vec model the Topic Diversity is 0.808, coherence is 0.38736377901991825


In [32]:
oldt_t2v_model.hierarchical_topic_reduction(30)
topics = oldt_t2v_model.topic_words_reduced
td = Top2VecTopicDiversity(topics)
c = Top2VecCoherence(clean_old_testament, topics)

print(f'For this Top2Vec model the Topic Diversity is {td}, coherence is {c}')

Reducing topics: 100%|██████████| 356/356 [00:14<00:00, 25.22it/s]


For this Top2Vec model the Topic Diversity is 0.9466666666666667, coherence is 0.5222067723212561


In [33]:
newt_t2v_model.hierarchical_topic_reduction(35)
topics = newt_t2v_model.topic_words_reduced
td = Top2VecTopicDiversity(topics)
c = Top2VecCoherence(clean_new_testament, topics)

print(f'For this Top2Vec model the Topic Diversity is {td}, coherence is {c}')

Reducing topics: 100%|██████████| 92/92 [00:00<00:00, 192.47it/s]


For this Top2Vec model the Topic Diversity is 0.8857142857142857, coherence is 0.41965162062590566


In [None]:
umap_args = {"n_neighbors": 15, "n_components": 2,"metric": "cosine", }
umap_model = umap.UMAP(**umap_args).fit_transform(qur_t2v_model.document_vectors)

# binary_top = np.array([gettopthree(t) for t in qur_t2v_model.doc_top])
# umap.plot.points(umap_model, labels=qur_t2v_model.doc_top)

x,y = umap_model[:,0], umap_model[:,1]
plt.scatter(x,y,c=qur_t2v_model.doc_top_reduced, 
            cmap='terrain', marker=".", s=8)
ax = plt.gca()
ax.set_facecolor('black')
plt.title("Top2Vec Model Quran (UMAP)")
plt.show()

In [None]:
umap_model = umap.UMAP(**umap_args).fit_transform(oldt_t2v_model.document_vectors)

#binary_top = np.array([gettopthree(t) for t in oldt_t2v_model.doc_top])
#umap.plot.points(umap_model, labels=binary_top, theme='fire')

x,y = umap_model[:,0], umap_model[:,1]
plt.scatter(x,y,c=oldt_t2v_model.doc_top_reduced, 
            cmap='bwr', marker=".", s=8)
ax = plt.gca()
ax.set_facecolor('black')
plt.title("Top2Vec Model Old Testament (UMAP)")
plt.show()

In [None]:
umap_model = umap.UMAP(**umap_args).fit_transform(newt_t2v_model.document_vectors)

#binary_top = np.array([gettopthree(t) for t in newt_t2v_model.doc_top])
#umap.plot.points(umap_model, labels=binary_top, theme='fire')

x,y = umap_model[:,0], umap_model[:,1]
plt.scatter(x,y,c=newt_t2v_model.doc_top_reduced, 
            cmap='Spectral', marker=".", s=8)
ax = plt.gca()
ax.set_facecolor('black')
plt.title("Top2Vec Model New Testament (UMAP)")
plt.show()

In [None]:
# hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=False)
# umap_model = umap.UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False)
# quran_brt_model = BERTopic(language='english', embedding_model='all-MiniLM-L6-v2', n_gram_range=(1,3), 
#                           umap_model=umap_model, hdbscan_model=hdbscan_model, top_n_words=5, nr_topics="auto")
# oldt_brt_model = BERTopic(language='english', embedding_model='all-MiniLM-L6-v2', n_gram_range=(1,3), 
#                           umap_model=umap_model, hdbscan_model=hdbscan_model, top_n_words=5, nr_topics="auto")
# newt_brt_model = BERTopic(language='english', embedding_model='all-MiniLM-L6-v2', n_gram_range=(1,3), 
#                           umap_model=umap_model, hdbscan_model=hdbscan_model, top_n_words=5, nr_topics="auto")

# qur_tops, probs = quran_brt_model.fit_transform(clean_quran)
# oldt_top, _ = oldt_brt_model.fit_transform(clean_old_testament)
# newt_tops,_ = newt_brt_model.fit_transform(clean_new_testament)

# q_o_matrix = cosine_similarity(quran_brt_model.topic_embeddings_, 
#                                oldt_brt_model.topic_embeddings_) 
# q_n_matrix = cosine_similarity(quran_brt_model.topic_embeddings_, 
#                                newt_brt_model.topic_embeddings_)

# col1 = []
# col2 = []
# col3 = []

# for i in range(2,12):
#     sim_topics1 = np.argmax(q_o_matrix[i])-1
#     sim2_topics = np.argmax(q_n_matrix[i])-1
#     print(i-1, sim_topics1, sim2_topics)
#     col1.append([k[0] for k in quran_brt_model.get_topic(i-1)])
#     col2.append([k[0] for k in oldt_brt_model.get_topic(sim_topics1)])
#     col3.append([k[0] for k in newt_brt_model.get_topic(sim2_topics)])

# table_data = {'quran':col1, 'old_t':col2, 'new_t': col3}
# df2 = pd.DataFrame(table_data)
# df2.to_latex('BURTtable.txt')


In [None]:
x = qur_t2v_model.topic_vectors_reduced
y = oldt_t2v_model.topic_vectors_reduced
z = newt_t2v_model.topic_vectors_reduced

sim_matrix = cosine_similarity(x,y)
sim_matrix2 = cosine_similarity(x,z)

In [None]:
col1 = []
col2 = []
col3 = []

for i in range(1,11):
    sim_topics1 = np.argmax(sim_matrix[i])-1
    sim2_topics = np.argmax(sim_matrix2[i])-1
    print(i, sim_topics1, sim2_topics)
    col1.append(qur_t2v_model.topic_words_reduced[i][0:10])
    col2.append(oldt_t2v_model.topic_words_reduced[sim_topics1][0:10])
    col3.append(newt_t2v_model.topic_words_reduced[sim2_topics][0:10])

table_data = {'quran':col1, 'old_t':col2, 'new_t': col3}
df2 = pd.DataFrame(table_data)
df2.to_latex('t2vtable.txt')

In [None]:
def findSimilarTopics(model1, model2, model3):
    x = model1.topic_vectors
    y = model2.topic_vectors
    z = model3.topic_vectors
    col1 = []
    col2 = []
    col3 = []
    sim_matrix = cosine_similarity(x,y)
    sim_matrix2 = cosine_similarity(x,z)

    for i in range(2,12):
        sim_topics1 = np.argmax(sim_matrix[i])-1
        sim2_topics = np.argmax(sim_matrix2[i])-1
        print(i, sim_topics1, sim2_topics)
        col1.append(model1.topic_words[i][0:10])
        col2.append(model2.topic_words[sim_topics1][0:10])
        col3.append(model3.topic_words[sim2_topics][0:10])

    table_data = {'model1':col1, 'model2':col2, 'model3': col3}
    return pd.DataFrame(table_data)
