# get_LDA_topic_coherence.ipynb

This notebook:
* Calculates coherence for the LDA topics.

In [None]:
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import pandas as pd

nltk.download('stopwords')

SEED = 42

df = pd.read_csv("./csv/df-all-features.csv")
df = df.drop_duplicates(subset="show_uri", keep="first")
df = df.drop(columns=df.columns[df.columns.str.contains("Topic")])  # drop old topic distributions, redoing in this file
df["transcript"] = df["transcript"].fillna("")
docs = list(df["transcript"])
print(docs[0])

# remove nltk stopwords from docs
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenized_documents = [word_tokenize(doc.lower()) for doc in docs]
docs = [
    [word for word in doc if word.isalnum() and word not in stop_words]
    for doc in tokenized_documents
]
processed_docs = docs 
dictionary = Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
vectorizer = CountVectorizer(analyzer='word', stop_words='english', lowercase=True)
X = vectorizer.fit_transform([' '.join(doc) for doc in processed_docs])

COHERENCE_SCORES = []
N_TOPICS = [40, 60, 80, 100, 120, 140, 160]
for n_topics in N_TOPICS:

    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)
    feature_names = vectorizer.get_feature_names_out()

    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        topic_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]  # Top 10 words per topic
        topics.append(topic_words)

    print(topics)

    coherence_model_lda = CoherenceModel(topics=topics, 
                                         texts=processed_docs, 
                                         dictionary=dictionary, 
                                         coherence='c_v')

    coherence_lda = coherence_model_lda.get_coherence()
    COHERENCE_SCORES.append(coherence_lda)

    print(f'Coherence Score: {coherence_lda}')

In [None]:
print(N_TOPICS)
print(COHERENCE_SCORES)