In [None]:
import gensim
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import pandas as pd
import requests
import json
from gensim import corpora
from gensim.models import CoherenceModel
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Great resource for topic modeling which I followed to create this script
# https://towardsdatascience.com/6-tips-to-optimize-an-nlp-topic-model-for-interpretability-20742f3047e2?gi=9d97408c65b1
stop_word_list = set(stopwords.words('english'))

response = requests.get(url)
data = response.text.strip().splitlines()
comedy_data = [json.loads(line) for line in data]
df = pd.DataFrame(comedy_data, columns=['title', 'transcript'])
mitch = df['transcript'][0]

documents = mitch  # mitch hedberg's comedy special

bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents([doc.split() for doc in documents])
finder.apply_freq_filter(2)

# Function to filter bigrams based on noun structures
def bigram_filter(bigram):
    tag = pos_tag(bigram)
    if tag[0][1] in ['JJ', 'NN'] and tag[1][1] in ['NN']:
        return bigram[0] not in stop_word_list and bigram[1] not in stop_word_list
    return False

# Score bigrams and filter by PMI and noun structure
filtered_bigrams = [(bigram, score) for bigram, score in finder.score_ngrams(bigram_measures.pmi)
                    if bigram_filter(bigram) and score > 5]

# Prepare bigram phrases for replacement
bigrams = [' '.join(bigram) for bigram, score in filtered_bigrams]

# Function to replace bigrams and filter nouns
def replace_and_filter_ngrams(text):
    for gram in bigrams:
        text = text.replace(gram, '_'.join(gram.split()))
    tokens = [word for word in text.split() if word not in stop_word_list and len(word) > 2]
    pos_tags = pos_tag(tokens)
    filtered = [word for word, pos in pos_tags if pos.startswith('NN')]
    return ' '.join(filtered)

# Apply to documents
filtered_documents = [replace_and_filter_ngrams(doc) for doc in documents if len(doc.strip()) > 0]

# Create dictionary and corpus for Gensim
vectorizer = CountVectorizer(stop_words='english', min_df=2)
doc_term_matrix = vectorizer.fit_transform(filtered_documents)
terms = vectorizer.get_feature_names_out()
dictionary = corpora.Dictionary([terms])
corpus = [dictionary.doc2bow(doc.split()) for doc in filtered_documents]

# Calculate coherence for a range of topics
# tune chunksize and passes for better performance
coherence = []
for k in range(1, 9):
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus, num_topics=k, id2word=dictionary, passes=40, iterations=200,
        chunksize=4000, eval_every=None
    )
    
    # Use 'c_v' coherence for interpretability
    coherence_model = CoherenceModel(model=lda_model, texts=[doc.split() for doc in filtered_documents], 
                                     dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence.append((k, coherence_score))

# Output coherence scores
# print("\nCoherence Scores by Number of Topics:")
for num_topics, score in coherence:
    print(f"{num_topics} topics: Coherence score = {score}")
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)
