In [6]:
import gensim
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import pandas as pd
import requests
import json
from gensim import corpora
from gensim.models import CoherenceModel
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
import csv

# NLTK downloads
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

stop_word_list = set(stopwords.words('english'))

# Load the data and select a single transcript
response = requests.get(url)
data = response.text.strip().splitlines()
comedy_data = [json.loads(line) for line in data]
df = pd.DataFrame(comedy_data, columns=['title', 'transcript'])

# Use only the first transcript (or specify by index)
transcript = df['transcript'][0]

# If transcript is a list, join it into a single string
if isinstance(transcript, list):
    transcript = " ".join(transcript)

# Split the transcript by '\n' to create separate documents for each joke or line
documents = transcript.split('\n')
documents = [joke.strip() for joke in documents if joke.strip()]

# Simplified tokenization, filtering, and POS tagging
def preprocess_text(text):
    # Tokenize and remove stop words
    tokens = [word for word in text.split() if word not in stop_word_list and len(word) > 2]
    # Keep only nouns
    pos_tags = pos_tag(tokens)
    filtered = [word for word, pos in pos_tags if pos.startswith('NN')]
    return ' '.join(filtered)

# Apply to documents and filter out any empty strings
filtered_documents = [preprocess_text(doc) for doc in documents if len(doc.strip()) > 0]

# Tokenize and remove stop words early
def tokenize_and_filter(doc):
    tokens = [word for word in doc.split() if len(word) > 2]
    return tokens

tokenized_docs = [tokenize_and_filter(doc) for doc in filtered_documents]

# Create dictionary and filter extremes
dictionary = corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes(no_below=1, no_above=0.8)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Remove any empty documents from corpus and tokenized_docs
corpus = [doc for doc in corpus if len(doc) > 0]
tokenized_docs = [doc for doc in tokenized_docs if len(doc) > 0]

# Calculate coherence for a range of topics and store the best model
best_coherence_score = -1  # Initialize with a low value
best_k = None  # Store the best number of topics
best_lda_model = None  # Store the best LDA model

for k in range(23, 58):  # Adjust the range for fewer topics if needed
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus, num_topics=k, id2word=dictionary, passes=80, iterations=240,
        chunksize=70, eval_every=None
    )

    coherence_model = CoherenceModel(model=lda_model, texts=tokenized_docs, 
                                     dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print(f"{k} topics: Coherence score = {coherence_score}")
    
    # Update the best model if this coherence score is higher
    if coherence_score > best_coherence_score:
        best_coherence_score = coherence_score
        best_k = k
        best_lda_model = lda_model

# Print the model with the highest coherence score
print(f"\nBest model with {best_k} topics has the highest coherence score: {best_coherence_score}")

# Map each joke to its dominant topic and the top word/phrase from that topic
joke_topic_mapping = []
for i, bow in enumerate(corpus):
    topic_distribution = best_lda_model.get_document_topics(bow)
    # Find the dominant topic with the highest probability
    dominant_topic = max(topic_distribution, key=lambda x: x[1])[0]
    
    # Get the top word/phrase for the dominant topic
    top_words_for_topic = best_lda_model.show_topic(dominant_topic, topn=1)
    dominant_word_or_phrase = top_words_for_topic[0][0]  # The top word/phrase
    
    joke_topic_mapping.append({
        "Joke": documents[i], 
        "Dominant Topic": dominant_topic,
        "Top Word/Phrase": dominant_word_or_phrase
    })

# Convert to DataFrame and write to CSV
joke_topic_df = pd.DataFrame(joke_topic_mapping)
joke_topic_df.to_csv('jokes_to_topics_and_words.csv', index=False)

print("\nJokes mapped to topics and top words/phrases written to 'jokes_to_topics_and_words.csv'.")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robinwoodfamily/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/robinwoodfamily/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robinwoodfamily/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['answers, they’ll', 'drug use.', 'roast beef.', 'standup comedian.', 'use liquor', 'death metal', 'emergency brake.', 'him.” “And', 'on, man.', 'gonna redo', 'right. You’re', 'think that’s', 'don’t know.', 'didn’t get', '“I don’t', 'can’t get']
1 topics: Coherence score = 0.29581227450939973
2 topics: Coherence score = 0.30874377104061446
3 topics: Coherence score = 0.4041151018496619
4 topics: Coherence score = 0.415306893211147
5 topics: Coherence score = 0.5312187843974392
6 topics: Coherence score = 0.5501523252688106
7 topics: Coherence score = 0.4503830131641474
8 topics: Coherence score = nan
LdaModel<num_terms=545, num_topics=8, decay=0.5, chunksize=4000>


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))
