# get_LDA_df.ipynb

This notebook:
* Trains the LDA model on the podcast transcripts. 
    * Tokenizes the text using nltk stopwords. 
    * Uses sklearn's LDA CountVectorizer and LatentDirichletAllocation.
* Writes out the top 50 words for each topic into a csv. 
* Saves the LDA Document-Topic distributions as features into the df, and writes this df out to csv. 

In [1]:
import os
import json
import sys

import pandas as pd
import numpy as np

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import utils_general

# set random seed
np.random.seed(0)

# set pandas to show all the cols in this notebook
pd.set_option('display.max_columns', None)

# read in df
df = pd.read_csv("./csv/df.csv")

# if no words were transcribed, replace nan's with empty string
df["transcript"] = df["transcript"].fillna("")

# read in data samples from the df
data_samples = list(df["transcript"])

n_top_words = 50
            
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]  # Get indices of top n words
        top_words = [feature_names[i] for i in top_words_idx]
        print("\n", topic_idx+1, ":", end=" ")
        for word in top_words:
            print(word, end=", ")
            
def save_top_words(model, feature_names, n_top_words):
    with open("./LDA_topics.csv", "w") as f:
        for topic_idx, topic in enumerate(model.components_):
            top_words_idx = topic.argsort()[:-n_top_words - 1:-1]  # Get indices of top n words
            top_words = [feature_names[i] for i in top_words_idx]
            f.write(f"{topic_idx+1},")
            for word in top_words:
                f.write(word+",")
            f.write("\n")
            
# nltk stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# tokenize and filter stopwords
tokenized_documents = [word_tokenize(doc.lower()) for doc in data_samples]
filtered_documents = [
    [word for word in doc if word.isalnum() and word not in stop_words]
    for doc in tokenized_documents
]

count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform([" ".join(doc) for doc in filtered_documents])

print("Fitting LDA model...")
n_topics = 100
lda = LatentDirichletAllocation(
    n_components=n_topics,
    learning_method="online",
    random_state=0, 
    max_iter=5,
    evaluate_every=1,
    verbose=1
    # we use the default value for perplexity tolerance
)
lda.fit(X)

count_feature_names = count_vectorizer.get_feature_names_out()
print_top_words(lda, count_feature_names, n_top_words)
save_top_words(lda, count_feature_names, n_top_words)

# Transform the fitted LDA model to get document-topic distribution
document_topic_probs = lda.transform(X)
print("Doc #0 probabilities =", document_topic_probs[0])
    
# Create a new DataFrame with the original data and add the topic probabilities
df = pd.concat([df, pd.DataFrame(document_topic_probs, columns=[f'Topic_{i+1}_Probability' for i in range(n_topics)])], axis=1)

# drop columns containing 'Unnamed' in their name because these are old indexes
columns_to_drop = [col for col in df.columns if "Unnamed" in col]
df = df.drop(columns=columns_to_drop)

# Display the new DataFrame
display(df)

csv_path = "./csv/df-LDA_topics.csv"
utils_general.delete_file_if_already_exists(csv_path)
df.to_csv(csv_path, header=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/grads/m/mariateleki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fitting LDA model...
iteration: 1 of max_iter: 5, perplexity: 9715.0403
iteration: 2 of max_iter: 5, perplexity: 9687.5349
iteration: 3 of max_iter: 5, perplexity: 9673.9227
iteration: 4 of max_iter: 5, perplexity: 9667.2046
iteration: 5 of max_iter: 5, perplexity: 9662.9406

 1 : cartilage, hydrogen, o2, tentacles, philo, calcification, deployments, lipids, leona, violins, avascular, celiac, unifying, potts, agreeance, pythagoras, copd, brca, pancreatic, dissolution, whipple, calcified, seasonality, v3, emits, hardening, cerberus, reactors, broskis, cervantes, pth, thruster, scalding, perot, colic, underlies, radcast, fistula, popularize, malignancies, asin, metabolically, rpsb, frcr, malformations, tca, cholo, cardiopulmonary, musing, landmine, 
 2 : blah, language, english, say, word, words, use, learning, learn, speak, spanish, means, something, okay, example, different, languages, would, speaking, used, understand, native, let, lesson, another, read, one, right, reading, brazil, t

Unnamed: 0,show_uri,show_name,show_description,publisher,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,language,transcript,transcript_length,Topic_1_Probability,Topic_2_Probability,Topic_3_Probability,Topic_4_Probability,Topic_5_Probability,Topic_6_Probability,Topic_7_Probability,Topic_8_Probability,Topic_9_Probability,Topic_10_Probability,Topic_11_Probability,Topic_12_Probability,Topic_13_Probability,Topic_14_Probability,Topic_15_Probability,Topic_16_Probability,Topic_17_Probability,Topic_18_Probability,Topic_19_Probability,Topic_20_Probability,Topic_21_Probability,Topic_22_Probability,Topic_23_Probability,Topic_24_Probability,Topic_25_Probability,Topic_26_Probability,Topic_27_Probability,Topic_28_Probability,Topic_29_Probability,Topic_30_Probability,Topic_31_Probability,Topic_32_Probability,Topic_33_Probability,Topic_34_Probability,Topic_35_Probability,Topic_36_Probability,Topic_37_Probability,Topic_38_Probability,Topic_39_Probability,Topic_40_Probability,Topic_41_Probability,Topic_42_Probability,Topic_43_Probability,Topic_44_Probability,Topic_45_Probability,Topic_46_Probability,Topic_47_Probability,Topic_48_Probability,Topic_49_Probability,Topic_50_Probability,Topic_51_Probability,Topic_52_Probability,Topic_53_Probability,Topic_54_Probability,Topic_55_Probability,Topic_56_Probability,Topic_57_Probability,Topic_58_Probability,Topic_59_Probability,Topic_60_Probability,Topic_61_Probability,Topic_62_Probability,Topic_63_Probability,Topic_64_Probability,Topic_65_Probability,Topic_66_Probability,Topic_67_Probability,Topic_68_Probability,Topic_69_Probability,Topic_70_Probability,Topic_71_Probability,Topic_72_Probability,Topic_73_Probability,Topic_74_Probability,Topic_75_Probability,Topic_76_Probability,Topic_77_Probability,Topic_78_Probability,Topic_79_Probability,Topic_80_Probability,Topic_81_Probability,Topic_82_Probability,Topic_83_Probability,Topic_84_Probability,Topic_85_Probability,Topic_86_Probability,Topic_87_Probability,Topic_88_Probability,Topic_89_Probability,Topic_90_Probability,Topic_91_Probability,Topic_92_Probability,Topic_93_Probability,Topic_94_Probability,Topic_95_Probability,Topic_96_Probability,Topic_97_Probability,Topic_98_Probability,Topic_99_Probability,Topic_100_Probability
0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj,en,"Hello, hello, hello everyone. This is Katie an...",1716,0.000012,0.000012,0.002161,0.002365,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.034438,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.001214,0.000012,0.000012,0.003613,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.001192,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.030293,0.000012,0.000012,0.000012,0.000012,0.008942,0.263176,0.000012,0.360638,0.000012,0.000012,0.004425,0.000012,0.000012,0.007946,0.000012,0.000012,0.225073,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.001251,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.052268
1,spotify:show:6vZRgUFTYwbAA79UNCADr4,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Inside the 18 GK Media,https://anchor.fm/s/81a072c/podcast/rss,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,43.616333,show_6vZRgUFTYwbAA79UNCADr4,001UfOruzkA3Bn1SPjcdfa,en,Welcome to Inside the 18. Today's episode is a...,2017,0.000011,0.000011,0.005914,0.000011,0.000011,0.000011,0.000011,0.000011,0.017988,0.000011,0.000011,0.000011,0.125472,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.001155,0.003559,0.000011,0.000011,0.015363,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.001143,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.103951,0.000011,0.000011,0.000011,0.000011,0.046425,0.000011,0.000011,0.000011,0.005094,0.000011,0.060959,0.000011,0.222170,0.007833,0.001051,0.000011,0.000011,0.309811,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.005237,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.005860,0.000011,0.000011,0.000011,0.000011,0.060087
2,spotify:show:5BvKEjaMSuvUsGROGi2S7s,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Arrowhead Live!,https://anchor.fm/s/917dba4/podcast/rss,spotify:episode:001i89SvIQgDuuyC53hfBm,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,58.189200,show_5BvKEjaMSuvUsGROGi2S7s,001i89SvIQgDuuyC53hfBm,en,"Hey Cheese fans! Before we get started, I want...",1518,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.017489,0.000015,0.000015,0.000015,0.000015,0.016417,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.375438,0.000015,0.000015,0.000015,0.015987,0.176725,0.000015,0.000015,0.000015,0.010951,0.000015,0.214406,0.000015,0.028262,0.000015,0.007246,0.000015,0.000015,0.039921,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.004716,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.013284,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.000015,0.077872
3,spotify:show:7w3h3umpH74veEJcbE6xf4,FBoL,"The comedy podcast about toxic characters, wri...",Emily Edwards,https://www.fuckboisoflit.com/episodes?format=rss,spotify:episode:0025RWNwe2lnp6HcnfzwzG,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,51.782050,show_7w3h3umpH74veEJcbE6xf4,0025RWNwe2lnp6HcnfzwzG,en,"Sorry to interrupt the show, but I do have to ...",1707,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.010266,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.024359,0.000013,0.000013,0.000013,0.007594,0.000013,0.000013,0.000013,0.000013,0.000013,0.127155,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.018362,0.000013,0.000013,0.000013,0.010327,0.017344,0.004134,0.000013,0.000013,0.264712,0.017538,0.000013,0.000013,0.002205,0.000013,0.000013,0.000013,0.000013,0.105184,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.113735,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.020751,0.000013,0.000013,0.000013,0.000013,0.255202
4,spotify:show:5ljREb8VLogQLT7AKGwav1,UPSC Podcasts,Podcasts useful for UPSC aspirants! Mainly dis...,UPSC Podcast,https://anchor.fm/s/8afceec/podcast/rss,spotify:episode:0025w0gdgkl11Nzkmg1wnm,Tourism in India : Opportunities and Challenges,.,13.788000,show_5ljREb8VLogQLT7AKGwav1,0025w0gdgkl11Nzkmg1wnm,en,This is All India Radio. In the program Spotli...,1755,0.000012,0.000012,0.001215,0.015956,0.001344,0.001357,0.006328,0.002416,0.060408,0.008331,0.020441,0.076749,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.006456,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.014816,0.035791,0.000012,0.000012,0.004240,0.000012,0.005102,0.105217,0.000012,0.000012,0.524370,0.000012,0.000012,0.000012,0.081295,0.000012,0.000012,0.000012,0.006944,0.000012,0.000012,0.000012,0.000012,0.000012,0.001214,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.001214,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.017861,0.000012,0.000012,0.000012,0.000012,0.000012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82596,spotify:show:416U8ZhubKrFHq8ynOaxfH,The Top 10,"Each week, John Rocha and Matt Knost breakdown...",The Top 10,http://thetop10.podomatic.com/rss2.xml,spotify:episode:7zzQnjBXqDApvnm1hLPzVY,The Top 10 - Re-List - Steve Martin Moves,Thanks to our patreon members for their suppor...,51.025850,show_416U8ZhubKrFHq8ynOaxfH,7zzQnjBXqDApvnm1hLPzVY,en,"Hey guys, this is John Rocha again. And Matt N...",1881,0.000012,0.003839,0.001750,0.000012,0.000012,0.000012,0.001368,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.002387,0.001209,0.005192,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.007214,0.000012,0.000012,0.000012,0.059179,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.006120,0.000012,0.000012,0.000012,0.001894,0.000012,0.019589,0.186117,0.000012,0.000012,0.000012,0.015903,0.009304,0.187023,0.000012,0.012680,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.006590,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.165647,0.000012,0.000012,0.000012,0.000012,0.000012,0.000012,0.002645,0.000012,0.000012,0.000012,0.000012,0.303402
82597,spotify:show:5rgmBAzsJ5znpV2b4WNDsb,Let's Grab Coffee Podcast,"After connecting with someone, what's the next...",George Khalife,https://anchor.fm/s/9043d60/podcast/rss,spotify:episode:7zzRRsjuymax0YSczpi0SU,Let's Grab Coffee E45 with Ross Paquette | Gro...,Ross founded Maropost in 2011 as a customer-ce...,33.364750,show_5rgmBAzsJ5znpV2b4WNDsb,7zzRRsjuymax0YSczpi0SU,en,What's going on everyone? This is George Khali...,2005,0.000011,0.000011,0.000011,0.000011,0.001313,0.000011,0.001640,0.000011,0.000011,0.000011,0.024764,0.207258,0.000011,0.000011,0.000011,0.000011,0.000011,0.001123,0.000011,0.000011,0.000011,0.001158,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.001124,0.000011,0.000011,0.000011,0.139293,0.000011,0.000011,0.000011,0.000011,0.000011,0.070767,0.000011,0.064722,0.000011,0.000011,0.000011,0.000011,0.482370,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.003492,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011,0.000011
82598,spotify:show:56CjYLQWyMx1MkOEQmlubi,Coach Corey Wayne,Life & Peak Performance Coach. I Teach Self-Re...,Coach Corey Wayne,https://anchor.fm/s/4dd625c/podcast/rss,spotify:episode:7zzZJGsL8fwDOrduUkX91D,Maybe She Is Just Testing Me?,How to know if your woman is maybe just testin...,11.799950,show_56CjYLQWyMx1MkOEQmlubi,7zzZJGsL8fwDOrduUkX91D,en,"Hi, I'm Coach Cory Wayne and this is my video ...",1850,0.000013,0.000013,0.022701,0.000013,0.000013,0.000013,0.004343,0.000013,0.040474,0.015293,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.001298,0.000013,0.000013,0.000013,0.000013,0.000013,0.001371,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.074772,0.000013,0.612834,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.040161,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.136312,0.000013,0.001298,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.048011
82599,spotify:show:7uddSH8MhaK3Q6YFlllbVZ,The Cricket Podcast,The best & funniest independent cricket podcas...,The Cricket Podcast,https://anchor.fm/s/9d3dcf0/podcast/rss,spotify:episode:7zzoT4r0Rhffyegk2HJ9N8,Ep 16: England In Danger,"In Episode 16, the boys evaluate England's per...",69.215350,show_7uddSH8MhaK3Q6YFlllbVZ,7zzoT4r0Rhffyegk2HJ9N8,en,I think it should never be permitted to happen...,1699,0.000013,0.000013,0.003494,0.000013,0.000013,0.000013,0.000013,0.000013,0.022790,0.002655,0.003881,0.000013,0.000013,0.000013,0.001277,0.000013,0.000013,0.000013,0.001326,0.000013,0.003087,0.001366,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.026579,0.011159,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.011391,0.000013,0.000013,0.000013,0.000013,0.211715,0.000013,0.000013,0.000013,0.008884,0.007995,0.000013,0.001548,0.000013,0.000013,0.000013,0.309667,0.000013,0.000013,0.087228,0.001391,0.000013,0.001613,0.057847,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.000013,0.020150,0.000013,0.001277,0.000013,0.000013,0.000013,0.000013,0.006334,0.080595,0.000013,0.000013,0.000013,0.000013,0.001316,0.000013,0.000013,0.001277,0.000013,0.000013,0.000013,0.111237
