# Latent Dirichlet Allocation (LDA)

In [10]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaMulticore
from gensim.models import TfidfModel

In [11]:
# Load Tokenized and preprocessed words
df = pd.read_pickle('tokenized_tweets.pkl')
print(df.head())

                                                text
0                           [american, harem, metoo]
1    [guy, resigned, yet, liberal, hypocrisy, metoo]
3  [woman, talking, crap, entire, time, finally, ...
4  [please, speak, sexual, assault, interview, me...
5  [cant, keep, turning, blind_eye, pretend, isnt...


In [12]:
# I am using gensim library and LDA needs a dictionary of all words and a corpus

dictionary = corpora.Dictionary(df['text']) # mapping IDs to words
bow_corpus = [dictionary.doc2bow(text) for text in df['text']] # collection of documents represented in a numerical format - Bag of words

# I want to use TF-IDF as an alternative and see whether it outputs different topics 
tfidf_model = TfidfModel(bow_corpus) # initiating the model
tfidf_corpus = tfidf_model[bow_corpus] # creating a Term Frequence - Inverse Document Frequency


### Training the model

In [20]:
# LDA requires a preset k of Topics and training phases
num_topics = 4
passes = 5 
chunksize = 100  # how many documents are processed at a time -> carefull with memory!
workers = 4  # CPU cores for parallel computing

#### Using TF-IDF

In [21]:
# Create LDA model using  TF-IDF corpus
lda_model_tfidf = LdaMulticore(tfidf_corpus, num_topics=num_topics, id2word=dictionary, passes=passes, chunksize=chunksize, workers=workers)

#Print Topics suggested by LDA model
for index, topic in lda_model_tfidf.print_topics(-1):
    print(f"Topic {index}: {topic}")

Topic 0: 0.015*"hertoo" + 0.015*"corey_lewandowski" + 0.014*"tapped" + 0.014*"believe" + 0.014*"proof_sht" + 0.014*"joy_villa" + 0.012*"claim" + 0.010*"shocking" + 0.010*"like" + 0.009*"leading_israeli"
Topic 1: 0.029*"choose_onemetoo" + 0.012*"damon" + 0.012*"matt" + 0.010*"men" + 0.010*"far" + 0.008*"need" + 0.008*"try" + 0.008*"movement" + 0.008*"microsoft_learned" + 0.008*"bullshit"
Topic 2: 0.011*"story" + 0.010*"want" + 0.009*"new" + 0.009*"hay" + 0.009*"might" + 0.008*"thought" + 0.008*"know" + 0.008*"precious_little" + 0.008*"spent_entire" + 0.008*"lifetime_hara"
Topic 3: 0.014*"voice" + 0.014*"year" + 0.013*"emergence" + 0.013*"hire_hugh" + 0.013*"hewitt_fire" + 0.013*"msnbc_soooo" + 0.013*"joan_wal" + 0.012*"movement" + 0.012*"woman" + 0.012*"powerful"


In [23]:
# Save TF_IDF output into a txt file

output_file = f'model_outputs\\LDA_TF-IDF_topics_{num_topics}_{passes}.txt'

with open(output_file, 'w') as f: # Open File in write modus
    for index, topic in lda_model_tfidf.print_topics(-1):
        f.write(f"Topic {index}: {topic}\n") # Print output into file

print("Topics have been saved!")

Topics have been saved!


#### Using Bag of Words -> BOW

In [24]:
# Create LDA model using  BOW corpus
lda_model_bow = LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, passes=passes, chunksize=chunksize, workers=workers)

#Print Topics suggested by LDA model
for index, topic in lda_model_bow.print_topics(-1):
    print(f"Topic {index}: {topic}")

Topic 0: 0.125*"metoo" + 0.059*"year" + 0.046*"story" + 0.039*"woman" + 0.039*"time" + 0.017*"new" + 0.016*"thank" + 0.015*"movement" + 0.014*"silence" + 0.014*"latest"
Topic 1: 0.136*"metoo" + 0.026*"moment" + 0.022*"movement" + 0.019*"womens" + 0.018*"believe" + 0.016*"voice" + 0.015*"powerful" + 0.012*"call" + 0.012*"trump" + 0.012*"dont"
Topic 2: 0.130*"metoo" + 0.032*"movement" + 0.018*"woman" + 0.016*"like" + 0.015*"men" + 0.014*"amp" + 0.012*"girl" + 0.010*"one" + 0.008*"need" + 0.008*"say"
Topic 3: 0.125*"metoo" + 0.037*"sexual" + 0.019*"harassment" + 0.017*"want" + 0.015*"thought" + 0.014*"claim" + 0.014*"know" + 0.014*"trump" + 0.013*"might" + 0.013*"joy_villa"


In [25]:
# Save BOW output into a txt file

output_file = f'model_outputs\\LDA_BOW_topics_{num_topics}_{passes}.txt'

with open(output_file, 'w') as f: # Open File in write modus
    for index, topic in lda_model_bow.print_topics(-1):
        f.write(f"Topic {index}: {topic}\n") # Print output into file

print("Topics have been saved!")

Topics have been saved!
