# Topic Modeling with Gensim

In [2]:
# code adapted from https://nlpforhackers.io/topic-modeling/
# and applied to new corpus:
# https://github.com/andrewts129/transcript-scraping/
# tree/master/Donald%20Trump%20Speeches

# import packages for assignment
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

In [3]:
# function to clean text by tokenizing & filtering stop words
def clean_text(text):
    sw = stopwords.words('english')
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text 
                    if t not in sw and 
                    re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

# function that processes corpus, builds LSI and 
# LDA models, and output topics for each model
def compare_topics(data, n_topics, words):
    # clean each document in corpus
    tokenized_data = []
    for doc in data:
        tokenized_data.append(clean_text(doc))

    # build a dictionary - association word to numeric id
    dictionary = corpora.Dictionary(tokenized_data)

    # transform collection of documents to numerical form
    corpus = [dictionary.doc2bow(text) for text in tokenized_data]

    # build LSI(LSA) model
    lsi_model = models.LsiModel(corpus=corpus, 
                                num_topics=n_topics, 
                                id2word=dictionary)

    # build LDA model
    lda_model = models.LdaModel(corpus=corpus, 
                                num_topics=n_topics, 
                                id2word=dictionary)

    print("\n\nLSI Model:")
    for i in range(n_topics):
        # print the first n most representative topics
        print("Topic #%s:" % (i+1), lsi_model.print_topic(i, words))

    print("\n\nLDA Model:") 
    for i in range(n_topics):
        # print the first n most representative topics
        print("Topic #%s:" % (i+1), lda_model.print_topic(i, words))

In [4]:
# read in corpus for modeling
with open('AllSpeechesTrump.txt', encoding='utf-8') as inputfile:
    data = inputfile.read().split('\n\n')

# print out number of documents and lengths of first 10 documents
print(len(data), [len(d) for d in data][:10])

# generate topics
compare_topics(data, 10, 5)

450 [48255, 4381, 4434, 4676, 5051, 4367, 4352, 4593, 3101, 1910]


LSI Model:
Topic #1: 0.442*"going" + 0.356*"know" + 0.293*"people" + 0.214*"said" + 0.184*"want"
Topic #2: 0.511*"going" + -0.374*"know" + -0.312*"said" + -0.166*"like" + 0.162*"back"
Topic #3: 0.452*"going" + -0.406*"people" + -0.246*"country" + 0.200*"know" + -0.183*"jobs"
Topic #4: -0.366*"going" + 0.263*"think" + 0.199*"want" + -0.194*"great" + -0.190*"country"
Topic #5: -0.366*"trump" + -0.336*"donald" + -0.308*"question" + -0.257*"inaudible" + -0.171*"think"
Topic #6: 0.264*"said" + 0.255*"wall" + 0.238*"want" + -0.165*"know" + -0.164*"money"
Topic #7: 0.325*"hillary" + -0.285*"people" + 0.265*"clinton" + -0.262*"going" + 0.168*"jobs"
Topic #8: 0.216*"get" + 0.213*"need" + 0.204*"israel" + -0.193*"people" + 0.190*"deal"
Topic #9: 0.349*"people" + 0.307*"thank" + -0.287*"jobs" + -0.230*"china" + 0.228*"want"
Topic #10: -0.399*"want" + 0.382*"people" + -0.259*"great" + -0.243*"thank" + 0.178*"clinton"


LDA Model:


The models appear to work somewhat similarly on this corpus. Since the corpus is a collection of Trump's campaign speeches, the topics should reflect what he promises to do as president. So it's not surprising that words like - going, people, great, country, want, know, hillary, clinton- appear in both models.

From the LSI model, we get other topics like:
    wall, money, jobs, china, israel, deal, donald, and trump.

From the LDA model, we don't really get any other topics. We get a better variety of topics with the LSI model.