# Import Dependencies

In [1]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image

from wordcloud import WordCloud
from textwrap import wrap

import pyLDAvis
import pyLDAvis.sklearn

# Custom functions from .py files
from web_scrape import get_transcript
from preprocess import clean, preprocess

# Import/Preprocess Data

In [2]:
# Web scrape most recent speeches using our imported 'get_transcript'
goodyear = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-goodyear-az-october-28')
bullhead = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-bullhead-city-az-october-28')
omaha = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-omaha-ne-october-27')
wsalem = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-west-salem-wisconsin-october-27')
lansing = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lansing-michigan-october-27')
martinsburg = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-martinsburg-pa-october-26')
lititz = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lititz-pa-october-26')
allentown = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-allentown-pa-october-26')

In [3]:
# Create corpus
corpus = [goodyear, bullhead, omaha, wsalem, lansing, martinsburg, lititz, allentown]

In [4]:
# Clean corpus using our imported 'clean' function
clean_corpus = clean(corpus)

In [6]:
# Preprocess the speeches using our imported 'preprocess' function
processed_corpus = preprocess(clean_corpus)

# Doc-Term Matrix

In [7]:
# Countvectorizer
cv = CountVectorizer(stop_words='english')

X_cv = cv.fit_transform(processed_corpus)

df_cv = pd.DataFrame(X_cv.toarray(), columns=cv.get_feature_names())

In [8]:
# TF_IDF
tfidf_vectorizer = TfidfVectorizer(**cv.get_params())

X_tfidf = tfidf_vectorizer.fit_transform(processed_corpus)

df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())



# LSA

In [9]:
# Latent Semantic Analysis (LSA) is just another name for Singular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
# Let's compare explained variance using each vectorizer
lsa_cv = TruncatedSVD(5)
lsa_tfidf = TruncatedSVD(5)

# Doc_topic matrices will be used later
doc_topic_cv = lsa_cv.fit_transform(X_cv)
doc_topic_tfidf = lsa_tfidf.fit_transform(X_tfidf)

total_expl_var_cv = round(sum(lsa_cv.explained_variance_ratio_), 4) * 100
total_expl_var_tfidf = round(sum(lsa_tfidf.explained_variance_ratio_), 4) * 100

print(f"Explained variance using CountVectorizer: {total_expl_var_cv}%")
print(f"Explained variance using TFIDF: {total_expl_var_tfidf}%")

Explained variance using CountVectorizer: 7.37%
Explained variance using TFIDF: 7.31%


In [22]:
# CountVectorizer has a slightly higher explained variance, so we will use this model from here on out

In [10]:
# Topic matrix
topics = pd.DataFrame(lsa_cv.components_.round(3),
             index = ["c1","c2","c3","c4","c5"],
             columns = cv.get_feature_names())
topics

Unnamed: 0,able,abolish,abortion,abraham,absentee,absolute,absolutely,abuse,accept,acceptance,...,york,young,youtube,yu,yuma,zero,zippo,zone,zoning,zucker
c1,0.005,0.005,0.001,0.004,0.0,0.0,0.01,0.002,0.0,0.0,...,0.008,0.012,0.001,0.0,0.001,0.001,0.0,0.002,0.0,-0.0
c2,0.002,0.006,0.001,-0.006,0.0,0.0,0.0,0.0,0.0,-0.0,...,-0.001,0.01,-0.0,0.0,0.0,-0.0,0.0,-0.001,0.0,0.0
c3,0.004,0.006,0.001,0.001,0.0,-0.0,0.002,-0.0,0.0,0.0,...,-0.001,0.0,0.0,-0.0,-0.001,0.0,0.0,0.0,-0.0,-0.0
c4,0.001,0.0,0.0,-0.002,0.0,0.001,0.014,0.001,0.0,0.0,...,-0.003,-0.003,-0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.0
c5,0.001,0.0,0.0,-0.005,0.0,-0.002,-0.006,0.0,-0.0,0.0,...,0.001,-0.002,0.0,0.0,-0.003,0.0,0.0,0.0,0.0,-0.0


In [11]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [12]:
display_topics(lsa_cv, cv.get_feature_names(), 10)


Topic  0
great, want, know, people, year, like, right, country, win, biden

Topic  1
want, know, right, thing, talk, border, joe, life, lock, hell

Topic  2
want, great, win, people, china, country, year, american, world, border

Topic  3
right, year, win, like, china, biden, joe, president, ago, vote

Topic  4
win, year, biden, know, china, vote, like, think, joe, ago


In [13]:
# Vt matrix
Vt = pd.DataFrame(doc_topic_cv.round(5),
             index = processed_corpus,
             columns = ["c1","c2","c3","c4","c5"])
Vt.head(10)

Unnamed: 0,c1,c2,c3,c4,c5
"deliver,record,prosperity,epic,job,growth,safe,vaccine,eradicate,virus,china,plague,quickly,end,pandemic",0.42474,-0.15341,0.12672,0.21529,0.23074
"normal,life,want,want,normal,life",0.87541,1.33947,1.19383,-0.31805,-0.29664
"like,seven,month,ago",0.28586,-0.02939,-0.15301,0.33185,0.13105
"fully,resume,year,great,economic,power",0.68624,-0.83955,0.17621,-0.08504,-0.09506
strong,0.01877,-0.01072,0.00074,-0.00525,-0.00202
happen,0.05103,0.01515,-0.03285,0.01576,0.00545
"year,good,economic,year,country,history",0.83381,-0.59056,0.11311,0.7346,0.28099
"year,interrupt",0.24854,-0.19716,0.03373,0.33202,0.16794
interrupt,0.00034,-0.00038,6e-05,0.00069,0.00036
"america,great",0.48632,-0.69893,0.15625,-0.41083,-0.23615


In [14]:
cosine_similarity(Vt.head(5))

array([[ 1.        ,  0.15643375,  0.76256168,  0.64355656,  0.6422796 ],
       [ 0.15643375,  1.        , -0.1177528 , -0.11426277,  0.11442145],
       [ 0.76256168, -0.1177528 ,  1.        ,  0.28680709,  0.3298587 ],
       [ 0.64355656, -0.11426277,  0.28680709,  1.        ,  0.91642632],
       [ 0.6422796 ,  0.11442145,  0.3298587 ,  0.91642632,  1.        ]])

In [15]:
cos_dtm = cosine_similarity(df_cv).round(2)
cos_dtm

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

# Non-negative Matrix Factorization

In [16]:
processed_corpus
ex_label = [e[:70]+"..." for e in processed_corpus]

In [17]:
df_nmf = pd.DataFrame(X_cv .toarray(), index=ex_label, columns=cv.get_feature_names())

In [19]:
df_nmf.head(10)

Unnamed: 0,able,abolish,abortion,abraham,absentee,absolute,absolutely,abuse,accept,acceptance,...,york,young,youtube,yu,yuma,zero,zippo,zone,zoning,zucker
"deliver,record,prosperity,epic,job,growth,safe,vaccine,eradicate,virus...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"normal,life,want,want,normal,life...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"like,seven,month,ago...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"fully,resume,year,great,economic,power...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
strong...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
happen...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"year,good,economic,year,country,history...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"year,interrupt...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
interrupt...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"america,great...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
nmf_model = NMF(5)
doc_topic_nmf = nmf_model.fit_transform(X_cv)

The W matrix shows us the 2 resulting topics, and the terms that are associated with each topic. In this case:
- Component 1 (topic 1) seems to be about _____
- Component 2 (topic 2) seems to be about _____

In [23]:
topic_word_nmf = pd.DataFrame(nmf_model.components_.round(3),
             index = ["c1","c2","c3","c4","c5"],
             columns = cv.get_feature_names())
topic_word_nmf

Unnamed: 0,able,abolish,abortion,abraham,absentee,absolute,absolutely,abuse,accept,acceptance,...,york,young,youtube,yu,yuma,zero,zippo,zone,zoning,zucker
c1,0.009,0.0,0.0,0.062,0.0,0.0,0.0,0.0,0.0,0.001,...,0.038,0.009,0.004,0.0,0.0,0.0,0.0,0.012,0.0,0.0
c2,0.023,0.04,0.008,0.0,0.0,0.0,0.017,0.002,0.0,0.0,...,0.008,0.053,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,0.0,0.0,...,0.019,0.033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c4,0.017,0.012,0.003,0.0,0.002,0.0,0.057,0.01,0.001,0.001,...,0.014,0.008,0.0,0.002,0.0,0.005,0.0,0.007,0.0,0.0
c5,0.0,0.0,0.0,0.01,0.0,0.01,0.057,0.001,0.0,0.0,...,0.0,0.016,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0


In [24]:
display_topics(nmf_model, cv.get_feature_names(), 10)


Topic  0
great, people, job, country, america, history, world, american, stand, nation

Topic  1
want, people, thing, border, talk, life, normal, come, million, lock

Topic  2
know, people, happen, lot, time, thing, think, big, hell, good

Topic  3
year, win, like, biden, china, vote, joe, country, think, president

Topic  4
right, liberty, big, defend, thing, speech, free, good, religious, bear


The H matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. In this case:
- The first document seems to be about _____
- The last document seems to be about _____
- Everything in between is a combination of _____

In [26]:
H = pd.DataFrame(doc_topic_nmf.round(4),
             index = ex_label,
             columns = ["c1","c2","c3","c4","c5"])
H.head(5)

Unnamed: 0,c1,c2,c3,c4,c5
"deliver,record,prosperity,epic,job,growth,safe,vaccine,eradicate,virus...",0.0242,0.0021,0.0,0.0992,0.0
"normal,life,want,want,normal,life...",0.0,0.5018,0.0,0.0,0.0055
"like,seven,month,ago...",0.0,0.0,0.0039,0.1036,0.0006
"fully,resume,year,great,economic,power...",0.1349,0.0,0.0,0.1037,0.0
strong...,0.003,0.0002,0.0007,0.0013,0.0001


In [32]:
topic_c1 = H[['c1']]
topic_c1.sort_values(by='c1',ascending=False).head(10)

Unnamed: 0,c1
"know,great,great,country,country,tremendous,potential,country,great...",0.461
"day,feel,exactly,great,mike,great,congressman,feel,exactly,great...",0.4361
"year,american,dream,plan,bring,million,new,job,hispanic,community,crea...",0.4322
"remember,guy,great,great,great...",0.4238
"stand,shoulder,american,hero,cross,ocean,settle,continent,tame,wildern...",0.3788
"stand,shoulder,american,hero,cross,ocean,settle,continent,tame,wildern...",0.3784
"raise,great,skyscraper,lay,battleship,win,world,wars,defeat,fascism,co...",0.3477
"place,generation,tough,strong,pennsylvania,worker,mind,work,railroad,f...",0.3469
"governor,talk,great,job,great,job,fight,china,virus...",0.3209
"great,job,anchor,thing,fella,great,job...",0.3203
