# Import Dependencies

In [1]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image

from wordcloud import WordCloud
from textwrap import wrap

import pyLDAvis
import pyLDAvis.sklearn

# Custom functions from .py files
from web_scrape import get_transcript
from preprocess import clean, preprocess

# Import/Preprocess Data

In [2]:
# Web scrape most recent speeches using our imported 'get_transcript'
goodyear = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-goodyear-az-october-28')
bullhead = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-bullhead-city-az-october-28')
omaha = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-omaha-ne-october-27')
wsalem = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-west-salem-wisconsin-october-27')
lansing = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lansing-michigan-october-27')
martinsburg = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-martinsburg-pa-october-26')
lititz = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lititz-pa-october-26')
allentown = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-allentown-pa-october-26')

In [3]:
# Clean all of the speeches in the corpus using our imported 'clean' function
clean_goodyear = clean(goodyear)

In [4]:
# Preprocess the speeches using our imported 'preprocess' function
processed_goodyear = preprocess(clean_goodyear)

# Doc-Term Matrix

In [6]:
# Countvectorizer
cv = CountVectorizer(stop_words='english')

X_cv = cv.fit_transform(processed_goodyear)

df_cv = pd.DataFrame(X_cv.toarray(), columns=cv.get_feature_names())

In [7]:
df_cv

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# TF_IDF
tfidf_vectorizer = TfidfVectorizer(**cv.get_params())

X_tfidf = tfidf_vectorizer.fit_transform(processed_goodyear)

df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())



# LSA

In [54]:
# Latent Semantic Analysis (LSA) is just another name for Singular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
# Let's compare explained variance using each vectorizer
lsa_cv = TruncatedSVD(6)
lsa_tfidf = TruncatedSVD(6)

# Doc_topic matrices will be used later
doc_topic_cv = lsa_cv.fit_transform(X_cv)
doc_topic_tfidf = lsa_tfidf.fit_transform(X_tfidf)

total_expl_var_cv = round(sum(lsa_cv.explained_variance_ratio_), 4) * 100
total_expl_var_tfidf = round(sum(lsa_tfidf.explained_variance_ratio_), 4) * 100

print(f"Explained variance using CountVectorizer: {total_expl_var_cv}%")
print(f"Explained variance using TFIDF: {total_expl_var_tfidf}%")

Explained variance using CountVectorizer: 11.48%
Explained variance using TFIDF: 10.100000000000001%


In [22]:
# CountVectorizer has a slightly higher explained variance, so we will use this model from here on out

In [56]:
# Topic matrix
topics = pd.DataFrame(lsa_cv.components_.round(3),
             index = ["c1","c2","c3","c4","c5","c6"],
             columns = cv.get_feature_names())
topics

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
c1,0.004,0.004,0.001,0.014,0.001,0.047,0.001,0.003,0.001,0.01,...,0.002,0.0,0.156,0.0,0.001,0.0,0.022,0.015,0.001,0.001
c2,-0.001,-0.002,-0.001,0.001,-0.001,0.014,-0.0,-0.001,-0.0,0.003,...,-0.001,0.0,-0.012,-0.0,-0.001,-0.0,-0.014,-0.007,-0.001,-0.001
c3,-0.0,-0.0,0.0,-0.004,0.0,-0.02,-0.0,-0.001,-0.0,-0.003,...,-0.0,-0.0,-0.034,0.0,-0.0,0.0,0.01,-0.001,0.001,0.0
c4,0.004,0.008,0.001,-0.013,0.004,-0.006,0.001,0.006,0.001,-0.005,...,0.002,-0.0,0.06,0.0,0.001,-0.0,0.006,0.041,0.003,0.001
c5,0.003,-0.002,-0.002,-0.011,0.001,0.055,0.001,0.004,-0.001,0.001,...,0.0,0.0,0.046,0.0,-0.003,0.0,0.006,0.009,-0.002,-0.002
c6,0.001,-0.009,0.0,0.002,0.0,-0.005,-0.001,-0.001,0.001,0.001,...,0.0,-0.0,0.123,-0.0,-0.001,-0.0,0.025,-0.015,-0.003,-0.001


In [64]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [65]:
display_topics(lsa_cv, cv.get_feature_names(), 10)


Topic  0
great, job, people, business, know, year, american, win, want, hispanic

Topic  1
usa, job, rand, great, business, know, good, million, governor, opportunity

Topic  2
win, usa, number, rand, election, carolina, maybe, vote, right, time

Topic  3
want, vote, people, joe, right, biden, president, americans, usa, know

Topic  4
know, people, business, win, hispanic, million, new, job, look, american

Topic  5
vote, biden, joe, year, job, sleepy, corrupt, china, great, history


In [71]:
# Vt matrix
Vt = pd.DataFrame(doc_topic_cv.round(5),
             index = processed_goodyear,
             columns = ["c1","c2","c3","c4","c5","c6"])
Vt.head(10)

Unnamed: 0,c1,c2,c3,c4,c5,c6
"deliver,record,prosperity,epic,job,growth,safe,vaccine,eradicate,virus,china,plague,quickly,end,pandemic",0.56616,0.25444,-0.02699,-0.14776,0.20746,0.26449
"normal,life,want,want,normal,life",0.38865,-0.26249,0.05933,0.78532,-0.56159,-1.01172
"like,seven,month,ago",0.11068,-0.03502,0.01116,0.08277,-0.11057,-0.09609
"fully,resume,year,great,economic,power",0.77313,0.07285,-0.20634,-0.47415,-0.27348,0.20094
strong,0.04135,-0.00723,-0.00761,0.00445,-0.04316,0.00027
happen,0.02561,-0.03393,0.04617,-0.00072,0.05608,-0.05735
"year,good,economic,year,country,history",0.5279,-0.02862,-0.08231,0.09165,-0.03102,0.27232
"year,interrupt",0.15765,-0.01218,-0.03424,0.06078,0.04678,0.12521
interrupt,0.0013,-0.00014,-0.00043,0.00088,0.00055,0.00224
"america,great",0.6687,0.02538,-0.14455,-0.45491,-0.52644,0.03192


In [74]:
cosine_similarity(Vt.head(5))

array([[ 1.        , -0.32710971, -0.04827846,  0.73735032,  0.27864264],
       [-0.32710971,  1.        ,  0.91718693, -0.10437074,  0.50070961],
       [-0.04827846,  0.91718693,  1.        ,  0.25782562,  0.79146665],
       [ 0.73735032, -0.10437074,  0.25782562,  1.        ,  0.70783417],
       [ 0.27864264,  0.50070961,  0.79146665,  0.70783417,  1.        ]])

In [68]:
cos_dtm = cosine_similarity(df_cv).round(2)
cos_dtm

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

# Non-negative Matrix Factorization

In [88]:
processed_goodyear
ex_label = [e[:70]+"..." for e in processed_goodyear]

In [89]:
df_nmf = pd.DataFrame(X_cv .toarray(), index=ex_label, columns=cv.get_feature_names())

In [90]:
df_nmf

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
"deliver,record,prosperity,epic,job,growth,safe,vaccine,eradicate,virus...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"normal,life,want,want,normal,life...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"like,seven,month,ago...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"fully,resume,year,great,economic,power...",0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
strong...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
thank...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"thank,arizona...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
vote...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"thank,weekly,digest,week,important,transcript,inbox...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
nmf_model = NMF(6)
doc_topic_nmf = nmf_model.fit_transform(X_cv)

The W matrix shows us the 2 resulting topics, and the terms that are associated with each topic. In this case:
- Component 1 (topic 1) seems to be about _____
- Component 2 (topic 2) seems to be about _____

In [92]:
topic_word_nmf = pd.DataFrame(nmf_model.components_.round(3),
             index = ["c1","c2","c3","c4","c5","c6"],
             columns = cv.get_feature_names())
topic_word_nmf

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
c1,0.0,0.0,0.003,0.088,0.0,0.025,0.0,0.0,0.001,0.049,...,0.0,0.0,0.19,0.0,0.0,0.0,0.024,0.0,0.0,0.001
c2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c3,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018,0.0,0.0,0.0,0.055,0.0,0.0,0.001
c4,0.016,0.006,0.0,0.0,0.003,0.246,0.005,0.021,0.0,0.011,...,0.006,0.0,0.441,0.0,0.002,0.0,0.038,0.037,0.0,0.0
c5,0.003,0.03,0.004,0.002,0.004,0.0,0.002,0.003,0.002,0.0,...,0.005,0.0,0.012,0.0,0.008,0.0,0.0,0.065,0.015,0.009
c6,0.007,0.0,0.003,0.0,0.005,0.0,0.0,0.0,0.004,0.0,...,0.0,0.0,0.257,0.0,0.0,0.0,0.071,0.049,0.0,0.0


In [93]:
display_topics(nmf_model, cv.get_feature_names(), 10)


Topic  0
great, job, think, state, governor, country, good, doctor, history, american

Topic  1
usa, job, rand, good, trillion, missile, tank, stealth, rocket, fine

Topic  2
win, number, election, carolina, maybe, time, day, happen, democrats, north

Topic  3
know, people, business, hispanic, million, new, job, american, year, look

Topic  4
want, right, president, war, america, like, life, hate, support, overseas

Topic  5
vote, biden, joe, year, corrupt, sleepy, americans, world, lock, second


The H matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. In this case:
- The first document seems to be about _____
- The last document seems to be about _____
- Everything in between is a combination of _____

In [94]:
H = pd.DataFrame(doc_topic_nmf.round(4),
             index = ex_label,
             columns = ["c1","c2","c3","c4","c5","c6"])
H.head(5)

Unnamed: 0,c1,c2,c3,c4,c5,c6
"deliver,record,prosperity,epic,job,growth,safe,vaccine,eradicate,virus...",0.0998,0.0438,0.0046,0.0959,0.0,0.0668
"normal,life,want,want,normal,life...",0.0,0.0,0.0,0.0,0.5801,0.0
"like,seven,month,ago...",0.0017,0.0006,0.0,0.0076,0.0834,0.0043
"fully,resume,year,great,economic,power...",0.2555,0.0,0.0,0.0143,0.0,0.0236
strong...,0.011,0.0,0.0,0.0,0.0134,0.0036
