# Import Dependencies

In [8]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image

from wordcloud import WordCloud
from textwrap import wrap

import pyLDAvis
import pyLDAvis.sklearn

# Custom functions from .py files
from web_scrape import get_transcript
from preprocess import clean, preprocess

# Import/Preprocess Data

In [2]:
# Web scrape most recent speeches using our imported 'get_transcript'
goodyear = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-goodyear-az-october-28')
bullhead = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-bullhead-city-az-october-28')
omaha = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-omaha-ne-october-27')
wsalem = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-west-salem-wisconsin-october-27')
lansing = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lansing-michigan-october-27')
martinsburg = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-martinsburg-pa-october-26')
lititz = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lititz-pa-october-26')
allentown = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-allentown-pa-october-26')

In [3]:
# Clean all of the speeches in the corpus using our imported 'clean' function
clean_goodyear = clean(goodyear)

In [5]:
# Preprocess the speeches using our imported 'preprocess' function
processed_goodyear = preprocess(clean_goodyear)

# Doc-Term Matrix

In [6]:
# Countvectorizer
cv = CountVectorizer(stop_words='english')

X_cv = cv.fit_transform(processed_goodyear)

df_cv = pd.DataFrame(X_cv.toarray(), columns=cv.get_feature_names())

In [24]:
df_cv

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# TF_IDF
tfidf_vectorizer = TfidfVectorizer(**cv.get_params())

X_tfidf = tfidf_vectorizer.fit_transform(processed_goodyear)

df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())



# LSA

In [9]:
# Latent Semantic Analysis (LSA) is just another name for Singular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(2)
doc_topic = lsa.fit_transform(X_cv)
lsa.explained_variance_ratio_

array([0.02330904, 0.0209941 ])

In [10]:
# Topic matrix
topics = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2"],
             columns = cv.get_feature_names())
topics

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
component_1,0.004,0.004,0.001,0.014,0.001,0.047,0.001,0.004,0.001,0.01,...,0.002,-0.0,0.156,0.0,0.001,0.0,0.022,0.015,0.001,0.001
component_2,-0.001,-0.002,-0.001,-0.0,-0.001,0.014,-0.0,-0.001,-0.0,0.004,...,-0.0,-0.0,-0.013,0.0,-0.0,-0.0,-0.018,-0.006,-0.001,-0.001


In [12]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [21]:
display_topics(lsa, cv.get_feature_names(), 20)


Topic  0
great, job, people, business, know, american, year, want, win, hispanic, vote, million, new, president, right, state, biden, country, america, think

Topic  1
usa, job, rand, great, business, good, know, million, governor, end, community, opportunity, dollar, easy, federal, create, billion, natural, expand, contracting


In [19]:
# Vt matrix
Vt = pd.DataFrame(doc_topic.round(5),
             index = processed_goodyear,
             columns = ["component_1","component_2" ])
Vt.head(20)

Unnamed: 0,component_1,component_2
"deliver,record,prosperity,epic,job,growth,safe,vaccine,eradicate,virus,china,plague,quickly,end,pandemic",0.56507,0.27978
"normal,life,want,want,normal,life",0.38925,-0.26137
"like,seven,month,ago",0.11145,-0.03081
"fully,resume,year,great,economic,power",0.77298,0.06831
strong,0.04139,-0.0087
happen,0.02579,-0.03687
"year,good,economic,year,country,history",0.52979,-0.019
"year,interrupt",0.15748,-0.01346
interrupt,0.0013,-0.0002
"america,great",0.66816,0.02378


In [22]:
cosine_similarity(Vt)

array([[1.        , 0.49665064, 0.74554078, ..., 0.47803219, 0.57256335,
        0.84892932],
       [0.49665064, 1.        , 0.94872763, ..., 0.99977268, 0.9959623 ,
        0.8803387 ],
       [0.74554078, 0.94872763, 1.        , ..., 0.94177252, 0.97327355,
        0.9851398 ],
       ...,
       [0.47803219, 0.99977268, 0.94177252, ..., 1.        , 0.99382187,
        0.87002507],
       [0.57256335, 0.9959623 , 0.97327355, ..., 0.99382187, 1.        ,
        0.91936732],
       [0.84892932, 0.8803387 , 0.9851398 , ..., 0.87002507, 0.91936732,
        1.        ]])

In [23]:
cos_dtm = cosine_similarity(df_cv).round(2)
cos_dtm

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

# Non-negative Matrix Factorization

In [25]:
processed_goodyear
ex_label = [e[:30]+"..." for e in processed_goodyear]

['deliver,record,prosperity,epic,job,growth,safe,vaccine,eradicate,virus,china,plague,quickly,end,pandemic',
 'normal,life,want,want,normal,life',
 'like,seven,month,ago',
 'fully,resume,year,great,economic,power',
 'strong',
 'happen',
 'year,good,economic,year,country,history',
 'year,interrupt',
 'interrupt',
 'america,great',
 'vote,joe,biden,sleepy,joe,vote,big,tax,hike,history',
 'man,campaign,fact,raise,taxis',
 '',
 '',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock,crush,regulation,crush',
 'idea,want',
 'regulation,administration,history,country',
 'regulation,cause,waste,lot,problem,slash,medicare,social,security,want,abolish,american,energy',
 'know,fracke',
 'fracke',
 'pennsylvania,maybe,frack',
 'people,let,frack,joe,clue',
 'know',
 '',
 '',
 'state',
 'wrong,seven,time',
 '',
 'arizona,nevada',
 '',
 'tell',
 'iowa',
 'oh,oh,oh',
 'oh',
 'hot',
 'know',
 'florida',
 'joe',
 'joe,vote,biden,vote,open,border,offshore,job,shred,second,amend

In [29]:
df_nmf = pd.DataFrame(X_cv .toarray(), index=ex_label, columns=cv.get_feature_names())

In [30]:
df_nmf

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
"deliver,record,prosperity,epic...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"normal,life,want,want,normal,l...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"like,seven,month,ago...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"fully,resume,year,great,econom...",0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
strong...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
thank...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"thank,arizona...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
vote...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"thank,weekly,digest,week,impor...",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
nmf_model = NMF(2)
doc_topic_nmf = nmf_model.fit_transform(X_cv)

The W matrix shows us the 2 resulting topics, and the terms that are associated with each topic. In this case:
- Component 1 (topic 1) seems to be about _____
- Component 2 (topic 2) seems to be about _____

In [32]:
topic_word_nmf = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2"],
             columns = cv.get_feature_names())
topic_word_nmf

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
component_1,0.013,0.013,0.004,0.047,0.005,0.159,0.003,0.012,0.003,0.034,...,0.006,0.0,0.538,0.0,0.005,0.0,0.076,0.052,0.003,0.005
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
display_topics(nmf_model, cv.get_feature_names(), 20)


Topic  0
great, job, people, business, know, year, american, win, want, hispanic, vote, million, new, president, right, state, biden, country, america, think

Topic  1
usa, job, rand, good, trillion, missile, submarine, fine, tank, rocket, stealth, jet, paul, china, governor, virus, happen, worried, extremely, vaccine


The H matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. In this case:
- The first document seems to be about _____
- The last document seems to be about _____
- Everything in between is a combination of _____

In [35]:
H = pd.DataFrame(doc_topic_nmf.round(5),
             index = ex_label,
             columns = ["component_1","component_2" ])
H

Unnamed: 0,component_1,component_2
"deliver,record,prosperity,epic...",0.15805,0.06566
"normal,life,want,want,normal,l...",0.11645,0.00000
"like,seven,month,ago...",0.03290,0.00000
"fully,resume,year,great,econom...",0.22855,0.00000
strong...,0.01232,0.00000
...,...,...
thank...,0.00209,0.00054
"thank,arizona...",0.01802,0.00000
vote...,0.04022,0.00000
"thank,weekly,digest,week,impor...",0.00918,0.00007
