# Importing Dependencies

In [31]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image

from wordcloud import WordCloud
from textwrap import wrap
from sklearn.manifold import TSNE

import pyLDAvis
import pyLDAvis.sklearn

# Custom functions from .py files
from web_scrape import get_transcript
from preprocess import clean, preprocess

# Import/Preprocess Data

In [2]:
# Web scrape most recent speeches using our imported 'get_transcript'
goodyear_az = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-goodyear-az-october-28')
bullhead_az = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-bullhead-city-az-october-28')
omaha_ne = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-omaha-ne-october-27')
wsalem_wi = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-west-salem-wisconsin-october-27')
lansing_mi = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lansing-michigan-october-27')
martinsburg_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-martinsburg-pa-october-26')
lititz_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lititz-pa-october-26')
allentown_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-allentown-pa-october-26')

# Create corpus for each state and the for all of the speaces
corpus = [goodyear_az, bullhead_az, omaha_ne, wsalem_wi, lansing_mi, martinsburg_pa, lititz_pa, allentown_pa]
az = [goodyear_az, bullhead_az]
ne = [omaha_ne]
wi = [wsalem_wi]
mi = [lansing_mi]
pa = [martinsburg_pa, lititz_pa, allentown_pa]

# Clean corpuses using our imported 'clean' function
clean_corpus = clean(corpus)
clean_az = clean(az)
clean_ne = clean(ne)
clean_wi = clean(wi)
clean_mi = clean(mi)
clean_pa = clean(pa)

# Preprocess corpuses using our import 'clean' function
processed_corpus = preprocess(clean_corpus)
processed_az = preprocess(clean_az)
processed_ne = preprocess(clean_ne)
processed_wi = preprocess(clean_wi)
processed_mi = preprocess(clean_mi)
processed_pa = preprocess(clean_pa)

# Doc-Term Matrices

In [None]:
# Let's create our doc-term matrices for the entire corpus and for each individual state

In [3]:
# Countvectorizer, make dataframes
cv = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_az = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_ne = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_wi = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_mi = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_pa = CountVectorizer(stop_words='english', ngram_range=(1,3))

X = cv.fit_transform(processed_corpus)

X_az = cv_az.fit_transform(processed_az)
df_az = pd.DataFrame(X_az.toarray(), columns=cv_az.get_feature_names())

X_ne = cv_ne.fit_transform(processed_ne)
df_ne = pd.DataFrame(X_ne.toarray(), columns=cv_ne.get_feature_names())

X_wi = cv_wi.fit_transform(processed_wi)
df_wi = pd.DataFrame(X_wi.toarray(), columns=cv_wi.get_feature_names())

X_mi = cv_mi.fit_transform(processed_mi)
df_mi = pd.DataFrame(X_mi.toarray(), columns=cv_mi.get_feature_names())

X_pa = cv_pa.fit_transform(processed_pa)
df_pa = pd.DataFrame(X_pa.toarray(), columns=cv_pa.get_feature_names())

# Add 'State' column to each dataframe
df_az['State'] = [0] * len(df_az) # AZ
df_ne['State'] = [1] * len(df_ne) # NE
df_wi['State'] = [2] * len(df_wi) # WI
df_mi['State'] = [3] * len(df_mi) # MI
df_pa['State'] = [4] * len(df_pa) # PA

# Concatenate the state dataframes into one and fillnas with 0
df = pd.concat([df_az, df_ne, df_wi, df_mi, df_pa])
df.fillna(0, inplace=True)

# EDA Using NMF

In [4]:
# Import NMF, fit_transform all corpuses
nmf = NMF(10)
nmf_az = NMF(10)
nmf_ne = NMF(10)
nmf_wi = NMF(10)
nmf_mi = NMF(10)
nmf_pa = NMF(10)

doc_topic_nmf = nmf.fit_transform(X)
doc_topic_nmf_az = nmf_az.fit_transform(X_az)
doc_topic_nmf_ne = nmf_ne.fit_transform(X_ne)
doc_topic_nmf_wi = nmf_wi.fit_transform(X_wi)
doc_topic_nmf_mi = nmf_mi.fit_transform(X_mi)
doc_topic_nmf_pa = nmf_pa.fit_transform(X_pa)

In [5]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    """
    Input: A model, feature names object, and number of top words to display, and list-like of topic names
    Output: topics and their top words
    """
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

## Overall Topics

1. Jobs/Economy/Patriotism
2. Winning election
3. Sleepy Joe/radical socialist (trolling Joe Biden)
4. China/world trade
5. America/Patriotism
6. 1st/2nd Amendment
7. Healthcare/drug prices
8. 'Great', but what is great? He may be boasting that everything is great

In [120]:
df_legend

Unnamed: 0,Topics,Related Words
0,Patriotism,"[great, country, history, america, nation]"
1,China/World Trade,"[china, world, trade, deal, world trade]"
2,Trolling Dems,"[biden, joe, sleepy, sleepy joe, corrupt, soci..."
3,Winning Election,"[vote, president, trump, vote republicans, rep..."
4,Conservative Values,"[right, religious liberty, defend, speech, rep..."
5,COVID,"[covid, turn, news, talk covid, fake, talk]"
6,Economy,"[job, cut, tax, tax, big tax, economy]"
7,Border,"[border, wall, hispanic]"


In [119]:
legend_dict = {'Topics':['Patriotism','China/World Trade','Trolling Dems','Winning Election','Conservative Values','COVID','Economy','Border'],
               'Related Words': [['great','country','history','america','nation'],
                                 ['china','world','trade','deal','world trade'],
                                 ['biden','joe','sleepy','sleepy joe','corrupt','socialist'],
                                 ['vote','president','trump','vote republicans','republicans','win'],
                                 ['right','religious liberty','defend','speech', 'republicans','bear'],
                                 ['covid','turn','news','talk covid', 'fake','talk'],
                                 ['job','cut','tax','tax','big tax','economy'],
                                 ['border','wall','hispanic']]
                      }
df_legend = pd.DataFrame(data=legend_dict)

In [108]:
# Create list
topic_names = np.array(['Patriotism','China/World Trade','Trolling Dems','Winning Election','Conservative Values','COVID','Economy'])

# Create list
topic_values = np.array([np.sum(df[['great','country','history','america','nation']].sum().values),
                np.sum(df[['china','world','trade','deal','world trade']].sum().values),
                np.sum(df[['biden','joe','sleepy','sleepy joe','corrupt','socialist']].sum().values),
                np.sum(df[['vote','president','trump','vote republicans','republicans','win']].sum().values),
                np.sum(df[['right','religious liberty','defend','speech', 'republicans','bear']].sum().values),
                np.sum(df[['covid','turn','news','talk covid', 'fake','talk']].sum().values),
                np.sum(df[['job','cut','tax','tax','big tax','economy']].sum().values)])

# Sort both lists by descending order of topic values
topic_names = topic_names[np.argsort(topic_values)[::-1]]
topic_values = topic_values[np.argsort(topic_values)[::-1]]

# Create dictionary to be used in dataframe
topic_dict = {'Topics': topic_names,'Number of Related Words': topic_values}

# Dataframe for plotly
df_topic = pd.DataFrame(data=topic_dict)

In [109]:
fig = px.bar(data_frame=df_topic, x='Topics', y='Number of Related Words', title='Number of Related Words', labels={'Number of Related Words':''})
fig.show();

In [8]:
# Entire corpus
display_topics(nmf, cv.get_feature_names(), 10)


Topic  0
great, job, great job, world, america, history, nation, stand, state, america great

Topic  1
china, world, trade, support, united, states, united states, deal, world trade, organization

Topic  2
right, liberty, speech, good, defend, free, religious liberty, religious, speech right, bear

Topic  3
win, win win, america, fighting, fighting win win, fighting win, win win win, america win, win america, win america win

Topic  4
biden, joe, joe biden, sleepy, sleepy joe, sleepy joe biden, corrupt, plan, biden corrupt, socialist

Topic  5
usa, usa usa, usa usa usa, job, usa job, usa usa job, catch, spy, catch spy, usa catch

Topic  6
people, american, people people, incredible, great people, million, pennsylvania, stand, movement, incredible people

Topic  7
covid, covid covid, covid covid covid, turn, turn covid covid, turn covid, talk, news, talk covid, fake

Topic  8
country, big, history, history country, cut, tax, tax cut, love, big tax, american

Topic  9
vote, president, a

## Arizona Topics
1. Jobs/Economy/Patriotism
2. Sleepy Joe/radical socialist (trolling Joe Biden)
3. Winning Election
4. Drug prices/healthcare
5. Hispanic/border (appeals to demographic)
6. Hate war?
7. 2nd Amendment/Patriotism/Freedom/Liberty

In [113]:
# Create list
topic_names_az = np.array(['Patriotism','China/World Trade','Trolling Dems','Winning Election','Conservative Values','Border','Economy'])

# Create list
topic_values_az = np.array([np.sum(df_az[['usa','great','history','america','nation']].sum().values),
                np.sum(df_az[['china','world','trade','deal']].sum().values),
                np.sum(df_az[['biden','joe','sleepy','sleepy joe','corrupt','socialist','spy','catch']].sum().values),
                np.sum(df_az[['vote','president','trump','vote republicans','republicans','win']].sum().values),
                np.sum(df_az[['right','religious liberty','defend','speech', 'republicans','bear','free','life','right']].sum().values),
                np.sum(df_az[['border','wall','hispanic']].sum().values),
                np.sum(df_az[['economy','job','million','business','business people','million new']].sum().values)])

# Sort both lists by descending order of topic values
topic_names_az = topic_names_az[np.argsort(topic_values_az)[::-1]]
topic_values_az = topic_values_az[np.argsort(topic_values_az)[::-1]]

# Create dictionary to be used in dataframe
topic_dict_az = {'Topics': topic_names_az,'Number of Related Words': topic_values_az}

# Dataframe for plotly
df_topic_az = pd.DataFrame(data=topic_dict_az)

In [114]:
fig_az = px.bar(data_frame=df_topic_az, x='Topics', y='Number of Related Words', title='Number of Related Words', labels={'Number of Related Words':''})
fig_az.show();

In [92]:
# AZ corpus
display_topics(nmf_az, cv_az.get_feature_names(), 10)


Topic  0
usa, usa usa, usa usa usa, job, usa job, usa usa job, catch, spy, catch spy, usa catch

Topic  1
great, job, great job, governor, great great, state, great state, good, doctor, great doctor

Topic  2
biden, joe, vote, joe biden, sleepy, sleepy joe, second, border, amendment, second amendment

Topic  3
win, win win, nevada, election, carolina, number, maybe, arizona, win nevada, win number

Topic  4
business, hispanic, million, job, new, business people, million new, people, american, dream

Topic  5
right, life, arizona, second, amendment, second amendment, free, life right, defend, liberty

Topic  6
american, brave, state, arizona, west, generation, sun, hand, pioneer, brave blazing

Topic  7
people, nevada, arizona, people arizona, arizona nevada, people nevada, people arizona nevada, stand, american people, people people

Topic  8
country, history, love, big, history country, happen, election, number, love country, socialist

Topic  9
war, america, president, support, hate

## Nebraska
1. Normal life (going back to normal, aka ignoring COVID?)
2. Jobs/Economy/Tax cut/Unemployment
3. Joe Biden-corrupt (trolling Joe Biden)
4. Winning election

In [None]:
# NE corpus
display_topics(nmf_ne, cv_ne.get_feature_names(), 10)

## Wisconsin
1. America/Patriotism/Liberty/Religious Liberty
2. China/World Trade
3. Wall
4. Trolling Joe Biden/Clinton and the left

In [None]:
# WI corpus
display_topics(nmf_wi, cv_wi.get_feature_names(), 10)

## Michigan
1. Covid/Scaring people with Covid
2. China/world trade
3. Supreme court justices
4. Winning Michigan/election
5. Social security/medicare
6. Veterans
7. Record low unemployment/Economy
8. Michigan governer

In [131]:
# Create list
topic_names_mi = np.array(['Patriotism','China/World Trade','Trolling Dems','Winning Election','Conservative Values','COVID','Economy'])

# Create list
topic_values_mi = np.array([np.sum(df_mi[['usa','great','history','america','nation']].sum().values),
                np.sum(df_mi[['china','world','trade','deal']].sum().values),
                np.sum(df_mi[['biden','joe','sleepy','sleepy joe','corrupt','socialist','catch']].sum().values),
                np.sum(df_mi[['vote','president','trump','vote republicans','republicans','win']].sum().values),
                np.sum(df_mi[['right','religious liberty','defend','speech', 'republicans','bear','free','life','right']].sum().values),
                np.sum(df_mi[['covid','turn','news', 'fake','talk','covid scare']].sum().values),
                np.sum(df_mi[['economy','job','million','business','cut','social','social security','unemployment','african american unemployment']].sum().values)])

# Sort both lists by descending order of topic values
topic_names_mi = topic_names_mi[np.argsort(topic_values_mi)[::-1]]
topic_values_mi = topic_values_mi[np.argsort(topic_values_mi)[::-1]]

# Create dictionary to be used in dataframe
topic_dict_mi = {'Topics': topic_names_mi,'Number of Related Words': topic_values_mi}

# Dataframe for plotly
df_topic_mi = pd.DataFrame(data=topic_dict_mi)

In [132]:
fig_mi = px.bar(data_frame=df_topic_mi, x='Topics', y='Number of Related Words', title='Number of Related Words', labels={'Number of Related Words':''})
fig_mi.show();

In [121]:
# MI corpus
display_topics(nmf_mi, cv_mi.get_feature_names(),10)


Topic  0
covid, covid covid, covid covid covid, turn, turn covid, turn covid covid, covid scare people, covid scare, heavy covid, heavy

Topic  1
china, trade, world, support, trade organization, organization, world trade, world trade organization, states, united states

Topic  2
people, people people, help, people help, vote, car, people vote, blame, understand, big

Topic  3
win, win win, win michigan, michigan, michigan win, america, win win michigan, win michigan win, win america win, win america

Topic  4
great, right, stand, white house, white, house, doctor, winston churchill, winston, churchill

Topic  5
right, court, supreme court, supreme, justice, liberty, defend, barrett, second, amendment

Topic  6
cut, time, security, social, social security, veteran, cut social, cut social security, talk, medicare

Topic  7
american, unemployment, american unemployment, low, record, level, low level, african american unemployment, reach, american unemployment asian

Topic  8
michigan, m

## Pennsylvania
1. China/world trade
2. Sleepy Joe (trolling Joe Biden)
3. Great country/Patriotism
4. President Trump (talks about him self alot, potential boasting)
5. Tax cut/jobs/unemployment/economy
6. Winning Pennsylvania and the election

In [None]:
# PA corpus
display_topics(nmf_pa, cv_pa.get_feature_names(), 10)