# Import Dependencies

In [1]:
import re
import pickle
import string
import numpy as np
import pandas as pd

# import spacy
# from spacy.lang.en.stop_words import STOP_WORDS
# from spacy.attrs import IS_ALPHA
# from spacy.lang.en import English
# from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import matplotlib.pyplot as plt

# from wordcloud import WordCloud
# from textwrap import wrap

import pyLDAvis
import pyLDAvis.sklearn

# Custom functions from .py files
from web_scrape import get_transcript
from preprocess import clean, preprocess

# Import/Preprocess Data

In [2]:
# Web scrape most recent speeches using our imported 'get_transcript'
goodyear_az = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-goodyear-az-october-28')
bullhead_az = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-bullhead-city-az-october-28')
omaha_ne = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-omaha-ne-october-27')
wsalem_wi = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-west-salem-wisconsin-october-27')
lansing_mi = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lansing-michigan-october-27')
martinsburg_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-martinsburg-pa-october-26')
lititz_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lititz-pa-october-26')
allentown_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-allentown-pa-october-26')

In [3]:
# Create corpus for each state
az = [goodyear_az, bullhead_az]
ne = [omaha_ne]
wi = [wsalem_wi]
mi = [lansing_mi]
pa = [martinsburg_pa, lititz_pa, allentown_pa]

In [4]:
# Clean state corpuses
clean_az = clean(az)
clean_ne = clean(ne)
clean_wi = clean(wi)
clean_mi = clean(mi)
clean_pa = clean(pa)

In [5]:
# Preprocess our state corpuses
processed_az = preprocess(clean_az)
processed_ne = preprocess(clean_ne)
processed_wi = preprocess(clean_wi)
processed_mi = preprocess(clean_mi)
processed_pa = preprocess(clean_pa)

In [6]:
# Create corpus of all speeches
corpus = [goodyear_az, bullhead_az, omaha_ne, wsalem_wi, lansing_mi, martinsburg_pa, lititz_pa, allentown_pa]

In [7]:
# Clean corpus using our imported 'clean' function
clean_corpus = clean(corpus)

In [8]:
# Preprocess the corpus using our imported 'preprocess' function
processed_corpus = preprocess(clean_corpus)

# Doc-Term Matrix

In [27]:
# Countvectorizer, make dataframes

cv = CountVectorizer(stop_words='english')
cv_az = CountVectorizer(stop_words='english')
cv_ne = CountVectorizer(stop_words='english')
cv_wi = CountVectorizer(stop_words='english')
cv_mi = CountVectorizer(stop_words='english')
cv_pa = CountVectorizer(stop_words='english')

X = cv.fit_transform(processed_corpus)

X_az = cv_az.fit_transform(processed_az)
df_az = pd.DataFrame(X_az.toarray(), columns=cv_az.get_feature_names())

X_ne = cv_ne.fit_transform(processed_ne)
df_ne = pd.DataFrame(X_ne.toarray(), columns=cv_ne.get_feature_names())

X_wi = cv_wi.fit_transform(processed_wi)
df_wi = pd.DataFrame(X_wi.toarray(), columns=cv_wi.get_feature_names())

X_mi = cv_mi.fit_transform(processed_mi)
df_mi = pd.DataFrame(X_mi.toarray(), columns=cv_mi.get_feature_names())

X_pa = cv_pa.fit_transform(processed_pa)
df_pa = pd.DataFrame(X_pa.toarray(), columns=cv_pa.get_feature_names())

In [28]:
# Add 'State' column to each dataframe
df_az['State'] = [0] * len(df_az) # AZ
df_ne['State'] = [1] * len(df_ne) # NE
df_wi['State'] = [2] * len(df_wi) # WI
df_mi['State'] = [3] * len(df_mi) # MI
df_pa['State'] = [4] * len(df_pa) # PA

In [29]:
# Concatenate the state dataframes into one and fillnas with 0
df = pd.concat([df_az, df_ne, df_wi, df_mi, df_pa])
df.fillna(0, inplace=True)

# LDA

In [34]:
# for TF DTM
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(X)

LatentDirichletAllocation(n_components=5, random_state=0)

In [35]:
# LDA Visualization for CountVectorizer
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, X, cv)