# Import Dependencies

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import matplotlib.pyplot as plt

import pyLDAvis
import pyLDAvis.sklearn

# Custom functions from .py files
from web_scrape import get_transcript
from preprocess import clean, preprocess

# Import/Preprocess Data

In [2]:
# Web scrape most recent speeches using our imported 'get_transcript'
goodyear_az = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-goodyear-az-october-28')
bullhead_az = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-bullhead-city-az-october-28')
omaha_ne = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-omaha-ne-october-27')
wsalem_wi = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-west-salem-wisconsin-october-27')
lansing_mi = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lansing-michigan-october-27')
martinsburg_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-martinsburg-pa-october-26')
lititz_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lititz-pa-october-26')
allentown_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-allentown-pa-october-26')

# Create corpus of all speeches
corpus = [goodyear_az, bullhead_az, omaha_ne, wsalem_wi, lansing_mi, martinsburg_pa, lititz_pa, allentown_pa]

# Create corpus for each state
az = [goodyear_az, bullhead_az]
ne = [omaha_ne]
wi = [wsalem_wi]
mi = [lansing_mi]
pa = [martinsburg_pa, lititz_pa, allentown_pa]

# Clean corpus using our imported 'clean' function
clean_corpus = clean(corpus)

# Clean state corpuses
clean_az = clean(az)
clean_ne = clean(ne)
clean_wi = clean(wi)
clean_mi = clean(mi)
clean_pa = clean(pa)

# Preprocess the corpus using our imported 'preprocess' function
processed_corpus = preprocess(clean_corpus)

# Preprocess our state corpuses
processed_az = preprocess(clean_az)
processed_ne = preprocess(clean_ne)
processed_wi = preprocess(clean_wi)
processed_mi = preprocess(clean_mi)
processed_pa = preprocess(clean_pa)

# Doc-Term Matrix

In [3]:
# Countvectorizer, make dataframes

cv = CountVectorizer(stop_words='english')
cv_az = CountVectorizer(stop_words='english')
cv_ne = CountVectorizer(stop_words='english')
cv_wi = CountVectorizer(stop_words='english')
cv_mi = CountVectorizer(stop_words='english')
cv_pa = CountVectorizer(stop_words='english')

X = cv.fit_transform(processed_corpus)
df = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

X_az = cv_az.fit_transform(processed_az)
df_az = pd.DataFrame(X_az.toarray(), columns=cv_az.get_feature_names())

X_ne = cv_ne.fit_transform(processed_ne)
df_ne = pd.DataFrame(X_ne.toarray(), columns=cv_ne.get_feature_names())

X_wi = cv_wi.fit_transform(processed_wi)
df_wi = pd.DataFrame(X_wi.toarray(), columns=cv_wi.get_feature_names())

X_mi = cv_mi.fit_transform(processed_mi)
df_mi = pd.DataFrame(X_mi.toarray(), columns=cv_mi.get_feature_names())

X_pa = cv_pa.fit_transform(processed_pa)
df_pa = pd.DataFrame(X_pa.toarray(), columns=cv_pa.get_feature_names())

# LDA

In [5]:
lda = LatentDirichletAllocation(n_components=9, random_state=0)
lda.fit(X)

lda_az = LatentDirichletAllocation(n_components=9, random_state=0)
lda_az.fit(X_az)

lda_ne = LatentDirichletAllocation(n_components=9, random_state=0)
lda_ne.fit(X_ne)

lda_wi = LatentDirichletAllocation(n_components=9, random_state=0)
lda_wi.fit(X_wi)

lda_mi = LatentDirichletAllocation(n_components=9, random_state=0)
lda_mi.fit(X_mi)

lda_pa = LatentDirichletAllocation(n_components=9, random_state=0)
lda_pa.fit(X_pa)

LatentDirichletAllocation(n_components=9, random_state=0)

In [15]:
# LDA Visualization for all speeches
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, X, cv)