# Import Dependencies

In [None]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from wordcloud import WordCloud
from textwrap import wrap

import pyLDAvis
import pyLDAvis.sklearn

# Custom functions from .py files
from web_scrape import get_transcript
from preprocess import clean, preprocess

# Import/Preprocess Data

In [None]:
# Web scrape most recent speeches using our imported 'get_transcript'
goodyear = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-goodyear-az-october-28')
bullhead = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-bullhead-city-az-october-28')
omaha = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-omaha-ne-october-27')
wsalem = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-west-salem-wisconsin-october-27')
lansing = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lansing-michigan-october-27')
martinsburg = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-martinsburg-pa-october-26')
lititz = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lititz-pa-october-26')
allentown = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-allentown-pa-october-26')

In [None]:
# Clean all of the speeches in the corpus using our imported 'clean' function
clean_goodyear = clean(goodyear)

In [None]:
# Preprocess the speeches using our imported 'preprocess' function
processed_goodyear = preprocess(clean_goodyear)

# Doc-Term Matrix

In [None]:
# Countvectorizer
cv = CountVectorizer(stop_words='english')

X_cv = cv.fit_transform(processed_goodyear)

df_cv = pd.DataFrame(X_cv.toarray(), columns=cv.get_feature_names())

In [None]:
# TF_IDF
tfidf_vectorizer = TfidfVectorizer(**cv.get_params())

X_tfidf = tfidf_vectorizer.fit_transform(processed_goodyear)

df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())

# LDA

In [None]:
# for TF DTM
lda_cv = LatentDirichletAllocation(n_components=2, random_state=0)
lda_cv.fit(df_cv)

lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)
lda_tfidf.fit(df_tfidf)

In [None]:
# LDA Visualization for CountVectorizer
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_cv, X_cv, cv)

In [None]:
# LDA Visualization for TF_IDF
pyLDAvis.sklearn.prepare(lda_tfidf, X_tfidf, tfidf_vectorizer)