# Importing Dependencies

In [13]:
import re
import pickle
import string
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF


import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from PIL import Image

from wordcloud import WordCloud
from textwrap import wrap
from sklearn.manifold import TSNE

# Custom functions from .py files
from web_scrape import get_transcript
from preprocess import clean, preprocess

# Import/Preprocess Data

In [14]:
# Web scrape most recent speeches using our imported 'get_transcript'
goodyear_az = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-goodyear-az-october-28')
bullhead_az = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-bullhead-city-az-october-28')
omaha_ne = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-omaha-ne-october-27')
wsalem_wi = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-west-salem-wisconsin-october-27')
lansing_mi = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lansing-michigan-october-27')
martinsburg_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-martinsburg-pa-october-26')
lititz_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-lititz-pa-october-26')
allentown_pa = get_transcript('https://www.rev.com/blog/transcripts/donald-trump-rally-speech-transcript-allentown-pa-october-26')

# Create corpus for each state and the for all of the speaces
corpus = [goodyear_az, bullhead_az, omaha_ne, wsalem_wi, lansing_mi, martinsburg_pa, lititz_pa, allentown_pa]
az = [goodyear_az, bullhead_az]
ne = [omaha_ne]
wi = [wsalem_wi]
mi = [lansing_mi]
pa = [martinsburg_pa, lititz_pa, allentown_pa]

# Clean corpuses using our imported 'clean' function
clean_corpus = clean(corpus)
clean_az = clean(az)
clean_ne = clean(ne)
clean_wi = clean(wi)
clean_mi = clean(mi)
clean_pa = clean(pa)

# Preprocess corpuses using our imported 'preprocess' function
processed_corpus = preprocess(clean_corpus)
processed_az = preprocess(clean_az)
processed_ne = preprocess(clean_ne)
processed_wi = preprocess(clean_wi)
processed_mi = preprocess(clean_mi)
processed_pa = preprocess(clean_pa)

# Doc-Term Matrices

In [15]:
# Let's create our doc-term matrices for the entire corpus and for each individual state

In [16]:
# Countvectorizer, make dataframes
cv = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_az = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_ne = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_wi = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_mi = CountVectorizer(stop_words='english', ngram_range=(1,3))
cv_pa = CountVectorizer(stop_words='english', ngram_range=(1,3))

X = cv.fit_transform(processed_corpus)

X_az = cv_az.fit_transform(processed_az)
df_az = pd.DataFrame(X_az.toarray(), columns=cv_az.get_feature_names())

X_ne = cv_ne.fit_transform(processed_ne)
df_ne = pd.DataFrame(X_ne.toarray(), columns=cv_ne.get_feature_names())

X_wi = cv_wi.fit_transform(processed_wi)
df_wi = pd.DataFrame(X_wi.toarray(), columns=cv_wi.get_feature_names())

X_mi = cv_mi.fit_transform(processed_mi)
df_mi = pd.DataFrame(X_mi.toarray(), columns=cv_mi.get_feature_names())

X_pa = cv_pa.fit_transform(processed_pa)
df_pa = pd.DataFrame(X_pa.toarray(), columns=cv_pa.get_feature_names())

# Add 'State' column to each dataframe
df_az['State'] = [0] * len(df_az) # AZ
df_ne['State'] = [1] * len(df_ne) # NE
df_wi['State'] = [2] * len(df_wi) # WI
df_mi['State'] = [3] * len(df_mi) # MI
df_pa['State'] = [4] * len(df_pa) # PA

# Concatenate the state dataframes into one and fillnas with 0
df = pd.concat([df_az, df_ne, df_wi, df_mi, df_pa])
df.fillna(0, inplace=True)

# EDA Using NMF

In [24]:
# Instantiate NMF
nmf = NMF(9)
nmf_az = NMF(9)
nmf_ne = NMF(9)
nmf_wi = NMF(9)
nmf_mi = NMF(9)
nmf_pa = NMF(9)

# Create doc_topic for all corpuses
doc_topic = nmf.fit_transform(X)
doc_topic_az = nmf_az.fit_transform(X_az)
doc_topic_ne = nmf_ne.fit_transform(X_ne)
doc_topic_wi = nmf_wi.fit_transform(X_wi)
doc_topic_mi = nmf_mi.fit_transform(X_mi)
doc_topic_pa = nmf_pa.fit_transform(X_pa)

In [20]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    """
    Input: A model, feature names object, and number of top words to display, and list-like of topic names
    Output: topics and their top words
    """
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [97]:
def plot_topics(doc_topic, processed_corpus, top_n):
    """
    Takes in a doc-topic matrix, a processed corpus, and the number of topics the user would like to see,
    and outputs the bar chart of topics in descending orde4r
    """
    # After analysis and inspection, it was determined these are the 9 most prominent topics in Trump's speeches
    c1 = 'Patriotism'
    c2 = 'China/World Trade'
    c3 = 'Conservative Values'
    c4 = "Winning Election"
    c5 = "Trolling Dems"
    c6 = "Law & Order"
    c7 = "Border"
    c8 = "COVID"
    c9 = 'Economy'
    
    # Shorted up the documents so our dataframes are easier to see
    ex_label = [e[:70]+"..." for e in processed_corpus]

    # Let's put the topic column names on our doc-topic matrix
    H = pd.DataFrame(doc_topic.round(4),
             index=ex_label,
             columns = [c1,c2,c3,c4,c5,c6,c7,c8,c9])

    # Create list
    topic_names = np.array(['Patriotism','China/World Trade','Trolling Dems','Winning Election','Conservative Values','COVID','Economy','Border','Law & Order'])

    # Create list
    topic_values = np.array([len(H[H[x]>0.10]) for x in topic_names])

    # Sort both lists by descending order of topic values
    topic_names = topic_names[np.argsort(topic_values)[::-1]][:top_n]
    topic_values = topic_values[np.argsort(topic_values)[::-1]][:top_n]    

    # Create dictionary to be used in dataframe
    topic_dict = {'Topics': topic_names,'Number of Related Words': topic_values}

    # Dataframe for plotly
    df_topic = pd.DataFrame(data=topic_dict)
    
    # Plot the bar graph using plotly
    fig = px.bar(data_frame=df_topic, x='Topics', y='Number of Related Words', title='Number of Related Sentences',
                 labels={'Number of Related Words':''})
    fig.show();

## Entire Corpus

In [98]:
# All speeches
plot_topics(doc_topic_nmf, processed_corpus, 9)

## Arizona

In [99]:
# Arizona

plot_topics(doc_topic_nmf_az, processed_az, 4)

## Nebraska

In [100]:
# Nebraska

plot_topics(doc_topic_nmf_ne, processed_ne, 4)

## Wisconsin

In [101]:
# Wisconsin

plot_topics(doc_topic_nmf_wi, processed_wi, 4)

## Michigan

In [102]:
# Michigan

plot_topics(doc_topic_nmf_mi, processed_mi, 4)

## Pennsylvania

In [103]:
# Pennsylvania

plot_topics(doc_topic_nmf_pa, processed_pa, 4)